In [275]:
import pandas as pd

df = pd.read_csv("/Users/jacob/Documents/GitHub/projects-2024-jacobogmads/Jacob/Data project/data.csv", encoding='ISO-8859-1', skiprows=[0])
df.head()

Unnamed: 0,Unnamed: 1,.1,2001M01,2001M02,2001M03,2001M04,2001M05,2001M06,2001M07,2001M08,...,2023M05,2023M06,2023M07,2023M08,2023M09,2023M10,2023M11,2023M12,2024M01,2024M02
0,Indeks,,,,,,,,,,...,,,,,,,,,,
1,,07.2.1 Reservedele og tilbehï¿½r,77.5,77.9,78.3,78.3,78.3,78.2,78.4,78.7,...,109.9,110.2,110.7,110.8,110.8,111.2,111.0,111.2,111.9,112.1
2,,07.2.2 Brï¿½ndstof,71.5,73.9,72.3,75.5,79.8,75.9,73.8,73.1,...,128.7,129.9,128.4,137.3,139.6,135.7,130.0,124.9,131.1,135.0
3,,07.2.3 Vedligeholdelse og reparation af person...,63.2,63.3,63.2,63.7,63.8,63.9,64.2,64.7,...,121.8,121.9,122.0,122.0,122.1,122.2,122.4,122.4,123.5,126.0
4,,07.3.1.1 Personbefordring med tog,..,..,..,..,..,..,..,..,...,112.2,112.2,112.2,112.2,112.2,112.2,112.2,112.2,112.2,121.8


# Cleaning the data

We rename the indexes. We create a dictionary with the proper names, and then perform the remapping.

In [276]:
# Define your mapping of Danish names to more meaningful names or translations
# Example:
var_dict = {
     '07.2.1 Reservedele og tilbehï¿½r': 'Spare parts and accessories',
     '07.2.2 Brï¿½ndstof': 'Fuel',
     '07.2.3 Vedligeholdelse og reparation af personlige transportmidler': 'Maintenance and repair of personal transportation equipment',
     '07.3.1.1 Personbefordring med tog': 'Passenger transport by train',
     '07.3.1.2 Personbefordring med metro': 'Passenger transport by metro',
     '07.3.2.1Personbefordring med bus': 'Passenger transport by bus',
     '07.3.2.2 Personbefordring med taxi og lejet bil med fï¿½rer': 'Passenger transport by taxi and rented car with driver',
     '07.3.3.1 Indenrigsflyvning': 'Domestic flights',
     '07.3.4 Personbefordring med fï¿½rge': 'Passenger transport by ferry',
     '07.3.4.1 Personbefordring ad sï¿½vejen': 'Passenger transport by sea',
     'ï¿½ndring i forhold til mï¿½neden fï¿½r (pct.)': 'Change compared to the previous month (pct.)',
     'ï¿½ndring i forhold til samme mï¿½ned ï¿½ret fï¿½r (pct.)': 'Change compared to the same month last year (pct.)'
 }

# Rename the indexes
df.replace(var_dict, inplace=True)

We continue by droppping rows which we are not interested in. We then reset the index.

In [277]:
df = df.drop(df.index[11:])
df = df.drop(df.index[0])
df.reset_index(inplace = True, drop = True)

Now we rename our index-column to Category.

In [278]:
df.columns.values[1] = 'Category'
df.iloc[[]]

Unnamed: 0,Unnamed: 1,Category,2001M01,2001M02,2001M03,2001M04,2001M05,2001M06,2001M07,2001M08,...,2023M05,2023M06,2023M07,2023M08,2023M09,2023M10,2023M11,2023M12,2024M01,2024M02


We now want to mean the monthly values for each year, so they become comparable with the rest of our data. To do so, we have to do a bit of manipulation. First we need to ensure, that our column names are correctly formatted.

In [279]:
# Strip leading/trailing spaces from column names
df.columns = df.columns.str.strip()

# Ensure column names are in the expected case, here assuming title case for 'Category'
df.columns = df.columns.str.title()

We then replace ".." to NaN to properly handle missing values when we aggregate and mean the observations.

In [280]:
# Replace '..' with NaN to properly handle missing values during aggregation
df.replace('..', pd.NA, inplace=True)

Now we make the conversion to long format.

In [281]:
# Convert the DataFrame from wide to long format to easily manipulate the dates and values
df_long = pd.melt(df, id_vars=["Category"], var_name="Date", value_name="Value")

# Ensure 'Value' is numeric and handle any conversion errors by coercing them to NaN
df_long['Value'] = pd.to_numeric(df_long['Value'], errors='coerce')

We now convert the column names from the format from yyyyMmm to a proper datetime format.

In [282]:
# Convert 'Date' from the custom format 'YYYYMmm' to datetime, correcting the format
df_long['Date'] = pd.to_datetime(df_long['Date'], format='%YM%m', errors='coerce')

# Dropping rows where Date conversion resulted in NaT to clean up the data
df_long.dropna(subset=['Date'], inplace=True)

We finally group by category and year, and calculate the mean for each group.

In [283]:


# Group by Category and Year, then calculate mean for each group
df_yearly_mean = df_long.groupby(['Category', df_long['Date'].dt.year])['Value'].mean().reset_index()

print(df_yearly_mean)


                        Category  Date       Value
0               Domestic flights  2001         NaN
1               Domestic flights  2002         NaN
2               Domestic flights  2003         NaN
3               Domestic flights  2004         NaN
4               Domestic flights  2005         NaN
..                           ...   ...         ...
235  Spare parts and accessories  2020  102.358333
236  Spare parts and accessories  2021  102.000000
237  Spare parts and accessories  2022  105.591667
238  Spare parts and accessories  2023  110.283333
239  Spare parts and accessories  2024  112.000000

[240 rows x 3 columns]
