# Importing packages and loading data

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display

from dstapi import DstApi # install with `pip install git+https://github.com/alemartinello/dstapi`
import pandas_datareader # install with `pip install pandas-datareader

%load_ext autoreload 
%autoreload 2
from jacob_data_script import *

df = pd.read_csv("/Users/jacob/Documents/GitHub/projects-2024-jacobogmads/Jacob/Data project/data.csv", encoding='ISO-8859-1', skiprows=[0])
df.head(20)

Unnamed: 0,Unnamed: 1,.1,2001M01,2001M02,2001M03,2001M04,2001M05,2001M06,2001M07,2001M08,...,2023M05,2023M06,2023M07,2023M08,2023M09,2023M10,2023M11,2023M12,2024M01,2024M02
0,Indeks,,,,,,,,,,...,,,,,,,,,,
1,,00 Forbrugerprisindekset i alt,76.7,77.3,77.7,78.0,78.4,78.3,78.1,78.1,...,116.0,116.4,118.5,117.7,117.4,117.7,117.3,116.7,117.8,118.4
2,,07.2 Drift af personlige transportmidler,68.9,69.9,69.3,70.8,72.7,71.0,70.3,70.2,...,122.5,123.2,123.0,126.4,127.5,126.1,124.4,122.5,125.3,127.1
3,,07.3.1.1 Personbefordring med tog,..,..,..,..,..,..,..,..,...,112.2,112.2,112.2,112.2,112.2,112.2,112.2,112.2,112.2,121.8
4,,07.3.1.2 Personbefordring med metro,..,..,..,..,..,..,..,..,...,116.4,116.4,116.4,116.4,116.4,116.4,116.4,116.4,116.4,126.8
5,,07.3.2.1Personbefordring med bus,..,..,..,..,..,..,..,..,...,111.9,111.9,111.9,111.7,111.7,111.7,111.7,111.7,111.7,120.5
6,,07.3.2.2 Personbefordring med taxi og lejet bil med fører,..,..,..,..,..,..,..,..,...,131.3,131.3,131.3,131.3,131.3,131.7,131.7,133.0,133.0,133.0
7,,07.3.3.1 Indenrigsflyvning,..,..,..,..,..,..,..,..,...,81.0,84.9,88.4,86.8,92.4,92.0,97.3,97.2,92.2,99.6
8,,07.3.4.1 Personbefordring ad søvejen,..,..,..,..,..,..,..,..,...,111.8,124.6,131.4,127.2,119.9,117.3,105.6,106.8,110.9,109.7
9,"Prisindeksene for april 2020 - juni 2021 er mere usikre end normalt,",,,,,,,,,,...,,,,,,,,,,


# Cleaning the data

We rename the indexes. We create a dictionary with the proper names, and then perform the remapping.

In [2]:
# Define your mapping of Danish names to more meaningful names or translations
# Example:
var_dict = {
     '00 Forbrugerprisindekset i alt': 'General Consumer Price Index',
     '07.2 Drift af personlige transportmidler': 'Passenger transport by personal transportation',
     '07.2.1 Reservedele og tilbehï¿½r': 'Spare parts and accessories',
     '07.2.2 Brï¿½ndstof': 'Fuel',
     '07.2.3 Vedligeholdelse og reparation af personlige transportmidler': 'Maintenance and repair of personal transportation equipment',
     '07.3.1.1 Personbefordring med tog': 'Passenger transport by train',
     '07.3.1.2 Personbefordring med metro': 'Passenger transport by metro',
     '07.3.2.1Personbefordring med bus': 'Passenger transport by bus',
     '07.3.2.2 Personbefordring med taxi og lejet bil med fører': 'Passenger transport by taxi and rented car with driver',
     '07.3.3.1 Indenrigsflyvning': 'Personal transport by domestic flights',
     '07.3.4 Personbefordring med fï¿½rge': 'Passenger transport by ferry',
     '07.3.4.1 Personbefordring ad søvejen': 'Passenger transport by sea',
     'ï¿½ndring i forhold til mï¿½neden fï¿½r (pct.)': 'Change compared to the previous month (pct.)',
     'ï¿½ndring i forhold til samme mï¿½ned ï¿½ret fï¿½r (pct.)': 'Change compared to the same month last year (pct.)'
 }

# Rename the indexes
df.replace(var_dict, inplace=True)

We continue by droppping rows which we are not interested in. We then reset the index.

In [3]:
df = df.drop(df.index[9:])
df = df.drop(df.index[0])
df.reset_index(inplace = True, drop = True)
df

Unnamed: 0,Unnamed: 1,.1,2001M01,2001M02,2001M03,2001M04,2001M05,2001M06,2001M07,2001M08,...,2023M05,2023M06,2023M07,2023M08,2023M09,2023M10,2023M11,2023M12,2024M01,2024M02
0,,General Consumer Price Index,76.7,77.3,77.7,78.0,78.4,78.3,78.1,78.1,...,116.0,116.4,118.5,117.7,117.4,117.7,117.3,116.7,117.8,118.4
1,,Passenger transport by personal transportation,68.9,69.9,69.3,70.8,72.7,71.0,70.3,70.2,...,122.5,123.2,123.0,126.4,127.5,126.1,124.4,122.5,125.3,127.1
2,,Passenger transport by train,..,..,..,..,..,..,..,..,...,112.2,112.2,112.2,112.2,112.2,112.2,112.2,112.2,112.2,121.8
3,,Passenger transport by metro,..,..,..,..,..,..,..,..,...,116.4,116.4,116.4,116.4,116.4,116.4,116.4,116.4,116.4,126.8
4,,Passenger transport by bus,..,..,..,..,..,..,..,..,...,111.9,111.9,111.9,111.7,111.7,111.7,111.7,111.7,111.7,120.5
5,,Passenger transport by taxi and rented car with driver,..,..,..,..,..,..,..,..,...,131.3,131.3,131.3,131.3,131.3,131.7,131.7,133.0,133.0,133.0
6,,Personal transport by domestic flights,..,..,..,..,..,..,..,..,...,81.0,84.9,88.4,86.8,92.4,92.0,97.3,97.2,92.2,99.6
7,,Passenger transport by sea,..,..,..,..,..,..,..,..,...,111.8,124.6,131.4,127.2,119.9,117.3,105.6,106.8,110.9,109.7


Now we rename our index-column to Category.

In [4]:
df.columns.values[1] = 'Category'
df.iloc[[]]

Unnamed: 0,Unnamed: 1,Category,2001M01,2001M02,2001M03,2001M04,2001M05,2001M06,2001M07,2001M08,...,2023M05,2023M06,2023M07,2023M08,2023M09,2023M10,2023M11,2023M12,2024M01,2024M02


We now want to mean the monthly values for each year, so they become comparable with the rest of our data. To do so, we have to do a bit of manipulation. First we need to ensure, that our column names are correctly formatted.

In [5]:
# Strip leading/trailing spaces from column names
df.columns = df.columns.str.strip()

# Ensure column names are in the expected case, here assuming title case for 'Category'
df.columns = df.columns.str.title()

We then replace ".." to NaN to properly handle missing values when we aggregate and mean the observations.

In [6]:
# Replace '..' with NaN to properly handle missing values during aggregation
df.replace('..', pd.NA, inplace=True)

Now we make the conversion to long format.

In [7]:
# Convert the DataFrame from wide to long format to easily manipulate the dates and values
df_long = pd.melt(df, id_vars=["Category"], var_name="Date", value_name="Value")

# Ensure 'Value' is numeric and handle any conversion errors by coercing them to NaN
df_long['Value'] = pd.to_numeric(df_long['Value'], errors='coerce')

We now convert the column names from the format from yyyyMmm to a proper datetime format.

In [8]:
# Convert 'Date' from the custom format 'YYYYMmm' to datetime, correcting the format
df_long['Date'] = pd.to_datetime(df_long['Date'], format='%YM%m', errors='coerce')

# Dropping rows where Date conversion resulted in NaT to clean up the data
df_long.dropna(subset=['Date'], inplace=True)

We finally group by category and year, and calculate the mean for each group.

In [9]:


# Group by Category and Year, then calculate mean for each group
df_yearly_mean = df_long.groupby(['Category', df_long['Date'].dt.year])['Value'].mean().reset_index()

print(df_yearly_mean)


                                   Category  Date      Value
0              General Consumer Price Index  2001  78.025000
1              General Consumer Price Index  2002  79.916667
2              General Consumer Price Index  2003  81.575000
3              General Consumer Price Index  2004  82.516667
4              General Consumer Price Index  2005  84.016667
..                                      ...   ...        ...
187  Personal transport by domestic flights  2020  94.691667
188  Personal transport by domestic flights  2021  79.033333
189  Personal transport by domestic flights  2022  75.783333
190  Personal transport by domestic flights  2023  86.950000
191  Personal transport by domestic flights  2024  95.900000

[192 rows x 3 columns]


In [10]:
df_wide = df_yearly_mean.pivot(index='Category', columns='Date', values='Value')

# Since you may have NaNs in your original data as seen in the screenshot, 
# you might want to fill them with a value or forward-fill them
df_wide.fillna(method='ffill', inplace=True)  # forward fill to replace NaNs

# If you want to replace NaN with 0s instead, you can do:
# df_wide.fillna(0, inplace=True)

# Optionally, reset the index if you want 'Date' to be a column and not the index
df_wide.reset_index(inplace=True)

# Output the wide format DataFrame
df_wide.head()

Date,Category,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,General Consumer Price Index,78.025,79.916667,81.575,82.516667,84.016667,85.633333,87.083333,90.058333,91.233333,...,100.0,100.25,101.4,102.225,103.0,103.433333,105.35,113.458333,117.208333,118.1
1,Passenger transport by bus,78.025,79.916667,81.575,82.516667,84.016667,85.633333,87.083333,90.058333,86.883333,...,100.016667,100.375,100.625,101.983333,104.05,105.191667,105.908333,106.283333,111.35,116.1
2,Passenger transport by metro,78.025,79.916667,81.575,82.516667,84.016667,85.633333,87.083333,90.058333,86.883333,...,100.0,100.8,100.8,101.625,105.15,112.875,113.916667,112.533333,116.066667,121.6
3,Passenger transport by personal transportation,69.933333,71.35,72.566667,75.008333,78.758333,82.108333,84.341667,88.9,87.475,...,100.008333,98.441667,101.75,105.041667,105.866667,103.675,111.108333,124.791667,124.516667,126.2
4,Passenger transport by sea,69.933333,71.35,72.566667,75.008333,78.758333,82.108333,84.341667,88.9,78.108333,...,100.008333,98.741667,98.575,98.691667,103.833333,107.55,101.833333,109.866667,115.008333,110.3


The data from DST was originally index to january 2015 as base month. As we meaned the values for each year, 2015 is not equal to 100 anymore. We thus want to re-index the dataframe. We use our index_year function, which has been defined in the script, and which returns the dataframe indexed to a given year, in our case 2015.

In [11]:
df_wide_index2015 = index_to_year(df_wide, 2015)
df_wide_index2015.head()

Date,Category,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,General Consumer Price Index,78.025,79.916667,81.575,82.516667,84.016667,85.633333,87.083333,90.058333,91.233333,...,100.0,100.25,101.4,102.225,103.0,103.433333,105.35,113.458333,117.208333,118.1
1,Passenger transport by bus,78.011998,79.903349,81.561406,82.502916,84.002666,85.619063,87.068822,90.043326,86.868855,...,100.0,100.358274,100.608232,101.966339,104.032661,105.174138,105.890685,106.265622,111.331445,116.080653
2,Passenger transport by metro,78.025,79.916667,81.575,82.516667,84.016667,85.633333,87.083333,90.058333,86.883333,...,100.0,100.8,100.8,101.625,105.15,112.875,113.916667,112.533333,116.066667,121.6
3,Passenger transport by personal transportation,69.927506,71.344055,72.56062,75.002083,78.751771,82.101492,84.334639,88.892592,87.467711,...,100.0,98.433464,101.741522,105.032914,105.857845,103.666361,111.099075,124.781268,124.506291,126.189484
4,Passenger transport by sea,69.927506,71.344055,72.56062,75.002083,78.751771,82.101492,84.334639,88.892592,78.101825,...,100.0,98.733439,98.566786,98.683443,103.824681,107.541038,101.824848,109.857512,114.99875,110.290809


And we save our dataframe as a long format as well in order to plot it easily for descriptive statistics, as well as renaming our dataframe to match the original name from DST.

In [12]:
pris111_wide = df_wide_index2015
pris111_long = pd.melt(pris111_wide, id_vars=["Category"], var_name="Date", value_name="Value")
pris111_long

Unnamed: 0,Category,Date,Value
0,General Consumer Price Index,2001,78.025000
1,Passenger transport by bus,2001,78.011998
2,Passenger transport by metro,2001,78.025000
3,Passenger transport by personal transportation,2001,69.927506
4,Passenger transport by sea,2001,69.927506
...,...,...,...
187,Passenger transport by personal transportation,2024,126.189484
188,Passenger transport by sea,2024,110.290809
189,Passenger transport by taxi and rented car with driver,2024,133.033258
190,Passenger transport by train,2024,116.951270


Our data for transportation prices has now been cleaned, ready to be used. 

# Descriptive statistics

We use our plot_e function to plot, and examine the data.

In [13]:
# interactive plot using widgets and the defined plot_e function
widgets.interact(plot_a, 
    df=widgets.fixed(pris111_long),
    category1=widgets.Dropdown(
        description='Category 1', 
        options=pris111_long['Category'].unique(),
        value=pris111_long['Category'].unique()[0]  # Default to the first unique category value
    ),
    category2=widgets.Dropdown(
        description='Category 2', 
        options=pris111_long['Category'].unique(),
        value=pris111_long['Category'].unique()[1] if len(pris111_long['Category'].unique()) > 1 else pris111_long['Category'].unique()[0]  # Default to the second unique category value if it exists, otherwise the first
    )
)

interactive(children=(Dropdown(description='Category 1', options=('General Consumer Price Index', 'Passenger t…

<function jacob_data_script.plot_e(df, category1, category2)>

### Description of trends

The General Consumer Index shows a consistent upward trend, reflecting a general increase in the consumer price index over time. There's a noticeable acceleration in growth after 2020, indicating a significant rise in consumer prices in the recent years.

Passenger Transport by Bus exhibits a gradual increase with some fluctuations. There's a notable rise after 2020, similar to the general consumer index, suggesting increased costs in bus transport. In specific, we see that the increase in passenger transport by bus rises less than the general consumer price index in the time after 2020.

Passenger Transport by Metro follows a slightly more volatile path than bus transport, with sharper increases and some periods of stability. It also shows a steep increase post-2020, emphasizing a significant jump in metro transport costs. In contrast to the price of transportation by bus, the price of transportation by Metro increases more than the general consumer price, making it relatively more expensive compared to travel by metro compared to the general consumer price index.

Passenger Transport by Personal Transportation shows a unique pattern with more pronounced fluctuations. There's a significant dip around 2016, followed by a rapid increase, particularly sharp after 2020, indicating volatile costs associated with personal transportation. Personal transportation is a category consisting of both cars, bikes, mopeds, motorcycles etc. and further includes the cost of buying, repairing, servicing etc.

Passenger Transport by Sea has a distinct trend with notable dips and recoveries, reflecting the variable costs associated with sea transport. Like others, it shows an upward trend after 2020, but with a notable dip before this recent rise.