In [30]:
import sys
import os
import numpy as np
# Define the path to the parent directory of data_analysis
parent_dir = '/Users/nielsvanwinden/Projects/Projects/Inholland/Scenario_Generator/'

# Add the parent directory to sys.path
sys.path.append(parent_dir + "src")

# Now you can import the data_analysis module
from data_analysis import *

# Source data

In [31]:
df = load_macro_economic_data(parent_dir + "data/macro_economic.csv")

# Convert 'pit_date' to datetime
df['pit_date'] = pd.to_datetime(df['pit_date'], format='%Y-%m-%d')

# Convert 'macro_code' to string
df['macro_code'] = df['macro_code'].astype(str)

display(df)

Data loaded successfully from /Users/nielsvanwinden/Projects/Projects/Inholland/Scenario_Generator/data/macro_economic.csv.


Unnamed: 0,macro_code,pit_date,pit_index,pit_value
0,unemployment_rate,1970-12-31,0,0.016
1,unemployment_rate,1971-12-31,1,0.020
2,unemployment_rate,1972-12-31,2,0.029
3,unemployment_rate,1973-12-31,3,0.029
4,unemployment_rate,1974-12-31,4,0.033
...,...,...,...,...
597,inflation,2023-06-30,317,0.064
598,inflation,2023-07-31,318,0.053
599,inflation,2023-08-31,319,0.034
600,inflation,2023-09-30,320,-0.003


In [32]:
inflation_df = load_macro_economic_data(parent_dir +"data/hcip_data.csv")

# Convert 'pit_date' to datetime
inflation_df['pit_date'] = pd.to_datetime(inflation_df['pit_date'], format='%Y-%m-%d')

# Convert 'macro_code' to string
inflation_df['macro_code'] = inflation_df['macro_code'].astype(str)

# Convert 'pit_value' to float
inflation_df['pit_value'] = pd.to_numeric(inflation_df['pit_value'], errors='coerce').astype(float)

Data loaded successfully from /Users/nielsvanwinden/Projects/Projects/Inholland/Scenario_Generator/data/hcip_data.csv.


In [33]:
print(f'{inflation_df.dtypes}\n')

print(f'{df.dtypes}\n')

pit_date      datetime64[ns]
pit_value            float64
macro_code            object
dtype: object

macro_code            object
pit_date      datetime64[ns]
pit_index              int64
pit_value            float64
dtype: object



# Merge dataframes

In [34]:
df = pd.concat([df, inflation_df], ignore_index=True)
df

Unnamed: 0,macro_code,pit_date,pit_index,pit_value
0,unemployment_rate,1970-12-31,0.0,0.016
1,unemployment_rate,1971-12-31,1.0,0.020
2,unemployment_rate,1972-12-31,2.0,0.029
3,unemployment_rate,1973-12-31,3.0,0.029
4,unemployment_rate,1974-12-31,4.0,0.033
...,...,...,...,...
945,hicp,2024-08-31,,134.220
946,hicp,2024-09-30,,133.010
947,hicp,2024-10-31,,133.850
948,hicp,2024-11-30,,132.470


# Timeseries values

In [42]:
# Add the "time_series_values" column
df.loc[df["macro_code"] == "unemployment_rate", "time_series_value"] = df[df["macro_code"] == "unemployment_rate"]["pit_value"]
df.loc[df["macro_code"] == "hpi", "time_series_value"] = (df[df["macro_code"] == "hpi"]["pit_value"] - df[df["macro_code"] == "hpi"]["pit_value"].shift(1))/df[df["macro_code"] == "hpi"]["pit_value"].shift(1)
df.loc[df["macro_code"] == "gdp_growth", "time_series_value"] = df[df["macro_code"] == "gdp_growth"]["pit_value"]
df.loc[df["macro_code"] == "hicp", "time_series_value"] = (df[df["macro_code"] == "hicp"]["pit_value"] - df[df["macro_code"] == "hicp"]["pit_value"].shift(1))/df[df["macro_code"] == "hicp"]["pit_value"].shift(1)
df.loc[df["macro_code"] == "inflation", "time_series_value"] = df[df["macro_code"] == "inflation"]["pit_value"]

In [43]:
display(df[df["macro_code"] == "hpi"])

Unnamed: 0,macro_code,pit_date,pit_index,pit_value,time_series_value
25,hpi,1995-03-31,0.0,0.414,
26,hpi,1995-06-30,1.0,0.419,0.012077
27,hpi,1995-09-30,2.0,0.431,0.028640
28,hpi,1995-12-31,3.0,0.436,0.011601
34,hpi,1996-03-31,4.0,0.448,0.027523
...,...,...,...,...,...
898,hpi,2022-09-30,110.0,1.886,0.005867
906,hpi,2022-12-31,111.0,1.841,-0.023860
915,hpi,2023-03-31,112.0,1.812,-0.015752
923,hpi,2023-06-30,113.0,1.778,-0.018764


In [44]:
df = df.sort_values(by=['pit_date', 'macro_code'], ascending=[True, True])
df.reset_index(drop=True, inplace=True)
print(df[df["macro_code"] == "hpi"].count())
print(df[df["macro_code"] == "unemployment_rate"].count())
print(df[df["macro_code"] == "gdp_growth"].count())

macro_code           115
pit_date             115
pit_index            115
pit_value            115
time_series_value    114
dtype: int64
macro_code           54
pit_date             54
pit_index            54
pit_value            54
time_series_value    54
dtype: int64
macro_code           111
pit_date             111
pit_index            111
pit_value            111
time_series_value    111
dtype: int64


In [45]:
# Save df to dataset.csv
df.to_csv('/Users/nielsvanwinden/Projects/Projects/Inholland/Scenario_Generator/data/dataset.csv', index=False, sep=';')

In [46]:
display(df)

Unnamed: 0,macro_code,pit_date,pit_index,pit_value,time_series_value
0,unemployment_rate,1970-12-31,0.0,0.016,0.016000
1,unemployment_rate,1971-12-31,1.0,0.020,0.020000
2,unemployment_rate,1972-12-31,2.0,0.029,0.029000
3,unemployment_rate,1973-12-31,3.0,0.029,0.029000
4,unemployment_rate,1974-12-31,4.0,0.033,0.033000
...,...,...,...,...,...
945,hicp,2024-08-31,,134.220,0.003664
946,hicp,2024-09-30,,133.010,-0.009015
947,hicp,2024-10-31,,133.850,0.006315
948,hicp,2024-11-30,,132.470,-0.010310
