In [1]:
import pandas as pd
from datetime import datetime

In [2]:
cmsa_df_filled = pd.read_csv('../data/cmsa_small_filled.csv')

In [3]:
cmsa_df_zero = pd.read_csv('../data/cmsa_small_fill_zero.csv')

In [4]:
vacation_df = pd.read_csv('../data/vacation.csv')

In [5]:
holiday_df = pd.read_csv('../data/holiday.csv')

In [6]:
covid_string = pd.read_csv('../data/covid-stringency.csv')

In [7]:
new_row = {'Day':'2022-01-01', 'stringency_index':63.89}
new_row2 = {'Day':'2022-01-02', 'stringency_index':63.89}
covid_string = covid_string.append(new_row, ignore_index=True)
covid_string = covid_string.append(new_row2, ignore_index=True)

In [8]:
covid_string

Unnamed: 0,Day,stringency_index
0,2020-09-01,50.93
1,2020-09-02,50.93
2,2020-09-03,50.93
3,2020-09-04,50.93
4,2020-09-05,50.93
...,...,...
484,2021-12-29,63.89
485,2021-12-30,63.89
486,2021-12-31,63.89
487,2022-01-01,63.89


In [9]:
def date_time_col(df):
    df["datetime_utc"] = pd.to_datetime(df["datetime"], utc = True)
    df["datetime"] = df["datetime_utc"].dt.tz_convert("Europe/Amsterdam")
    del df["datetime_utc"]
    df['datetime'] = df['datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')
    df = df.sort_values(by = "datetime", ascending = True)
    return df

In [10]:
vacation_df = date_time_col(vacation_df)
holiday_df = date_time_col(holiday_df)

In [11]:
covid_string["Day"] = pd.to_datetime(covid_string["Day"], utc = True)
covid_string['Day'] = covid_string['Day'].dt.strftime('%Y-%m-%d %H:%M:%S')
covid_string = covid_string.rename(columns={"Day": "datetime"})

In [12]:
covid_string

Unnamed: 0,datetime,stringency_index
0,2020-09-01 00:00:00,50.93
1,2020-09-02 00:00:00,50.93
2,2020-09-03 00:00:00,50.93
3,2020-09-04 00:00:00,50.93
4,2020-09-05 00:00:00,50.93
...,...,...
484,2021-12-29 00:00:00,63.89
485,2021-12-30 00:00:00,63.89
486,2021-12-31 00:00:00,63.89
487,2022-01-01 00:00:00,63.89


In [13]:
cmsa_df_filled = cmsa_df_filled.set_index('datetime')
cmsa_df_zero = cmsa_df_zero.set_index('datetime')
vacation_df = vacation_df.set_index('datetime')
holiday_df = holiday_df.set_index('datetime')
covid_string = covid_string.set_index('datetime')

In [14]:
cmsa_df_filled.head()

Unnamed: 0_level_0,GAWW-11,GAWW-12,GAWW-14
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-09-01 00:00:00,104.0,113.0,32.0
2020-09-01 00:15:00,73.0,109.0,30.0
2020-09-01 00:30:00,84.0,88.0,27.0
2020-09-01 00:45:00,95.0,99.0,26.0
2020-09-01 01:00:00,63.0,84.0,43.0


In [15]:
cmsa_df_filled.index = pd.DatetimeIndex(cmsa_df_filled.index)
cmsa_df_zero.index = pd.DatetimeIndex(cmsa_df_zero.index)

In [16]:
vacation_df.index = pd.DatetimeIndex(vacation_df.index)
holiday_df.index = pd.DatetimeIndex(holiday_df.index)
covid_string.index = pd.DatetimeIndex(covid_string.index)

In [17]:
start_date = '2020-09-01 00:00:00'
end_date = '2022-01-01 23:45:00'

In [18]:
cmsa_df_filled.shape

(46848, 3)

In [19]:
holiday_df = holiday_df.resample('15min').ffill(limit=96)
vacation_df = vacation_df.resample('15min').ffill(limit=96)
covid_string = covid_string.resample('15min').ffill(limit=96)

In [20]:
vacation_df = vacation_df[(vacation_df.index >= start_date) & (vacation_df.index <= end_date)]
holiday_df = holiday_df[(holiday_df.index >= start_date) & (holiday_df.index <= end_date)]
covid_string = covid_string[(covid_string.index >= start_date) & (covid_string.index <= end_date)]

In [21]:
cmsa_df_filled_merged = pd.concat([cmsa_df_filled, vacation_df, holiday_df, covid_string], axis=1)

In [22]:
cmsa_df_fill_zero_merged = pd.concat([cmsa_df_zero, vacation_df, holiday_df, covid_string], axis=1)

In [23]:
cmsa_df_filled_merged

Unnamed: 0_level_0,GAWW-11,GAWW-12,GAWW-14,vacation_dummy,holiday_dummy,stringency_index
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-09-01 00:00:00,104.0,113.0,32.0,0,0,50.93
2020-09-01 00:15:00,73.0,109.0,30.0,0,0,50.93
2020-09-01 00:30:00,84.0,88.0,27.0,0,0,50.93
2020-09-01 00:45:00,95.0,99.0,26.0,0,0,50.93
2020-09-01 01:00:00,63.0,84.0,43.0,0,0,50.93
...,...,...,...,...,...,...
2022-01-01 22:45:00,48.0,22.0,21.0,1,1,63.89
2022-01-01 23:00:00,28.0,16.0,21.0,1,1,63.89
2022-01-01 23:15:00,27.0,26.0,21.0,1,1,63.89
2022-01-01 23:30:00,29.0,19.0,15.0,1,1,63.89


In [24]:
cmsa_df_fill_zero_merged

Unnamed: 0_level_0,GAWW-11,GAWW-12,GAWW-14,vacation_dummy,holiday_dummy,stringency_index
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-09-01 00:00:00,104.0,113.0,32.0,0,0,50.93
2020-09-01 00:15:00,73.0,109.0,30.0,0,0,50.93
2020-09-01 00:30:00,84.0,88.0,27.0,0,0,50.93
2020-09-01 00:45:00,95.0,99.0,26.0,0,0,50.93
2020-09-01 01:00:00,63.0,84.0,43.0,0,0,50.93
...,...,...,...,...,...,...
2022-01-01 22:45:00,48.0,22.0,21.0,1,1,63.89
2022-01-01 23:00:00,28.0,16.0,21.0,1,1,63.89
2022-01-01 23:15:00,27.0,26.0,21.0,1,1,63.89
2022-01-01 23:30:00,29.0,19.0,15.0,1,1,63.89


In [25]:
cmsa_df_filled_merged.to_csv('../data/cmsa_df_filled_merged.csv')

In [26]:
cmsa_df_fill_zero_merged.to_csv('../data/cmsa_df_fill_zero_merged.csv')