In [None]:
import pandas as pd
import numpy as np
import calendar
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from statsmodels.tsa.stattools import adfuller

**Cases and Deaths Data**

In [None]:
data = pd.read_csv("owid-covid-data_Ireland.csv", index_col = "date")
data = data[(data.index >= "2020-03-01") & (data.index <= "2021-11-01")]
ireland_data = data[data['location'] == "Ireland"]
ireland_data.index = pd.DatetimeIndex(ireland_data.index)
ireland_data['delta_cases_per_capita'] = ireland_data['total_cases'].divide(ireland_data['population']).diff().fillna(0)
ireland_data['delta_deaths_per_capita'] = ireland_data['total_deaths'].divide(ireland_data['population']).diff().fillna(0)

In [None]:
ireland_data['new_cases'].iloc[-100:].plot()

In [None]:
ireland_data['total_vaccinations'].fillna(method = 'bfill', inplace = True)
ireland_data['people_vaccinated'].fillna(method = 'bfill', inplace = True)

In [None]:
ireland_data['total_vaccinations_per_capita'] = ireland_data['total_vaccinations'].divide(ireland_data['population'])
ireland_data['people_vaccinated_per_capita'] = ireland_data['people_vaccinated'].divide(ireland_data['population'])

In [None]:
ireland_data['new_people_vaccinated'] = ireland_data['total_vaccinations_per_capita'].diff().fillna(0)

In [None]:
ireland_data['new_vaccinations']

In [None]:
ireland_data['new_vaccinations_per_capita'] = ireland_data['new_vaccinations'].divide(ireland_data['population'])
ireland_data['new_people_vaccinated_per_capita'] = ireland_data['new_people_vaccinated'].divide(ireland_data['population'])

In [None]:
ireland_data['new_vaccinations_per_capita'].fillna(method = 'bfill', inplace = True)
ireland_data['new_people_vaccinated_per_capita'].fillna(method = 'bfill', inplace = True)

In [None]:
ireland_data['new_vaccinations_per_capita'].fillna(0, inplace = True)
ireland_data['new_people_vaccinated_per_capita'].fillna(0, inplace = True)

In [None]:
ireland_data['log_new_vaccinations_per_capita'] = np.log(ireland_data['total_vaccinations_per_capita']).diff().fillna(0)
ireland_data['log_new_people_vaccinated_per_capita'] = np.log(ireland_data['people_vaccinated_per_capita']).diff().fillna(0)

In [None]:
total_vax = round(adfuller(ireland_data['total_vaccinations_per_capita'])[0],3)
total_ppl_vaxxed = round(adfuller(ireland_data['people_vaccinated_per_capita'])[0],3)

new_vax_t = round(adfuller(ireland_data['new_vaccinations_per_capita'])[0],4)
new_ppl_vaxxed_t = round(adfuller(ireland_data['new_people_vaccinated_per_capita'])[0],3)

log_new_vax_t = round(adfuller(ireland_data['log_new_vaccinations_per_capita'].fillna(0))[0],4)
log_new_ppl_vaxxed_t = round(adfuller(ireland_data['log_new_people_vaccinated_per_capita'].fillna(0))[0],3)

In [None]:
from statsmodels.tsa.stattools import adfuller

fig, ax = plt.subplots(nrows = 3, gridspec_kw={"wspace": 0.2, 'hspace': 0}, sharex = True)
ireland_data['total_vaccinations_per_capita'].loc['2021-01-18':].plot(ax = ax[0],label = f"Total Vaccinations. ADF Test Statistic: {total_vax}", color = 'purple')
ireland_data['people_vaccinated_per_capita'].loc['2021-01-18':].loc['2021-01-18':].plot(ax = ax[0],label = f"Total People Vaccinated. ADF Test Statistic: {total_ppl_vaxxed}", color = 'darkcyan')
ax[0].legend(fontsize = 12, loc = 2)
ax[0].set_title("Raw vaccination ratios, first-differences and log-differences", fontsize = 15)
# ax[1].set_title("Differenced vaccination ratios: still non-stationary", fontsize = 10)
from matplotlib.offsetbox import AnchoredText


fig.set_size_inches(20, 18)
ireland_data['new_vaccinations_per_capita'].loc['2021-01-18':].plot(ax = ax[1],label = f"New Vaccinations. ADF Test Statistic: {new_vax_t}", color = 'purple')
ireland_data['people_vaccinated_per_capita'].loc['2021-01-18':].diff().fillna(0).plot(ax = ax[1],label = f"New People Vaccinated. ADF Test Statistic: {new_ppl_vaxxed_t}", color = 'darkcyan')
ax[1].legend(fontsize = 13, loc = 2)
ax[0].set_ylabel("Total", fontsize = 14)
ax[1].set_ylabel("New (Difference)", fontsize = 14)
ax[0].set_xlabel("Date", fontsize = 13)
ax[1].set_xlabel("Date", fontsize = 13)

ireland_data['log_new_vaccinations_per_capita'].loc['2021-01-18':].plot(ax = ax[2], label = f"Log First Difference People Vaccinated. ADF Test Statistic: {log_new_vax_t}***", color = 'purple')
secondary_ax = ax[2].twinx()
ireland_data['log_new_people_vaccinated_per_capita'].loc['2021-01-18':].plot(ax = secondary_ax, label = f"Log First Difference People Vaccinated. ADF Test Statistic: {log_new_ppl_vaxxed_t}***", color = 'darkcyan')
ax[0].grid(linestyle = '--')
ax[1].grid(linestyle = '--')
ax[2].grid(linestyle = '--')
ax[2].legend(fontsize = 13, loc = 2)
secondary_ax.set_ylabel("Log First Difference People Vaccinated", fontsize = 14)

secondary_ax.legend(fontsize =12, loc = 1)
ax[2].set_ylabel("Log First Difference New Vaccinations", fontsize = 14)

In [None]:
fig, ax = plt.subplots(ncols = 1)

ax.set_ylabel(r"$\Delta$ cases per capita", fontsize = 14)
ireland_data['trend'] = np.arange(0, len(ireland_data))
(ireland_data['trend']*ireland_data['delta_cases_per_capita'].mean()/300).plot(ax = ax, color = 'lightcoral', label = 'Plausible trend')
ireland_data['delta_cases_per_capita'].plot(ax = ax, color = 'mediumseagreen', label = r'$\Delta C_t$')

ax.legend(loc = 2, fontsize = 14)
fig.set_size_inches(17, 6)
ax.set_xlabel("Date", fontsize = 14)
ax.set_title("Plausible overall trend in new cases")
ax.grid(linestyle = '--', alpha = 0.5)

In [None]:
ireland_data[ireland_data['delta_cases_per_capita']< 0]

In [None]:
ireland_data.loc['2021-06-30', 'delta_cases_per_capita'] = ireland_data.loc[ '2021-06-29', 'delta_cases_per_capita']
ireland_data.loc['2021-09-02', 'delta_cases_per_capita'] = ireland_data.loc[ '2021-09-01', 'delta_cases_per_capita']
ireland_data.loc['2021-09-15', 'delta_cases_per_capita'] = ireland_data.loc[ '2021-09-14', 'delta_cases_per_capita']
ireland_data.loc['2021-09-16', 'delta_cases_per_capita'] = ireland_data.loc['2021-09-15', 'delta_cases_per_capita']

**What will be our final Dataset**

In [None]:
cases_dataset = pd.DataFrame(ireland_data['delta_cases_per_capita'])
# cases_dataset[f'delta_cases_per_capita.l1'] = 
# cases_dataset[f'delta_cases_per_capita.l1'] = cases_dataset['delta_cases_per_capita'].shift(1).fillna(0)
cases_dataset['log_new_vaccines_per_capita'] = np.log(ireland_data['total_vaccinations']).diff().fillna(0).divide(ireland_data['population'])
cases_dataset['log_new_people_vaccinated_per_capita'] = np.log(ireland_data['people_vaccinated']).diff().fillna(0).divide(ireland_data['population'])

**Country by Country Data**

In [None]:
columns_of_interest = ['delta_cases_per_capita', 'delta_deaths_per_capita']

In [None]:
def process_data(country, columns_of_interest):
    data = pd.read_csv(f"owid-covid-data_{country}.csv", index_col = "date")
    data = data[(data.index >= "2020-03-01") & (data.index <= "2022-03-01")]
    data.index = pd.DatetimeIndex(data.index)
    country_data = data[data['location'] == country]
    country_data['cases_per_capita'] = country_data['total_cases']/country_data['population']
    country_data['delta_cases_per_capita'] = country_data['cases_per_capita'].diff()
    country_data['deaths_per_capita'] = country_data['total_deaths']/country_data['population']
    country_data['delta_deaths_per_capita'] = country_data['deaths_per_capita'].diff()
    country_data = country_data.loc[:, columns_of_interest] 
    for column in country_data.columns:
        country_data.rename(columns = {column: column + f'_{country}'}, inplace = True)
    return country_data

In [None]:
UK_data = process_data("United Kingdom", columns_of_interest)
Germany_data = process_data("Germany", columns_of_interest)
France_data = process_data("France", columns_of_interest)
country_data = [UK_data, Germany_data, France_data]

In [None]:
for country in country_data:
    for column in country.columns:
        cases_dataset[column] = country[column]

**Non-Pharmaceutical Intervention Data**

In [None]:
policy_data = pd.read_csv("OxCGRT_latest_responses.csv")

In [None]:
Irish_policies = policy_data[policy_data['CountryName'] == "Ireland"]
Irish_policies['StartDate'] = pd.to_datetime(Irish_policies['StartDate'].astype(str), format='%Y%m%d')
Irish_policies['EndDate'] = pd.to_datetime(Irish_policies['EndDate'].astype(str), format='%Y%m%d')

In [None]:
closure_notices = [i for i in Irish_policies['PolicyType'].unique() if str(i)[0] == "C"]

In [None]:
workplace_closures = Irish_policies[Irish_policies['PolicyType'].str.contains("C2")]

In [None]:
cases_dataset['full_lockdown'] = 0

In [None]:
for i in workplace_closures[workplace_closures['PolicyValue'] ==3].index:
    start_date_closure = workplace_closures.loc[i, "StartDate"]
    end_date_closure = workplace_closures.loc[i, "EndDate"]

    date_range = pd.date_range(start_date_closure, end_date_closure)

    for j in date_range:
        cases_dataset.loc[j, "full_lockdown"] = 1


In [None]:
cases_dataset['full_lockdown.l30'] = cases_dataset['full_lockdown'].shift(30).fillna(0)

In [None]:
cases_dataset['full_lockdown.l45'] = cases_dataset['full_lockdown'].shift(45).fillna(0)

**Weather Data**

In [None]:
weather = pd.read_csv("Weather.csv")

In [None]:
weather['date'] = pd.DatetimeIndex(weather['date'] )

In [None]:
weather.set_index("date", inplace=True)

In [None]:
weather

In [None]:
weather = weather[(weather.index >= "2020-03-01") & (weather.index <= "2021-11-01")]

In [None]:
weather

In [None]:
cases_dataset = pd.concat([cases_dataset, weather], axis = 1)

**Day of the Week and Season Encoding**

In [None]:
def integer_encoding(df, series_name):

    # Find unique categorical labels
    unique_labels = df[series_name].unique()

    # Translate str label into simple numerical label e.g. 0, 1, 2, 3, ..., n
    # where n is the number of labels
    encoding_dict = {unique_labels[i]: i for i in range(len(unique_labels))}

    # Replace the str labels with the numerical ones
    df = df.replace({series_name: encoding_dict})
    return df

In [None]:
day_of_week = [calendar.day_name[pd.to_datetime(date).weekday()] for date in ireland_data.index]
cases_dataset['day_of_the_week'] = day_of_week
cases_dataset = integer_encoding(cases_dataset, 'day_of_the_week')

In [None]:
def season_of_date(date):
    year = str(date.year)
    seasons = {'spring': pd.date_range(start='21/03/'+year, end='20/06/'+year),
               'summer': pd.date_range(start='21/06/'+year, end='22/09/'+year),
               'autumn': pd.date_range(start='23/09/'+year, end='20/12/'+year)}
    if date in seasons['spring']:
        return 'spring'
    if date in seasons['summer']:
        return 'summer'
    if date in seasons['autumn']:
        return 'autumn'
    else:
        return 'winter'

In [None]:
# Assuming df has a date column of type `datetime`
cases_dataset['season'] = cases_dataset.index.map(season_of_date)

In [None]:
cases_dataset = integer_encoding(cases_dataset, "season")

In [None]:
cases_dataset.fillna(0, inplace = True)

In [None]:
cases_dataset['trend'] = np.arange(1, len(cases_dataset)+1 )

In [None]:
original_cases = cases_dataset.copy()['delta_cases_per_capita']

In [None]:
cases_dataset.to_csv("Combined_Dataset.csv")