In [None]:
import pandas as pd
import numpy as np
import calendar
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from statsmodels.tsa.stattools import adfuller

In [None]:
start_date = "2021-11-01"
end_date = "2022-03-01"

**Cases and Deaths Data**

In [None]:
data = pd.read_csv("owid-covid-data_Ireland.csv", index_col = "date")
data = data[(data.index >= start_date) & (data.index <= end_date)]
ireland_data = data[data['location'] == "Ireland"]
ireland_data.index = pd.DatetimeIndex(ireland_data.index)
ireland_data['delta_cases_per_capita'] = ireland_data['total_cases'].divide(ireland_data['population']).diff().fillna(0)
ireland_data['delta_deaths_per_capita'] = ireland_data['total_deaths'].divide(ireland_data['population']).diff().fillna(0)

In [None]:
ireland_data['delta_cases_per_capita'].diff().fillna(0).plot()

In [None]:
fig, ax = plt.subplots(ncols = 1)

ax.set_ylabel(r"$\Delta$ cases per capita", fontsize = 14)
ireland_data['trend'] = np.arange(0, len(ireland_data))
(ireland_data['trend']*ireland_data['delta_cases_per_capita'].mean()/300).plot(ax = ax, color = 'lightcoral', label = 'Plausible trend')
ireland_data['delta_cases_per_capita'].plot(ax = ax, color = 'mediumseagreen', label = r'$\Delta C_t$')

ax.legend(loc = 2, fontsize = 14)
fig.set_size_inches(17, 6)
ax.set_xlabel("Date", fontsize = 14)
ax.set_title("Plausible overall trend in new cases")
ax.grid(linestyle = '--', alpha = 0.5)

In [None]:
ireland_data[ireland_data['delta_cases_per_capita']< 0]

**What will be our final Dataset**

In [None]:
cases_dataset = pd.DataFrame(ireland_data['delta_deaths_per_capita'])
# cases_dataset[f'delta_cases_per_capita.l1'] = 
cases_dataset['delta_cases_per_capita'] = ireland_data['delta_cases_per_capita']
cases_dataset['delta_deaths_per_capita.l1'] = cases_dataset['delta_deaths_per_capita'].shift(1).fillna(0)
cases_dataset[f'delta_cases_per_capita.l1'] = cases_dataset['delta_cases_per_capita'].shift(1).fillna(0)
cases_dataset['log_new_vaccines_per_capita'] = np.log(ireland_data['total_vaccinations']).diff().fillna(0).divide(ireland_data['population'])
cases_dataset['log_new_people_vaccinated_per_capita'] = np.log(ireland_data['people_vaccinated']).diff().fillna(0).divide(ireland_data['population'])

**Country by Country Data**

In [None]:
columns_of_interest = ['delta_cases_per_capita', 'delta_deaths_per_capita']

In [None]:
def process_data(country, columns_of_interest):
    data = pd.read_csv(f"owid-covid-data_{country}.csv", index_col = "date")
    data = data[(data.index >= start_date) & (data.index <= end_date)]
    data.index = pd.DatetimeIndex(data.index)
    country_data = data[data['location'] == country]
    country_data['cases_per_capita'] = country_data['total_cases']/country_data['population']
    country_data['delta_cases_per_capita'] = country_data['cases_per_capita'].diff()
    country_data['deaths_per_capita'] = country_data['total_deaths']/country_data['population']
    country_data['delta_deaths_per_capita'] = country_data['deaths_per_capita'].diff()
    country_data = country_data.loc[:, columns_of_interest] 
    for column in country_data.columns:
        country_data.rename(columns = {column: column + f'_{country}'}, inplace = True)
    return country_data

In [None]:
UK_data = process_data("United Kingdom", columns_of_interest)
Germany_data = process_data("Germany", columns_of_interest)
France_data = process_data("France", columns_of_interest)
country_data = [UK_data, Germany_data, France_data]

In [None]:
for country in country_data:
    for column in country.columns:
        cases_dataset[column] = country[column]

**Non-Pharmaceutical Intervention Data**

In [None]:
policy_data = pd.read_csv("OxCGRT_latest_responses.csv")

In [None]:
Irish_policies = policy_data[policy_data['CountryName'] == "Ireland"]
Irish_policies['StartDate'] = pd.to_datetime(Irish_policies['StartDate'].astype(str), format='%Y%m%d')
Irish_policies['EndDate'] = pd.to_datetime(Irish_policies['EndDate'].astype(str), format='%Y%m%d')

In [None]:
closure_notices = [i for i in Irish_policies['PolicyType'].unique() if str(i)[0] == "C"]

In [None]:
workplace_closures = Irish_policies[Irish_policies['PolicyType'].str.contains("C2")]

In [None]:
cases_dataset['full_lockdown'] = 0

In [None]:
for i in workplace_closures[workplace_closures['PolicyValue'] ==3].index:
    start_date_closure = workplace_closures.loc[i, "StartDate"]
    end_date_closure = workplace_closures.loc[i, "EndDate"]

    date_range = pd.date_range(start_date_closure, end_date_closure)

    for j in date_range:
        cases_dataset.loc[j, "full_lockdown"] = 1


In [None]:
cases_dataset['full_lockdown.l30'] = cases_dataset['full_lockdown'].shift(30).fillna(0)

In [None]:
cases_dataset['full_lockdown.l45'] = cases_dataset['full_lockdown'].shift(45).fillna(0)

**Weather Data**

In [None]:
weather = pd.read_csv("Weather.csv")

In [None]:
weather['date'] = pd.DatetimeIndex(weather['date'] )

In [None]:
weather.set_index("date", inplace=True)

In [None]:
weather

In [None]:
weather = weather[(weather.index >= start_date) & (weather.index <= end_date)]

In [None]:
cases_dataset = pd.concat([cases_dataset, weather], axis = 1)

**Day of the Week and Season Encoding**

In [None]:
def integer_encoding(df, series_name):

    # Find unique categorical labels
    unique_labels = df[series_name].unique()

    # Translate str label into simple numerical label e.g. 0, 1, 2, 3, ..., n
    # where n is the number of labels
    encoding_dict = {unique_labels[i]: i for i in range(len(unique_labels))}

    # Replace the str labels with the numerical ones
    df = df.replace({series_name: encoding_dict})
    return df

In [None]:
day_of_week = [calendar.day_name[pd.to_datetime(date).weekday()] for date in cases_dataset.index]
cases_dataset['day_of_the_week'] = day_of_week
cases_dataset = integer_encoding(cases_dataset, 'day_of_the_week')

In [None]:
def season_of_date(date):
    year = str(date.year)
    seasons = {'spring': pd.date_range(start='21/03/'+year, end='20/06/'+year),
               'summer': pd.date_range(start='21/06/'+year, end='22/09/'+year),
               'autumn': pd.date_range(start='23/09/'+year, end='20/12/'+year)}
    if date in seasons['spring']:
        return 'spring'
    if date in seasons['summer']:
        return 'summer'
    if date in seasons['autumn']:
        return 'autumn'
    else:
        return 'winter'

In [None]:
# Assuming df has a date column of type `datetime`
cases_dataset['season'] = cases_dataset.index.map(season_of_date)

In [None]:
cases_dataset = integer_encoding(cases_dataset, "season")

In [None]:
cases_dataset.fillna(0, inplace = True)

In [None]:
cases_dataset['trend'] = np.arange(1, len(cases_dataset)+1 )

In [None]:
original_cases = cases_dataset.copy()['delta_cases_per_capita']

**Combining Everything**

In [None]:
combined_dataset = original_cases.iloc[-120:, :]

In [None]:
cases_dataset.to_csv("Combined_testing_Dataset.csv")