# Predicting COVID-19 cases in Ireland

## Import the Libraries

In [None]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import r2_score
from sklearn.neural_network import MLPRegressor
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

### Import the Datasets
- Data for COVID cases the Republic of Ireland was sourced from Our World in Data
- Data for the prevalence of COVID-19 internet searches were sourced from Google Trends
- Data for COVID cases in Northern Ireland was sourced from the Department of Health for Northern Ireland 

In [None]:
# Import Republic of Ireland and Google Trends data
roi_data = pd.read_excel("preparedData.xlsx")

In [None]:
# Import Northern Ireland data
#ni_data = pd.read_excel("ni_smoothed.xlsx")

In [None]:
# Merge the datasets
#data = pd.merge(roi_data, ni_data, left_on='date', right_on='date', how='left')

In [None]:
data = roi_data.copy(deep=True)

In [None]:
data.fillna(0, inplace=True)

In [None]:
#data['new_cases_smoothed'] = data['new_cases_smoothed']+data['newCasesByPublishDateSmoothed']

In [None]:
data.head(5)

In [None]:
data['date'] = pd.to_datetime(data['date'])
data.sort_values(by='date', ascending=True, inplace=True)
data = data.set_index('date')

In [None]:
data.drop(columns=['tests_per_case', 'new_cases','new_tests','reproduction_rate',
                   'positive_rate','total_vaccinations','stringency_index',
            'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
       'new_vaccinations','population',
       'population_density', 'median_age', 'aged_65_older', 'aged_70_older',
       'gdp_per_capita', 'extreme_poverty', 'cardiovasc_death_rate',
       'diabetes_prevalence', 'female_smokers', 'male_smokers',
       'life_expectancy', 'human_development_index', 
       'COVID-19 testing: (Ireland)', 'COVID-19 rapid antigen test: (Ireland)',
       'Health Service Executive: (Ireland)', 'Vaccination: (Ireland)',
       'book covid test: (Ireland)_x', 'how many covid cases today: (Ireland)',
       'pcr covid test: (Ireland)', 'close contact covid: (Ireland)',
       'book a covid test: (Ireland)', 'vaccination centre: (Ireland)',
       'pharmacy near me: (Ireland)',
       'Treatment and management of COVID-19: (Ireland)',
       'Hand sanitizer: (Ireland)', 'Face mask: (Ireland)',
       'book covid test: (Ireland)_y', 'covid test dublin: (Ireland)',
       'covid test centre: (Ireland)', 'hse covid vaccine: (Ireland)',
       'hse vaccine portal: (Ireland)', 'hse portal vaccine: (Ireland)',
       'pcr test hse: (Ireland)', 'hse covid test: (Ireland)',
       'hse vaccine registration: (Ireland)',
       'how long will it take to vaccinate ireland: (Ireland)'],inplace=True)

In [None]:
data.describe().transpose()

In [None]:
# Take a copy of dataframe without NaN values
df = data.dropna()

In [None]:
df.tail()

In [None]:
df.loc[:,'ncs'] = df.loc[:,'new_cases_smoothed'].shift(5)
df.loc[:,'ncs_diff'] = df.loc[:,'ncs'].diff()
df.loc[:,'ncs2'] = df.loc[:,'ncs'].shift()
df.loc[:,'ncs_diff2'] = df.loc[:,'ncs2'].diff()
df.loc[:,'ncs3'] = df.loc[:,'ncs2'].shift()
df.loc[:,'ncs_diff3'] = df.loc[:,'ncs3'].diff()
df.loc[:,'ncs4'] = df.loc[:,'ncs3'].shift()
df.loc[:,'ncs_diff4'] = df.loc[:,'ncs4'].diff()
df.loc[:,'ncs5'] = df.loc[:,'ncs4'].shift()
df.loc[:,'ncs_diff5'] = df.loc[:,'ncs5'].diff()
df.loc[:,'ncs6'] = df.loc[:,'ncs5'].shift()
df.loc[:,'ncs_diff6'] = df.loc[:,'ncs6'].diff()
df.loc[:,'ncs7'] = df.loc[:,'ncs6'].shift()
df.loc[:,'ncs_diff7'] = df.loc[:,'ncs7'].diff()

In [None]:
df.head(5)

In [None]:
df.tail(5)

# USE THIS

df.loc[:,'google'] = df.loc[:,'covid: (Ireland)'].shift(5)
df.loc[:,'google_diff'] = df.loc[:,'google'].diff()
df.loc[:,'google2'] = df.loc[:,'google'].shift()
df.loc[:,'google_diff2'] = df.loc[:,'google2'].diff()
df.loc[:,'google3'] = df.loc[:,'google2'].shift()
df.loc[:,'google_diff3'] = df.loc[:,'google3'].diff()
df.loc[:,'google4'] = df.loc[:,'google3'].shift()
df.loc[:,'google_diff4'] = df.loc[:,'google4'].diff()
df.loc[:,'google5'] = df.loc[:,'google4'].shift()
df.loc[:,'google_diff5'] = df.loc[:,'google5'].diff()
df.loc[:,'google6'] = df.loc[:,'google5'].shift()
df.loc[:,'google_diff6'] = df.loc[:,'google6'].diff()
df.loc[:,'google7'] = df.loc[:,'google6'].shift()
df.loc[:,'google_diff7'] = df.loc[:,'google7'].diff()

In [None]:
df = df.dropna()

In [None]:
#df = df.fillna(method='ffill').fillna(method='bfill')

In [None]:
df.head(5)

In [None]:

df.tail(5)

In [None]:
df.shape


In [None]:
n = len(df)

In [None]:
df.columns

In [None]:
variables = df.columns.drop(['covid: (Ireland)', 'ncs', 'ncs_diff', 'ncs2',
       'ncs_diff2', 'ncs3', 'ncs_diff3', 'ncs4', 'ncs_diff4', 'ncs5',
       'ncs_diff5', 'ncs6', 'ncs_diff6', 'ncs7', 'ncs_diff7'])

In [None]:
X_train = df.iloc[0:int(n*0.7)].drop(['new_cases_smoothed'], axis=1)
y_train = df.iloc[0:int(n*0.7)].drop(columns=variables, axis=1)

In [None]:
X_test = df.iloc[int(n*0.7):int(n*0.9)].drop(['new_cases_smoothed'],axis=1)
y_test = df.iloc[int(n*0.7):int(n*0.9)].drop(columns=variables, axis=1)

In [None]:
X_val = df.iloc[int(n*0.9):].drop(['new_cases_smoothed'], axis=1)
y_val = df.iloc[int(n*0.9):].drop(columns=variables, axis=1)

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
X_test.head()

In [None]:
y_test.head()

In [None]:
X_val.head()

In [None]:
y_val.head()

In [None]:
model = MLPRegressor()

In [None]:
param_search = {
    "hidden_layer_sizes": [(1,),(50,),(100,),(150,),(200,),(250,),(300,),(350,)],
    "activation": ["identity", "logistic", "tanh", "relu"],
    "solver": ["lbfgs", "sgd", "adam"],
    "alpha": [0.00005,0.0005, 0.005],
    "learning_rate": ['constant', 'invscaling', 'adaptive']
}

In [None]:
tsvc = TimeSeriesSplit(n_splits=5)
gsearch = GridSearchCV(estimator=model, cv=tsvc, param_grid=param_search, scoring='r2')
gsearch.fit(X_train, y_train)
best_score = gsearch.best_score_
best_model = gsearch.best_estimator_

In [None]:
print(best_model)

In [None]:
print(best_score)

In [None]:
y_pred = best_model.predict(X_test)

print(r2_score(y_test, y_pred))

In [None]:
y_val_pred = best_model.predict(X_val)

In [None]:
print(r2_score(y_val, y_val_pred))

In [None]:
y_test

In [None]:
y_test['predictions'] = y_pred.tolist()


In [None]:
y_test.reset_index(inplace=True)
y_test.plot(x='date', y=['new_cases_smoothed','predictions'], linestyle="dashed", figsize=(10,10))


In [None]:

y_val['predictions'] = y_val_pred.tolist()

In [None]:
y_val.reset_index(inplace=True)

In [None]:
y_val.plot(x='date', y=['new_cases_smoothed', 'predictions'], linestyle="dashed", figsize=(10,10))

In [None]:
data.tail(25)

In [None]:
owid = pd.read_csv("owid-covid-data(1).csv")

In [None]:
trend = pd.read_csv("multiTimeline(4).csv", skiprows=2)

In [None]:
owid.tail()

In [None]:
trend.head()

In [None]:
combi = pd.merge(owid, trend, left_on='date', right_on='Day', how='left')

In [None]:
combi['new_cases_smoothed'].fillna(method='ffill', inplace=True)

In [None]:
combi.tail()

In [None]:
combi.drop(columns=['Day'], inplace=True)

In [None]:
combi.loc[:,'ncs'] = combi.loc[:,'new_cases_smoothed'].shift(5)
combi.loc[:,'ncs_diff'] = combi.loc[:,'ncs'].diff()
combi.loc[:,'ncs2'] = combi.loc[:,'ncs'].shift()
combi.loc[:,'ncs_diff2'] = combi.loc[:,'ncs2'].diff()
combi.loc[:,'ncs3'] = combi.loc[:,'ncs2'].shift()
combi.loc[:,'ncs_diff3'] = combi.loc[:,'ncs3'].diff()
combi.loc[:,'ncs4'] = combi.loc[:,'ncs3'].shift()
combi.loc[:,'ncs_diff4'] = combi.loc[:,'ncs4'].diff()
combi.loc[:,'ncs5'] = combi.loc[:,'ncs4'].shift()
combi.loc[:,'ncs_diff5'] = combi.loc[:,'ncs5'].diff()
combi.loc[:,'ncs6'] = combi.loc[:,'ncs5'].shift()
combi.loc[:,'ncs_diff6'] = combi.loc[:,'ncs6'].diff()
combi.loc[:,'ncs7'] = combi.loc[:,'ncs6'].shift()
combi.loc[:,'ncs_diff7'] = combi.loc[:,'ncs7'].diff()

In [None]:
combi.loc[:,'google'] = combi.loc[:,'COVID: (Ireland)'].shift(5)
combi.loc[:,'google_diff'] = combi.loc[:,'google'].diff()
combi.loc[:,'google2'] = combi.loc[:,'google'].shift()
combi.loc[:,'google_diff2'] = combi.loc[:,'google2'].diff()
combi.loc[:,'google3'] = combi.loc[:,'google2'].shift()
combi.loc[:,'google_diff3'] = combi.loc[:,'google3'].diff()
combi.loc[:,'google4'] = combi.loc[:,'google3'].shift()
combi.loc[:,'google_diff4'] = combi.loc[:,'google4'].diff()
combi.loc[:,'google5'] = combi.loc[:,'google4'].shift()
combi.loc[:,'google_diff5'] = combi.loc[:,'google5'].diff()
combi.loc[:,'google6'] = combi.loc[:,'google5'].shift()
combi.loc[:,'google_diff6'] = combi.loc[:,'google6'].diff()
combi.loc[:,'google7'] = combi.loc[:,'google6'].shift()
combi.loc[:,'google_diff7'] = combi.loc[:,'google7'].diff()

In [None]:
combi.head()

In [None]:
combi.tail(30)

In [None]:
combi['date'] = pd.to_datetime(combi['date'])

In [None]:
dates = combi.iloc[729:743,0]

In [None]:
x_combi_slice = combi.iloc[717:731,:]

In [None]:
x_combi_slice.drop(columns=['new_cases_smoothed','COVID: (Ireland)'], inplace=True)

In [None]:
x_combi_slice.set_index('date', inplace=True)

In [None]:
x_combi_slice

In [None]:
x_combi_slice.fillna(method='ffill', inplace=True)

In [None]:
y_combi_slice = combi.iloc[729:743,1]

In [None]:
y_combi_slice.shape

In [None]:
y_combi_slice = y_combi_slice.to_frame()


In [None]:
y_combi_slice_pred = best_model.predict(x_combi_slice)

In [None]:
print(r2_score(y_combi_slice, y_combi_slice_pred))

In [None]:
y_combi_slice['predictions'] = y_combi_slice_pred.tolist()

In [None]:
y_combi_slice['dates'] = dates.tolist()

In [None]:
y_combi_slice.reset_index(inplace=True)
y_combi_slice.plot(x='dates', y=['new_cases_smoothed','predictions'], linestyle="dashed", figsize=(10,10))