In [1]:
import pandas as pd 
import numpy as np 

from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('NI.csv', sep=",")

In [3]:
data.head()

Unnamed: 0,date,areaName,areaCode,newCasesByPublishDate,cumCasesByPublishDate,newDeaths28DaysByDeathDate,cumDeaths28DaysByDeathDate,newAntibodyTestsByPublishDate,newLFDTestsBySpecimenDate,newPCRTestsByPublishDate,newPCRTestsByPublishDateRollingSum,hospitalCases,newAdmissions,newPeopleVaccinatedCompleteByVaccinationDate,newVaccinesGivenByPublishDate,VaccineRegisterPopulationByVaccinationDate
0,10/03/2022,Northern Ireland,N92000002,2602.0,646794.0,,,,,,,,,,,
1,09/03/2022,Northern Ireland,N92000002,2683.0,644192.0,2.0,3244.0,,0.0,20295.0,118210.0,483.0,21.0,,1317.0,
2,08/03/2022,Northern Ireland,N92000002,2669.0,641509.0,4.0,3242.0,,0.0,19739.0,117337.0,506.0,37.0,,1037.0,
3,07/03/2022,Northern Ireland,N92000002,5769.0,638840.0,4.0,3238.0,,0.0,19464.0,115735.0,518.0,29.0,,1078.0,
4,06/03/2022,Northern Ireland,N92000002,0.0,633071.0,2.0,3234.0,,0.0,13711.0,117006.0,544.0,25.0,,903.0,


In [4]:
data.drop(columns=['areaName', 'areaCode'], inplace=True)

In [5]:
data['date'] = pd.to_datetime(data['date'])

In [6]:
data.sort_values(by='date', ascending=True, inplace=True)

In [7]:
data = data.set_index('date')

In [8]:
data.head()

Unnamed: 0_level_0,newCasesByPublishDate,cumCasesByPublishDate,newDeaths28DaysByDeathDate,cumDeaths28DaysByDeathDate,newAntibodyTestsByPublishDate,newLFDTestsBySpecimenDate,newPCRTestsByPublishDate,newPCRTestsByPublishDateRollingSum,hospitalCases,newAdmissions,newPeopleVaccinatedCompleteByVaccinationDate,newVaccinesGivenByPublishDate,VaccineRegisterPopulationByVaccinationDate
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2020-01-03,,,,,,,,,11.0,,,,
2020-01-04,103.0,689.0,4.0,45.0,,0.0,,,280.0,45.0,,,
2020-01-05,87.0,3623.0,12.0,387.0,,0.0,,,269.0,19.0,,,
2020-01-06,12.0,4728.0,1.0,532.0,,0.0,,,127.0,3.0,,,
2020-01-07,1.0,5761.0,1.0,552.0,,0.0,,,32.0,0.0,,,


In [9]:
data.shape

(740, 13)

In [10]:
n = len(data)

In [11]:
data.loc[:,'newCasesByPublishDate'] = data.loc[:,'newCasesByPublishDate'].shift(periods=7)

In [12]:
data = data.fillna(0)

In [22]:
data.drop(columns=['cumCasesByPublishDate', 'newDeaths28DaysByDeathDate',
       'cumDeaths28DaysByDeathDate', 'newAntibodyTestsByPublishDate',
       'newLFDTestsBySpecimenDate', 'newPCRTestsByPublishDate',
       'newPCRTestsByPublishDateRollingSum', 'hospitalCases', 'newAdmissions'], inplace=True)

In [23]:
variables = data.columns.drop('newCasesByPublishDate')

In [24]:
X_train = data.iloc[0:int(n*0.7)].drop(['newCasesByPublishDate'], axis=1)
y_train = data.iloc[0:int(n*0.7)].drop(columns=variables, axis=1)

In [25]:
X_test = data.iloc[int(n*0.9):].drop(['newCasesByPublishDate'], axis=1)
y_test = data.iloc[int(n*0.9):].drop(columns=variables, axis=1)

In [26]:
X_val = data.iloc[int(n*0.7):int(n*0.9)].drop(['newCasesByPublishDate'], axis=1)
y_val = data.iloc[int(n*0.7):int(n*0.9)].drop(columns=variables, axis=1)

In [27]:
models = []
models.append(('MN',MLPRegressor(solver='lbfgs')))

In [28]:
results = []
names = []
for name, model in models:
    tsvc = TimeSeriesSplit(n_splits=2)
    cv_results = cross_val_score(model, X_train, y_train, cv=tsvc, scoring='r2')
    results.append(cv_results)
    names.append(name)
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

MN: -1454.354453 (767.016072)


In [None]:
plt.boxplot(results, labels=names)
plt.title('Algorithm Comparison')
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV
model = MLPRegressor()

In [None]:
param_search = {
    "hidden_layer_sizes": [(1,),(50,),(100,),(150,),(200,)],
    "activation": ["identity", "logistic", "tanh", "relu"],
    "solver": ["lbfgs", "sgd", "adam"],
    "alpha": [0.00005,0.0005],
    "learning_rate": ['constant', 'invscaling', 'adaptive']
}

In [None]:
tsvc = TimeSeriesSplit(n_splits=2)
gsearch = GridSearchCV(estimator=model, cv=tsvc, param_grid=param_search, scoring='r2')
gsearch.fit(X_train, y_train)
best_score = gsearch.best_score_
best_model = gsearch.best_estimator_

In [None]:
print(best_model)

In [None]:
print(best_score)

In [None]:
from sklearn.metrics import r2_score

y_pred = best_model.predict(X_test)

print(r2_score(y_test, y_pred))