In [1]:
from sklearn.svm import SVC, LinearSVC
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import pyreadr
from IPython.display import display
from coronanet.data import Coronanet
from coronanet.coronanet_pp import Prepro_coronanet
import statsmodels.api as sm 
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from mlxtend.plotting import plot_decision_regions
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

# Import table

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/TristanBOOK/coronanet/master/coronanet/data/LMA_Data_10_semaines_and_Pol_fam.csv')


In [4]:
df.head()

Unnamed: 0,ID,Date,PolicyFamily,PolicyType,value,Cases,Cases_New,Type,Source,J-28,J-21,J-14,J-7,J0,J7,J14,J21,J28,J35,J42
0,AD,2020-02-27,E,E3_diff,141399008,0.0,0.0,Deaths,JHU,0,0,0,0,0,0,0,0,3,15,25
1,AD,2020-02-28,E,E3_diff,-141399008,0.0,0.0,Deaths,JHU,0,0,0,0,0,0,0,0,3,16,26
2,AD,2020-03-02,H,H2_diff,1,0.0,0.0,Deaths,JHU,0,0,0,0,0,0,0,1,8,21,29
3,AD,2020-03-11,H,H6_diff,2,0.0,0.0,Deaths,JHU,0,0,0,0,0,0,1,14,23,33,37
4,AD,2020-03-13,C,C3_diff,1,0.0,0.0,Deaths,JHU,0,0,0,0,0,0,3,16,26,35,40


# Policy preprocessing

In [5]:
def handle_policy(x):
    if 'E' in x:
        return 0
    if 'H' in x:
        return 1
    if 'C' in x:
        return 2

In [6]:
df['PolicyFamily_'] = df['PolicyType'].apply(handle_policy)

In [7]:
df.columns

Index(['ID', 'Date', 'PolicyFamily', 'PolicyType', 'value', 'Cases',
       'Cases_New', 'Type', 'Source', 'J-28', 'J-21', 'J-14', 'J-7', 'J0',
       'J7', 'J14', 'J21', 'J28', 'J35', 'J42', 'PolicyFamily_'],
      dtype='object')

In [86]:
df1 = df.copy()

In [87]:
df1 = pd.get_dummies(df, columns=["PolicyFamily_"])

In [88]:
df1.tail()

Unnamed: 0,ID,Date,PolicyFamily,PolicyType,value,Cases,Cases_New,Type,Source,J-28,J-21,J-14,J-7,J0,J7,J14,J21,J28,J35,J42,PolicyFamily__0,PolicyFamily__1,PolicyFamily__2
14577,ZW,2020-10-26,E,E4_diff,3800000,242.0,5.0,Deaths,JHU,228,228,230,232,242,245,254,257,273,276,-10000,1,0,0
14578,ZW,2020-10-26,H,H6_diff,2,242.0,5.0,Deaths,JHU,228,228,230,232,242,245,254,257,273,276,-10000,0,1,0
14579,ZW,2020-10-27,E,E4_diff,-3800000,242.0,0.0,Deaths,JHU,228,229,230,233,242,246,255,260,274,277,-10000,1,0,0
14580,ZW,2020-11-09,C,C1_diff,-1,254.0,1.0,Deaths,JHU,230,232,242,245,254,257,273,276,-10000,-10000,-10000,0,0,1
14581,ZW,2020-11-20,C,C1_diff,1,265.0,0.0,Deaths,JHU,236,242,250,257,265,275,-10000,-10000,-10000,-10000,-10000,0,0,1


In [89]:
df1 = df1[df1['J-7']>0]
df1 = df1[df1['J7']>0]
df1 = df1[df1['J-14']>0]
df1 = df1[df1['J14']>0]
df1 = df1[df1['J-21']>0]
df1 = df1[df1['J21']>0]

In [138]:
df2 = df1.copy()

In [139]:
df2.head()

Unnamed: 0,ID,Date,PolicyFamily,PolicyType,value,Cases,Cases_New,Type,Source,J-28,J-21,J-14,J-7,J0,J7,J14,J21,J28,J35,J42,PolicyFamily__0,PolicyFamily__1,PolicyFamily__2
15,AD,2020-04-18,H,H2_diff,2,35.0,0.0,Deaths,JHU,0,3,17,26,35,40,44,48,51,51,51,0,1,0
16,AD,2020-04-20,C,C2_diff,-1,37.0,1.0,Deaths,JHU,1,8,21,29,37,40,45,48,51,51,51,0,0,1
17,AD,2020-05-09,E,E2_diff,2,48.0,1.0,Deaths,JHU,26,35,40,44,48,51,51,51,51,51,52,1,0,0
18,AD,2020-05-11,C,C8_diff,-1,48.0,0.0,Deaths,JHU,29,37,40,45,48,51,51,51,51,51,52,0,0,1
19,AD,2020-05-27,E,E3_diff,137063808,51.0,0.0,Deaths,JHU,42,46,49,51,51,51,51,52,52,52,52,1,0,0


# Baseline model

## Features selection

In [154]:
df2['dJ_21'] =(df2['J-21']/df2['Cases'])**(1/20)
df2['dJ_14'] =(df2['Cases']/df2['J-14'])**(1/13)
df2['dJ14']=(df2['J14']/df2['Cases'])**(1/13)
df2['dJ7']=(df2['J7']/df2['Cases'])**(1/6)
df2['dJ21']=(df2['J21']/df2['Cases'])**(1/20)

In [161]:
df2.head()

Unnamed: 0,ID,Date,PolicyFamily,PolicyType,value,Cases,Cases_New,Type,Source,J-28,J-21,J-14,J-7,J0,J7,J14,J21,J28,J35,J42,PolicyFamily__0,PolicyFamily__1,PolicyFamily__2,dJ_21,dJ_14,dJ14,dJ7,dJ21
15,AD,2020-04-18,H,H2_diff,2,35.0,0.0,Deaths,JHU,0,3,17,26,35,40,44,48,51,51,51,0,1,0,0.884408,1.057121,1.017759,1.022505,1.015918
16,AD,2020-04-20,C,C2_diff,-1,37.0,1.0,Deaths,JHU,1,8,21,29,37,40,45,48,51,51,51,0,0,1,0.926285,1.044532,1.015171,1.013078,1.013099
17,AD,2020-05-09,E,E2_diff,2,48.0,1.0,Deaths,JHU,26,35,40,44,48,51,51,51,51,51,52,1,0,0,0.984331,1.014124,1.004674,1.010155,1.003036
18,AD,2020-05-11,C,C8_diff,-1,48.0,0.0,Deaths,JHU,29,37,40,45,48,51,51,51,51,51,52,0,0,1,0.98707,1.014124,1.004674,1.010155,1.003036
19,AD,2020-05-27,E,E3_diff,137063808,51.0,0.0,Deaths,JHU,42,46,49,51,51,51,51,52,52,52,52,1,0,0,0.994854,1.003082,1.0,1.0,1.000971


In [172]:
# dJ_14: moyenne geo du nombre de morts entre jour d'application de la la loi et 14 jours avant
X = df2[['ID','PolicyFamily__0','PolicyFamily__1', 'PolicyFamily__2', 'dJ_14']]

# dJ14: moyenne geo du nombre de morts entre jour d'application de la la loi et 14 jours après
y = df2['dJ14']

In [156]:
X.shape

(4436, 4)

## train, test split

In [151]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=5)

## Linear Regression 

In [152]:
linereg = LinearRegression()
linereg.fit(X_train, y_train)
linereg.score(X_test,y_test)

0.6039395835231646

In [166]:
# dJ_14: moyenne geo du nombre de morts entre jour d'application de la la loi et 14 jours avant
X = df2[['dJ_14']]

# dJ14: moyenne geo du nombre de morts entre jour d'application de la la loi et 14 jours après
y = df2['dJ14']

In [167]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=5)

In [168]:
linereg = LinearRegression()
linereg.fit(X_train, y_train)
linereg.score(X_test,y_test)

0.6039395835231646

# Model with ID 

0.9909028103837464