In [1]:
import sys 
sys.dont_write_bytecode = True # Stop creating __pycache__ folder

import numpy as np

from toolbox import scatter_plot
from toolbox import features_importance

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

# Loading the data set

In [2]:
import pandas as pd

# Daily updated COVID database
url = 'https://covid.ourworldindata.org/data/owid-covid-data.json'
json = pd.read_json(url)

data = json['CHE']['data']
df = pd.DataFrame(data)
df.tail()

Unnamed: 0,date,total_cases,new_cases,total_cases_per_million,new_cases_per_million,stringency_index,new_cases_smoothed,new_deaths_smoothed,new_cases_smoothed_per_million,new_deaths_smoothed_per_million,...,new_tests_smoothed_per_thousand,total_vaccinations,people_vaccinated,total_vaccinations_per_hundred,people_vaccinated_per_hundred,new_vaccinations,new_vaccinations_smoothed,new_vaccinations_smoothed_per_million,people_fully_vaccinated,people_fully_vaccinated_per_hundred
611,2021-10-28,870837.0,1793.0,99918.261,205.726,46.3,1387.714,5.857,159.224,0.672,...,2.739,11048576.0,5713559.0,126.77,65.56,15958.0,15996.0,1835.0,5503938.0,63.15
612,2021-10-29,872558.0,1721.0,100115.725,197.464,46.3,1449.571,7.286,166.321,0.836,...,2.777,11068501.0,5719097.0,127.0,65.62,19925.0,15094.0,1732.0,5518847.0,63.32
613,2021-10-30,872558.0,0.0,100115.725,0.0,,1449.571,7.429,166.321,0.852,...,2.792,11077084.0,5721856.0,127.1,65.65,8583.0,14202.0,1630.0,5524939.0,63.39
614,2021-10-31,872558.0,0.0,100115.725,0.0,,1449.571,7.429,166.321,0.852,...,,11077733.0,5722090.0,127.1,65.65,649.0,14053.0,1612.0,5525410.0,63.4
615,2021-11-01,877098.0,4540.0,100636.636,520.911,,1627.143,7.0,186.695,0.803,...,,,,,,,,,,


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit

df_used = df[['date', 'new_deaths', 'new_cases', 'stringency_index', 'new_tests', 'icu_patients', 'reproduction_rate']]
df_used = df_used.dropna()
df_used = df_used[df_used.select_dtypes(include=[np.number]).ge(0).all(1)] # Remove rows with negative values because not possible

# Décaler les y vers la gauche pour pouvoir prédire sur le mois d'après
# Knn pour prédire la forme des features, et ensuite utiliser ces features approximée pour approximer la feature à prédire

X = df_used.index.values.reshape(-1, 1)
X_range = np.linspace(X.min(), X.max(), 300).reshape(-1, 1)
X_pred = np.array(range(len(df), len(df) + 30)).reshape(-1, 1) # next month

y = df_used['new_deaths'].values.ravel()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

cv = ShuffleSplit(test_size=0.2, train_size=0.8)

In [None]:
import pyplot.express as ex

dff = ex.data.iris()

Xi = dff.drop(columns=['petal_width', 'species_id'])
Xi = pd.get_dummies(Xi, columns=['species'], prefix_sep='=')
yi = dff['petal_width']

model = LinearRegression()
model.fit(Xi, yi)

colors = ['Positive' if c > 0 else 'Negative' for c in model.coef_]

fig = ex.bar(
    x=Xi.columns, y=model.coef_, color=colors,
    color_discrete_sequence=['red', 'blue'],
    labels=dict(x='Feature', y='Linear coefficient'),
    title='Weight of each feature for predicting petal width'
)
fig.show()

# Testing machine learning models

## Linear Models


### Ordinary Least Squares

In [10]:
from sklearn.linear_model import LinearRegression

pipeline = Pipeline([("scaler", MinMaxScaler()), ("linear", LinearRegression())])

param_grid = {}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, n_jobs=-1) # njobs for the number of CPU cores used
grid = grid.fit(X_train, y_train)

print(f"Best parameters: {grid.best_params_}")
print(f"Training set score: {grid.score(X_train, y_train):.2f}")
print(f"Test set score: {grid.score(X_test, y_test):.2f}")
print(f"Best cross-validation accuracy: {grid.best_score_:.2f}")
scatter_plot(X_train[:, 0], X_test[:, 0], X_range, X_pred, y_train, y_test, grid.predict(X_range), grid.predict(X_pred), "Linear")

1
Best parameters: {}
Training set score: 0.02
Test set score: 0.01
Best cross-validation accuracy: -0.02


### Ridge regression

In [None]:
from sklearn.linear_model import Ridge

pipeline = Pipeline([("scaler", MinMaxScaler()), ("ridge", Ridge())])

param_grid = {
	'ridge__alpha': [0, 0.001, 0.01, 0.1, 1, 10, 100]
}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, n_jobs=-1) 
grid.fit(X_train, y_train)

print(f"Best parameters: {grid.best_params_}")
print(f"Training set score: {grid.score(X_train, y_train):.2f}")
print(f"Test set score: {grid.score(X_test, y_test):.2f}")
print(f"Best cross-validation accuracy: {grid.best_score_:.2f}")
scatter_plot(X_train[:, 0], X_test[:, 0], X_range, X_pred, y_train, y_test, grid.predict(X_range), grid.predict(X_pred), "Ridge")

Best parameters: {'ridge__alpha': 10}
Training set score: 0.02
Test set score: 0.00
Best cross-validation accuracy: -0.01


### Lasso

In [None]:
from sklearn.linear_model import Lasso

pipeline = Pipeline([("scaler", MinMaxScaler()), ("lasso", Lasso())])

param_grid = {
	'lasso__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

print(f"Best parameters: {grid.best_params_}")
print(f"Training set score: {grid.score(X_train, y_train):.2f}")
print(f"Test set score: {grid.score(X_test, y_test):.2f}")
print(f"Best cross-validation accuracy: {grid.best_score_:.2f}")
scatter_plot(X_train[:, 0], X_test[:, 0], X_range, X_pred, y_train, y_test, grid.predict(X_range), grid.predict(X_pred), "Lasso")

Best parameters: {'lasso__alpha': 0.1}
Training set score: 0.02
Test set score: 0.01
Best cross-validation accuracy: 0.00


### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([("scaler", MinMaxScaler()), ("logistic", LogisticRegression())])

param_grid = {
	'logistic__C': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, n_jobs=-1) 
grid.fit(X_train, y_train)

print(f"Best parameters: {grid.best_params_}")
print(f"Training set score: {grid.score(X_train, y_train):.2f}")
print(f"Test set score: {grid.score(X_test, y_test):.2f}")
print(f"Best cross-validation accuracy: {grid.best_score_:.2f}")
scatter_plot(X_train[:, 0], X_test[:, 0], X_range, X_pred, y_train, y_test, grid.predict(X_range), grid.predict(X_pred), "Logistic")

Best parameters: {'logistic__C': 0.001}
Training set score: 0.20
Test set score: 0.18
Best cross-validation accuracy: 0.19


## Naive Bayes Classifiers

### Gaussian

In [None]:
from sklearn.naive_bayes import GaussianNB

pipeline = Pipeline([("scaler", MinMaxScaler()), ("gaussian", GaussianNB())])

param_grid = {}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, n_jobs=-1) 
grid.fit(X_train, y_train)

print(f"Best parameters: {grid.best_params_}")
print(f"Training set score: {grid.score(X_train, y_train):.2f}")
print(f"Test set score: {grid.score(X_test, y_test):.2f}")
print(f"Best cross-validation accuracy: {grid.best_score_:.2f}")
scatter_plot(X_train[:, 0], X_test[:, 0], X_range, X_pred, y_train, y_test, grid.predict(X_range), grid.predict(X_pred), "Gaussian")

Best parameters: {}
Training set score: 0.32
Test set score: 0.18
Best cross-validation accuracy: 0.20


### Multinomial

In [None]:
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([("scaler", MinMaxScaler()), ("multinomial", MultinomialNB())])

param_grid = {
	'multinomial__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, n_jobs=-1) 
grid.fit(X_train, y_train)

print(f"Best parameters: {grid.best_params_}")
print(f"Training set score: {grid.score(X_train, y_train):.2f}")
print(f"Test set score: {grid.score(X_test, y_test):.2f}")
print(f"Best cross-validation accuracy: {grid.best_score_:.2f}")
scatter_plot(X_train[:, 0], X_test[:, 0], X_range, X_pred, y_train, y_test, grid.predict(X_range), grid.predict(X_pred), "Multinomial")

Best parameters: {'multinomial__alpha': 0.001}
Training set score: 0.20
Test set score: 0.18
Best cross-validation accuracy: 0.22


## K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsRegressor

pipeline = Pipeline([("scaler", MinMaxScaler()), ("knn", KNeighborsRegressor())])

param_grid = {
	'knn__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40]
}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, n_jobs=-1) 
grid.fit(X_train, y_train)

print(f"Best parameters: {grid.best_params_}")
print(f"Training set score: {grid.score(X_train, y_train):.2f}")
print(f"Test set score: {grid.score(X_test, y_test):.2f}")
print(f"Best cross-validation accuracy: {grid.best_score_:.2f}")
scatter_plot(X_train[:, 0], X_test[:, 0], X_range, X_pred, y_train, y_test, grid.predict(X_range), grid.predict(X_pred), "Knn")

Best parameters: {'knn__n_neighbors': 10}
Training set score: 0.98
Test set score: 0.87
Best cross-validation accuracy: 0.97


## Decision Trees


### Simple Decision Trees

In [None]:
from sklearn.tree import DecisionTreeRegressor

pipeline = Pipeline([("scaler", MinMaxScaler()), ("tree", DecisionTreeRegressor())])

param_grid = {
	'tree__criterion': ["squared_error", "absolute_error", "poisson", "friedman_mse"], 
	'tree__max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None]
}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, n_jobs=-1) 
grid.fit(X_train, y_train)

print(f"Best parameters: {grid.best_params_}")
print(f"Training set score: {grid.score(X_train, y_train):.2f}")
print(f"Test set score: {grid.score(X_test, y_test):.2f}")
print(f"Best cross-validation accuracy: {grid.best_score_:.2f}")
scatter_plot(X_train[:, 0], X_test[:, 0], X_range, X_pred, y_train, y_test, grid.predict(X_range), grid.predict(X_pred), "Decision tree")

Best parameters: {'tree__criterion': 'friedman_mse', 'tree__max_depth': 6}
Training set score: 0.98
Test set score: 0.87
Best cross-validation accuracy: 0.96



One or more of the test scores are non-finite: [       nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan 0.10083764 0.36164912
 0.50607561 0.61436152 0.67788138 0.79054004 0.84375275 0.88901787
 0.93678018 0.94206817 0.9520875  0.14516022 0.79618409 0.92556835
 0.94253149 0.95832861 0.96104289 0.95810005 0.95645667 0.95497558
 0.9542155  0.95155633]



### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

pipeline = Pipeline([("scaler", MinMaxScaler()), ("forest", RandomForestRegressor())])

param_grid = {
	'forest__criterion': ["squared_error", "absolute_error", "poisson"], 
	'forest__max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None], 
	'forest__n_estimators': [1, 5, 10, 50, 100, 500, 1000]
}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

print(f"Best parameters: {grid.best_params_}")
print(f"Training set score: {grid.score(X_train, y_train):.2f}")
print(f"Test set score: {grid.score(X_test, y_test):.2f}")
print(f"Best cross-validation accuracy: {grid.best_score_:.2f}")
scatter_plot(X_train[:, 0], X_test[:, 0], X_range, X_pred, y_train, y_test, grid.predict(X_range), grid.predict(X_pred), "Random forest")
features_importance(grid.best_estimator_.named_steps['forest'], df_used)

Best parameters: {'forest__criterion': 'poisson', 'forest__max_depth': None, 'forest__n_estimators': 100}
Training set score: 0.99
Test set score: 0.87
Best cross-validation accuracy: 0.96



One or more of the test scores are non-finite: [       nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan   

### Gradient Boosted Trees

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

pipeline = Pipeline([("scaler", MinMaxScaler()), ("gradient_boosting", GradientBoostingRegressor())])

param_grid = {
	'gradient_boosting__criterion': ["squared_error", "friedman_mse"], 
	'gradient_boosting__loss': ["squared_error", "absolute_error", "huber", "quantile"], 
	'gradient_boosting__n_estimators': [1, 5, 10, 50, 100, 500, 1000], 
	'gradient_boosting__learning_rate': [0, 0.001, 0.01, 0.1, 1, 10, 100]
}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

print(f"Best parameters: {grid.best_params_}")
print(f"Training set score: {grid.score(X_train, y_train):.2f}")
print(f"Test set score: {grid.score(X_test, y_test):.2f}")
print(f"Best cross-validation accuracy: {grid.best_score_:.2f}")
scatter_plot(X_train[:, 0], X_test[:, 0], X_range, X_pred, y_train, y_test, grid.predict(X_range), grid.predict(X_pred), "Gradient boosted trees")

Best parameters: {'gradient_boosting__criterion': 'friedman_mse', 'gradient_boosting__learning_rate': 0.1, 'gradient_boosting__loss': 'huber', 'gradient_boosting__n_estimators': 50}
Training set score: 0.98
Test set score: 0.87
Best cross-validation accuracy: 0.97



One or more of the test scores are non-finite: [             nan              nan              nan              nan
              nan              nan              nan              nan
              nan              nan              nan              nan
              nan              nan              nan              nan
              nan              nan              nan              nan
              nan              nan              nan              nan
              nan              nan              nan              nan
              nan              nan              nan              nan
              nan              nan              nan              nan
              nan              nan              nan              nan
              nan              nan              nan              nan
              nan              nan              nan              nan
              nan              nan              nan              nan
              nan              nan              nan    

## Support Vector Regression

In [None]:
from sklearn.svm import SVR

pipeline = Pipeline([("scaler", MinMaxScaler()), ("svr", SVR())])

param_grid = {
	'svr__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 
	'svr__gamma': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

print(f"Best parameters: {grid.best_params_}")
print(f"Training set score: {grid.score(X_train, y_train):.2f}")
print(f"Test set score: {grid.score(X_test, y_test):.2f}")
print(f"Best cross-validation accuracy: {grid.best_score_:.2f}")
scatter_plot(X_train[:, 0], X_test[:, 0], X_range, X_pred, y_train, y_test, grid.predict(X_range), grid.predict(X_pred), "SVR")

Best parameters: {'svr__C': 100, 'svr__gamma': 1000}
Training set score: 0.98
Test set score: 0.87
Best cross-validation accuracy: 0.97


## Multi-Layer Perceptrons (Deep Learning)

In [None]:
from sklearn.neural_network import MLPRegressor

pipeline = Pipeline([("scaler", MinMaxScaler()), ("mlp", MLPRegressor())])

param_grid = {
	'mlp__hidden_layer_sizes': [1, 10, 100, 1000], 
	'mlp__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100], 
	'mlp__activation': ['identity', 'logistic', 'tanh', 'relu'], 
	'mlp__solver': ['lbfgs', 'sgd', 'adam'], 
	'mlp__max_iter': [1000]
}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

print(f"Best parameters: {grid.best_params_}")
print(f"Training set score: {grid.score(X_train, y_train):.2f}")
print(f"Test set score: {grid.score(X_test, y_test):.2f}")
print(f"Best cross-validation accuracy: {grid.best_score_:.2f}")
scatter_plot(X_train[:, 0], X_test[:, 0], X_range, X_pred, y_train, y_test, grid.predict(X_range), grid.predict(X_pred), "MLP")