In [1]:
import pandas as pd
import numpy as np
from statistics import mean
import random

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    KFold,
    cross_validate,
    GridSearchCV
)
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.linear_model import LinearRegression,SGDRegressor,HuberRegressor,Perceptron,BayesianRidge
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.neural_network import MLPRegressor
from sklearn.isotonic import IsotonicRegression

import preprocessing_predictions as prep

date                datetime64[ns]
hospitalizations             int32
dtype: object


In [2]:
df = prep.get_prediction_data()
x_train = df[["deaths", "cases", "hospitalizations", "temp"]]#, 'vaccinations']]
y_train = df["StringencyIndexForDisplay"]

# df_measures = prep.get_measures_data()
# x_train_oxford = df_measures[["deaths", "cases"]]
# y_train_oxford = df_measures["StringencyIndexForDisplay"]



In [3]:
countries = df['CountryName'].unique()
print('Countries in df: ',countries)
print('Australia should be in here!')
print('_'*90)
print('all rows that concern the last week (and NL) should be removed for training the model...')
print('...and used as data that will be predicted on')

Countries in df:  ['Netherlands' 'Israel' 'Australia']
Australia should be in here!
__________________________________________________________________________________________
all rows that concern the last week (and NL) should be removed for training the model...
...and used as data that will be predicted on


In [4]:
folds = KFold(n_splits = 10, shuffle = True, random_state = 32)

In [17]:
regressors = [
    KNeighborsRegressor(),
    LinearRegression(),
    SGDRegressor(),
    HuberRegressor(),
    BayesianRidge(),
    DecisionTreeRegressor(),
    GaussianProcessRegressor(),
    PLSRegression(),
    MLPRegressor(),
    AdaBoostRegressor(),
    RandomForestRegressor()
    
]

best_regressor = ''
best_score = 9999
scores = []

for regressor in regressors:
    regressor_scores = cross_val_score(regressor, x_train, y_train, cv=folds, scoring='neg_mean_absolute_error', error_score='raise', n_jobs=-1)
    mean = regressor_scores.mean()
    scores.append((regressor, mean))

    if abs(mean) < abs(best_score):
        best_regressor = regressor
        best_score = mean

print(f"Best regressor {best_regressor} MAE mean:", best_score)
i=1
for regressor, mean in scores:
    print(f'{i} ', regressor, mean)
    i+=1

Best regressor DecisionTreeRegressor() MAE mean: -1.7976522828885044
1  KNeighborsRegressor() -3.641008657363501
2  LinearRegression() -12.245461480619499
3  SGDRegressor() -2.2097479164470108e+16
4  HuberRegressor() -14.828811076289393
5  BayesianRidge() -12.289116243113703
6  DecisionTreeRegressor() -1.7976522828885044
7  GaussianProcessRegressor() -61.15084279957999
8  PLSRegression() -12.521736008018495
9  MLPRegressor() -14.636336657438317
10  AdaBoostRegressor() -8.811670193859527
11  RandomForestRegressor() -1.9285831154315338


In [47]:
# Regression Tree hyperparameter tuning
criterion = ["absolute_error"]
splitter = ["best"]
max_depth = list(np.arange(18, 38,1))  + [None]
min_samples_split = list(np.arange(1, 10, 1)) + [None]
min_samples_leaf = list(np.arange(1, 10, 1)) + [None]
min_weight_fraction_leaf = [0]
max_features = ["auto", "sqrt", "log2"]
max_leaf_nodes = np.arange(10, 50, 5)
ccp_alpha = [0]
parameters = {
    'criterion': ["absolute_error"],
    'splitter': ["best"],
    "max_depth": max_depth,
}
regressor = DecisionTreeRegressor(random_state=32)
regressor = GridSearchCV(regressor,param_grid=parameters, scoring='neg_mean_absolute_error', n_jobs=-1, cv=folds)
regressor.fit(x_train,y_train)
results = pd.DataFrame(regressor.cv_results_).sort_values(by='rank_test_score').set_index('rank_test_score')

SyntaxError: invalid syntax (Temp/ipykernel_18900/1347786305.py, line 12)

In [46]:
results = results[['params','mean_test_score', 'std_test_score']]
results.to_csv('data/regression_tree_tuning.csv',sep=';')
results.head()

Unnamed: 0_level_0,params,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,{'max_depth': 18},-1.762785,0.798242
2,{'max_depth': 36},-1.763253,0.761975
2,{'max_depth': 35},-1.763253,0.761975
2,{'max_depth': 34},-1.763253,0.761975
2,{'max_depth': 33},-1.763253,0.761975


In [44]:
# AdaBoost hyperparameter tuning

n_estimators = list(np.arange(10,25,1))
parameters = {'n_estimators': n_estimators, 'loss':['square'],'learning_rate':np.arange(0.8,1.2,0.05)}
regressor = AdaBoostRegressor(random_state=32)
regressor = GridSearchCV(regressor,param_grid=parameters, scoring='neg_mean_absolute_error', n_jobs=-1, cv=folds)
regressor.fit(x_train,y_train)
results = pd.DataFrame(regressor.cv_results_).sort_values(by='rank_test_score')

In [124]:
# AdaBoost, hyperparameters tuned

regressor = AdaBoostRegressor(random_state=811, loss='square',n_estimators=21,learning_rate=0.9)
regressor_scores = cross_val_score(regressor, x_train, y_train, cv=folds,scoring='neg_mean_absolute_error',error_score='raise')
print("Mean:", np.mean(regressor_scores))
print("stdev:", np.std(regressor_scores))

Mean: -4.9099265252002455
stdev: 1.9805838029801786


In [5]:
# Decision Tree
# FULL OXFORD SET, ALL COUNTRIES
dtr_grid_search = DecisionTreeRegressor(criterion='absolute_error')
scores_dtr = cross_val_score(dtr_grid_search, x_train_oxford, y_train_oxford, cv=folds,scoring='neg_mean_absolute_error',error_score='raise')
print("MSE dtr:", scores_dtr)
print("Mean:", mean(scores_dtr))

# Random Forest
# FULL OXFORD SET, ALL COUNTRIES
rfr_grid_search = RandomForestRegressor(n_jobs=-1,n_estimators=12,min_samples_split=5,max_depth=21,max_features='log2',criterion='absolute_error')
scores_rfr = cross_val_score(rfr_grid_search, x_train_oxford, y_train_oxford, cv=folds,scoring='neg_mean_absolute_error',error_score='raise')
print("MSE rfr:", scores_rfr)
print("Mean:", mean(scores_rfr))

Accuracy rfr: [0.42388131 0.42296422 0.42742043 0.41482642 0.43004378]
Mean: 0.42382723184585913
