In [1]:
import pandas as pd
import numpy as np
from statistics import mean
import random

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    KFold,
    cross_validate,
    GridSearchCV
)
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.linear_model import LinearRegression,SGDRegressor,HuberRegressor,Perceptron,BayesianRidge
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.neural_network import MLPRegressor
from sklearn.isotonic import IsotonicRegression

import preprocessing_predictions as prep

CountryName                                      object
CountryCode                                      object
RegionName                                       object
RegionCode                                       object
Jurisdiction                                     object
date                                     datetime64[ns]
C1_School closing                               float64
C1_Flag                                         float64
C2_Workplace closing                            float64
C2_Flag                                         float64
C3_Cancel public events                         float64
C3_Flag                                         float64
C4_Restrictions on gatherings                   float64
C4_Flag                                         float64
C5_Close public transport                       float64
C5_Flag                                         float64
C6_Stay at home requirements                    float64
C6_Flag                                         

In [2]:
df = prep.get_prediction_data()
x_train = df[["deaths", "cases", "hospitalizations", "temp"]]#, 'vaccinations']]
y_train = df["StringencyIndexForDisplay"]

# df_measures = prep.get_measures_data()
# x_train_oxford = df_measures[["deaths", "cases"]]
# y_train_oxford = df_measures["StringencyIndexForDisplay"]



41.67000000000001
         deaths        cases  hospitalizations       temp
0  18240.714286  3701.285714         23.857143  11.542857


  df_measures = m.get_measures_df_il_nl_nsw()
  mean_latest_week = pd.DataFrame(latest_week.mean(axis=0)).T
  mean_latest_week = pd.DataFrame(latest_week.mean(axis=0)).T


In [3]:
countries = df['CountryName'].unique()
print('Countries in df: ',countries)
print('Australia should be in here!')
print('_'*90)
print('all rows that concern the last week (and NL) should be removed for training the model...')
print('...and used as data that will be predicted on')

Countries in df:  ['Netherlands' 'Israel' 'Australia']
Australia should be in here!
__________________________________________________________________________________________
all rows that concern the last week (and NL) should be removed for training the model...
...and used as data that will be predicted on


In [4]:
folds = KFold(n_splits = 10, shuffle = True, random_state = 32)

In [17]:
regressors = [
    KNeighborsRegressor(),
    LinearRegression(),
    SGDRegressor(),
    HuberRegressor(),
    BayesianRidge(),
    DecisionTreeRegressor(),
    GaussianProcessRegressor(),
    PLSRegression(),
    MLPRegressor(),
    AdaBoostRegressor(),
    RandomForestRegressor()
    
]

best_regressor = ''
best_score = 9999
scores = []

for regressor in regressors:
    regressor_scores = cross_val_score(regressor, x_train, y_train, cv=folds, scoring='neg_mean_absolute_error', error_score='raise', n_jobs=-1)
    mean = regressor_scores.mean()
    scores.append((regressor, mean))

    if abs(mean) < abs(best_score):
        best_regressor = regressor
        best_score = mean

print(f"Best regressor {best_regressor} MAE mean:", best_score)
i=1
for regressor, mean in scores:
    print(f'{i} ', regressor, mean)
    i+=1

Best regressor DecisionTreeRegressor() MAE mean: -1.7976522828885044
1  KNeighborsRegressor() -3.641008657363501
2  LinearRegression() -12.245461480619499
3  SGDRegressor() -2.2097479164470108e+16
4  HuberRegressor() -14.828811076289393
5  BayesianRidge() -12.289116243113703
6  DecisionTreeRegressor() -1.7976522828885044
7  GaussianProcessRegressor() -61.15084279957999
8  PLSRegression() -12.521736008018495
9  MLPRegressor() -14.636336657438317
10  AdaBoostRegressor() -8.811670193859527
11  RandomForestRegressor() -1.9285831154315338


In [50]:
# Regression Tree hyperparameter tuning
criterion = ["absolute_error"]
splitter = ["best"]
max_depth = list(np.arange(18, 38,1))  + [None]
min_samples_split = list(np.arange(1, 10, 1)) + [None]
min_samples_leaf = list(np.arange(1, 10, 1)) + [None]
min_weight_fraction_leaf = [0]
max_features = ["auto", "sqrt", "log2"]
max_leaf_nodes = np.arange(10, 50, 5)
ccp_alpha = [0]
parameters = {
    'criterion': ["absolute_error"],
    'splitter': ["best"],
    "max_depth": [20,18,19],
    'min_samples_leaf':min_samples_leaf,
    'min_samples_split':min_samples_split
}
regressor = DecisionTreeRegressor(random_state=32)
regressor = GridSearchCV(regressor,param_grid=parameters, scoring='neg_mean_absolute_error', n_jobs=-1, cv=folds)
regressor.fit(x_train,y_train)
results = pd.DataFrame(regressor.cv_results_).sort_values(by='rank_test_score').set_index('rank_test_score')

840 fits failed out of a total of 3000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
270 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Tim\Development\INFOMDSS\.env\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Tim\Development\INFOMDSS\.env\lib\site-packages\sklearn\tree\_classes.py", line 1315, in fit
    super().fit(
  File "c:\Users\Tim\Development\INFOMDSS\.env\lib\site-packages\sklearn\tree\_classes.py", line 250, in fit
    raise ValueError(
ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

-------------------------------

In [49]:
results = results[['params','mean_test_score', 'std_test_score']]
results.to_csv('data/regression_tree_tuning.csv',sep=';')
results.head()

Unnamed: 0_level_0,params,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"{'criterion': 'absolute_error', 'max_depth': 2...",-1.54232,0.518872
2,"{'criterion': 'absolute_error', 'max_depth': 2...",-1.595603,0.499648
3,"{'criterion': 'absolute_error', 'max_depth': 3...",-1.598271,0.497715
4,"{'criterion': 'absolute_error', 'max_depth': 1...",-1.602004,0.486508
5,"{'criterion': 'absolute_error', 'max_depth': 2...",-1.60282,0.4947


In [58]:
list(results.head(1)['params'].values)

[{'criterion': 'absolute_error',
  'max_depth': 20,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'splitter': 'best'}]

In [44]:
# AdaBoost hyperparameter tuning

n_estimators = list(np.arange(10,25,1))
parameters = {'n_estimators': n_estimators, 'loss':['square'],'learning_rate':np.arange(0.8,1.2,0.05)}
regressor = AdaBoostRegressor(random_state=32)
regressor = GridSearchCV(regressor,param_grid=parameters, scoring='neg_mean_absolute_error', n_jobs=-1, cv=folds)
regressor.fit(x_train,y_train)
results = pd.DataFrame(regressor.cv_results_).sort_values(by='rank_test_score')

In [124]:
# AdaBoost, hyperparameters tuned

regressor = AdaBoostRegressor(random_state=811, loss='square',n_estimators=21,learning_rate=0.9)
regressor_scores = cross_val_score(regressor, x_train, y_train, cv=folds,scoring='neg_mean_absolute_error',error_score='raise')
print("Mean:", np.mean(regressor_scores))
print("stdev:", np.std(regressor_scores))

Mean: -4.9099265252002455
stdev: 1.9805838029801786


In [3]:
df = prep.get_latest_stringency_nl()
print(df.dtypes)
print(df)