In [1]:
import pandas as pd
import numpy as np
from statistics import mean
import random

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    KFold,
    cross_validate,
    GridSearchCV
)
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.linear_model import LinearRegression,SGDRegressor,HuberRegressor,Perceptron,BayesianRidge
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.neural_network import MLPRegressor
from sklearn.isotonic import IsotonicRegression

import preprocessing_predictions as prep

In [2]:
df = prep.get_prediction_data()
# df_measures = prep.get_measures_data()

In [131]:
countries = df['CountryName'].unique()
print('Countries in df: ',countries)
print('Australia should be in here!')
print('_'*90)
print('all rows that concern the last week (and NL) should be removed for training the model...')
print('...and used as data that will be predicted on')

Countries in df:  ['Israel' 'Netherlands']
Australia should be in here!
__________________________________________________________________________________________
all rows that concern the last week (and NL) should be removed for training the model...
...and used as data that will be predicted on


In [3]:
folds = KFold(n_splits = 10, shuffle = True, random_state = 32)

In [4]:
x_train = df[["deaths", "cases", "hospitalizations", "vaccinations", "temp"]]
y_train = df["StringencyIndexForDisplay"]

x_train_oxford = df_measures[["deaths", "cases"]]
y_train_oxford = df_measures["StringencyIndexForDisplay"]



In [5]:
regressors = [
    KNeighborsRegressor(),
    LinearRegression(),
    SGDRegressor(),
    HuberRegressor(),
    BayesianRidge(),
    DecisionTreeRegressor(),
    GaussianProcessRegressor(),
    PLSRegression(),
    MLPRegressor(),
    AdaBoostRegressor(),
    RandomForestRegressor(n_jobs=-1,n_estimators=12,min_samples_split=5,max_depth=21,max_features='log2',criterion='absolute_error'),
    
]

best_regressor = ''
best_score = 9999

for regressor in regressors:
    regressor_scores = cross_val_score(regressor, x_train, y_train, cv=folds, scoring='neg_mean_absolute_error', error_score='raise', n_jobs=-1)
    mean = regressor_scores.mean()
    # print(f"{regressor} MSE mean:", mean)
    # print('\n')
    if abs(mean) < abs(best_score):
        best_regressor = regressor
        best_score = mean

        
print(f"{best_regressor} MAE mean:", best_score)

AdaBoostRegressor() MAE mean: -5.171669929265951


In [44]:
# AdaBoost hyperparameter tuning

n_estimators = list(np.arange(10,25,1))
parameters = {'n_estimators': n_estimators, 'loss':['square'],'learning_rate':np.arange(0.8,1.2,0.05)}
regressor = AdaBoostRegressor(random_state=32)
regressor = GridSearchCV(regressor,param_grid=parameters, scoring='neg_mean_absolute_error', n_jobs=-1, cv=folds)
regressor.fit(x_train,y_train)
results = pd.DataFrame(regressor.cv_results_).sort_values(by='rank_test_score')

In [45]:
print(results[['param_loss','param_n_estimators','param_learning_rate' ,'mean_test_score','std_test_score']].head())

   param_loss param_n_estimators param_learning_rate  mean_test_score  \
41     square                 21                 0.9        -4.876255   
40     square                 20                 0.9        -4.898224   
98     square                 18                 1.1        -4.904362   
43     square                 23                 0.9        -4.911939   
39     square                 19                 0.9        -4.921986   

    std_test_score  
41        2.120645  
40        1.810617  
98        2.115379  
43        2.190320  
39        2.628689  


In [124]:
# AdaBoost, hyperparameters tuned

regressor = AdaBoostRegressor(random_state=811, loss='square',n_estimators=21,learning_rate=0.9)
regressor_scores = cross_val_score(regressor, x_train, y_train, cv=folds,scoring='neg_mean_absolute_error',error_score='raise')
print("Mean:", np.mean(regressor_scores))
print("stdev:", np.std(regressor_scores))

Mean: -4.9099265252002455
stdev: 1.9805838029801786


In [5]:
# Decision Tree
# FULL OXFORD SET, ALL COUNTRIES
dtr_grid_search = DecisionTreeRegressor(criterion='absolute_error')
scores_dtr = cross_val_score(dtr_grid_search, x_train_oxford, y_train_oxford, cv=folds,scoring='neg_mean_absolute_error',error_score='raise')
print("MSE dtr:", scores_dtr)
print("Mean:", mean(scores_dtr))

# Random Forest
# FULL OXFORD SET, ALL COUNTRIES
rfr_grid_search = RandomForestRegressor(n_jobs=-1,n_estimators=12,min_samples_split=5,max_depth=21,max_features='log2',criterion='absolute_error')
scores_rfr = cross_val_score(rfr_grid_search, x_train_oxford, y_train_oxford, cv=folds,scoring='neg_mean_absolute_error',error_score='raise')
print("MSE rfr:", scores_rfr)
print("Mean:", mean(scores_rfr))

Accuracy rfr: [0.42388131 0.42296422 0.42742043 0.41482642 0.43004378]
Mean: 0.42382723184585913
