In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append('../.')
from lib import get_data

In [53]:
df = get_data.get_model_data(date_range=(0,7), pred_day=10)
df.head()

Unnamed: 0,state,county,fips,cases,deaths,cldCvrMin,cldCvrAvg,cldCvrMax,dewPtMin,dewPtAvg,...,retail_and_recreation_percent_change_from_baseline,residential_percent_change_from_baseline,workplaces_percent_change_from_baseline,transit_stations_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,income_2018,pop_2018,day_10_delta_cases,day_10_delta_deaths
0,alabama,autauga,1001,11.5,0.375,0.25,31.25,81.625,48.7625,52.875,...,-31.5,16.5,-38.0,,9.875,,41618.0,55601.0,7,0
1,alabama,baldwin,1003,20.875,0.5,0.0,31.25,86.625,47.6625,53.725,...,-42.125,12.25,-31.875,-24.5,-12.5,-33.5,45596.0,218022.0,13,0
2,alabama,blount,1009,10.625,0.0,0.075,30.95,89.75,41.68,47.14,...,-29.25,16.8,-34.75,,0.0,,34976.0,57840.0,4,0
3,alabama,calhoun,1015,28.25,0.0,0.166667,23.708333,81.416667,46.5,50.866667,...,-35.75,13.625,-37.5,,-2.5,,37120.0,114277.0,5,0
4,alabama,chambers,1017,24.875,1.625,0.0,29.958333,84.333333,46.320833,52.445833,...,-25.375,12.8,-32.625,-22.0,1.25,,33859.0,33615.0,42,2


In [54]:
df = df._get_numeric_data().drop(['fips'],axis=1).dropna()
df.head()

Unnamed: 0,cases,deaths,cldCvrMin,cldCvrAvg,cldCvrMax,dewPtMin,dewPtAvg,dewPtMax,feelsLikeMin,feelsLikeAvg,...,retail_and_recreation_percent_change_from_baseline,residential_percent_change_from_baseline,workplaces_percent_change_from_baseline,transit_stations_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,income_2018,pop_2018,day_10_delta_cases,day_10_delta_deaths
1,20.875,0.5,0.0,31.25,86.625,47.6625,53.725,59.6375,56.025,67.625,...,-42.125,12.25,-31.875,-24.5,-12.5,-33.5,45596.0,218022.0,13,0
11,21.0,1.0,0.0625,31.833333,89.0625,43.135417,48.777083,54.99375,51.904167,62.291667,...,-37.25,14.142857,-34.625,-30.25,4.5,41.714286,39607.0,83442.0,9,0
12,14.125,0.0,0.25,30.875,91.875,40.8,47.1125,53.85,48.525,60.2125,...,-37.625,14.0,-31.0,-37.0,-4.125,-4.0,31916.0,71385.0,10,0
14,29.875,2.625,0.0,24.125,71.75,46.35,51.0625,55.225,49.175,62.375,...,-38.0,13.0,-35.625,-31.625,-3.875,-23.0,36918.0,102501.0,27,3
16,23.5,1.0,0.0,33.0,96.0,50.9,56.825,62.6625,59.7125,70.425,...,-42.25,13.625,-34.5,-31.875,-9.125,3.5,42398.0,104722.0,14,0


In [55]:
X = df.drop(['day_10_delta_cases','day_10_delta_deaths'], axis = 1)
y = df.day_10_delta_cases

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
rf = RandomForestRegressor(max_features='auto', oob_score=True, random_state=1, n_jobs=-1)
param_grid = { "min_samples_leaf" : [1, 5, 10], "min_samples_split" : [2, 4, 8, 12], "n_estimators": [10, 50, 100, 200, 500]}
gs = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)
gs = gs.fit(X_train, y_train)

gs.best_estimator_

  warn("Some inputs do not have OOB scores. "


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=-1, oob_score=True,
                      random_state=1, verbose=0, warm_start=False)

In [37]:
rf = gs.best_estimator_

rf.fit(X_train, y_train)
print("%.4f" % rf.oob_score_)

0.6088


In [38]:
rf.score(X_test, y_test)

0.6011377750644261

In [39]:
pd.concat((pd.DataFrame(X_train.columns, columns = ['variables']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:10]


Unnamed: 0,variables,importance
0,cases,0.629243
16,mslPresMax,0.05252
15,mslPresAvg,0.033533
14,mslPresMin,0.024822
64,pop_2018,0.024356
4,cldCvrMax,0.021383
18,presTendMin,0.02005
45,windSpdMin,0.018451
63,income_2018,0.016753
47,windSpdMax,0.008852


In [41]:
# death
y = df.day_10_delta_deaths
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
rf = RandomForestRegressor(max_features='auto', oob_score=True, random_state=1, n_jobs=-1)
param_grid = { "min_samples_leaf" : [1, 5, 10], "min_samples_split" : [2, 4, 8, 12], "n_estimators": [10, 50, 100, 200, 500, 1000]}
gs = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)
gs = gs.fit(X_train, y_train)

gs.best_estimator_

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=5,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=500, n_jobs=-1, oob_score=True,
                      random_state=1, verbose=0, warm_start=False)

In [43]:
rf = gs.best_estimator_
rf.fit(X_train, y_train)
print("%.4f" % rf.oob_score_)

0.4250


In [44]:
rf.score(X_test, y_test)

0.150478989948853

In [45]:
pd.concat((pd.DataFrame(X_train.columns, columns = ['variables']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:10]


Unnamed: 0,variables,importance
0,cases,0.501452
1,deaths,0.191011
44,windDir100mAvg,0.014838
49,windSpd80mAvg,0.013934
16,mslPresMax,0.013881
19,presTendAvg,0.012086
63,income_2018,0.012084
60,transit_stations_percent_change_from_baseline,0.010327
5,dewPtMin,0.009629
62,parks_percent_change_from_baseline,0.009229


In [46]:
# gradient boosting

# cases
X = df.drop(['day_10_delta_cases','day_10_delta_deaths'], axis = 1)
y = df.day_10_delta_cases

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

gb_hyperparameters = {
    "n_estimators": [10, 50, 100, 200, 500],
    "min_samples_split" : [2, 4, 8, 12],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5],
    "min_samples_leaf": [1, 3, 5]
}

gbr = GradientBoostingRegressor()
gs = GridSearchCV(estimator=gbr, param_grid=gb_hyperparameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
gs = gs.fit(X_train, y_train)

gs.best_estimator_

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.2, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=3, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=50,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [47]:
gbr = gs.best_estimator_
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)

In [48]:
print("R-squared for Train: %.2f" % gbr.score(X_train, y_train))
print("R-squared for Test: %.2f" % gbr.score(X_test, y_test))

R-squared for Train: 0.98
R-squared for Test: 0.62


In [49]:
pd.concat((pd.DataFrame(X_train.columns, columns = ['variables']), 
           pd.DataFrame(gbr.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:10]


Unnamed: 0,variables,importance
0,cases,0.584037
14,mslPresMin,0.069735
16,mslPresMax,0.060523
1,deaths,0.043946
61,grocery_and_pharmacy_percent_change_from_baseline,0.041964
64,pop_2018,0.033892
15,mslPresAvg,0.03352
18,presTendMin,0.029926
63,income_2018,0.015254
3,cldCvrAvg,0.014869


In [50]:
y_pred 

array([ 23.99168236,  22.69439417,  54.60163697,  27.07693476,
         5.88006274,  19.1776787 ,   7.94459738,  51.75720638,
         5.41630832,  12.30190209,  65.44641683,  18.05639726,
        21.82875933,   9.76448836,  55.11700628,  39.37445361,
        26.06210849,   5.70596011,  35.95989148,  19.92024859,
         8.47514753,  23.17978942,   2.68884249,  37.05197078,
        21.68574919,   8.00396237,  38.15373296,   3.18024478,
        11.5183166 ,   9.82504635,  19.09070341,  18.10415172,
        69.60677325,  18.24333679,  38.07298896, 287.75166502,
        25.69873133,  14.90504856,  26.12285408,  53.24306573,
        82.17343512, 153.79055334,  12.33305892,  11.93214949,
        56.00082772,  20.63119574, 133.75391129,  29.89920909,
         8.54146365,  37.25975406,   7.91808078,  36.98072319,
        58.37090363, 326.56345424,  19.44305929,   9.43814126,
        11.25053085,   8.49287793,  77.59182271,  11.6204854 ,
        15.54096578,  39.23164339,  39.2261511 ,  35.68