In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
pre_sj = pd.read_csv('Data/sj_scaled_20200522.csv', index_col = 'week_start_date', parse_dates = ['week_start_date'])
pre_iq = pd.read_csv('Data/iq_scaled_20200522.csv', index_col = 'week_start_date', parse_dates = ['week_start_date'])

In [3]:
pre_sj.head()

Unnamed: 0_level_0,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,...,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,total_cases
week_start_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-04-30,0.245004,0.159823,0.504881,0.465672,0.031797,0.984688,0.985367,0.981929,0.985212,0.986662,...,0.83774,0.031797,0.720826,0.593548,0.840491,0.695965,0.825843,0.749064,0.052305,1.609438
1990-05-07,0.339528,0.219068,0.412987,0.40765,0.058423,0.986802,0.987684,0.987091,0.988827,0.988329,...,0.883448,0.058423,0.790785,0.535484,0.882492,0.642651,0.890449,0.831461,0.028114,1.791759
1990-05-14,0.064448,0.266513,0.399869,0.447913,0.088428,0.988688,0.989126,0.99207,0.987512,0.99133,...,0.936936,0.088428,0.866696,0.519355,0.882492,0.654179,0.904494,0.853933,0.135338,1.609438
1990-05-21,0.257061,0.377607,0.578836,0.618441,0.039324,0.989368,0.990284,0.991653,0.99047,0.99033,...,0.917345,0.039324,0.857657,0.548387,0.907504,0.682997,0.935393,0.872659,0.013076,1.386294
1990-05-28,0.392086,0.404006,0.638977,0.648472,0.019252,0.991127,0.991726,0.99337,0.992113,0.991997,...,0.918748,0.019252,0.885288,0.680645,0.956111,0.945245,0.983146,0.895131,0.01896,1.94591


In [4]:
pre_sj.shape

(1196, 21)

In [5]:
# Separate data for training and data for prediction
sj = pre_sj[~pre_sj.total_cases.isnull()]
iq = pre_iq[~pre_iq.total_cases.isnull()]

# Modeling

In [6]:
# Models
from sklearn.svm import SVR
from sklearn.linear_model import Lasso, Ridge, BayesianRidge, ElasticNet, HuberRegressor, TheilSenRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Hyperparameters Tuning
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Feature Selection and Data Manipulation Process
from backward_elim import backward_elim
from data_prep import slicing, splitting, gridsearch_loop, select_var

In [42]:
# Hyperparameters Grids
model_list = [SVR(), Lasso(), Ridge(), BayesianRidge(), ElasticNet(), HuberRegressor(),
              KNeighborsRegressor(), RandomForestRegressor(), GradientBoostingRegressor()]

SV_params = {'kernel':['linear', 'rbf'], 'C':[1.0, 5.0]}
LS_params = {'alpha': [0.1, 1.0], 'tol': [1e-4, 1e-3]}
RD_params = {'alpha': [0.5, 1.0], 'tol': [1e-4, 1e-3]}
BR_params = {'alpha_1': [1e-6, 0.001], 'alpha_2': [1e-6, 0.001],
             'lambda_1': [1e-6, 0.001], 'lambda_2': [1e-6, 0.001]}
EN_params = {'l1_ratio': [0.2, 0.5, 0.8], 'alpha': [1, 2]}
HR_params = {'epsilon': [1.35, 1.75, 2]}
#TS_params = {'max_iter': [150, 300]}
KN_params = {'n_neighbors': [3, 5, 7], 'weights':['uniform', 'distance']}
RF_params = {'n_estimators': [100, 500], 'criterion': ['mae'], 'min_samples_split':[2,4]}
GB_params = {'learning_rate': [0.01, 0.1, 0.5], 'n_estimators': [100, 500], 'criterion': ['mae']}

param_list = [SV_params, LS_params, RD_params, BR_params, EN_params, HR_params, KN_params, 
              RF_params, GB_params]

params_dict = {}
for i in range(len(model_list)):
    params_dict[model_list[i]] = param_list[i]
    
# Note: Remove TheilSenRegressor, it doesn't work

In [52]:
# Create a new DataFrame to collect results
#all_result = pd.DataFrame(columns=['city', 'lag', 'method', 'model_name', 'model_detail', 'training_score', 'testing_score'])

In [54]:
min_lag = 1
max_lag = 5
city = sj
city_name = 'sj'
method = 'BackElim'
train_size = 0.8

In [56]:
for lag in range(min_lag, max_lag + 1):
    # Step 1, feature selection
    print('Step 1 of lag %.0f'%lag)
    useful_features = select_var(city, lag, corr_dict = None, method=method)
    n_features = len(useful_features)
    print('Select %.0f features.'%n_features)
    
    # Step 2, Split the data
    x_train, x_test, y_train, y_test = splitting(slicing(city[useful_features], 'total_cases', lag), 'total_cases', train_size)
    
    # Step 3 For-loop Randomized Search
    grid_result = gridsearch_loop(city, city_name, model_list, params_dict, x_train, x_test, y_train, y_test)
    # Note: grid_result is a list with length 3, comprises of the sklearn object, training score, and testing score
    
    # Add results to the DataFrame
    for run in grid_result:
        all_result = all_result.append({
            'city': city_name,
            'lag': lag,
            'method': method,
            'model_name': run[0].__class__.__name__,
            'model_detail': run[0],
            'training_score': run[1],
            'testing_score': run[2]
        }, ignore_index = True)
print('Iteration is completed')

Step 1 of lag 1
Select 11 features.
Step 1 of lag 2
Select 13 features.
Step 1 of lag 3
Select 13 features.
Step 1 of lag 4
Select 10 features.
Step 1 of lag 5
Select 10 features.
Iteration is completed


In [55]:
all_result.tail()

Unnamed: 0,city,lag,method,model_name,model_detail,training_score,testing_score
