<a href="https://colab.research.google.com/github/StevenBryceLee/DengAI/blob/master/Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The goal of this notebook is to get much farther in model searching without human input, specifically by finding good models, then optimizing their hyperparameters and performing multiple hyperparameter searches, without needing to manually search</br>
Difficulties will be that hyperparameters have different ranges and potential values, so this will only work so long as the hyperparameter can be found without reading the documentation. For example, some hyperparameters such as loss function are any of the list ['mse','mae'] which cannot be learned except by reading the documentation. I expect this to be more successful when looking over numeric ranges </br>
Competition URL:</br>
https://www.drivendata.org/competitions/44/dengai-predicting-disease-spread/page/82/

In [None]:
# !wget -O features_train.csv 'https://drivendata-prod.s3.amazonaws.com/data/44/public/dengue_features_train.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCY3EFSLNZR%2F20200817%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200817T143418Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=2b3964d4e4cb929566aa344aaf2e07992c25b0daca4ce719cf6c49a3de2ac256'
# !wget -O labels_train.csv 'https://drivendata-prod.s3.amazonaws.com/data/44/public/dengue_labels_train.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCY3EFSLNZR%2F20200817%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200817T143418Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=998c2ef0db8e655154a200e8829ba20682071d35d222224aab6191eed6898366'
# !wget -O features_test.csv 'https://drivendata-prod.s3.amazonaws.com/data/44/public/dengue_features_test.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCY3EFSLNZR%2F20200817%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200817T143418Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=675b5b3940af4e0a439323481472746f28b34e6e901a4a124c9e53fb09e52c12'

In [None]:
try:
  import category_encoders
except:
  !pip install category_encoders

  import pandas.util.testing as tm


In [None]:
import os 

import pandas as pd
import numpy as np

In [None]:
#os.listdir()

In [None]:
trainFeatures = pd.read_csv('features_train.csv')
#trainFeatures.head()

In [None]:
trainLabels = pd.read_csv('labels_train.csv')
#trainLabels.head()

In [None]:
train = pd.merge(trainFeatures,trainLabels,on=trainLabels.columns[:-1].tolist())
#train.head()

In [None]:
test = pd.read_csv('features_test.csv')
#test.head()

In [None]:
def wrangle(df):
  '''
  This function wrangles training and testing data from the DengAI datasets

  df is a pandas df with either train or test data 

  returns a cleaned df
  '''
  #Drop empty rows
  #df.drop(df[df.weekofyear==53].index,inplace=True)

  #convert datetime
  df.week_start_date = pd.to_datetime(df.week_start_date)
  #Get days, months, years
  df['years'] = df.week_start_date.apply(lambda x: x.year)
  df['months'] = df.week_start_date.apply(lambda x: x.month)
  df['days'] = df.week_start_date.apply(lambda x: x.day)

  #Drop datetime object type
  df.drop(['week_start_date',
           #'weekofyear'
           ],axis=1, inplace = True)

  # #Only applying to the training dataset, which contains labels
  if 'total_cases' in df.columns:
    df.dropna(axis=0,thresh=len(df.columns)-8,inplace=True)


  return df

In [None]:
def model_save(df, y_pred,name):
  temp = df[['city','year','weekofyear']].copy()
  temp['total_cases'] = y_pred
  print(temp.head())
  if '.csv' not in name:
    name += '.csv'
  temp.to_csv(name,index=False)
  from google.colab import files
  files.download(name)

In [None]:
train = wrangle(train)
test = wrangle(test)

In [None]:
# !pip install pandas-profiling==2.*;
# from pandas_profiling import ProfileReport
# profile = ProfileReport(train, minimal=True).to_notebook_iframe()
# profile

In [None]:
from sklearn.metrics import mean_absolute_error as MAE

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from category_encoders import OneHotEncoder
from category_encoders import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor

#Non-performant models
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor 

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from xgboost import XGBRegressor

In [None]:
X = train.drop('total_cases',axis=1)
y = train.total_cases
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size = 0.25,random_state=42)

In [None]:
#Get baseline accuracy
base = MAE(train['total_cases'],[train['total_cases'].median()] * len(train))
base

19.863070539419088

In [None]:
# #Experiment params
# params = {'model__loss':['ls','lad','huber'],
#           'model__learning_rate':np.linspace(0.0001,0.2,3),
#           'model__n_estimators':np.linspace(50,150,3,dtype='int64'),
#           'model__criterion': ['friedman_mse','mae'],
#           'model__min_samples_split': np.linspace(2,10,3,dtype='int64'),
#           'model__min_samples_leaf':np.linspace(1,int(len(X_train) * (1/10)),3,dtype='int64').tolist(),
#           'model__max_depth':np.linspace(3,15,3,dtype='int64').tolist(),
#           'model__min_impurity_decrease':np.linspace(0,0.9,3),
#           'model__init':[None, RandomForestRegressor(random_state=41)] ,
#           'model__ccp_alpha':np.linspace(0,0.9,3).tolist(),
#           }

In [None]:
# prevscore = base
# encoders = [#OrdinalEncoder(),
#             OneHotEncoder()]
# base_estimator = RandomForestRegressor(random_state=41)
# est_list = [('rf1',RandomForestRegressor(random_state=1)),
#             ('rf2',RandomForestRegressor(random_state=42)),
#             ('rf3',RandomForestRegressor(random_state=93))]
# models = [
#           AdaBoostRegressor(base_estimator,random_state=41), 
#           GradientBoostingRegressor(random_state=41), 
#           VotingRegressor(est_list,n_jobs=-1),
#           XGBRegressor(random_state=41),
#           ]
# imputers = [SimpleImputer()
#             #,IterativeImputer()
#             ]
# scalers = [#StandardScaler(),
#            MinMaxScaler()]
# #Search for the baseline optimum model using 
# for model in models:
#   pipe = Pipeline([
#               ('encode',OneHotEncoder()),
#               ('impute', SimpleImputer()),
#               ('scale',MinMaxScaler()),
#               ('model',model)
#               ])
# #grid = GridSearchCV(pipe,param_grid =params,n_jobs=-1,cv = 3)
#   #For each model, fit and predict to get the MAE
#   pipe.fit(X_train,y_train)
#   y_pred = pipe.predict(X_val)
#   # grid.fit(X_train,y_train)
#   # y_pred = grid.predict(X_val)
#   score = MAE(y_val,y_pred)
#   print(score)
#   #If the score is a new high score
#   if score < prevscore:
#     prevscore = score
#     # print('encoder:\t{}\nimputer:\t{}\nscaler:\t\t{}\nmodel:\t{}\n'
#     #                                                       .format(str(encoder)[:10],
#     #                                                       str(imputer)[:10],
#     #                                                       str(scaler)[:10],
#     #                                                       str(model)[:10]))
#     #Store the best model
#     modelBest = pipe.named_steps['model']
# print(modelBest)

best model </br>
12.744609500492475
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=41,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [None]:
# #recreate the pipeline, predict and save
# y_pred = modelBest.predict(test)
# model_save(test,y_pred,str(score)[:6]+'.csv')

In [None]:
default_float = np.linspace(0.1,1,3)
params = {'model__model__base_score': [base,0.5],
          'model__model__booster': ['gbtree','dart'],
          'model__colsample_bylevel': default_float,
          'model__colsample_bynode': default_float,
          'model__colsample_bytree': default_float,
          'model__gamma': np.linspace(0,1,3),
          'model__model__importance_type': ['gain','total_gain','cover'],
          'model__learning_rate': np.linspace(0.1,0.5,3),
          'model__max_delta_step': np.linspace(0,100,3),
          'model__model__max_depth': np.linspace(3,100,3,dtype='int64'),
          'model__min_child_weight': default_float,
          'model__model__n_estimators': np.linspace(75,500,3,dtype='int64'),
          'model__model__n_jobs': [-1],
          'model__model__nthread': [-1],
          'model__objective': ['reg:squarederror'],
          'model__model__random_state': np.linspace(75,500,3,dtype='int64'),
          'model__reg_alpha': np.linspace(0,1,3),
          'model__reg_lambda': np.linspace(0,1,3),
          'model__scale_pos_weight': np.linspace(0,1,3),
          'model__model__seed': np.linspace(75,500,3,dtype='int64'),
          'model__subsample': default_float,
          }

In [None]:
try:
  print(grid.get_params().keys())
except:
  print('no grid')

no grid


In [None]:
#Remake pipeline with the best model
pipe = Pipeline([
              ('encode',OneHotEncoder()),
              ('impute', SimpleImputer()),
              ('scale',MinMaxScaler()),
              ('model',XGBRegressor())
              ])

#Initiate random search
#grid = RandomizedSearchCV(pipe,param_distributions =params,n_jobs=-1,cv = 5,random_state=91)
#Initiate grid search
grid = GridSearchCV(pipe,param_grid=params,n_jobs=-1,cv = 5)
grid.fit(X_train,y_train)
y_pred = grid.predict(X_val)
score = MAE(y_val,y_pred)
print(score)
if score > prevscore:
  print('you had a bad idea >:[')
else:
  y_pred = modelBest.predict(test)
  model_save(test,y_pred,str(score)[:6]+'.csv')

Raw data grants a MAE of 17.43

Baseline MAE with wrangled data is 0.99

Model MAE is 0.7. 
Pipeline:

('encode',OneHotEncoder(cols = ['city'],handle_unknown='indicator')),
('impute', SimpleImputer()),
('scale',StandardScaler()),
('model',DecisionTreeRegressor())

MAE of 0.48:
OneHotEncoder, SimpleImputer, RandomForestReg