In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, roc_auc_score, plot_roc_curve, accuracy_score

In [3]:
import xgboost as xgb
from xgboost import XGBClassifier

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
df_train = pd.read_csv('../assets/processed_train.csv')
df_prediction = pd.read_csv('../assets/processed_test.csv')

In [6]:
df_train.drop(columns='Unnamed: 0', inplace = True, axis = 1)
df_prediction.drop(columns='Unnamed: 0', inplace = True, axis = 1)

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16950 entries, 0 to 16949
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Date             16950 non-null  object 
 1   Species          16950 non-null  object 
 2   Trap             16950 non-null  object 
 3   Latitude         16950 non-null  float64
 4   Longitude        16950 non-null  float64
 5   NumMosquitos     16950 non-null  int64  
 6   WnvPresent       16950 non-null  int64  
 7   Nearest_Station  16950 non-null  int64  
 8   Station          16950 non-null  int64  
 9   Tavg             16950 non-null  float64
 10  DewPoint         16950 non-null  int64  
 11  WetBulb          16950 non-null  float64
 12  PrecipTotal      16950 non-null  float64
 13  StnPressure      16950 non-null  float64
 14  AvgSpeed         16950 non-null  float64
 15  Year             16950 non-null  float64
 16  Month            16950 non-null  float64
 17  WeekofYear  

In [8]:
df_prediction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232586 entries, 0 to 232585
Data columns (total 16 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Id           232586 non-null  int64  
 1   Date         232586 non-null  object 
 2   Species      232586 non-null  object 
 3   Trap         232586 non-null  object 
 4   Latitude     232586 non-null  float64
 5   Longitude    232586 non-null  float64
 6   Station      232586 non-null  int64  
 7   Tavg         232586 non-null  float64
 8   DewPoint     232586 non-null  int64  
 9   WetBulb      232586 non-null  float64
 10  PrecipTotal  232586 non-null  float64
 11  StnPressure  232586 non-null  float64
 12  AvgSpeed     232586 non-null  float64
 13  Year         232586 non-null  float64
 14  Month        232586 non-null  float64
 15  WeekofYear   232586 non-null  float64
dtypes: float64(10), int64(3), object(3)
memory usage: 28.4+ MB


In [9]:
# during the EDA portion the date has already been sprase out into 3 new columns 'Year', 'Month' & 'WeekofYear'
# Since the test set does not have NumMosquitos this is also drop on the train set
# Nearest_Station is also dropped as it was only use for EDA purpose
df_train.drop(columns=['Date', 'NumMosquitos', 'Nearest_Station'], inplace = True, axis = 1)

In [10]:
# date is also drop from the prediction set, there is and Id column which is drop since it is only used for kaggle score
df_prediction.drop(columns=['Date', 'Id'], inplace = True, axis = 1)

In [11]:
print(df_train.shape, df_prediction.shape)

(16950, 15) (232586, 14)


## One-Hot Encoding

In [12]:
df_train['Species'].nunique()

7

In [13]:
df_prediction['Species'].nunique()

8

In [14]:
df_train['Trap'].nunique()

136

In [15]:
df_prediction['Trap'].nunique()

149

Columns `Species` and `Trap` requires us to do do one-hot encoding however exploring the dataset, we can see that there is additional categorical information in `Species` and `Trap`, train set has `7 Species` category while prediction set has `8 Species` category. The same can be seen on `Trap` where train set has `136` vs `149` on the prediction set.  

Using a combined dataset I created the dummy variables and then split them back into the appropriate dataframes again `train` and `prediction`.

In [16]:
df_train_num = len(df_train)

# concat without the WnvPresent columns as that will be our target variable
df_combine = pd.concat(objs = [df_train.drop('WnvPresent', axis = 1), df_prediction], axis = 0)

# get dummy variables
df_combine = pd.get_dummies(df_combine)

# extract the train set out
df_train_final = df_combine[:df_train_num]

# extract the prediction set out
df_prediction_final = df_combine[df_train_num:]

Sanity check to ensure that the shape row and columns are correct.

In [17]:
df_train_final.shape

(16950, 169)

In [18]:
df_prediction_final.shape

(232586, 169)

In [19]:
# adding the WnvPresent column back into the train set so that we can do a split later
df_wnv = pd.DataFrame(df_train['WnvPresent'])
df_train_final = pd.concat(objs = [df_train_final, df_wnv], axis = 1)

In [20]:
df_train_final.shape

(16950, 170)

In [21]:
corrmat = df_train_final.corr()
columns = abs(corrmat['WnvPresent']).sort_values(ascending=False).head(41)
columns

WnvPresent                        1.000000
WeekofYear                        0.100561
Month                             0.097948
Species_CULEX PIPIENS             0.097868
DewPoint                          0.094676
Species_CULEX RESTUANS            0.094047
WetBulb                           0.086304
Trap_T900                         0.080944
Tavg                              0.078215
Longitude                         0.076732
Year                              0.043038
Species_CULEX TERRITANS           0.038609
Trap_T003                         0.036012
Trap_T086                         0.034420
Trap_T225                         0.034122
AvgSpeed                          0.033466
Trap_T143                         0.031773
Latitude                          0.030862
Trap_T115                         0.029565
Trap_T002                         0.028303
Trap_T223                         0.027643
Trap_T046                         0.026868
Trap_T006                         0.026601
Trap_T233  

In [22]:
X, y = df_train_final.drop(columns = 'WnvPresent'), df_train_final['WnvPresent']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 42)

In [24]:
X_train.head()

Unnamed: 0,Latitude,Longitude,Station,Tavg,DewPoint,WetBulb,PrecipTotal,StnPressure,AvgSpeed,Year,Month,WeekofYear,Species_CULEX ERRATICUS,Species_CULEX PIPIENS,Species_CULEX PIPIENS/RESTUANS,Species_CULEX RESTUANS,Species_CULEX SALINARIUS,Species_CULEX TARSALIS,Species_CULEX TERRITANS,Species_UNSPECIFIED CULEX,Trap_T001,Trap_T002,Trap_T002A,Trap_T002B,Trap_T003,Trap_T004,Trap_T005,Trap_T006,Trap_T007,Trap_T008,Trap_T009,Trap_T011,Trap_T012,Trap_T013,Trap_T014,Trap_T015,Trap_T016,Trap_T017,Trap_T018,Trap_T019,Trap_T025,Trap_T027,Trap_T028,Trap_T030,Trap_T031,Trap_T033,Trap_T034,Trap_T035,Trap_T036,Trap_T037,Trap_T039,Trap_T040,Trap_T043,Trap_T044,Trap_T045,Trap_T046,Trap_T047,Trap_T048,Trap_T049,Trap_T050,Trap_T051,Trap_T054,Trap_T054C,Trap_T060,Trap_T061,Trap_T062,Trap_T063,Trap_T065,Trap_T065A,Trap_T066,Trap_T067,Trap_T069,Trap_T070,Trap_T071,Trap_T072,Trap_T073,Trap_T074,Trap_T075,Trap_T076,Trap_T077,Trap_T078,Trap_T079,Trap_T080,Trap_T081,Trap_T082,Trap_T083,Trap_T084,Trap_T085,Trap_T086,Trap_T088,Trap_T089,Trap_T090,Trap_T090A,Trap_T090B,Trap_T090C,Trap_T091,Trap_T092,Trap_T094,Trap_T094B,Trap_T095,Trap_T096,Trap_T097,Trap_T099,Trap_T100,Trap_T102,Trap_T103,Trap_T107,Trap_T114,Trap_T115,Trap_T128,Trap_T128A,Trap_T129,Trap_T135,Trap_T138,Trap_T141,Trap_T142,Trap_T143,Trap_T144,Trap_T145,Trap_T146,Trap_T147,Trap_T148,Trap_T149,Trap_T150,Trap_T151,Trap_T152,Trap_T153,Trap_T154,Trap_T155,Trap_T156,Trap_T157,Trap_T158,Trap_T159,Trap_T160,Trap_T161,Trap_T162,Trap_T200,Trap_T200A,Trap_T200B,Trap_T206,Trap_T209,Trap_T212,Trap_T215,Trap_T218,Trap_T218A,Trap_T218B,Trap_T218C,Trap_T219,Trap_T220,Trap_T221,Trap_T222,Trap_T223,Trap_T224,Trap_T225,Trap_T226,Trap_T227,Trap_T228,Trap_T229,Trap_T230,Trap_T231,Trap_T232,Trap_T233,Trap_T234,Trap_T235,Trap_T236,Trap_T237,Trap_T238,Trap_T900,Trap_T903
12459,41.738903,-87.695443,2,53.5,38,47.0,0.0,29.65,3.7,2011.0,9.0,37.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
835,41.867108,-87.654224,2,77.0,69,71.0,0.92,29.18,10.6,2007.0,7.0,29.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7445,41.986921,-87.689778,2,72.5,60,65.0,0.19,29.14,9.6,2009.0,7.0,30.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2504,41.911824,-87.726737,1,75.5,67,70.0,0.23,29.17,5.8,2007.0,8.0,33.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6156,41.754676,-87.612922,1,61.5,49,55.0,0.0,29.24,5.8,2009.0,6.0,24.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [25]:
# Initializing Classifiers
clf_lr  = LogisticRegression(solver='liblinear', random_state=0)
clf_knn = KNeighborsClassifier()
clf_rf  = RandomForestClassifier(random_state=0)
clf_svc = SVC(random_state=0)
clf_xgc = xgb.XGBClassifier(random_state=0)

# Building the model pipelines incl. preprocessing where needed 
# Note that the random forest and xgboost does not need feature scaling
pipe_lr  = Pipeline([('std', StandardScaler()),
                     ('clf_lr', clf_lr)])

pipe_svc = Pipeline([('std', StandardScaler()),
                     ('clf_svc', clf_svc)])

pipe_knn = Pipeline([('std', StandardScaler()),
                     ('clf_knn', clf_knn)])

# Setting up the parameter grids
param_grid_lr  = [{'clf_lr__penalty': ['l1', 'l2'],
                   'clf_lr__C': np.logspace(-4, 4, 9)}]

param_grid_knn = [{'clf_knn__n_neighbors': list(range(3, 7)),
                   'clf_knn__p': [1, 2],
                   'clf_knn__leaf_size': [5, 10, 15],
                   'clf_knn__weights': ['uniform', 'distance']}]

param_grid_rf  = [{'n_estimators': [10, 20, 50, 100, 150, 200],
                   'min_samples_leaf': [2, 5, 10],
                   'max_features': ['sqrt', 'log2']}]

param_grid_svc = [{'clf_svc__kernel': ['rbf'],
                   'clf_svc__C': np.logspace(-1, 3, 5),
                   'clf_svc__gamma': np.logspace(-4, 0, 4)[:3]}]

param_grid_xgc = [{'xgc__eval_metric' : ['auc'],
                   'xgc__subsample' : [0.5], 
                   'xgc__colsample_bytree' : [0.5], 
                   'xgc__learning_rate' : [0.1],
                   'xgc__max_depth' : [3], 
                   'xgc__reg_alpha' : [0, 1, 1.5],
                   'xgc__reg_lambda' : [1, 2, 5],
                   'xgc__gamma' : [0.01, 0.1, 3]}]

In [26]:
# Initialize functions for grid search, calling on above functions, and printing model metrics
def grid_search(classifier, params):
    gs = GridSearchCV(classifier, param_grid=params, cv=3, scoring='roc_auc', verbose=5)
    gs.fit(X_train, y_train)
    print(f'Train AUC Score: {round(gs.best_score_, 4)}')
    print(f'Using the following parameters: {gs.best_params_}')
    print('\n')
    return

In [28]:
def crossvalscore(classifer_best_params):
    model = classifer_best_params.fit(X_train, y_train)
    score = cross_val_score(classifer_best_params,
                            X=X_train,
                            y=y_train,
                            cv=5,
                            n_jobs=1,
                            scoring='roc_auc')
    print(score)
    print(f'AUC Mean Score {100*score.mean():.2f} +/- {100*score.std():.2f}')
    return

### Logistic Regression

In [319]:
grid_search(pipe_lr, param_grid_lr)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] clf_lr__C=0.0001, clf_lr__penalty=l1 ............................
[CV]  clf_lr__C=0.0001, clf_lr__penalty=l1, score=0.500, total=   0.2s
[CV] clf_lr__C=0.0001, clf_lr__penalty=l1 ............................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV]  clf_lr__C=0.0001, clf_lr__penalty=l1, score=0.500, total=   0.1s
[CV] clf_lr__C=0.0001, clf_lr__penalty=l1 ............................
[CV]  clf_lr__C=0.0001, clf_lr__penalty=l1, score=0.500, total=   0.1s
[CV] clf_lr__C=0.0001, clf_lr__penalty=l2 ............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s remaining:    0.0s


[CV]  clf_lr__C=0.0001, clf_lr__penalty=l2, score=0.798, total=   0.1s
[CV] clf_lr__C=0.0001, clf_lr__penalty=l2 ............................
[CV]  clf_lr__C=0.0001, clf_lr__penalty=l2, score=0.757, total=   0.1s
[CV] clf_lr__C=0.0001, clf_lr__penalty=l2 ............................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.4s remaining:    0.0s


[CV]  clf_lr__C=0.0001, clf_lr__penalty=l2, score=0.773, total=   0.1s
[CV] clf_lr__C=0.001, clf_lr__penalty=l1 .............................
[CV] . clf_lr__C=0.001, clf_lr__penalty=l1, score=0.500, total=   0.1s
[CV] clf_lr__C=0.001, clf_lr__penalty=l1 .............................
[CV] . clf_lr__C=0.001, clf_lr__penalty=l1, score=0.500, total=   0.1s
[CV] clf_lr__C=0.001, clf_lr__penalty=l1 .............................
[CV] . clf_lr__C=0.001, clf_lr__penalty=l1, score=0.500, total=   0.1s
[CV] clf_lr__C=0.001, clf_lr__penalty=l2 .............................
[CV] . clf_lr__C=0.001, clf_lr__penalty=l2, score=0.800, total=   0.2s
[CV] clf_lr__C=0.001, clf_lr__penalty=l2 .............................
[CV] . clf_lr__C=0.001, clf_lr__penalty=l2, score=0.762, total=   0.1s
[CV] clf_lr__C=0.001, clf_lr__penalty=l2 .............................
[CV] . clf_lr__C=0.001, clf_lr__penalty=l2, score=0.773, total=   0.2s
[CV] clf_lr__C=0.01, clf_lr__penalty=l1 ..............................
[CV] .

  else:


[CV] .. clf_lr__C=10.0, clf_lr__penalty=l1, score=0.811, total=   6.3s
[CV] clf_lr__C=10.0, clf_lr__penalty=l1 ..............................
[CV] .. clf_lr__C=10.0, clf_lr__penalty=l1, score=0.777, total=   9.9s
[CV] clf_lr__C=10.0, clf_lr__penalty=l1 ..............................
[CV] .. clf_lr__C=10.0, clf_lr__penalty=l1, score=0.767, total=   7.0s
[CV] clf_lr__C=10.0, clf_lr__penalty=l2 ..............................
[CV] .. clf_lr__C=10.0, clf_lr__penalty=l2, score=0.810, total=   1.0s
[CV] clf_lr__C=10.0, clf_lr__penalty=l2 ..............................
[CV] .. clf_lr__C=10.0, clf_lr__penalty=l2, score=0.776, total=   0.8s
[CV] clf_lr__C=10.0, clf_lr__penalty=l2 ..............................
[CV] .. clf_lr__C=10.0, clf_lr__penalty=l2, score=0.768, total=   0.9s
[CV] clf_lr__C=100.0, clf_lr__penalty=l1 .............................


  else:


[CV] . clf_lr__C=100.0, clf_lr__penalty=l1, score=0.810, total=  16.4s
[CV] clf_lr__C=100.0, clf_lr__penalty=l1 .............................


  else:


[CV] . clf_lr__C=100.0, clf_lr__penalty=l1, score=0.776, total=  13.2s
[CV] clf_lr__C=100.0, clf_lr__penalty=l1 .............................


  else:


[CV] . clf_lr__C=100.0, clf_lr__penalty=l1, score=0.767, total=   9.6s
[CV] clf_lr__C=100.0, clf_lr__penalty=l2 .............................
[CV] . clf_lr__C=100.0, clf_lr__penalty=l2, score=0.809, total=   1.3s
[CV] clf_lr__C=100.0, clf_lr__penalty=l2 .............................
[CV] . clf_lr__C=100.0, clf_lr__penalty=l2, score=0.776, total=   1.2s
[CV] clf_lr__C=100.0, clf_lr__penalty=l2 .............................
[CV] . clf_lr__C=100.0, clf_lr__penalty=l2, score=0.767, total=   1.1s
[CV] clf_lr__C=1000.0, clf_lr__penalty=l1 ............................


  else:


[CV]  clf_lr__C=1000.0, clf_lr__penalty=l1, score=0.810, total=   4.2s
[CV] clf_lr__C=1000.0, clf_lr__penalty=l1 ............................


  else:


[CV]  clf_lr__C=1000.0, clf_lr__penalty=l1, score=0.776, total=  10.0s
[CV] clf_lr__C=1000.0, clf_lr__penalty=l1 ............................


  else:


[CV]  clf_lr__C=1000.0, clf_lr__penalty=l1, score=0.767, total=  11.3s
[CV] clf_lr__C=1000.0, clf_lr__penalty=l2 ............................
[CV]  clf_lr__C=1000.0, clf_lr__penalty=l2, score=0.809, total=   1.7s
[CV] clf_lr__C=1000.0, clf_lr__penalty=l2 ............................
[CV]  clf_lr__C=1000.0, clf_lr__penalty=l2, score=0.776, total=   1.3s
[CV] clf_lr__C=1000.0, clf_lr__penalty=l2 ............................
[CV]  clf_lr__C=1000.0, clf_lr__penalty=l2, score=0.767, total=   1.3s
[CV] clf_lr__C=10000.0, clf_lr__penalty=l1 ...........................


  else:


[CV]  clf_lr__C=10000.0, clf_lr__penalty=l1, score=0.809, total=  20.0s
[CV] clf_lr__C=10000.0, clf_lr__penalty=l1 ...........................


  else:


[CV]  clf_lr__C=10000.0, clf_lr__penalty=l1, score=0.776, total=  11.8s
[CV] clf_lr__C=10000.0, clf_lr__penalty=l1 ...........................


  else:


[CV]  clf_lr__C=10000.0, clf_lr__penalty=l1, score=0.767, total=  12.1s
[CV] clf_lr__C=10000.0, clf_lr__penalty=l2 ...........................
[CV]  clf_lr__C=10000.0, clf_lr__penalty=l2, score=0.809, total=   1.7s
[CV] clf_lr__C=10000.0, clf_lr__penalty=l2 ...........................
[CV]  clf_lr__C=10000.0, clf_lr__penalty=l2, score=0.776, total=   1.3s
[CV] clf_lr__C=10000.0, clf_lr__penalty=l2 ...........................
[CV]  clf_lr__C=10000.0, clf_lr__penalty=l2, score=0.767, total=   1.2s


[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:  2.6min finished


Results from: 
Train AUC Score: 0.7881
Using the following parameters: {'clf_lr__C': 0.1, 'clf_lr__penalty': 'l2'}




In [323]:
clf_lr_best = LogisticRegression(solver='liblinear', random_state=0, C=0.1, penalty='l2')

In [322]:
crossvalscore(clf_lr_best)

[0.80779744 0.73483595 0.74259281 0.73614388 0.77209691]
AUC Mean Score 75.87 +/- 2.80


### K-nearest Neighbors

In [327]:
grid_search(pipe_knn, param_grid_knn)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] clf_knn__leaf_size=5, clf_knn__n_neighbors=3, clf_knn__p=1, clf_knn__weights=uniform 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf_knn__leaf_size=5, clf_knn__n_neighbors=3, clf_knn__p=1, clf_knn__weights=uniform, score=0.797, total=   6.5s
[CV] clf_knn__leaf_size=5, clf_knn__n_neighbors=3, clf_knn__p=1, clf_knn__weights=uniform 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.4s remaining:    0.0s


[CV]  clf_knn__leaf_size=5, clf_knn__n_neighbors=3, clf_knn__p=1, clf_knn__weights=uniform, score=0.769, total=   6.3s
[CV] clf_knn__leaf_size=5, clf_knn__n_neighbors=3, clf_knn__p=1, clf_knn__weights=uniform 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   12.7s remaining:    0.0s


[CV]  clf_knn__leaf_size=5, clf_knn__n_neighbors=3, clf_knn__p=1, clf_knn__weights=uniform, score=0.801, total=   6.4s
[CV] clf_knn__leaf_size=5, clf_knn__n_neighbors=3, clf_knn__p=1, clf_knn__weights=distance 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   19.1s remaining:    0.0s


[CV]  clf_knn__leaf_size=5, clf_knn__n_neighbors=3, clf_knn__p=1, clf_knn__weights=distance, score=0.806, total=   6.6s
[CV] clf_knn__leaf_size=5, clf_knn__n_neighbors=3, clf_knn__p=1, clf_knn__weights=distance 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   25.7s remaining:    0.0s


[CV]  clf_knn__leaf_size=5, clf_knn__n_neighbors=3, clf_knn__p=1, clf_knn__weights=distance, score=0.775, total=   6.8s
[CV] clf_knn__leaf_size=5, clf_knn__n_neighbors=3, clf_knn__p=1, clf_knn__weights=distance 
[CV]  clf_knn__leaf_size=5, clf_knn__n_neighbors=3, clf_knn__p=1, clf_knn__weights=distance, score=0.809, total=   6.6s
[CV] clf_knn__leaf_size=5, clf_knn__n_neighbors=3, clf_knn__p=2, clf_knn__weights=uniform 
[CV]  clf_knn__leaf_size=5, clf_knn__n_neighbors=3, clf_knn__p=2, clf_knn__weights=uniform, score=0.739, total=   8.0s
[CV] clf_knn__leaf_size=5, clf_knn__n_neighbors=3, clf_knn__p=2, clf_knn__weights=uniform 
[CV]  clf_knn__leaf_size=5, clf_knn__n_neighbors=3, clf_knn__p=2, clf_knn__weights=uniform, score=0.703, total=   8.0s
[CV] clf_knn__leaf_size=5, clf_knn__n_neighbors=3, clf_knn__p=2, clf_knn__weights=uniform 
[CV]  clf_knn__leaf_size=5, clf_knn__n_neighbors=3, clf_knn__p=2, clf_knn__weights=uniform, score=0.750, total=   7.7s
[CV] clf_knn__leaf_size=5, clf_knn__n_

[CV]  clf_knn__leaf_size=5, clf_knn__n_neighbors=6, clf_knn__p=2, clf_knn__weights=uniform, score=0.754, total=   9.0s
[CV] clf_knn__leaf_size=5, clf_knn__n_neighbors=6, clf_knn__p=2, clf_knn__weights=uniform 
[CV]  clf_knn__leaf_size=5, clf_knn__n_neighbors=6, clf_knn__p=2, clf_knn__weights=uniform, score=0.782, total=   9.1s
[CV] clf_knn__leaf_size=5, clf_knn__n_neighbors=6, clf_knn__p=2, clf_knn__weights=distance 
[CV]  clf_knn__leaf_size=5, clf_knn__n_neighbors=6, clf_knn__p=2, clf_knn__weights=distance, score=0.796, total=   9.1s
[CV] clf_knn__leaf_size=5, clf_knn__n_neighbors=6, clf_knn__p=2, clf_knn__weights=distance 
[CV]  clf_knn__leaf_size=5, clf_knn__n_neighbors=6, clf_knn__p=2, clf_knn__weights=distance, score=0.757, total=   9.1s
[CV] clf_knn__leaf_size=5, clf_knn__n_neighbors=6, clf_knn__p=2, clf_knn__weights=distance 
[CV]  clf_knn__leaf_size=5, clf_knn__n_neighbors=6, clf_knn__p=2, clf_knn__weights=distance, score=0.792, total=   9.4s
[CV] clf_knn__leaf_size=10, clf_knn

[CV]  clf_knn__leaf_size=10, clf_knn__n_neighbors=5, clf_knn__p=2, clf_knn__weights=distance, score=0.745, total=   7.5s
[CV] clf_knn__leaf_size=10, clf_knn__n_neighbors=5, clf_knn__p=2, clf_knn__weights=distance 
[CV]  clf_knn__leaf_size=10, clf_knn__n_neighbors=5, clf_knn__p=2, clf_knn__weights=distance, score=0.786, total=   7.7s
[CV] clf_knn__leaf_size=10, clf_knn__n_neighbors=6, clf_knn__p=1, clf_knn__weights=uniform 
[CV]  clf_knn__leaf_size=10, clf_knn__n_neighbors=6, clf_knn__p=1, clf_knn__weights=uniform, score=0.826, total=   8.0s
[CV] clf_knn__leaf_size=10, clf_knn__n_neighbors=6, clf_knn__p=1, clf_knn__weights=uniform 
[CV]  clf_knn__leaf_size=10, clf_knn__n_neighbors=6, clf_knn__p=1, clf_knn__weights=uniform, score=0.791, total=   7.5s
[CV] clf_knn__leaf_size=10, clf_knn__n_neighbors=6, clf_knn__p=1, clf_knn__weights=uniform 
[CV]  clf_knn__leaf_size=10, clf_knn__n_neighbors=6, clf_knn__p=1, clf_knn__weights=uniform, score=0.799, total=   7.3s
[CV] clf_knn__leaf_size=10, c

[CV]  clf_knn__leaf_size=15, clf_knn__n_neighbors=5, clf_knn__p=1, clf_knn__weights=uniform, score=0.785, total=   6.9s
[CV] clf_knn__leaf_size=15, clf_knn__n_neighbors=5, clf_knn__p=1, clf_knn__weights=uniform 
[CV]  clf_knn__leaf_size=15, clf_knn__n_neighbors=5, clf_knn__p=1, clf_knn__weights=uniform, score=0.800, total=   6.9s
[CV] clf_knn__leaf_size=15, clf_knn__n_neighbors=5, clf_knn__p=1, clf_knn__weights=distance 
[CV]  clf_knn__leaf_size=15, clf_knn__n_neighbors=5, clf_knn__p=1, clf_knn__weights=distance, score=0.845, total=   6.8s
[CV] clf_knn__leaf_size=15, clf_knn__n_neighbors=5, clf_knn__p=1, clf_knn__weights=distance 
[CV]  clf_knn__leaf_size=15, clf_knn__n_neighbors=5, clf_knn__p=1, clf_knn__weights=distance, score=0.797, total=   7.1s
[CV] clf_knn__leaf_size=15, clf_knn__n_neighbors=5, clf_knn__p=1, clf_knn__weights=distance 
[CV]  clf_knn__leaf_size=15, clf_knn__n_neighbors=5, clf_knn__p=1, clf_knn__weights=distance, score=0.819, total=   7.3s
[CV] clf_knn__leaf_size=15

[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed: 17.9min finished


Results from: 
Train AUC Score: 0.823
Using the following parameters: {'clf_knn__leaf_size': 5, 'clf_knn__n_neighbors': 6, 'clf_knn__p': 1, 'clf_knn__weights': 'distance'}




In [332]:
clf_knn_best = KNeighborsClassifier(n_neighbors=6, leaf_size=5, p=1, weights='distance')

In [333]:
crossvalscore(clf_knn_best)

[0.7449818  0.74224581 0.74867872 0.74579859 0.72543528]
AUC Mean Score 74.14 +/- 0.83


### Random Forest

clf_lr  = LogisticRegression(solver='liblinear', random_state=0)
clf_knn = KNeighborsClassifier()
clf_rf  = RandomForestClassifier(random_state=0)
clf_svc = SVC(random_state=0)
clf_xgc = xgb.XGBClassifier(random_state=0)


In [334]:
grid_search(clf_rf, param_grid_rf)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] max_features=sqrt, min_samples_leaf=2, n_estimators=10 ..........
[CV]  max_features=sqrt, min_samples_leaf=2, n_estimators=10, score=0.891, total=   0.2s
[CV] max_features=sqrt, min_samples_leaf=2, n_estimators=10 ..........


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV]  max_features=sqrt, min_samples_leaf=2, n_estimators=10, score=0.852, total=   0.2s
[CV] max_features=sqrt, min_samples_leaf=2, n_estimators=10 ..........
[CV]  max_features=sqrt, min_samples_leaf=2, n_estimators=10, score=0.849, total=   0.1s
[CV] max_features=sqrt, min_samples_leaf=2, n_estimators=20 ..........


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.4s remaining:    0.0s


[CV]  max_features=sqrt, min_samples_leaf=2, n_estimators=20, score=0.903, total=   0.3s
[CV] max_features=sqrt, min_samples_leaf=2, n_estimators=20 ..........


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.6s remaining:    0.0s


[CV]  max_features=sqrt, min_samples_leaf=2, n_estimators=20, score=0.867, total=   0.3s
[CV] max_features=sqrt, min_samples_leaf=2, n_estimators=20 ..........
[CV]  max_features=sqrt, min_samples_leaf=2, n_estimators=20, score=0.866, total=   0.3s
[CV] max_features=sqrt, min_samples_leaf=2, n_estimators=50 ..........
[CV]  max_features=sqrt, min_samples_leaf=2, n_estimators=50, score=0.907, total=   0.6s
[CV] max_features=sqrt, min_samples_leaf=2, n_estimators=50 ..........
[CV]  max_features=sqrt, min_samples_leaf=2, n_estimators=50, score=0.880, total=   0.6s
[CV] max_features=sqrt, min_samples_leaf=2, n_estimators=50 ..........
[CV]  max_features=sqrt, min_samples_leaf=2, n_estimators=50, score=0.881, total=   0.6s
[CV] max_features=sqrt, min_samples_leaf=2, n_estimators=100 .........
[CV]  max_features=sqrt, min_samples_leaf=2, n_estimators=100, score=0.909, total=   1.3s
[CV] max_features=sqrt, min_samples_leaf=2, n_estimators=100 .........
[CV]  max_features=sqrt, min_samples_le

[CV]  max_features=log2, min_samples_leaf=2, n_estimators=10, score=0.854, total=   0.1s
[CV] max_features=log2, min_samples_leaf=2, n_estimators=10 ..........
[CV]  max_features=log2, min_samples_leaf=2, n_estimators=10, score=0.858, total=   0.1s
[CV] max_features=log2, min_samples_leaf=2, n_estimators=20 ..........
[CV]  max_features=log2, min_samples_leaf=2, n_estimators=20, score=0.900, total=   0.2s
[CV] max_features=log2, min_samples_leaf=2, n_estimators=20 ..........
[CV]  max_features=log2, min_samples_leaf=2, n_estimators=20, score=0.868, total=   0.2s
[CV] max_features=log2, min_samples_leaf=2, n_estimators=20 ..........
[CV]  max_features=log2, min_samples_leaf=2, n_estimators=20, score=0.872, total=   0.2s
[CV] max_features=log2, min_samples_leaf=2, n_estimators=50 ..........
[CV]  max_features=log2, min_samples_leaf=2, n_estimators=50, score=0.904, total=   0.5s
[CV] max_features=log2, min_samples_leaf=2, n_estimators=50 ..........
[CV]  max_features=log2, min_samples_lea

[CV]  max_features=log2, min_samples_leaf=10, n_estimators=200, score=0.837, total=   1.4s
[CV] max_features=log2, min_samples_leaf=10, n_estimators=200 ........
[CV]  max_features=log2, min_samples_leaf=10, n_estimators=200, score=0.857, total=   1.4s


[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:  1.5min finished


Results from: 
Train AUC Score: 0.895
Using the following parameters: {'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 200}




In [60]:
clf_rf_best = RandomForestClassifier(max_features='sqrt', min_samples_leaf=2, n_estimators=200, random_state = 0)

In [336]:
crossvalscore(clf_rf_best)

[0.91821128 0.90743281 0.89757258 0.90559903 0.91742319]
AUC Mean Score 90.92 +/- 0.77


### Support Vector Classification

In [39]:
grid_search(pipe_svc, param_grid_svc)

Fitting 3 folds for each of 15 candidates, totalling 45 fits
[CV] clf_svc__C=0.1, clf_svc__gamma=0.0001, clf_svc__kernel=rbf ......


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf_svc__C=0.1, clf_svc__gamma=0.0001, clf_svc__kernel=rbf, score=0.611, total=   3.0s
[CV] clf_svc__C=0.1, clf_svc__gamma=0.0001, clf_svc__kernel=rbf ......


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.9s remaining:    0.0s


[CV]  clf_svc__C=0.1, clf_svc__gamma=0.0001, clf_svc__kernel=rbf, score=0.632, total=   2.9s
[CV] clf_svc__C=0.1, clf_svc__gamma=0.0001, clf_svc__kernel=rbf ......


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.9s remaining:    0.0s


KeyboardInterrupt: 

In [30]:
clf_svc_best = SVC(random_state=0, C=1000, gamma=0.002154, kernel='rbf')

In [32]:
crossvalscore(clf_svc_best)

[0.88832093 0.88864925 0.89896059 0.88713311 0.88666905]
AUC Mean Score 88.99 +/- 0.46


### XGBoost

In [45]:
grid_search(clf_xgc, param_grid_xgc)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.01, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=0, xgc__reg_lambda=1, xgc__subsample=0.5 
Parameters: { xgc__colsample_bytree, xgc__eval_metric, xgc__gamma, xgc__learning_rate, xgc__max_depth, xgc__reg_alpha, xgc__reg_lambda, xgc__subsample } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.01, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=0, xgc__reg_lambda=1, xgc__subsample=0.5, score=0.916, total=   2.3s
[CV] xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.01, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=0, xgc__reg_lambda=1, xgc__subsample=0.5 
Parameters: { xgc__colsample_bytree, xgc__eval_metric, xgc__gamma, xgc__learning_rate, xgc__max_depth, xgc__reg_alpha, xgc__reg_lambda, xgc__subsample } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.2s remaining:    0.0s


[CV]  xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.01, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=0, xgc__reg_lambda=1, xgc__subsample=0.5, score=0.890, total=   2.4s
[CV] xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.01, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=0, xgc__reg_lambda=1, xgc__subsample=0.5 
Parameters: { xgc__colsample_bytree, xgc__eval_metric, xgc__gamma, xgc__learning_rate, xgc__max_depth, xgc__reg_alpha, xgc__reg_lambda, xgc__subsample } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.6s remaining:    0.0s


[CV]  xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.01, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=0, xgc__reg_lambda=1, xgc__subsample=0.5, score=0.892, total=   2.5s
[CV] xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.01, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=0, xgc__reg_lambda=2, xgc__subsample=0.5 
Parameters: { xgc__colsample_bytree, xgc__eval_metric, xgc__gamma, xgc__learning_rate, xgc__max_depth, xgc__reg_alpha, xgc__reg_lambda, xgc__subsample } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    7.1s remaining:    0.0s


[CV]  xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.01, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=0, xgc__reg_lambda=2, xgc__subsample=0.5, score=0.916, total=   2.7s
[CV] xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.01, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=0, xgc__reg_lambda=2, xgc__subsample=0.5 
Parameters: { xgc__colsample_bytree, xgc__eval_metric, xgc__gamma, xgc__learning_rate, xgc__max_depth, xgc__reg_alpha, xgc__reg_lambda, xgc__subsample } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    9.7s remaining:    0.0s


[CV]  xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.01, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=0, xgc__reg_lambda=2, xgc__subsample=0.5, score=0.890, total=   2.8s
[CV] xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.01, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=0, xgc__reg_lambda=2, xgc__subsample=0.5 
Parameters: { xgc__colsample_bytree, xgc__eval_metric, xgc__gamma, xgc__learning_rate, xgc__max_depth, xgc__reg_alpha, xgc__reg_lambda, xgc__subsample } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[CV]  xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.01, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=0, xgc__reg_lambda=2, xgc__subsample=0.5, score=0.892, total=   2.9s
[CV] xgc__colsample_by

[CV]  xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.01, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=1, xgc__reg_lambda=2, xgc__subsample=0.5, score=0.892, total=   3.4s
[CV] xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.01, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=1, xgc__reg_lambda=5, xgc__subsample=0.5 
Parameters: { xgc__colsample_bytree, xgc__eval_metric, xgc__gamma, xgc__learning_rate, xgc__max_depth, xgc__reg_alpha, xgc__reg_lambda, xgc__subsample } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[CV]  xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.01, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=1, xgc__reg_lambda=5, xgc__subsample=0.5, score=0.916, total=   3.7s
[CV] xgc__colsample_by

[CV]  xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.01, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=1.5, xgc__reg_lambda=5, xgc__subsample=0.5, score=0.916, total=   3.4s
[CV] xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.01, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=1.5, xgc__reg_lambda=5, xgc__subsample=0.5 
Parameters: { xgc__colsample_bytree, xgc__eval_metric, xgc__gamma, xgc__learning_rate, xgc__max_depth, xgc__reg_alpha, xgc__reg_lambda, xgc__subsample } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[CV]  xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.01, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=1.5, xgc__reg_lambda=5, xgc__subsample=0.5, score=0.890, total=   3.5s
[CV] xgc__colsam

[CV]  xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.1, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=0, xgc__reg_lambda=5, xgc__subsample=0.5, score=0.890, total=   3.2s
[CV] xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.1, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=0, xgc__reg_lambda=5, xgc__subsample=0.5 
Parameters: { xgc__colsample_bytree, xgc__eval_metric, xgc__gamma, xgc__learning_rate, xgc__max_depth, xgc__reg_alpha, xgc__reg_lambda, xgc__subsample } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[CV]  xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.1, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=0, xgc__reg_lambda=5, xgc__subsample=0.5, score=0.892, total=   3.4s
[CV] xgc__colsample_bytre

[CV]  xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.1, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=1, xgc__reg_lambda=5, xgc__subsample=0.5, score=0.892, total=   3.5s
[CV] xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.1, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=1.5, xgc__reg_lambda=1, xgc__subsample=0.5 
Parameters: { xgc__colsample_bytree, xgc__eval_metric, xgc__gamma, xgc__learning_rate, xgc__max_depth, xgc__reg_alpha, xgc__reg_lambda, xgc__subsample } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[CV]  xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=0.1, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=1.5, xgc__reg_lambda=1, xgc__subsample=0.5, score=0.916, total=   3.9s
[CV] xgc__colsample_b

[CV]  xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=3, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=0, xgc__reg_lambda=1, xgc__subsample=0.5, score=0.916, total=   3.5s
[CV] xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=3, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=0, xgc__reg_lambda=1, xgc__subsample=0.5 
Parameters: { xgc__colsample_bytree, xgc__eval_metric, xgc__gamma, xgc__learning_rate, xgc__max_depth, xgc__reg_alpha, xgc__reg_lambda, xgc__subsample } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[CV]  xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=3, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=0, xgc__reg_lambda=1, xgc__subsample=0.5, score=0.890, total=   4.2s
[CV] xgc__colsample_bytree=0.5,

[CV]  xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=3, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=1, xgc__reg_lambda=1, xgc__subsample=0.5, score=0.890, total=   3.8s
[CV] xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=3, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=1, xgc__reg_lambda=1, xgc__subsample=0.5 
Parameters: { xgc__colsample_bytree, xgc__eval_metric, xgc__gamma, xgc__learning_rate, xgc__max_depth, xgc__reg_alpha, xgc__reg_lambda, xgc__subsample } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[CV]  xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=3, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=1, xgc__reg_lambda=1, xgc__subsample=0.5, score=0.892, total=   3.6s
[CV] xgc__colsample_bytree=0.5,

[CV]  xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=3, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=1.5, xgc__reg_lambda=1, xgc__subsample=0.5, score=0.892, total=   3.5s
[CV] xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=3, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=1.5, xgc__reg_lambda=2, xgc__subsample=0.5 
Parameters: { xgc__colsample_bytree, xgc__eval_metric, xgc__gamma, xgc__learning_rate, xgc__max_depth, xgc__reg_alpha, xgc__reg_lambda, xgc__subsample } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[CV]  xgc__colsample_bytree=0.5, xgc__eval_metric=auc, xgc__gamma=3, xgc__learning_rate=0.1, xgc__max_depth=3, xgc__reg_alpha=1.5, xgc__reg_lambda=2, xgc__subsample=0.5, score=0.916, total=   3.6s
[CV] xgc__colsample_bytre

[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed:  4.5min finished


Train AUC Score: 0.8996
Using the following parameters: {'xgc__colsample_bytree': 0.5, 'xgc__eval_metric': 'auc', 'xgc__gamma': 0.01, 'xgc__learning_rate': 0.1, 'xgc__max_depth': 3, 'xgc__reg_alpha': 0, 'xgc__reg_lambda': 1, 'xgc__subsample': 0.5}




In [42]:
clf_xgc_best = xgb.XGBClassifier(random_state = 0,
                                 colsample_bytree=0.5,
                                 eval_metric='auc',
                                 gamma=0.01,
                                 learning_rate=0.1,
                                 max_depth=3,
                                 reg_alpha=0,
                                 reg_lambda=1,
                                 subsample=0.5)

In [43]:
crossvalscore(clf_xgc_best)

[0.88447186 0.85751236 0.85267166 0.84720635 0.88202517]
AUC Mean Score 86.48 +/- 1.54


In [None]:
LR
[0.80779744 0.73483595 0.74259281 0.73614388 0.77209691]
AUC Mean Score 75.87 +/- 2.80

K Nearest Neighbor
[0.7449818  0.74224581 0.74867872 0.74579859 0.72543528]
AUC Mean Score 74.14 +/- 0.83

Random Forest
[0.91821128 0.90743281 0.89757258 0.90559903 0.91742319]
AUC Mean Score 90.92 +/- 0.77

SVM
[0.88832093 0.88864925 0.89896059 0.88713311 0.88666905]
AUC Mean Score 88.99 +/- 0.46

XGBoost
[0.88447186 0.85751236 0.85267166 0.84720635 0.88202517]
AUC Mean Score 86.48 +/- 1.54

### Learning Algorithm Selection

- Logistic Regression ROC_AUC Mean Score(std): `75.87 +/- 2.80`
- K Nearest Neighbor ROC_AUC Mean Score(std): `74.14 +/- 0.83`
- Random Forest ROC_AUC Mean Score(std): `90.92 +/- 0.77`
- Support Vector Classification ROC_AUC Mean Score(std): `88.99 +/- 0.46`
- XGBoost ROC_AUC Mean Score(std): `86.48 +/- 1.54`

Using `GridSearchCV` followed by `cross_val_score` we were able to tune the respective classifier with the optimised hyperparameters and then check if the learning algorithm is capable of yielding high mean score and low variances as we would want the selected learning algorithm to be capable of producing similar performance on unseen data. In this case it will be the test set provided by kaggle. 

By comparing the above scores, the final learning algorithm that is choosen is Support Vector Classification. Using the best parameters selected `C = 1000`, `gamma = 0.002154`(rounded to 6 decimal places) and `kernel = 'rbf`.

In [30]:
best_algo = SVC(random_state=0, C=1000, gamma=0.002154, kernel='rbf', probability=True)

In [31]:
best_algo.fit(X_train, y_train)
train_auc = roc_auc_score(y_true=y_train, y_score=best_algo.predict_proba(X_train)[:,1])
test_auc = roc_auc_score(y_true=y_test, y_score=best_algo.predict_proba(X_test)[:,1])

In [39]:
print(f'Training ROC AUC: {100*train_auc:.2f}')
print(f'Test ROC AUC: {100*test_auc:.2f}')

Training ROC AUC: 96.52
Test ROC AUC: 92.33


### Final Model

In [35]:
# combine both train test split to do a final train and then evaluate against kaggle and check score!
# final model for kaggle prediction
X_final, y_final = df_train_final.drop(columns='WnvPresent', axis=1), df_train_final['WnvPresent']

In [36]:
best_algo.fit(X_final, y_final)

SVC(C=1000, gamma=0.002154, probability=True, random_state=0)

In [37]:
final_auc = roc_auc_score(y_true=y_final, y_score=best_algo.predict_proba(X_final)[:,1])

In [38]:
final_auc

0.9620726860188209

### Kaggle Submissions

In [126]:
df_prediction_final = df_prediction_final[df_prediction_final['Station'] != 2]

In [128]:
kaggle_preds = best_algo.predict_proba(df_prediction_final)[:,1]

In [115]:
processed_test = pd.read_csv('../assets/processed_test.csv')

In [131]:
processed_test = processed_test[processed_test['Station'] != 2]

In [132]:
output_file = pd.DataFrame({'Id': processed_test.Id, 'WnvPresent': kaggle_preds})

In [133]:
output_file.to_csv('../assets/kaggle_submissions.csv', index=False)

In [134]:
output_file.shape

(116293, 2)

## Cost-Benefit Analysis of Spray operations

### Cost of spraying operations

The city of Chicago started <a href = "https://chicago.cbslocal.com/2017/08/30/spray-mosquitoes-far-south-side-west-nile-prevention/"> spraying operations </a> for the first time in the first week of September 2017 in areas of the Pullman and South Deering neighborhoods. The operation starts at dusk (around 6.49 pm according to the weather dataset) and lasts till 1 a.m, making it an average of about 5 hours of spraying. The spraying is done by licensed mosquito abatement technicians in trucks dispensing an ultra-low-volume spray. The main chemical used is <a href = "https://www.cmmcp.org/pesticide-information/pages/zenivex-e4-etofenprox">Zenivex™ E4 </a>, which is  4% solution of a chemical known as Etofenprox, a reduced risk synthetic pyrethroid with an extremely low toxicity to mammals. It is sprayed from a truck at 4.5-9.0 ounces per minute, at a vehicle speed of 10-15 mph (16 to 24 kmh). The cost of Zenivex™ E4 is about USD \$80 per gallon according to <a href = "http://www.gfmosquito.com/wp-content/uploads/2013/06/2013-North-Dakota-Bid-Tabulation.pdf"> tender information in North Dakota </a>.

<img src="http://www.meepi.org/wnv/graphics/trucksprayjpg.jpg" alt="ULV Sprayer in the City of Chicago" width="400">

Assuming each truck travels at the lowest speed of 16 kmh, a single truck can spray approximately (16 km $\times$ 0.003 km $\times$ 5 hours) in a single night, covering 0.24 km<sup>2</sup>. We assume there is no overlap in the spray area. If each truck were to spray an average of 6.75 ounces (0.05 gallon) per minute. Each truck would spray 3 gallons of Zenivex™ E4 per hour and hence 15 gallons a night. The total cost of Zenivex™ E4 for a truck for a night would hence be USD 80 per gallon $\times$ 15 gallons = **USD 1,200** covering **0.24 km<sup>2</sup>**. The cost to spray **1 km<sup>2</sup>** will be approximately **USD 5,000**.

If each truck were to travel at the highest speed of 24 kmh, a single truck can spray approximately (24 km $\times$ 0.003 km $\times$ 5 hours) in a single night, covering 0.36 km<sup>2</sup>. Assuming the cost for Zenivex™ E4 per truck for the entire night is the same as above, we would spend **USD 1,200** covering 0.36 km<sup>2</sup> per truck. The cost to spray **1 km<sup>2</sup>** will be approximately **USD 3,333**.

Considering the city of Chicago has a total areas of 606.1 km<sup>2</sup>, it would cost between **USD 2.02 - USD 3.03 million** to cover the entire city assuming there are enough trucks and fuel costs and worker salaries are negligible with almost no wastage of chemicals. 





### Benefits of spraying operations

There are multiple benefits from a reduced mosquito population as a result of spraying. These include an increased quality of life from fewer people falling sick and dying, increased workplace productivity from fewer people falling ill and going on medical leave, as well as savings in hospital expenses from treating WNV patients. Of these, only the latter two are measurable.

About 1 in 5 people infected with WNV develop the highly incapacitative <a href = "https://www.uptodate.com/contents/west-nile-virus-infection-beyond-the-basics">West Nile fever </a> with other symptoms such as headache, body aches, joint pains, vomiting, etc. Recovery from West Nile fever takes from a few days to several weeks, and prolonged fatigue is common.

About 1 in 150 people infected develop severe neuroinvasive diseases such as encephalitis or meningitis, in which the virus travels through the blood and infects the brain and spinal cord. Recovery is prolonged and less than 40% of patients with the severe diseases recover after one year.

Given that the <a href="https://datausa.io/profile/geo/chicago-il/">median household income</a> in Chicago was \\$55,295 (as of 2017), one can estimate the amount of losses the city will face from a workforce affected by WNV. 

In 2017, there were <a href="https://chicago.cbslocal.com/2018/08/29/west-nile-virus-death-reported-in-illinois/"> 90 WNV cases, including 8 deaths </a>. This means that approximately 18 people developed West Nile fever. Assuming all were working adults and each took two weeks off work to recover, this would have resulted in a total income loss of \\$38,281 in total. On average, each WNV patient spends approximately \\$25,000 in the hospital. Therefore the total monetary loss caused by WNV in 2017 is approximately **\\$488,281**.

## Summary and recomendations of Cost-Benefit Analysis

---

Examination of the the total costs of spraying the whole of Chicago compared against the benefits show that the costs far outweigh the benefits in monetary terms. At best, accounting for inflation or even a pessimistic outcome of having 50% more WNV infections, the total monetary benefit to Chicago as a society may only be 16%-25% of the total cost of spraying.

However, our model does not take into account any non-monetary benefits to reducing the mosquito population. These include the emotional costs from loss of life, the reduction in the need for enhanced testing for suspected WNV cases and public confidence in the government.

From previous geospatial analysis of spray data, there is a distinct lack of evidence to support the claim that mosquito spraying had any effect on the reduction of WNV-infected mosquitos. Furthermore, the spray data pointed towards highly fragmented and haphazardous spraying operations that did not seem to be driven by the evidence if the presence and severity of WNV mosquito infestations. Traps such as the T900 trap at O'Hare International airport which proved to capture the most WNV-infected mosquitos by far were not sprayed. 

Given the high costs required to conduct spraying operations, we hence recommend the following action points:

1. Re-examine the effectiveness of spraying Zenivex™ E4 as a means to control the mosquito population. Evidence points towards the ineffectiveness of the chemical and it is likely that other kinds of non-toxic mosquito sprays should be explored.

2. Re-direct mosquito spraying operations in a more organised and evidence-driven manner whereby severe hotspots such as O'Hare International Airport are sprayed first at the beginning of summer in order to prevent large populations of mosquitos forming. In addition, spraying operations should be accurately logged and routes planned to make sure mosquito breeding sites are properly covered.

3. Examine new ways of controlling the mosquito population that may arguably cost less than spraying the whole of Chicago. Innovative ways of doing so may include 'anti-mosquit' campaigns done in places such as Singapore or 