In [1]:
#standard ds imports
import pandas as pd
import numpy as np
#viz and stats
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# .py imports
#import wranglerer as wr
#import modeling as md
import os
#sklearn imports
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
import sklearn.preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix, plot_confusion_matrix,balanced_accuracy_score,roc_auc_score,make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#CATboost imports
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import explore_r as ex
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import xgboost as xgb

### This notebook builds upon the previous notebook (model_r_round3) and will use an ultra slim version of the original df...cols= wind, ou, is_outdoor, is_turf

# OK, enough of that...
## Let's work with a hi-speed,low-drag version of the df
### Cols: wind, ou, is_outdoor, is_turf, abnormal_start, playoff

In [2]:
df = pd.read_csv('prepped_data_stad.csv')

In [3]:
df.columns

Index(['date', 'day_of_week', 'start_time', 'week_num', 'home_score',
       'home_wins', 'away_score', 'away_wins', 'stadium', 'temp', 'humidity',
       'wind', 'spread', 'ou', 'is_under', 'abnormal_start', 'total_scores',
       'is_playoff', 'playoff_implications', 'is_turf', 'is_outdoor'],
      dtype='object')

In [4]:
df = df.drop(columns=['date','home_score',
       'home_wins', 'away_score', 'away_wins','total_scores',
                     'day_of_week','start_time', 'week_num','stadium', 'temp', 'humidity','spread',
                      'playoff_implications','abnormal_start','is_playoff'])
#df['spread'] = abs(df['spread'])

In [5]:
df.head()

Unnamed: 0,wind,ou,is_under,is_turf,is_outdoor
0,0,51.0,0,0,0
1,14,45.5,1,0,1
2,13,48.0,1,0,1
3,0,49.0,1,1,1
4,19,47.0,1,0,1


In [6]:
X_train, y_train, X_validate, y_validate, X_test, y_test = ex.train_validate_test(df,'is_under')

In [7]:
X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape 

((6471, 4), (6471,), (2394, 4), (2394,), (1946, 4), (1946,))

In [8]:
print(y_train.value_counts(normalize=True))
print(y_validate.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

1    0.50734
0    0.49266
Name: is_under, dtype: float64
1    0.507519
0    0.492481
Name: is_under, dtype: float64
1    0.507194
0    0.492806
Name: is_under, dtype: float64


In [9]:
X_train.head()

Unnamed: 0,wind,ou,is_turf,is_outdoor
894,9,46.5,0,1
6913,11,41.5,1,1
886,0,48.0,1,0
7630,20,37.0,1,1
5466,5,36.5,0,1


In [10]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1946 entries, 6593 to 94
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   wind        1946 non-null   int64  
 1   ou          1946 non-null   float64
 2   is_turf     1946 non-null   int64  
 3   is_outdoor  1946 non-null   int64  
dtypes: float64(1), int64(3)
memory usage: 76.0 KB


# MODELING with CV on slim df

## DTC with CV

In [11]:
param_grid = {
    'max_depth': [None,18, 12, 10, 5],
    'min_samples_leaf': [1, 5, 10, 20],
    'criterion': ['gini', 'entropy'],
}

gr_search = GridSearchCV(DecisionTreeClassifier(),param_grid)

In [12]:
gr_search

In [13]:
gr_search.fit(X_train, y_train)

In [14]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(40, 16)

In [15]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
28,0.00342,0.000106,0.000496,4e-06,entropy,12.0,1,"{'criterion': 'entropy', 'max_depth': 12, 'min...",0.507336,0.522411,0.527048,0.522411,0.517774,0.519396,0.006705,1
8,0.003362,8.3e-05,0.000506,7e-06,gini,12.0,1,"{'criterion': 'gini', 'max_depth': 12, 'min_sa...",0.508108,0.522411,0.527048,0.526275,0.509274,0.518623,0.008269,2
0,0.008577,0.001397,0.00133,0.00054,gini,,1,"{'criterion': 'gini', 'max_depth': None, 'min_...",0.515058,0.527048,0.517774,0.512365,0.518547,0.518158,0.00495,3
20,0.003998,0.000176,0.000529,3e-06,entropy,,1,"{'criterion': 'entropy', 'max_depth': None, 'm...",0.516602,0.527048,0.516229,0.510819,0.518547,0.517849,0.005267,4
30,0.003209,9.8e-05,0.000488,1.3e-05,entropy,12.0,10,"{'criterion': 'entropy', 'max_depth': 12, 'min...",0.501931,0.515456,0.514683,0.528594,0.517002,0.515533,0.008467,5


## DTC with CV PLAYTIME

In [16]:
param_grid = {
    'max_depth': [5],
    'min_samples_leaf': [10, 20,30,50,60],
    'criterion': ['gini', 'entropy'],
}

gr_search = GridSearchCV(DecisionTreeClassifier(),param_grid)

In [17]:
gr_search

In [18]:
gr_search.fit(X_train, y_train)

In [19]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(10, 16)

In [20]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,0.001969,3.8e-05,0.000429,9e-06,gini,5,60,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.501931,0.497682,0.507728,0.518547,0.510046,0.507187,0.007148,1
9,0.002005,2.2e-05,0.000438,6e-06,entropy,5,60,"{'criterion': 'entropy', 'max_depth': 5, 'min_...",0.501931,0.497682,0.507728,0.518547,0.510046,0.507187,0.007148,1
3,0.002246,0.000128,0.00047,2.5e-05,gini,5,50,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.501158,0.490726,0.50541,0.509274,0.510046,0.503323,0.007047,3
8,0.002016,2.8e-05,0.000428,5e-06,entropy,5,50,"{'criterion': 'entropy', 'max_depth': 5, 'min_...",0.501158,0.490726,0.50541,0.509274,0.510046,0.503323,0.007047,3
1,0.003649,0.000432,0.000734,7.5e-05,gini,5,20,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.49112,0.487635,0.504637,0.515456,0.506182,0.501006,0.01025,5


## RFC with CV

In [21]:
param_grid = {
    'max_depth': [None,18, 12, 10, 5],
    'n_estimators': [55, 101, 201],
    'criterion': ['gini', 'entropy'],
}
gr_search = GridSearchCV(RandomForestClassifier(),
                      param_grid)

In [22]:
gr_search

In [23]:
gr_search.fit(X_train, y_train)

In [24]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(30, 16)

In [25]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,0.195432,0.001165,0.015883,0.000206,gini,,101,"{'criterion': 'gini', 'max_depth': None, 'n_es...",0.514286,0.540185,0.517002,0.516229,0.517002,0.520941,0.009674,1
27,0.067445,0.000674,0.004015,9.9e-05,entropy,5.0,55,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",0.505019,0.520866,0.520093,0.517002,0.53864,0.520324,0.01078,2
17,0.420699,0.00304,0.031436,0.000149,entropy,,201,"{'criterion': 'entropy', 'max_depth': None, 'n...",0.513514,0.527821,0.513138,0.529366,0.513138,0.519395,0.007528,3
4,0.195133,0.001957,0.015512,0.000127,gini,18.0,101,"{'criterion': 'gini', 'max_depth': 18, 'n_esti...",0.518147,0.535549,0.507728,0.520093,0.513138,0.518931,0.009351,4
5,0.38478,0.002933,0.030177,7.7e-05,gini,18.0,201,"{'criterion': 'gini', 'max_depth': 18, 'n_esti...",0.505019,0.526275,0.516229,0.528594,0.517002,0.518624,0.008383,5


## RFC with CV PLAYTIME

In [26]:
param_grid = {
    'max_depth': [2,3,5,10],
    'n_estimators': [31, 101, 201,501],
    'criterion': ['gini', 'entropy'],
}
gr_search = GridSearchCV(RandomForestClassifier(),
                      param_grid)

In [27]:
gr_search

In [28]:
gr_search.fit(X_train, y_train)

In [29]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(32, 16)

In [30]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,0.468698,0.001085,0.025938,0.000355,gini,3,501,"{'criterion': 'gini', 'max_depth': 3, 'n_estim...",0.520463,0.511592,0.517002,0.523957,0.539413,0.522485,0.009397,1
22,0.187798,0.002501,0.010623,0.000196,entropy,3,201,"{'criterion': 'entropy', 'max_depth': 3, 'n_es...",0.528185,0.513138,0.507728,0.521638,0.540185,0.522175,0.011412,2
18,0.17013,0.001248,0.00958,0.000111,entropy,2,201,"{'criterion': 'entropy', 'max_depth': 2, 'n_es...",0.528185,0.51391,0.506182,0.520866,0.53323,0.520475,0.009692,3
6,0.189442,0.000803,0.010757,0.000143,gini,3,201,"{'criterion': 'gini', 'max_depth': 3, 'n_estim...",0.515058,0.512365,0.518547,0.517774,0.537867,0.520322,0.009039,4
19,0.40899,0.003967,0.023293,0.000147,entropy,2,501,"{'criterion': 'entropy', 'max_depth': 2, 'n_es...",0.528185,0.515456,0.506182,0.52473,0.527048,0.52032,0.008368,5


## NBC with CV

In [31]:
param_grid = {
    'var_smoothing': [.000000001,.000000002,.000000003]
}
gr_search = GridSearchCV(GaussianNB(),
                      param_grid)

In [32]:
gr_search

In [33]:
gr_search.fit(X_train, y_train)

In [34]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(3, 14)

In [35]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_var_smoothing,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003223,0.000881,0.00135,0.000264,0.0,{'var_smoothing': 1e-09},0.520463,0.510819,0.515456,0.496909,0.516229,0.511975,0.008131,1
1,0.00191,0.00013,0.000927,5.2e-05,0.0,{'var_smoothing': 2e-09},0.520463,0.510819,0.515456,0.496909,0.516229,0.511975,0.008131,1
2,0.001546,0.000131,0.000738,3.8e-05,0.0,{'var_smoothing': 3e-09},0.520463,0.510819,0.515456,0.496909,0.516229,0.511975,0.008131,1


## GBC with CV

In [36]:
param_grid = {
    'learning_rate': [0.1,0.2,0.5,1.0, 5.0],
    'n_estimators': [55, 101, 201],
    'min_samples_leaf': [1,5,10,20],
}
gr_search = GridSearchCV(GradientBoostingClassifier(),
                      param_grid)

In [37]:
gr_search

In [38]:
gr_search.fit(X_train, y_train)

In [39]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(60, 16)

In [40]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
47,0.254916,0.000732,0.002945,0.000119,1.0,20,201,"{'learning_rate': 1.0, 'min_samples_leaf': 20,...",0.511197,0.531685,0.508501,0.500773,0.527821,0.515995,0.011806,1
46,0.12667,0.000379,0.00168,4.3e-05,1.0,20,101,"{'learning_rate': 1.0, 'min_samples_leaf': 20,...",0.500386,0.526275,0.50541,0.512365,0.530139,0.514915,0.011565,2
4,0.126151,0.000572,0.001446,2.1e-05,0.1,5,101,"{'learning_rate': 0.1, 'min_samples_leaf': 5, ...",0.501158,0.51391,0.50541,0.531685,0.522411,0.514915,0.011118,3
3,0.06903,0.000321,0.001041,3e-05,0.1,5,55,"{'learning_rate': 0.1, 'min_samples_leaf': 5, ...",0.522008,0.506955,0.497682,0.522411,0.523957,0.514602,0.01047,4
15,0.069112,0.000482,0.001014,1.7e-05,0.2,5,55,"{'learning_rate': 0.2, 'min_samples_leaf': 5, ...",0.501931,0.510046,0.504637,0.528594,0.525502,0.514142,0.010901,5


## CATb with CV

In [41]:
# param_grid = {
#     'verbose': [False],
#     'depth': [5, 10, 12,15],
#     'learning_rate': [None,0.1, 0.3,0.7,1.0],
# }
# gr_search = GridSearchCV(CatBoostClassifier(),
#                       param_grid)

In [42]:
param_grid = {
    'verbose': [False],
    'depth': [5, 10, 12],
    'learning_rate': [None,0.1,0.2, 0.5],
}
gr_search = GridSearchCV(CatBoostClassifier(),
                      param_grid)

In [43]:
gr_search

In [44]:
gr_search.fit(X_train, y_train)

In [45]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(12, 16)

In [46]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_depth,param_learning_rate,param_verbose,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,2.222406,0.003744,0.001532,2e-05,10,0.5,False,"{'depth': 10, 'learning_rate': 0.5, 'verbose':...",0.505792,0.537094,0.518547,0.520866,0.507728,0.518005,0.011204,1
10,4.831274,0.028053,0.001834,1.9e-05,12,0.2,False,"{'depth': 12, 'learning_rate': 0.2, 'verbose':...",0.509653,0.531685,0.514683,0.51932,0.509274,0.516923,0.008247,2
9,4.741152,0.014854,0.001802,3.8e-05,12,0.1,False,"{'depth': 12, 'learning_rate': 0.1, 'verbose':...",0.508108,0.53323,0.518547,0.514683,0.506182,0.51615,0.00963,3
11,4.857277,0.027296,0.001805,5.3e-05,12,0.5,False,"{'depth': 12, 'learning_rate': 0.5, 'verbose':...",0.509653,0.527821,0.515456,0.51932,0.506182,0.515686,0.007579,4
6,2.269652,0.052457,0.001703,0.000283,10,0.2,False,"{'depth': 10, 'learning_rate': 0.2, 'verbose':...",0.505792,0.529366,0.513138,0.513138,0.506182,0.513523,0.008544,5


## SCALE for Logistic regression, MLP, etc.

In [47]:
sc_X = StandardScaler()
X_train_scaled = sc_X.fit_transform(X_train)
X_validate_scaled = sc_X.transform(X_validate)
X_test_scaled = sc_X.transform(X_test)

## MLP

In [48]:
mlp = MLPClassifier(hidden_layer_sizes=(1024,512,128),activation="relu",random_state=2013,
                    batch_size=500,solver='adam')
mlp.fit(X_train_scaled, y_train)
mlp_preds = mlp.predict(X_train_scaled)
print(mlp.score(X_train_scaled, y_train))
pd.crosstab(mlp_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows



0.5747179724926595


is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1321,885
1,1867,2398


In [49]:
print(f'Accuracy-Train {round(mlp.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(mlp.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,mlp_preds))
print(classification_report(y_validate,mlp.predict(X_validate_scaled)))

Accuracy-Train 0.5747
Accuracy-Validate 0.4962
              precision    recall  f1-score   support

           0       0.60      0.41      0.49      3188
           1       0.56      0.73      0.64      3283

    accuracy                           0.57      6471
   macro avg       0.58      0.57      0.56      6471
weighted avg       0.58      0.57      0.56      6471

              precision    recall  f1-score   support

           0       0.48      0.35      0.41      1179
           1       0.50      0.64      0.56      1215

    accuracy                           0.50      2394
   macro avg       0.49      0.49      0.48      2394
weighted avg       0.49      0.50      0.49      2394



## KNN with CV

In [50]:
param_grid = {
    'n_neighbors': [70],
    'weights': ['uniform','distance'],
    'algorithm':['ball_tree','brute'],
    'leaf_size': [3,4,5,6,7,8,9],
}
gr_search = GridSearchCV(KNeighborsClassifier(),
                      param_grid)

In [51]:
gr_search

In [52]:
gr_search.fit(X_train, y_train)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [53]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(28, 17)

In [54]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_leaf_size,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
13,0.001665,4.7e-05,0.021871,0.000369,ball_tree,9,70,distance,"{'algorithm': 'ball_tree', 'leaf_size': 9, 'n_...",0.517375,0.531685,0.515456,0.518547,0.517002,0.520013,0.005919,1
7,0.001817,0.000236,0.022483,0.000641,ball_tree,6,70,distance,"{'algorithm': 'ball_tree', 'leaf_size': 6, 'n_...",0.517375,0.531685,0.515456,0.518547,0.517002,0.520013,0.005919,1
11,0.001695,4.2e-05,0.021997,0.000625,ball_tree,8,70,distance,"{'algorithm': 'ball_tree', 'leaf_size': 8, 'n_...",0.517375,0.531685,0.515456,0.518547,0.517002,0.520013,0.005919,1
9,0.001687,1.5e-05,0.022051,0.000603,ball_tree,7,70,distance,"{'algorithm': 'ball_tree', 'leaf_size': 7, 'n_...",0.517375,0.531685,0.515456,0.518547,0.517002,0.520013,0.005919,1
25,0.000749,3e-05,0.02399,0.000447,brute,8,70,distance,"{'algorithm': 'brute', 'leaf_size': 8, 'n_neig...",0.519691,0.530139,0.514683,0.518547,0.516229,0.519858,0.00543,5


## LOG with CV

In [55]:
param_grid = {
    'penalty': ['none','l1','l2','elasticnet'],
    'class_weight': ['balanced',None],
    'max_iter':[100,50,200,500]
}
gr_search = GridSearchCV(LogisticRegression(),
                      param_grid)

In [56]:
gr_search

In [57]:
gr_search.fit(X_train, y_train)

80 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

-

In [58]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(32, 16)

In [59]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_max_iter,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
10,0.009927,0.003952,0.000447,1.4e-05,balanced,200,l2,"{'class_weight': 'balanced', 'max_iter': 200, ...",0.528958,0.516229,0.504637,0.50541,0.504637,0.511974,0.009564,1
14,0.009978,0.002363,0.00092,0.000566,balanced,500,l2,"{'class_weight': 'balanced', 'max_iter': 500, ...",0.528958,0.516229,0.504637,0.50541,0.504637,0.511974,0.009564,1
2,0.009793,0.003874,0.000477,3.3e-05,balanced,100,l2,"{'class_weight': 'balanced', 'max_iter': 100, ...",0.528958,0.516229,0.504637,0.50541,0.504637,0.511974,0.009564,1
6,0.008418,0.001836,0.000804,0.000437,balanced,50,l2,"{'class_weight': 'balanced', 'max_iter': 50, '...",0.528958,0.516229,0.504637,0.50541,0.504637,0.511974,0.009564,1
12,0.00896,0.00395,0.000596,0.000309,balanced,500,none,"{'class_weight': 'balanced', 'max_iter': 500, ...",0.528958,0.516229,0.503864,0.50541,0.504637,0.511819,0.009686,5


## SVM with CV

In [60]:
param_grid = {
    'shrinking': [True,False],
    'C':[1,5,10]
}
gr_search = GridSearchCV(SVC(),
                      param_grid)

In [61]:
gr_search

In [62]:
gr_search.fit(X_train, y_train)

In [63]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(6, 15)

In [64]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_shrinking,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,0.337687,0.002275,0.205112,0.000515,5,True,"{'C': 5, 'shrinking': True}",0.532819,0.516229,0.506955,0.530139,0.522411,0.521711,0.009409,1
3,0.337103,0.001705,0.20513,0.000486,5,False,"{'C': 5, 'shrinking': False}",0.532819,0.516229,0.506955,0.530139,0.522411,0.521711,0.009409,1
0,0.33906,0.014242,0.206186,0.000934,1,True,"{'C': 1, 'shrinking': True}",0.530502,0.51932,0.509274,0.523957,0.516229,0.519856,0.00715,3
1,0.332116,0.001286,0.20574,0.000466,1,False,"{'C': 1, 'shrinking': False}",0.530502,0.51932,0.509274,0.523957,0.516229,0.519856,0.00715,3
5,0.340354,0.003128,0.204669,0.000564,10,False,"{'C': 10, 'shrinking': False}",0.533591,0.508501,0.504637,0.52473,0.520093,0.51831,0.010593,5


# NOW return to single iterations of each model with optimized hyperparameters.

### DTC: gini, max_d=5,min_samples_leaf=50

In [65]:
# create the Decision Tree Classifier model 
dtc = DecisionTreeClassifier(max_depth=5,min_samples_leaf=50)
# fit the model to the TRAIN dataset:
dtc.fit(X_train, y_train)
# use the model by calling for the predictions made via the TRAIN dataset
dtc_preds = dtc.predict(X_train)
pd.crosstab(dtc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,693,564
1,2495,2719


In [66]:
print(f'Accuracy-Train {round(dtc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(dtc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,dtc_preds))
print(classification_report(y_validate,dtc.predict(X_validate)))

Accuracy-Train 0.5273
Accuracy-Validate 0.5184
              precision    recall  f1-score   support

           0       0.55      0.22      0.31      3188
           1       0.52      0.83      0.64      3283

    accuracy                           0.53      6471
   macro avg       0.54      0.52      0.48      6471
weighted avg       0.54      0.53      0.48      6471

              precision    recall  f1-score   support

           0       0.53      0.21      0.30      1179
           1       0.52      0.82      0.63      1215

    accuracy                           0.52      2394
   macro avg       0.52      0.51      0.47      2394
weighted avg       0.52      0.52      0.47      2394



### RFC: entropy, max_d=3,n_est=201

In [67]:
# create the Random Forest model 
rf1 = RandomForestClassifier(criterion='entropy',n_estimators=201,max_depth=3)
# fit the model to the TRAIN dataset1
rf1.fit(X_train, y_train)
# use the model by calling for the predictions made via the TRAIN dataset
rf1_preds = rf1.predict(X_train)
pd.crosstab(rf1_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1173,1000
1,2015,2283


In [68]:
print(f'Accuracy-Train {round(rf1.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(rf1.score(X_validate,y_validate),4)}')
print(classification_report(y_train,rf1_preds))
print(classification_report(y_validate,rf1.predict(X_validate)))

Accuracy-Train 0.5341
Accuracy-Validate 0.5217
              precision    recall  f1-score   support

           0       0.54      0.37      0.44      3188
           1       0.53      0.70      0.60      3283

    accuracy                           0.53      6471
   macro avg       0.54      0.53      0.52      6471
weighted avg       0.54      0.53      0.52      6471

              precision    recall  f1-score   support

           0       0.52      0.37      0.43      1179
           1       0.52      0.67      0.59      1215

    accuracy                           0.52      2394
   macro avg       0.52      0.52      0.51      2394
weighted avg       0.52      0.52      0.51      2394



### SVM: vanilla

In [70]:
svm = SVC()
svm.fit(X_train_scaled, y_train)
svm_preds = svm.predict(X_train_scaled)
print(svm.score(X_train_scaled, y_train))
pd.crosstab(svm_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

0.526348323288518


is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1101,978
1,2087,2305


In [71]:
print(f'Accuracy-Train {round(svm.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(svm.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,svm_preds))
print(classification_report(y_validate,svm.predict(X_validate_scaled)))

Accuracy-Train 0.5263
Accuracy-Validate 0.5192
              precision    recall  f1-score   support

           0       0.53      0.35      0.42      3188
           1       0.52      0.70      0.60      3283

    accuracy                           0.53      6471
   macro avg       0.53      0.52      0.51      6471
weighted avg       0.53      0.53      0.51      6471

              precision    recall  f1-score   support

           0       0.52      0.35      0.42      1179
           1       0.52      0.68      0.59      1215

    accuracy                           0.52      2394
   macro avg       0.52      0.52      0.50      2394
weighted avg       0.52      0.52      0.51      2394

