In [1]:
#standard ds imports
import pandas as pd
import numpy as np
#viz and stats
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# .py imports
#import wranglerer as wr
#import modeling as md
import os
#sklearn imports
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
import sklearn.preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix, plot_confusion_matrix,balanced_accuracy_score,roc_auc_score,make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#CATboost imports
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import explore_r as ex
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import xgboost as xgb

### This notebook builds upon the previous notebook (model_r_round2) and now will incorporate feature selection in addition to GridSearchCV in an attempt to find optimal features/hyperparams for each algorithm.  However, let's first add XGBoost to the algorithm lineup and see if it can help push the needle.

In [2]:
df = pd.read_csv('prepped_data_stad.csv')

In [3]:
df = df.drop(columns=['date','home_score',
       'home_wins', 'away_score', 'away_wins','total_scores'])
df['spread'] = abs(df['spread'])

In [None]:
# cols = ['day_of_week','start_time','stadium']
# # recast STADIUM,Day,start_time as 'categories' dtype:
# for col in cols:
#     X_train[col] = X_train[col].astype('category')

In [None]:
#df = pd.get_dummies(df,columns=['day_of_week','start_time','stadium'])

In [4]:
X_train, y_train, X_validate, y_validate, X_test, y_test = ex.train_validate_test(df,'is_under')

In [5]:
X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape 

((6471, 14), (6471,), (2394, 14), (2394,), (1946, 14), (1946,))

In [6]:
print(y_train.value_counts(normalize=True))
print(y_validate.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

1    0.50734
0    0.49266
Name: is_under, dtype: float64
1    0.507519
0    0.492481
Name: is_under, dtype: float64
1    0.507194
0    0.492806
Name: is_under, dtype: float64


In [7]:
X_train.head()

Unnamed: 0,day_of_week,start_time,week_num,stadium,temp,humidity,wind,spread,ou,abnormal_start,is_playoff,playoff_implications,is_turf,is_outdoor
894,Sunday,1,14,Raymond James Stadium,78,58,9,3.0,46.5,0,0,1,0,1
6913,Sunday,1,11,Cinergy Field,32,73,11,4.0,41.5,0,0,1,1,1
886,Sunday,1,14,Mercedes-Benz Stadium,72,0,0,3.5,48.0,0,0,0,1,0
7630,Sunday,1,12,Rich Stadium,37,49,20,13.0,37.0,0,0,1,1,1
5466,Sunday,4,7,Sun Devil Stadium,81,24,5,3.5,36.5,0,0,0,0,1


In [8]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1946 entries, 6593 to 94
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   day_of_week           1946 non-null   object 
 1   start_time            1946 non-null   int64  
 2   week_num              1946 non-null   int64  
 3   stadium               1946 non-null   object 
 4   temp                  1946 non-null   int64  
 5   humidity              1946 non-null   int64  
 6   wind                  1946 non-null   int64  
 7   spread                1946 non-null   float64
 8   ou                    1946 non-null   float64
 9   abnormal_start        1946 non-null   int64  
 10  is_playoff            1946 non-null   int64  
 11  playoff_implications  1946 non-null   int64  
 12  is_turf               1946 non-null   int64  
 13  is_outdoor            1946 non-null   int64  
dtypes: float64(2), int64(10), object(2)
memory usage: 228.0+ KB


In [9]:
X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape 

((6471, 14), (6471,), (2394, 14), (2394,), (1946, 14), (1946,))

# MODELING
## CATBoost

In [10]:
# Create and fit the thing
CATb = CatBoostClassifier(verbose=False,depth=5)
CATb.fit(X_train,y_train,cat_features=['day_of_week','start_time','stadium'])
CATb_preds = CATb.predict(X_train)
pd.crosstab(CATb_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1889,1140
1,1299,2143


In [11]:
print(f'Accuracy-Train {round(CATb.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(CATb.score(X_validate,y_validate),4)}')
print(classification_report(y_train,CATb_preds))
print(classification_report(y_validate,CATb.predict(X_validate)))

Accuracy-Train 0.6231
Accuracy-Validate 0.4896
              precision    recall  f1-score   support

           0       0.62      0.59      0.61      3188
           1       0.62      0.65      0.64      3283

    accuracy                           0.62      6471
   macro avg       0.62      0.62      0.62      6471
weighted avg       0.62      0.62      0.62      6471

              precision    recall  f1-score   support

           0       0.48      0.47      0.48      1179
           1       0.50      0.51      0.50      1215

    accuracy                           0.49      2394
   macro avg       0.49      0.49      0.49      2394
weighted avg       0.49      0.49      0.49      2394



## CATboost grid_search CROSS_VALIDATION k=5

In [12]:
param_grid = {
    'verbose': [False],
    'depth': [5, 10,15]
}
gr_search = GridSearchCV(CatBoostClassifier(),
                      param_grid)

In [13]:
gr_search

In [14]:
gr_search.fit(X_train, y_train,cat_features=['day_of_week','start_time','stadium'])

In [15]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(3, 15)

In [16]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_depth,param_verbose,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2.810644,0.029327,0.002423,8e-05,5,False,"{'depth': 5, 'verbose': False}",0.497297,0.525502,0.495363,0.544822,0.534776,0.519552,0.019931,1
1,5.586676,0.040965,0.003888,0.000294,10,False,"{'depth': 10, 'verbose': False}",0.509653,0.517774,0.508501,0.521638,0.530139,0.517541,0.007994,2
2,75.577717,0.369622,0.0099,0.00202,15,False,"{'depth': 15, 'verbose': False}",0.505019,0.520093,0.501546,0.500773,0.534776,0.512441,0.013171,3


## XGBoost

In [None]:
#pd.get_dummies(X_train,columns=['day_of_week','start_time','stadium']).columns

In [None]:
y_train.unique(),y_validate.unique(),y_test.unique()

In [None]:
xgbc = xgb.XGBClassifier(n_estimators=2000,objective='binary:logistic',eval_metric='aucpr',missing=None,
                         random_state=2013,learning_rate=0.005, early_stopping_rounds=300,max_depth=12,gamma=0.1,
                         reg_lambda=1.0)

In [None]:
# xgbc.fit(X_train,y_train,
#        verbose=True,
#        eval_set=[(X_validate,y_validate)])

In [None]:
# plot_confusion_matrix(xgbc,X_validate,y_validate)

In [None]:
param_grid = {
    'max_depth': [5, 10, 12, 16],
    'n_estimators': [1000, 2000, 500],
    'eta': [0.01, 0.05,0.1],
    'gamma':[0,0.25,1.0],
    'reg_lambda':[0,1.0,10.0]
}
gr_search = GridSearchCV(xgb.XGBClassifier(),
                      param_grid)

In [None]:
gr_search

In [None]:
# too much compute req'd no run
#gr_search.fit(X_train, y_train)

In [None]:
# results = gr_search.cv_results_
# results_df_init = pd.DataFrame(results)
# results_df_init.shape

In [None]:
# params = pd.DataFrame(results['params'])
# results_df_init.sort_values(by='rank_test_score').head()

# OK, enough of that...
## Let's work with a hi-speed,low-drag version of the df
### Cols: wind, ou, is_outdoor, is_turf, abnormal_start, playoff

In [17]:
df = pd.read_csv('prepped_data_stad.csv')

In [18]:
df = df.drop(columns=['date','home_score',
       'home_wins', 'away_score', 'away_wins','total_scores',
                     'day_of_week','start_time', 'week_num','stadium', 'temp', 'humidity','spread',
                      'playoff_implications'])
#df['spread'] = abs(df['spread'])

In [19]:
df.head()

Unnamed: 0,wind,ou,is_under,abnormal_start,is_playoff,is_turf,is_outdoor
0,0,51.0,0,0,1,0,0
1,14,45.5,1,0,1,0,1
2,13,48.0,1,0,1,0,1
3,0,49.0,1,0,1,1,1
4,19,47.0,1,0,1,0,1


In [20]:
X_train, y_train, X_validate, y_validate, X_test, y_test = ex.train_validate_test(df,'is_under')

In [21]:
X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape 

((6471, 6), (6471,), (2394, 6), (2394,), (1946, 6), (1946,))

In [22]:
X_train.head()

Unnamed: 0,wind,ou,abnormal_start,is_playoff,is_turf,is_outdoor
894,9,46.5,0,0,0,1
6913,11,41.5,0,0,1,1
886,0,48.0,0,0,1,0
7630,20,37.0,0,0,1,1
5466,5,36.5,0,0,0,1


In [24]:
print(y_train.value_counts(normalize=True))
print(y_validate.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

1    0.50734
0    0.49266
Name: is_under, dtype: float64
1    0.507519
0    0.492481
Name: is_under, dtype: float64
1    0.507194
0    0.492806
Name: is_under, dtype: float64


In [25]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1946 entries, 6593 to 94
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   wind            1946 non-null   int64  
 1   ou              1946 non-null   float64
 2   abnormal_start  1946 non-null   int64  
 3   is_playoff      1946 non-null   int64  
 4   is_turf         1946 non-null   int64  
 5   is_outdoor      1946 non-null   int64  
dtypes: float64(1), int64(5)
memory usage: 106.4 KB


# MODELING with CV on slim df

## DTC with CV

In [26]:
param_grid = {
    'max_depth': [None,18, 12, 10, 5],
    'min_samples_leaf': [1, 5, 10, 20],
    'criterion': ['gini', 'entropy'],
}

gr_search = GridSearchCV(DecisionTreeClassifier(),param_grid)

In [27]:
gr_search

In [28]:
gr_search.fit(X_train, y_train)

In [29]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(40, 16)

In [30]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,0.0039,8.5e-05,0.00051,8e-06,gini,12,1,"{'criterion': 'gini', 'max_depth': 12, 'min_sa...",0.491892,0.523184,0.511592,0.517774,0.514683,0.511825,0.010677,1
28,0.004003,5.5e-05,0.000502,3e-06,entropy,12,1,"{'criterion': 'entropy', 'max_depth': 12, 'min...",0.481081,0.526275,0.517002,0.513138,0.517774,0.511054,0.015588,2
32,0.003543,6.5e-05,0.000481,3e-06,entropy,10,1,"{'criterion': 'entropy', 'max_depth': 10, 'min...",0.491892,0.523957,0.506955,0.52473,0.502318,0.50997,0.012712,3
29,0.003803,9.9e-05,0.000497,1e-05,entropy,12,5,"{'criterion': 'entropy', 'max_depth': 12, 'min...",0.484942,0.513138,0.527048,0.507728,0.506955,0.507962,0.01358,4
12,0.003489,5.7e-05,0.000489,1e-05,gini,10,1,"{'criterion': 'gini', 'max_depth': 10, 'min_sa...",0.499614,0.518547,0.503864,0.517774,0.499227,0.507805,0.008614,5


## DTC with CV PLAYTIME

In [31]:
param_grid = {
    'max_depth': [5],
    'min_samples_leaf': [10, 20,30,50,60],
    'criterion': ['gini', 'entropy'],
}

gr_search = GridSearchCV(DecisionTreeClassifier(),param_grid)

In [32]:
gr_search

In [33]:
gr_search.fit(X_train, y_train)

In [34]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(10, 16)

In [35]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,0.002185,5.5e-05,0.000445,6e-06,gini,5,60,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.503475,0.497682,0.507728,0.518547,0.510046,0.507496,0.006945,1
9,0.002207,6.5e-05,0.000451,9e-06,entropy,5,60,"{'criterion': 'entropy', 'max_depth': 5, 'min_...",0.503475,0.497682,0.507728,0.518547,0.510046,0.507496,0.006945,1
3,0.002296,8e-05,0.000453,2.2e-05,gini,5,50,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.504247,0.490726,0.50541,0.509274,0.510046,0.503941,0.006965,3
8,0.00222,2.3e-05,0.000442,1.1e-05,entropy,5,50,"{'criterion': 'entropy', 'max_depth': 5, 'min_...",0.504247,0.490726,0.50541,0.509274,0.510046,0.503941,0.006965,3
5,0.002263,3.7e-05,0.000438,7e-06,entropy,5,10,"{'criterion': 'entropy', 'max_depth': 5, 'min_...",0.498069,0.484544,0.501546,0.51932,0.510819,0.50286,0.011785,5


## RFC with CV

In [36]:
param_grid = {
    'max_depth': [None,18, 12, 10, 5],
    'n_estimators': [55, 101, 201],
    'criterion': ['gini', 'entropy'],
}
gr_search = GridSearchCV(RandomForestClassifier(),
                      param_grid)

In [37]:
gr_search

In [38]:
gr_search.fit(X_train, y_train)

In [39]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(30, 16)

In [40]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
13,0.113664,0.000979,0.006742,0.000108,gini,5,101,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.518919,0.520093,0.512365,0.532457,0.525502,0.521867,0.006745,1
28,0.112395,0.004004,0.006645,5.7e-05,entropy,5,101,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",0.511197,0.513138,0.51932,0.527821,0.530139,0.520323,0.007596,2
29,0.225346,0.000363,0.012819,0.000182,entropy,5,201,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",0.517375,0.513138,0.506955,0.531685,0.531685,0.520167,0.009971,3
27,0.063305,0.000243,0.003904,3e-05,entropy,5,55,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",0.511969,0.526275,0.506182,0.52473,0.527048,0.519241,0.008532,4
14,0.214313,0.006638,0.015338,0.004564,gini,5,201,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.505019,0.530139,0.506182,0.521638,0.531685,0.518933,0.011416,5


## RFC with CV PLAYTIME

In [41]:
param_grid = {
    'max_depth': [2,3,5,10],
    'n_estimators': [31, 101, 201,501],
    'criterion': ['gini', 'entropy'],
}
gr_search = GridSearchCV(RandomForestClassifier(),
                      param_grid)

In [42]:
gr_search

In [43]:
gr_search.fit(X_train, y_train)

In [44]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(32, 16)

In [45]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,0.457355,0.001686,0.025822,0.000281,gini,3,501,"{'criterion': 'gini', 'max_depth': 3, 'n_estim...",0.525097,0.520093,0.503091,0.530139,0.536321,0.522948,0.011293,1
26,0.221341,0.016418,0.012718,0.000223,entropy,5,201,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",0.521236,0.517774,0.511592,0.532457,0.530912,0.522794,0.007904,2
10,0.216704,0.001943,0.013082,0.000167,gini,5,201,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.522008,0.521638,0.510046,0.530912,0.529366,0.522794,0.007397,3
11,0.535921,0.0011,0.031626,0.000264,gini,5,501,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.513514,0.520093,0.51391,0.52473,0.540185,0.522486,0.009778,4
9,0.108814,0.000358,0.006749,7.5e-05,gini,5,101,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.522008,0.517002,0.512365,0.529366,0.527821,0.521712,0.006412,5


## NBC with CV

In [46]:
param_grid = {
    'var_smoothing': [.000000001,.000000002,.000000003]
}
gr_search = GridSearchCV(GaussianNB(),
                      param_grid)

In [47]:
gr_search

In [48]:
gr_search.fit(X_train, y_train)

In [49]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(3, 14)

In [50]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_var_smoothing,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002941,0.001192,0.001329,0.000251,0.0,{'var_smoothing': 1e-09},0.506564,0.527821,0.513138,0.527821,0.520093,0.519087,0.008316,1
1,0.001883,0.000129,0.000981,0.000118,0.0,{'var_smoothing': 2e-09},0.506564,0.527821,0.513138,0.527821,0.520093,0.519087,0.008316,1
2,0.001605,7.4e-05,0.00081,4.1e-05,0.0,{'var_smoothing': 3e-09},0.506564,0.527821,0.513138,0.527821,0.520093,0.519087,0.008316,1


## GBC with CV

In [51]:
param_grid = {
    'learning_rate': [0.1,0.2,0.5,1.0, 5.0],
    'n_estimators': [55, 101, 201],
    'min_samples_leaf': [1,5,10,20],
}
gr_search = GridSearchCV(GradientBoostingClassifier(),
                      param_grid)

In [52]:
gr_search

In [53]:
gr_search.fit(X_train, y_train)

In [54]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(60, 16)

In [55]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.082284,0.013887,0.00107,7.8e-05,0.1,1,55,"{'learning_rate': 0.1, 'min_samples_leaf': 1, ...",0.511197,0.522411,0.489181,0.523957,0.522411,0.513831,0.013148,1
12,0.075414,0.000564,0.001015,3.7e-05,0.2,1,55,"{'learning_rate': 0.2, 'min_samples_leaf': 1, ...",0.506564,0.520866,0.502318,0.520093,0.518547,0.513678,0.007696,2
6,0.075267,0.000337,0.001052,3.9e-05,0.1,10,55,"{'learning_rate': 0.1, 'min_samples_leaf': 10,...",0.508108,0.528594,0.491499,0.523184,0.517002,0.513677,0.013014,3
4,0.137843,0.000824,0.001464,2.6e-05,0.1,5,101,"{'learning_rate': 0.1, 'min_samples_leaf': 5, ...",0.502703,0.521638,0.498454,0.531685,0.510819,0.51306,0.012215,4
38,0.282402,0.000649,0.003021,7.7e-05,1.0,1,201,"{'learning_rate': 1.0, 'min_samples_leaf': 1, ...",0.505792,0.530912,0.511592,0.511592,0.502318,0.512441,0.009892,5


## CATb with CV

In [None]:
# param_grid = {
#     'verbose': [False],
#     'depth': [5, 10, 12,15],
#     'learning_rate': [None,0.1, 0.3,0.7,1.0],
# }
# gr_search = GridSearchCV(CatBoostClassifier(),
#                       param_grid)

In [56]:
param_grid = {
    'verbose': [False],
    'depth': [5, 10, 12],
    'learning_rate': [None,0.1,0.2, 0.5],
}
gr_search = GridSearchCV(CatBoostClassifier(),
                      param_grid)

In [57]:
gr_search

In [58]:
gr_search.fit(X_train, y_train)

In [59]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(12, 16)

In [60]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_depth,param_learning_rate,param_verbose,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,4.897113,0.030274,0.001982,9.4e-05,12,0.1,False,"{'depth': 12, 'learning_rate': 0.1, 'verbose':...",0.524324,0.504637,0.506182,0.522411,0.51391,0.514293,0.008071,1
10,4.970007,0.024698,0.003404,0.002895,12,0.2,False,"{'depth': 12, 'learning_rate': 0.2, 'verbose':...",0.518919,0.501546,0.501546,0.527048,0.516229,0.513057,0.010052,2
0,1.220558,0.016833,0.001148,0.000145,5,,False,"{'depth': 5, 'learning_rate': None, 'verbose':...",0.493436,0.516229,0.503864,0.513138,0.535549,0.512443,0.014025,3
4,2.162148,0.03241,0.001547,6e-05,10,,False,"{'depth': 10, 'learning_rate': None, 'verbose'...",0.513514,0.512365,0.510046,0.510046,0.516229,0.51244,0.002323,4
5,2.237902,0.010958,0.001728,0.000163,10,0.1,False,"{'depth': 10, 'learning_rate': 0.1, 'verbose':...",0.511969,0.507728,0.503864,0.520093,0.517774,0.512286,0.006048,5


## SCALE for Logistic regression, MLP, etc.

In [61]:
sc_X = StandardScaler()
X_train_scaled = sc_X.fit_transform(X_train)
X_validate_scaled = sc_X.transform(X_validate)
X_test_scaled = sc_X.transform(X_test)

## MLP

In [62]:
mlp = MLPClassifier(hidden_layer_sizes=(1024,512,128),activation="relu",random_state=2013,
                    batch_size=500,solver='adam')
mlp.fit(X_train_scaled, y_train)
mlp_preds = mlp.predict(X_train_scaled)
print(mlp.score(X_train_scaled, y_train))
pd.crosstab(mlp_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows



0.6138154844691701


is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1648,959
1,1540,2324


In [63]:
print(f'Accuracy-Train {round(mlp.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(mlp.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,mlp_preds))
print(classification_report(y_validate,mlp.predict(X_validate_scaled)))

Accuracy-Train 0.6138
Accuracy-Validate 0.5113
              precision    recall  f1-score   support

           0       0.63      0.52      0.57      3188
           1       0.60      0.71      0.65      3283

    accuracy                           0.61      6471
   macro avg       0.62      0.61      0.61      6471
weighted avg       0.62      0.61      0.61      6471

              precision    recall  f1-score   support

           0       0.50      0.41      0.45      1179
           1       0.52      0.61      0.56      1215

    accuracy                           0.51      2394
   macro avg       0.51      0.51      0.51      2394
weighted avg       0.51      0.51      0.51      2394



## KNN with CV

In [64]:
param_grid = {
    'n_neighbors': [70],
    'weights': ['uniform','distance'],
    'algorithm':['ball_tree','brute'],
    'leaf_size': [3,4,5,6,7,8,9],
}
gr_search = GridSearchCV(KNeighborsClassifier(),
                      param_grid)

In [65]:
gr_search

In [66]:
gr_search.fit(X_train, y_train)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [67]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(28, 17)

In [68]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_leaf_size,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003061,0.001877,0.039697,0.012519,ball_tree,3,70,uniform,"{'algorithm': 'ball_tree', 'leaf_size': 3, 'n_...",0.503475,0.531685,0.510819,0.496136,0.52473,0.513369,0.01316,1
2,0.001861,1e-05,0.033054,0.000767,ball_tree,4,70,uniform,"{'algorithm': 'ball_tree', 'leaf_size': 4, 'n_...",0.503475,0.531685,0.510819,0.496136,0.52473,0.513369,0.01316,1
4,0.001864,9e-06,0.033066,0.000758,ball_tree,5,70,uniform,"{'algorithm': 'ball_tree', 'leaf_size': 5, 'n_...",0.503475,0.531685,0.510819,0.496136,0.52473,0.513369,0.01316,1
12,0.001738,8e-06,0.030743,0.000729,ball_tree,9,70,uniform,"{'algorithm': 'ball_tree', 'leaf_size': 9, 'n_...",0.505019,0.53323,0.509274,0.504637,0.514683,0.513369,0.010573,4
10,0.001783,4.2e-05,0.030703,0.000686,ball_tree,8,70,uniform,"{'algorithm': 'ball_tree', 'leaf_size': 8, 'n_...",0.505019,0.53323,0.509274,0.504637,0.514683,0.513369,0.010573,4


## LOG with CV

In [69]:
param_grid = {
    'penalty': ['none','l1','l2','elasticnet'],
    'class_weight': ['balanced',None],
    'max_iter':[100,50,200,500]
}
gr_search = GridSearchCV(LogisticRegression(),
                      param_grid)

In [70]:
gr_search

In [71]:
gr_search.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

80 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

-

In [72]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(32, 16)

In [73]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_max_iter,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
6,0.018211,0.000824,0.00077,0.00037,balanced,50,l2,"{'class_weight': 'balanced', 'max_iter': 50, '...",0.520463,0.520866,0.506955,0.516229,0.503864,0.513675,0.00701,1
20,0.019725,0.006313,0.000897,0.00034,,50,none,"{'class_weight': None, 'max_iter': 50, 'penalt...",0.526641,0.517002,0.496909,0.508501,0.513138,0.512438,0.009795,2
14,0.020878,0.002195,0.000463,2e-05,balanced,500,l2,"{'class_weight': 'balanced', 'max_iter': 500, ...",0.522008,0.521638,0.500773,0.513138,0.503864,0.512284,0.008788,3
12,0.018456,0.003961,0.000803,0.000378,balanced,500,none,"{'class_weight': 'balanced', 'max_iter': 500, ...",0.522008,0.521638,0.500773,0.51391,0.503091,0.512284,0.008963,3
8,0.023524,0.004916,0.00084,0.000417,balanced,200,none,"{'class_weight': 'balanced', 'max_iter': 200, ...",0.522008,0.521638,0.500773,0.51391,0.503091,0.512284,0.008963,3


## SVM with CV

In [74]:
param_grid = {
    'shrinking': [True,False],
    'C':[1,5,10]
}
gr_search = GridSearchCV(SVC(),
                      param_grid)

In [75]:
gr_search

In [76]:
gr_search.fit(X_train, y_train)

In [77]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(6, 15)

In [78]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_shrinking,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.363051,0.016833,0.212023,0.00043,1,True,"{'C': 1, 'shrinking': True}",0.533591,0.518547,0.510819,0.523184,0.515456,0.520319,0.007764,1
2,0.356876,0.003607,0.211385,0.000967,5,True,"{'C': 5, 'shrinking': True}",0.532046,0.517002,0.503864,0.525502,0.521638,0.520011,0.00946,2
3,0.353146,0.001716,0.211464,0.001008,5,False,"{'C': 5, 'shrinking': False}",0.532046,0.517002,0.503864,0.525502,0.521638,0.520011,0.00946,2
1,0.350691,0.001297,0.212081,0.000813,1,False,"{'C': 1, 'shrinking': False}",0.533591,0.518547,0.510819,0.521638,0.515456,0.52001,0.007674,4
4,0.358811,0.002887,0.210858,0.000549,10,True,"{'C': 10, 'shrinking': True}",0.527413,0.513138,0.508501,0.52473,0.522411,0.519238,0.007205,5


# NOW return to single iterations of each model with optimized hyperparameters.

### DTC: gini, max_d=5,min_samples_leaf=50

In [79]:
# create the Decision Tree Classifier model 
dtc = DecisionTreeClassifier(max_depth=5,min_samples_leaf=50)
# fit the model to the TRAIN dataset:
dtc.fit(X_train, y_train)
# use the model by calling for the predictions made via the TRAIN dataset
dtc_preds = dtc.predict(X_train)
pd.crosstab(dtc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,693,564
1,2495,2719


In [80]:
print(f'Accuracy-Train {round(dtc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(dtc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,dtc_preds))
print(classification_report(y_validate,dtc.predict(X_validate)))

Accuracy-Train 0.5273
Accuracy-Validate 0.5184
              precision    recall  f1-score   support

           0       0.55      0.22      0.31      3188
           1       0.52      0.83      0.64      3283

    accuracy                           0.53      6471
   macro avg       0.54      0.52      0.48      6471
weighted avg       0.54      0.53      0.48      6471

              precision    recall  f1-score   support

           0       0.53      0.21      0.30      1179
           1       0.52      0.82      0.63      1215

    accuracy                           0.52      2394
   macro avg       0.52      0.51      0.47      2394
weighted avg       0.52      0.52      0.47      2394



### RFC: entropy, max_d=3,n_est=201

In [81]:
# create the Random Forest model 
rf1 = RandomForestClassifier(criterion='entropy',n_estimators=201,max_depth=3)
# fit the model to the TRAIN dataset1
rf1.fit(X_train, y_train)
# use the model by calling for the predictions made via the TRAIN dataset
rf1_preds = rf1.predict(X_train)
pd.crosstab(rf1_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1133,945
1,2055,2338


In [82]:
print(f'Accuracy-Train {round(rf1.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(rf1.score(X_validate,y_validate),4)}')
print(classification_report(y_train,rf1_preds))
print(classification_report(y_validate,rf1.predict(X_validate)))

Accuracy-Train 0.5364
Accuracy-Validate 0.5217
              precision    recall  f1-score   support

           0       0.55      0.36      0.43      3188
           1       0.53      0.71      0.61      3283

    accuracy                           0.54      6471
   macro avg       0.54      0.53      0.52      6471
weighted avg       0.54      0.54      0.52      6471

              precision    recall  f1-score   support

           0       0.52      0.35      0.42      1179
           1       0.52      0.69      0.59      1215

    accuracy                           0.52      2394
   macro avg       0.52      0.52      0.51      2394
weighted avg       0.52      0.52      0.51      2394



### SVM: vanilla

In [83]:
svm = SVC()
svm.fit(X_train_scaled, y_train)
svm_preds = svm.predict(X_train_scaled)
print(svm.score(X_train_scaled, y_train))
pd.crosstab(svm_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

0.5387111729253593


is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1361,1158
1,1827,2125


In [84]:
print(f'Accuracy-Train {round(svm.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(svm.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,svm_preds))
print(classification_report(y_validate,svm.predict(X_validate_scaled)))

Accuracy-Train 0.5387
Accuracy-Validate 0.5167
              precision    recall  f1-score   support

           0       0.54      0.43      0.48      3188
           1       0.54      0.65      0.59      3283

    accuracy                           0.54      6471
   macro avg       0.54      0.54      0.53      6471
weighted avg       0.54      0.54      0.53      6471

              precision    recall  f1-score   support

           0       0.51      0.41      0.45      1179
           1       0.52      0.62      0.57      1215

    accuracy                           0.52      2394
   macro avg       0.52      0.52      0.51      2394
weighted avg       0.52      0.52      0.51      2394

