In [1]:
#standard ds imports
import pandas as pd
import numpy as np
#viz and stats
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# .py imports
#import wranglerer as wr
#import modeling as md
import os
#sklearn imports
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
import sklearn.preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix, plot_confusion_matrix,balanced_accuracy_score,roc_auc_score,make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#CATboost imports
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import explore_r as ex
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import xgboost as xgb

### This notebook builds upon the previous notebook (model_r_round2) and now will incorporate feature selection in addition to GridSearchCV in an attempt to find optimal features/hyperparams for each algorithm.  However, let's first add XGBoost to the algorithm lineup and see if it can help push the needle.

In [2]:
df = pd.read_csv('prepped_data_stad.csv')

In [3]:
df = df.drop(columns=['date','home_score',
       'home_wins', 'away_score', 'away_wins','total_scores'])
df['spread'] = abs(df['spread'])

In [None]:
# cols = ['day_of_week','start_time','stadium']
# # recast STADIUM,Day,start_time as 'categories' dtype:
# for col in cols:
#     X_train[col] = X_train[col].astype('category')

In [None]:
#df = pd.get_dummies(df,columns=['day_of_week','start_time','stadium'])

In [4]:
X_train, y_train, X_validate, y_validate, X_test, y_test = ex.train_validate_test(df,'is_under')

In [5]:
X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape 

((6471, 14), (6471,), (2394, 14), (2394,), (1946, 14), (1946,))

In [108]:
print(y_train.value_counts(normalize=True))
print(y_validate.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

1    0.508577
0    0.491423
Name: is_under, dtype: float64
1    0.507937
0    0.492063
Name: is_under, dtype: float64
1    0.502569
0    0.497431
Name: is_under, dtype: float64


In [6]:
X_train.head()

Unnamed: 0,day_of_week,start_time,week_num,stadium,temp,humidity,wind,spread,ou,abnormal_start,is_playoff,playoff_implications,is_turf,is_outdoor
1713,Sunday,1,12,Georgia Dome,72,0,0,4.0,49.0,0,0,1,1,0
2164,Sunday,1,17,NRG Stadium,72,0,0,7.5,38.0,0,0,1,0,0
2554,Sunday,8,9,Reliant Stadium,72,0,0,1.0,42.5,1,0,0,0,0
3117,Sunday,1,6,Heinz Field,58,52,10,12.5,40.0,0,0,0,0,1
9819,Sunday,1,13,Shea Stadium,51,49,14,5.0,39.5,0,0,1,0,1


In [None]:
X_test.info()

In [7]:
X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape 

((6471, 14), (6471,), (2394, 14), (2394,), (1946, 14), (1946,))

# MODELING
## CATBoost

In [8]:
# Create and fit the thing
CATb = CatBoostClassifier(verbose=False,depth=5)
CATb.fit(X_train,y_train,cat_features=['day_of_week','start_time','stadium'])
CATb_preds = CATb.predict(X_train)
pd.crosstab(CATb_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2111,1137
1,1069,2154


In [9]:
print(f'Accuracy-Train {round(CATb.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(CATb.score(X_validate,y_validate),4)}')
print(classification_report(y_train,CATb_preds))
print(classification_report(y_validate,CATb.predict(X_validate)))

Accuracy-Train 0.6591
Accuracy-Validate 0.5063
              precision    recall  f1-score   support

           0       0.65      0.66      0.66      3180
           1       0.67      0.65      0.66      3291

    accuracy                           0.66      6471
   macro avg       0.66      0.66      0.66      6471
weighted avg       0.66      0.66      0.66      6471

              precision    recall  f1-score   support

           0       0.50      0.51      0.50      1178
           1       0.51      0.51      0.51      1216

    accuracy                           0.51      2394
   macro avg       0.51      0.51      0.51      2394
weighted avg       0.51      0.51      0.51      2394



## CATboost grid_search CROSS_VALIDATION k=5

In [10]:
param_grid = {
    'verbose': [False],
    'depth': [5, 10,15]
}
gr_search = GridSearchCV(CatBoostClassifier(),
                      param_grid)

In [11]:
gr_search

In [12]:
gr_search.fit(X_train, y_train,cat_features=['day_of_week','start_time','stadium'])

In [13]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(3, 15)

In [14]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_depth,param_verbose,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2.778466,0.024948,0.002386,5.6e-05,5,False,"{'depth': 5, 'verbose': False}",0.528185,0.512365,0.51391,0.515456,0.525502,0.519084,0.006467,1
1,5.505333,0.028947,0.003718,7.7e-05,10,False,"{'depth': 10, 'verbose': False}",0.498842,0.503864,0.518547,0.507728,0.513138,0.508424,0.006896,2
2,74.198083,0.825437,0.006564,0.000591,15,False,"{'depth': 15, 'verbose': False}",0.495753,0.513138,0.50541,0.512365,0.510046,0.507342,0.00639,3


## XGBoost

In [None]:
#pd.get_dummies(X_train,columns=['day_of_week','start_time','stadium']).columns

In [15]:
y_train.unique(),y_validate.unique(),y_test.unique()

(array([0, 1]), array([0, 1]), array([0, 1]))

In [16]:
xgbc = xgb.XGBClassifier(n_estimators=2000,objective='binary:logistic',eval_metric='aucpr',missing=None,
                         random_state=2013,learning_rate=0.005, early_stopping_rounds=300,max_depth=12,gamma=0.1,
                         reg_lambda=1.0)

In [None]:
# xgbc.fit(X_train,y_train,
#        verbose=True,
#        eval_set=[(X_validate,y_validate)])

In [None]:
# plot_confusion_matrix(xgbc,X_validate,y_validate)

In [None]:
param_grid = {
    'max_depth': [5, 10, 12, 16],
    'n_estimators': [1000, 2000, 500],
    'eta': [0.01, 0.05,0.1],
    'gamma':[0,0.25,1.0],
    'reg_lambda':[0,1.0,10.0]
}
gr_search = GridSearchCV(xgb.XGBClassifier(),
                      param_grid)

In [None]:
gr_search

In [None]:
# too much compute req'd no run
#gr_search.fit(X_train, y_train)

In [None]:
# results = gr_search.cv_results_
# results_df_init = pd.DataFrame(results)
# results_df_init.shape

In [None]:
# params = pd.DataFrame(results['params'])
# results_df_init.sort_values(by='rank_test_score').head()

# OK, enough of that...
## Let's work with a hi-speed,low-drag version of the df
### Cols: wind, ou, is_outdoor, is_turf, abnormal_start, playoff

In [18]:
df = pd.read_csv('prepped_data_stad.csv')

In [19]:
df = df.drop(columns=['date','home_score',
       'home_wins', 'away_score', 'away_wins','total_scores',
                     'day_of_week','start_time', 'week_num','stadium', 'temp', 'humidity','spread',
                      'playoff_implications'])
#df['spread'] = abs(df['spread'])

In [20]:
df.head()

Unnamed: 0,wind,ou,is_under,abnormal_start,is_playoff,is_turf,is_outdoor
0,0,51.0,0,0,1,0,0
1,14,45.5,1,0,1,0,1
2,13,48.0,1,0,1,0,1
3,0,49.0,1,0,1,1,1
4,19,47.0,1,0,1,0,1


In [21]:
X_train, y_train, X_validate, y_validate, X_test, y_test = ex.train_validate_test(df,'is_under')

In [22]:
X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape 

((6471, 6), (6471,), (2394, 6), (2394,), (1946, 6), (1946,))

In [23]:
X_train.head()

Unnamed: 0,wind,ou,abnormal_start,is_playoff,is_turf,is_outdoor
1713,0,49.0,0,0,1,0
2164,0,38.0,0,0,0,0
2554,0,42.5,1,0,0,0
3117,10,40.0,0,0,0,1
9819,14,39.5,0,0,0,1


In [24]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1946 entries, 4842 to 3441
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   wind            1946 non-null   int64  
 1   ou              1946 non-null   float64
 2   abnormal_start  1946 non-null   int64  
 3   is_playoff      1946 non-null   int64  
 4   is_turf         1946 non-null   int64  
 5   is_outdoor      1946 non-null   int64  
dtypes: float64(1), int64(5)
memory usage: 106.4 KB


# MODELING with CV on slim df

## DTC with CV

In [25]:
param_grid = {
    'max_depth': [None,18, 12, 10, 5],
    'min_samples_leaf': [1, 5, 10, 20],
    'criterion': ['gini', 'entropy'],
}

gr_search = GridSearchCV(DecisionTreeClassifier(),param_grid)

In [26]:
gr_search

In [27]:
gr_search.fit(X_train, y_train)

In [28]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(40, 16)

In [29]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
19,0.002213,3.2e-05,0.000447,3e-06,gini,5,20,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.522008,0.528594,0.51391,0.520093,0.530139,0.522949,0.005904,1
39,0.002209,4.9e-05,0.000446,1e-05,entropy,5,20,"{'criterion': 'entropy', 'max_depth': 5, 'min_...",0.522008,0.528594,0.512365,0.520093,0.530139,0.52264,0.00639,2
38,0.002233,3.2e-05,0.000441,1.3e-05,entropy,5,10,"{'criterion': 'entropy', 'max_depth': 5, 'min_...",0.511197,0.51932,0.517002,0.520093,0.523184,0.518159,0.004004,3
18,0.002223,5e-05,0.000442,7e-06,gini,5,10,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.513514,0.51932,0.51391,0.520093,0.523184,0.518004,0.003738,4
16,0.002221,4e-05,0.000441,5e-06,gini,5,1,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.50888,0.520093,0.517002,0.515456,0.523957,0.517077,0.005025,5


## DTC with CV PLAYTIME

In [40]:
param_grid = {
    'max_depth': [5],
    'min_samples_leaf': [10, 20,30,50,60],
    'criterion': ['gini', 'entropy'],
}

gr_search = GridSearchCV(DecisionTreeClassifier(),param_grid)

In [41]:
gr_search

In [42]:
gr_search.fit(X_train, y_train)

In [43]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(10, 16)

In [44]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,0.002293,9.9e-05,0.000455,2.5e-05,gini,5,50,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.51583,0.528594,0.514683,0.523957,0.537867,0.524186,0.008565,1
8,0.002102,4.7e-05,0.000414,3e-06,entropy,5,50,"{'criterion': 'entropy', 'max_depth': 5, 'min_...",0.51583,0.528594,0.514683,0.523957,0.537867,0.524186,0.008565,1
2,0.00279,0.000146,0.000536,1.8e-05,gini,5,30,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.515058,0.529366,0.513138,0.522411,0.53864,0.523723,0.009415,3
7,0.002106,3.5e-05,0.000419,5e-06,entropy,5,30,"{'criterion': 'entropy', 'max_depth': 5, 'min_...",0.515058,0.529366,0.513138,0.522411,0.53864,0.523723,0.009415,3
4,0.002081,3.7e-05,0.000412,2e-06,gini,5,60,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.51583,0.527821,0.514683,0.517774,0.540958,0.523413,0.009927,5


## RFC with CV

In [45]:
param_grid = {
    'max_depth': [None,18, 12, 10, 5],
    'n_estimators': [55, 101, 201],
    'criterion': ['gini', 'entropy'],
}
gr_search = GridSearchCV(RandomForestClassifier(),
                      param_grid)

In [46]:
gr_search

In [47]:
gr_search.fit(X_train, y_train)

In [48]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(30, 16)

In [49]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
28,0.112739,0.000767,0.006761,6e-05,entropy,5,101,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",0.528185,0.51391,0.531685,0.521638,0.514683,0.52202,0.007087,1
27,0.057468,0.003189,0.003919,3.7e-05,entropy,5,55,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",0.524324,0.51932,0.513138,0.517774,0.534003,0.521712,0.007108,2
14,0.214013,0.001137,0.013017,0.000107,gini,5,201,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.526641,0.503864,0.523957,0.523957,0.527048,0.521093,0.008712,3
13,0.108429,0.000803,0.006832,8.4e-05,gini,5,101,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.531274,0.507728,0.525502,0.515456,0.52473,0.520938,0.008326,4
29,0.212339,0.001293,0.012908,0.000104,entropy,5,201,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",0.517375,0.50541,0.51932,0.52473,0.518547,0.517076,0.006356,5


## RFC with CV PLAYTIME

In [50]:
param_grid = {
    'max_depth': [2,3,5,10],
    'n_estimators': [31, 101, 201,501],
    'criterion': ['gini', 'entropy'],
}
gr_search = GridSearchCV(RandomForestClassifier(),
                      param_grid)

In [51]:
gr_search

In [52]:
gr_search.fit(X_train, y_train)

In [53]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(32, 16)

In [54]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
22,0.18901,0.000187,0.010648,6.6e-05,entropy,3,201,"{'criterion': 'entropy', 'max_depth': 3, 'n_es...",0.53668,0.520093,0.525502,0.520866,0.543277,0.529283,0.009167,1
23,0.469242,0.001174,0.025757,0.000207,entropy,3,501,"{'criterion': 'entropy', 'max_depth': 3, 'n_es...",0.534363,0.515456,0.526275,0.517002,0.543277,0.527274,0.010512,2
5,0.084525,0.000579,0.00593,0.000465,gini,3,101,"{'criterion': 'gini', 'max_depth': 3, 'n_estim...",0.531274,0.51932,0.527821,0.517774,0.539413,0.52712,0.007963,3
21,0.096343,0.000727,0.005634,3.6e-05,entropy,3,101,"{'criterion': 'entropy', 'max_depth': 3, 'n_es...",0.534363,0.520866,0.522411,0.520866,0.534776,0.526656,0.006487,4
7,0.46467,0.009422,0.026258,0.000255,gini,3,501,"{'criterion': 'gini', 'max_depth': 3, 'n_estim...",0.53668,0.512365,0.527048,0.517002,0.540185,0.526656,0.010781,5


## NBC with CV

In [55]:
param_grid = {
    'var_smoothing': [.000000001,.000000002,.000000003]
}
gr_search = GridSearchCV(GaussianNB(),
                      param_grid)

In [56]:
gr_search

In [57]:
gr_search.fit(X_train, y_train)

In [58]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(3, 14)

In [59]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_var_smoothing,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.00336,0.001272,0.00127,0.000137,0.0,{'var_smoothing': 1e-09},0.51583,0.513138,0.517002,0.516229,0.527048,0.517849,0.00478,1
1,0.002007,0.000185,0.000989,0.00011,0.0,{'var_smoothing': 2e-09},0.51583,0.513138,0.517002,0.516229,0.527048,0.517849,0.00478,1
2,0.001559,8.8e-05,0.000761,5.3e-05,0.0,{'var_smoothing': 3e-09},0.51583,0.513138,0.517002,0.516229,0.527048,0.517849,0.00478,1


## GBC with CV

In [60]:
param_grid = {
    'learning_rate': [0.1,0.2,0.5,1.0, 5.0],
    'n_estimators': [55, 101, 201],
    'min_samples_leaf': [1,5,10,20],
}
gr_search = GridSearchCV(GradientBoostingClassifier(),
                      param_grid)

In [61]:
gr_search

In [62]:
gr_search.fit(X_train, y_train)

In [63]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(60, 16)

In [64]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
19,0.133835,0.000968,0.001495,2.8e-05,0.2,10,101,"{'learning_rate': 0.2, 'min_samples_leaf': 10,...",0.528958,0.52473,0.528594,0.508501,0.517774,0.521711,0.007733,1
46,0.139817,0.000597,0.001698,4.9e-05,1.0,20,101,"{'learning_rate': 1.0, 'min_samples_leaf': 20,...",0.504247,0.51932,0.522411,0.516229,0.536321,0.519706,0.01034,2
35,0.274043,0.000586,0.002586,7.8e-05,0.5,20,201,"{'learning_rate': 0.5, 'min_samples_leaf': 20,...",0.511969,0.542504,0.522411,0.5,0.520866,0.51955,0.013975,3
8,0.266541,0.001011,0.002442,6.1e-05,0.1,10,201,"{'learning_rate': 0.1, 'min_samples_leaf': 10,...",0.525097,0.516229,0.520866,0.506182,0.527821,0.519239,0.007618,4
21,0.074591,0.000252,0.001091,1.6e-05,0.2,20,55,"{'learning_rate': 0.2, 'min_samples_leaf': 20,...",0.528958,0.515456,0.51932,0.496909,0.535549,0.519238,0.013217,5


## CATb with CV

In [65]:
# param_grid = {
#     'verbose': [False],
#     'depth': [5, 10, 12,15],
#     'learning_rate': [None,0.1, 0.3,0.7,1.0],
# }
# gr_search = GridSearchCV(CatBoostClassifier(),
#                       param_grid)

In [66]:
param_grid = {
    'verbose': [False],
    'depth': [5, 10, 12],
    'learning_rate': [None,0.1,0.2, 0.5],
}
gr_search = GridSearchCV(CatBoostClassifier(),
                      param_grid)

In [67]:
gr_search

In [68]:
gr_search.fit(X_train, y_train)

In [69]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(12, 16)

In [70]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_depth,param_learning_rate,param_verbose,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,1.204041,0.007003,0.000938,5.1e-05,5,0.1,False,"{'depth': 5, 'learning_rate': 0.1, 'verbose': ...",0.503475,0.527821,0.520866,0.526275,0.520866,0.51986,0.00866,1
0,1.199648,0.015689,0.001029,0.000179,5,,False,"{'depth': 5, 'learning_rate': None, 'verbose':...",0.510425,0.522411,0.51391,0.520866,0.523184,0.518159,0.00507,2
6,2.232696,0.009576,0.001589,8e-05,10,0.2,False,"{'depth': 10, 'learning_rate': 0.2, 'verbose':...",0.515058,0.516229,0.520093,0.516229,0.520866,0.517695,0.002326,3
7,2.237557,0.009579,0.00173,0.000362,10,0.5,False,"{'depth': 10, 'learning_rate': 0.5, 'verbose':...",0.507336,0.512365,0.526275,0.512365,0.525502,0.516769,0.007674,4
11,4.954589,0.03289,0.001922,2.3e-05,12,0.5,False,"{'depth': 12, 'learning_rate': 0.5, 'verbose':...",0.50888,0.51932,0.522411,0.510046,0.523184,0.516768,0.006114,5


## SCALE for Logistic regression, MLP, etc.

In [71]:
sc_X = StandardScaler()
X_train_scaled = sc_X.fit_transform(X_train)
X_validate_scaled = sc_X.transform(X_validate)
X_test_scaled = sc_X.transform(X_test)

## MLP

In [72]:
mlp = MLPClassifier(hidden_layer_sizes=(1024,512,128),activation="relu",random_state=2013,
                    batch_size=500,solver='adam')
mlp.fit(X_train_scaled, y_train)
mlp_preds = mlp.predict(X_train_scaled)
print(mlp.score(X_train_scaled, y_train))
pd.crosstab(mlp_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows



0.6158244475351569


is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1817,1123
1,1363,2168


In [73]:
print(f'Accuracy-Train {round(mlp.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(mlp.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,mlp_preds))
print(classification_report(y_validate,mlp.predict(X_validate_scaled)))

Accuracy-Train 0.6158
Accuracy-Validate 0.4879
              precision    recall  f1-score   support

           0       0.62      0.57      0.59      3180
           1       0.61      0.66      0.64      3291

    accuracy                           0.62      6471
   macro avg       0.62      0.62      0.61      6471
weighted avg       0.62      0.62      0.62      6471

              precision    recall  f1-score   support

           0       0.48      0.46      0.47      1178
           1       0.50      0.52      0.51      1216

    accuracy                           0.49      2394
   macro avg       0.49      0.49      0.49      2394
weighted avg       0.49      0.49      0.49      2394



## KNN with CV

In [74]:
param_grid = {
    'n_neighbors': [70],
    'weights': ['uniform','distance'],
    'algorithm':['ball_tree','brute'],
    'leaf_size': [3,4,5,6,7,8,9],
}
gr_search = GridSearchCV(KNeighborsClassifier(),
                      param_grid)

In [75]:
gr_search

In [92]:
gr_search.fit(X_train, y_train)

In [93]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(6, 15)

In [94]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_shrinking,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.359269,0.017115,0.210747,0.000587,1,True,"{'C': 1, 'shrinking': True}",0.528958,0.510046,0.526275,0.522411,0.526275,0.522793,0.006706,1
1,0.349427,0.001965,0.210275,0.000789,1,False,"{'C': 1, 'shrinking': False}",0.528958,0.510046,0.526275,0.522411,0.526275,0.522793,0.006706,1
4,0.361996,0.006691,0.2098,0.000846,10,True,"{'C': 10, 'shrinking': True}",0.523552,0.503091,0.526275,0.529366,0.523957,0.521248,0.009311,3
5,0.356277,0.002108,0.209656,0.000787,10,False,"{'C': 10, 'shrinking': False}",0.523552,0.503091,0.526275,0.529366,0.523957,0.521248,0.009311,3
2,0.354368,0.004005,0.209781,0.000795,5,True,"{'C': 5, 'shrinking': True}",0.521236,0.503091,0.523184,0.52473,0.517774,0.518003,0.00781,5


## LOG with CV

In [79]:
param_grid = {
    'penalty': ['none','l1','l2','elasticnet'],
    'class_weight': ['balanced',None],
    'max_iter':[100,50,200,500]
}
gr_search = GridSearchCV(LogisticRegression(),
                      param_grid)

In [80]:
gr_search

In [81]:
gr_search.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

80 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

-

In [82]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(32, 16)

In [83]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_max_iter,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
18,0.018848,0.002993,0.000851,0.000384,,100,l2,"{'class_weight': None, 'max_iter': 100, 'penal...",0.516602,0.510819,0.528594,0.515456,0.529366,0.520167,0.007455,1
28,0.016857,0.002935,0.00084,0.000327,,500,none,"{'class_weight': None, 'max_iter': 500, 'penal...",0.516602,0.509274,0.527821,0.517002,0.530139,0.520167,0.007738,1
16,0.017573,0.005112,0.000757,0.000289,,100,none,"{'class_weight': None, 'max_iter': 100, 'penal...",0.516602,0.509274,0.527821,0.517002,0.530139,0.520167,0.007738,1
30,0.016478,0.002017,0.000722,0.000349,,500,l2,"{'class_weight': None, 'max_iter': 500, 'penal...",0.516602,0.510819,0.528594,0.515456,0.529366,0.520167,0.007455,1
26,0.017782,0.002724,0.000607,0.000297,,200,l2,"{'class_weight': None, 'max_iter': 200, 'penal...",0.516602,0.510819,0.528594,0.515456,0.529366,0.520167,0.007455,1


## SVM with CV

In [84]:
param_grid = {
    'shrinking': [True,False],
    'C':[1,5,10]
}
gr_search = GridSearchCV(SVC(),
                      param_grid)

In [85]:
gr_search

In [86]:
gr_search.fit(X_train, y_train)

In [87]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(6, 15)

In [88]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_shrinking,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.364644,0.021464,0.210945,0.000577,1,True,"{'C': 1, 'shrinking': True}",0.528958,0.510046,0.526275,0.522411,0.526275,0.522793,0.006706,1
1,0.35163,0.001042,0.210619,0.000875,1,False,"{'C': 1, 'shrinking': False}",0.528958,0.510046,0.526275,0.522411,0.526275,0.522793,0.006706,1
4,0.362856,0.006651,0.209783,0.000651,10,True,"{'C': 10, 'shrinking': True}",0.523552,0.503091,0.526275,0.529366,0.523957,0.521248,0.009311,3
5,0.355914,0.002129,0.209389,0.000695,10,False,"{'C': 10, 'shrinking': False}",0.523552,0.503091,0.526275,0.529366,0.523957,0.521248,0.009311,3
2,0.356126,0.004366,0.209846,0.000901,5,True,"{'C': 5, 'shrinking': True}",0.521236,0.503091,0.523184,0.52473,0.517774,0.518003,0.00781,5


# NOW return to single iterations of each model with optimized hyperparameters.

### DTC: gini, max_d=5,min_samples_leaf=50

In [95]:
# create the Decision Tree Classifier model 
dtc = DecisionTreeClassifier(max_depth=5,min_samples_leaf=50)
# fit the model to the TRAIN dataset:
dtc.fit(X_train, y_train)
# use the model by calling for the predictions made via the TRAIN dataset
dtc_preds = dtc.predict(X_train)
pd.crosstab(dtc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1683,1487
1,1497,1804


In [96]:
print(f'Accuracy-Train {round(dtc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(dtc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,dtc_preds))
print(classification_report(y_validate,dtc.predict(X_validate)))

Accuracy-Train 0.5389
Accuracy-Validate 0.4875
              precision    recall  f1-score   support

           0       0.53      0.53      0.53      3180
           1       0.55      0.55      0.55      3291

    accuracy                           0.54      6471
   macro avg       0.54      0.54      0.54      6471
weighted avg       0.54      0.54      0.54      6471

              precision    recall  f1-score   support

           0       0.48      0.49      0.49      1178
           1       0.50      0.48      0.49      1216

    accuracy                           0.49      2394
   macro avg       0.49      0.49      0.49      2394
weighted avg       0.49      0.49      0.49      2394



### RFC: entropy, max_d=3,n_est=201

In [97]:
# create the Random Forest model 
rf1 = RandomForestClassifier(criterion='entropy',n_estimators=201,max_depth=3)
# fit the model to the TRAIN dataset1
rf1.fit(X_train, y_train)
# use the model by calling for the predictions made via the TRAIN dataset
rf1_preds = rf1.predict(X_train)
pd.crosstab(rf1_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1319,1137
1,1861,2154


In [98]:
print(f'Accuracy-Train {round(rf1.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(rf1.score(X_validate,y_validate),4)}')
print(classification_report(y_train,rf1_preds))
print(classification_report(y_validate,rf1.predict(X_validate)))

Accuracy-Train 0.5367
Accuracy-Validate 0.5092
              precision    recall  f1-score   support

           0       0.54      0.41      0.47      3180
           1       0.54      0.65      0.59      3291

    accuracy                           0.54      6471
   macro avg       0.54      0.53      0.53      6471
weighted avg       0.54      0.54      0.53      6471

              precision    recall  f1-score   support

           0       0.50      0.41      0.45      1178
           1       0.51      0.61      0.56      1216

    accuracy                           0.51      2394
   macro avg       0.51      0.51      0.50      2394
weighted avg       0.51      0.51      0.50      2394



### SVM: vanilla

In [99]:
svm = SVC()
svm.fit(X_train_scaled, y_train)
svm_preds = svm.predict(X_train_scaled)
print(svm.score(X_train_scaled, y_train))
pd.crosstab(svm_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

0.5325297481069386


is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1091,936
1,2089,2355


In [100]:
print(f'Accuracy-Train {round(svm.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(svm.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,svm_preds))
print(classification_report(y_validate,svm.predict(X_validate_scaled)))

Accuracy-Train 0.5325
Accuracy-Validate 0.5038
              precision    recall  f1-score   support

           0       0.54      0.34      0.42      3180
           1       0.53      0.72      0.61      3291

    accuracy                           0.53      6471
   macro avg       0.53      0.53      0.51      6471
weighted avg       0.53      0.53      0.52      6471

              precision    recall  f1-score   support

           0       0.49      0.33      0.40      1178
           1       0.51      0.67      0.58      1216

    accuracy                           0.50      2394
   macro avg       0.50      0.50      0.49      2394
weighted avg       0.50      0.50      0.49      2394

