In [1]:
#standard ds imports
import pandas as pd
import numpy as np
#viz and stats
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# .py imports
#import wranglerer as wr
#import modeling as md
import os
#sklearn imports
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
import sklearn.preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#CATboost imports
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import explore_r as ex
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

### This notebook builds upon the previous notebook (model_r) and incorporates GridSearch CrossValidation to explore each algorithm with a small subset of possible hyperparameter combinations.

In [2]:
df = pd.read_csv('prepped_data.csv')

In [3]:
df = df.drop(columns=['date', 'day_of_week', 'start_time','home_score',
       'home_wins', 'away_score', 'away_wins','total_scores'])
df['spread'] = abs(df['spread'])

In [4]:
df.head()

Unnamed: 0,week_num,stadium,temp,humidity,wind,spread,ou,is_under,abnormal_start,is_playoff,playoff_implications,is_turf,is_outdoor
0,19,State Farm Stadium,72,0,0,1.0,51.0,0,0,1,1,0,0
1,19,Lincoln Financial Field,52,48,14,2.5,45.5,1,0,1,1,0,1
2,19,GEHA Field at Arrowhead Stadium,22,55,13,1.5,48.0,1,0,1,1,0,1
3,19,Highmark Stadium,32,10,0,5.5,49.0,1,0,1,1,1,1
4,19,Levi's Stadium,55,47,19,4.0,47.0,1,0,1,1,0,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10811 entries, 0 to 10810
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   week_num              10811 non-null  int64  
 1   stadium               10811 non-null  object 
 2   temp                  10811 non-null  int64  
 3   humidity              10811 non-null  int64  
 4   wind                  10811 non-null  int64  
 5   spread                10811 non-null  float64
 6   ou                    10811 non-null  float64
 7   is_under              10811 non-null  int64  
 8   abnormal_start        10811 non-null  int64  
 9   is_playoff            10811 non-null  int64  
 10  playoff_implications  10811 non-null  int64  
 11  is_turf               10811 non-null  int64  
 12  is_outdoor            10811 non-null  int64  
dtypes: float64(2), int64(10), object(1)
memory usage: 1.1+ MB


## ESTABLISH BASELINE 50.7%

In [6]:
df.is_under.value_counts(normalize=True)

1    0.507354
0    0.492646
Name: is_under, dtype: float64

In [7]:
df['baseline'] = 1

In [8]:
baseline_accuracy = (df.baseline == df.is_under).mean()
print(f'baseline accuracy: {baseline_accuracy:.2%}')

baseline accuracy: 50.74%


In [9]:
subset = df[df.is_under == 1]
baseline_recall = (subset.baseline == subset.is_under).mean()
print(f'baseline recall: {baseline_recall:.2%}')

baseline recall: 100.00%


In [10]:
subset = df[df.baseline == 1]
baseline_precision = (subset.baseline == subset.is_under).mean()
print(f'baseline precision: {baseline_precision:.2%}')

baseline precision: 50.74%


In [11]:
df.drop(columns='baseline',inplace=True)

In [12]:
df_no_stadium = df.drop(columns='stadium')

In [13]:
df_no_stadium.head()

Unnamed: 0,week_num,temp,humidity,wind,spread,ou,is_under,abnormal_start,is_playoff,playoff_implications,is_turf,is_outdoor
0,19,72,0,0,1.0,51.0,0,0,1,1,0,0
1,19,52,48,14,2.5,45.5,1,0,1,1,0,1
2,19,22,55,13,1.5,48.0,1,0,1,1,0,1
3,19,32,10,0,5.5,49.0,1,0,1,1,1,1
4,19,55,47,19,4.0,47.0,1,0,1,1,0,1


In [14]:
X_train, y_train, X_validate, y_validate, X_test, y_test = ex.train_validate_test(df_no_stadium,'is_under')

In [15]:
X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape 

((6471, 11), (6471,), (2394, 11), (2394,), (1946, 11), (1946,))

In [16]:
X_train.head()

Unnamed: 0,week_num,temp,humidity,wind,spread,ou,abnormal_start,is_playoff,playoff_implications,is_turf,is_outdoor
1713,12,72,0,0,4.0,49.0,0,0,1,1,0
2164,17,72,0,0,7.5,38.0,0,0,1,0,0
2554,9,72,0,0,1.0,42.5,1,0,0,0,0
3117,6,58,52,10,12.5,40.0,0,0,0,0,1
9819,13,51,49,14,5.0,39.5,0,0,1,0,1


# MODELING

## DTC VANILLA

In [17]:
# create the Decision Tree Classifier model 
dtc = DecisionTreeClassifier()

In [18]:
# fit the model to the TRAIN dataset:
dtc.fit(X_train, y_train)

In [19]:
# use the model by calling for the predictions made via the TRAIN dataset
dtc_preds = dtc.predict(X_train)
pd.crosstab(dtc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3180,20
1,0,3271


In [20]:
print(f'Accuracy-Train {round(dtc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(dtc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,dtc_preds))
print(classification_report(y_validate,dtc.predict(X_validate)))

Accuracy-Train 0.9969
Accuracy-Validate 0.5146
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3180
           1       1.00      0.99      1.00      3291

    accuracy                           1.00      6471
   macro avg       1.00      1.00      1.00      6471
weighted avg       1.00      1.00      1.00      6471

              precision    recall  f1-score   support

           0       0.51      0.50      0.50      1178
           1       0.52      0.53      0.53      1216

    accuracy                           0.51      2394
   macro avg       0.51      0.51      0.51      2394
weighted avg       0.51      0.51      0.51      2394



## DTC maxDepth = X

In [21]:
# create the Decision Tree Classifier model 
dtc = DecisionTreeClassifier(max_depth=5,min_samples_leaf=5)

In [22]:
# fit the model to the TRAIN dataset:
dtc.fit(X_train, y_train)

In [23]:
# use the model by calling for the predictions made via the TRAIN dataset
dtc_preds = dtc.predict(X_train)
pd.crosstab(dtc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1604,1365
1,1576,1926


In [24]:
print(f'Accuracy-Train {round(dtc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(dtc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,dtc_preds))
print(classification_report(y_validate,dtc.predict(X_validate)))

Accuracy-Train 0.5455
Accuracy-Validate 0.5029
              precision    recall  f1-score   support

           0       0.54      0.50      0.52      3180
           1       0.55      0.59      0.57      3291

    accuracy                           0.55      6471
   macro avg       0.55      0.54      0.54      6471
weighted avg       0.55      0.55      0.54      6471

              precision    recall  f1-score   support

           0       0.49      0.46      0.48      1178
           1       0.51      0.55      0.53      1216

    accuracy                           0.50      2394
   macro avg       0.50      0.50      0.50      2394
weighted avg       0.50      0.50      0.50      2394



## DTC grid_search CROSS_VALIDATION k=5

In [25]:
cross_val_score(dtc, X_train, y_train, cv=10)

array([0.50308642, 0.52550232, 0.46367852, 0.47449768, 0.50695518,
       0.51931994, 0.51931994, 0.51468315, 0.51468315, 0.5007728 ])

In [26]:
param_grid = {
    'max_depth': [None,18, 12, 10, 5],
    'min_samples_leaf': [1, 5, 10, 20],
    'criterion': ['gini', 'entropy'],
}

In [27]:
gr_search = GridSearchCV(DecisionTreeClassifier(),
                      param_grid)

In [28]:
gr_search

In [29]:
gr_search.fit(X_train, y_train)

In [30]:
results = gr_search.cv_results_

In [31]:
results_df_init = pd.DataFrame(results)

In [32]:
results_df_init.shape

(40, 16)

In [33]:
params = pd.DataFrame(results['params'])

In [34]:
params

Unnamed: 0,criterion,max_depth,min_samples_leaf
0,gini,,1
1,gini,,5
2,gini,,10
3,gini,,20
4,gini,18.0,1
5,gini,18.0,5
6,gini,18.0,10
7,gini,18.0,20
8,gini,12.0,1
9,gini,12.0,5


In [35]:
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
37,0.004864,2.1e-05,0.000457,5e-06,entropy,5.0,5,"{'criterion': 'entropy', 'max_depth': 5, 'min_...",0.491892,0.510046,0.522411,0.521638,0.508501,0.510898,0.011096,1
0,0.021075,0.007536,0.000869,0.000383,gini,,1,"{'criterion': 'gini', 'max_depth': None, 'min_...",0.503475,0.493818,0.495363,0.526275,0.53323,0.510432,0.016262,2
2,0.010786,0.000418,0.00054,2e-06,gini,,10,"{'criterion': 'gini', 'max_depth': None, 'min_...",0.501158,0.501546,0.498454,0.515456,0.534776,0.510278,0.01361,3
24,0.01425,0.000195,0.000534,3e-06,entropy,18.0,1,"{'criterion': 'entropy', 'max_depth': 18, 'min...",0.504247,0.501546,0.512365,0.507728,0.520093,0.509196,0.006541,4
38,0.004841,6.7e-05,0.000453,7e-06,entropy,5.0,10,"{'criterion': 'entropy', 'max_depth': 5, 'min_...",0.492664,0.504637,0.520866,0.51932,0.507728,0.509043,0.010342,5


## RFC grid_search CROSS_VALIDATION k=5

In [36]:
param_grid = {
    'max_depth': [None,18, 12, 10, 5],
    'n_estimators': [55, 101, 201],
    'criterion': ['gini', 'entropy'],
}
gr_search = GridSearchCV(RandomForestClassifier(),
                      param_grid)

In [37]:
gr_search

In [38]:
gr_search.fit(X_train, y_train)

In [39]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(30, 16)

In [40]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head(11)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
12,0.086265,0.003047,0.003998,7e-05,gini,5,55,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.535907,0.514683,0.528594,0.515456,0.554869,0.529902,0.014842,1
14,0.307034,0.001641,0.013054,0.000217,gini,5,201,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.544402,0.513138,0.52473,0.502318,0.536321,0.524182,0.015209,2
10,0.236203,0.001197,0.010536,0.000232,gini,10,101,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.511969,0.512365,0.520093,0.527821,0.546368,0.523723,0.012727,3
29,0.309148,0.000893,0.012816,0.000242,entropy,5,201,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",0.52973,0.516229,0.514683,0.51391,0.535549,0.52202,0.008895,4
7,0.261788,0.00141,0.011839,0.000192,gini,12,101,"{'criterion': 'gini', 'max_depth': 12, 'n_esti...",0.528185,0.507728,0.523957,0.523957,0.523957,0.521557,0.007106,5
24,0.134917,0.002064,0.005784,9.5e-05,entropy,10,55,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.50888,0.51391,0.534776,0.514683,0.534776,0.521405,0.011098,6
26,0.486141,0.001357,0.019801,0.000366,entropy,10,201,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.511969,0.511592,0.520866,0.513138,0.543277,0.520168,0.012039,7
8,0.523489,0.005,0.023605,0.000456,gini,12,201,"{'criterion': 'gini', 'max_depth': 12, 'n_esti...",0.51583,0.504637,0.527821,0.522411,0.530139,0.520168,0.009198,8
25,0.245188,0.002473,0.010086,0.000151,entropy,10,101,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.524324,0.513138,0.534776,0.503091,0.525502,0.520166,0.010956,9
9,0.133574,0.008191,0.005942,0.000138,gini,10,55,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.513514,0.513138,0.527048,0.522411,0.520866,0.519395,0.005359,10


## RFC grid_search CROSS_VALIDATION PLAYTIME

In [41]:
param_grid = {
    'max_depth': [5,8,10, 12,15],
    'n_estimators': [31, 55, 101,201,501],
    'criterion': ['gini', 'entropy'],
    'oob_score':['False','True']
}
gr_search = GridSearchCV(RandomForestClassifier(),
                      param_grid)

In [42]:
gr_search

In [43]:
gr_search.fit(X_train, y_train)

In [44]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(100, 17)

In [45]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head(11)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,param_oob_score,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
51,0.058927,0.00309,0.002468,0.000107,entropy,5,31,True,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",0.542857,0.532457,0.518547,0.517002,0.531685,0.52851,0.009625,1
7,0.346849,0.012091,0.013209,0.000259,gini,5,201,True,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.528185,0.500773,0.539413,0.521638,0.541731,0.526348,0.014744,2
10,0.073657,0.002771,0.003125,7.6e-05,gini,8,31,False,"{'criterion': 'gini', 'max_depth': 8, 'n_estim...",0.526641,0.510046,0.528594,0.517774,0.544822,0.525575,0.011685,3
9,0.906204,0.041024,0.032039,0.000684,gini,5,501,True,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.530502,0.511592,0.53323,0.517002,0.535549,0.525575,0.009501,4
67,0.483511,0.01546,0.016554,0.000327,entropy,8,201,True,"{'criterion': 'entropy', 'max_depth': 8, 'n_es...",0.53668,0.51391,0.520093,0.523184,0.53323,0.525419,0.00841,5
59,0.92543,0.045532,0.031308,0.000606,entropy,5,501,True,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",0.538996,0.516229,0.530912,0.506955,0.53323,0.525264,0.011839,6
85,0.322484,0.001129,0.011593,0.00022,entropy,12,101,True,"{'criterion': 'entropy', 'max_depth': 12, 'n_e...",0.520463,0.517774,0.523957,0.513138,0.548686,0.524804,0.012453,7
55,0.17818,0.000463,0.00674,9.3e-05,entropy,5,101,True,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",0.528958,0.517774,0.527048,0.515456,0.534003,0.524648,0.00698,8
58,0.876541,0.003787,0.031233,0.00043,entropy,5,501,False,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",0.532046,0.517774,0.521638,0.515456,0.534003,0.524184,0.00751,9
4,0.173993,0.000774,0.006868,0.000149,gini,5,101,False,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.535135,0.510046,0.525502,0.513138,0.537094,0.524183,0.011048,10


## NBC grid_search CROSS_VALIDATION k=5

In [46]:
param_grid = {
    'var_smoothing': [.000000001,.000000002,.000000003]
}
gr_search = GridSearchCV(GaussianNB(),
                      param_grid)

In [47]:
gr_search

In [48]:
gr_search.fit(X_train, y_train)

In [49]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(3, 14)

In [50]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_var_smoothing,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003777,0.000639,0.001732,0.000203,0.0,{'var_smoothing': 1e-09},0.51583,0.503091,0.522411,0.512365,0.532457,0.517231,0.009841,1
1,0.002515,0.000264,0.001106,0.000112,0.0,{'var_smoothing': 2e-09},0.51583,0.503091,0.522411,0.512365,0.532457,0.517231,0.009841,1
2,0.001809,0.00026,0.000959,0.000163,0.0,{'var_smoothing': 3e-09},0.51583,0.503091,0.522411,0.512365,0.532457,0.517231,0.009841,1


## GBC grid_search CROSS_VALIDATION k=5

In [51]:
param_grid = {
    'learning_rate': [0.1,0.2,0.5,1.0, 5.0],
    'n_estimators': [55, 101, 201],
    'min_samples_leaf': [1,5,10,20],
}
gr_search = GridSearchCV(GradientBoostingClassifier(),
                      param_grid)

In [52]:
gr_search

In [53]:
gr_search.fit(X_train, y_train)

In [54]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(60, 16)

In [55]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,0.157495,0.000275,0.001107,3e-05,0.1,5,55,"{'learning_rate': 0.1, 'min_samples_leaf': 5, ...",0.528958,0.527821,0.512365,0.509274,0.532457,0.522175,0.009448,1
0,0.167212,0.012252,0.00112,7.6e-05,0.1,1,55,"{'learning_rate': 0.1, 'min_samples_leaf': 1, ...",0.520463,0.522411,0.510819,0.523184,0.529366,0.521249,0.006006,2
15,0.163812,0.00029,0.001144,1.5e-05,0.2,5,55,"{'learning_rate': 0.2, 'min_samples_leaf': 5, ...",0.500386,0.531685,0.517774,0.518547,0.52473,0.518624,0.010404,3
4,0.292702,0.003608,0.001588,2.3e-05,0.1,5,101,"{'learning_rate': 0.1, 'min_samples_leaf': 5, ...",0.52278,0.522411,0.516229,0.517774,0.51391,0.518621,0.003473,4
7,0.299272,0.001006,0.001682,7e-05,0.1,10,101,"{'learning_rate': 0.1, 'min_samples_leaf': 10,...",0.52973,0.51932,0.51391,0.502318,0.521638,0.517383,0.009092,5


## GBC grid_search CROSS_VALIDATION PLAYTIME

In [56]:
param_grid = {
    'learning_rate': [0.02,0.05,0.1],
    'n_estimators': [21,31,55, 101],
    'min_samples_leaf': [2,5,10],
}
gr_search = GridSearchCV(GradientBoostingClassifier(),
                      param_grid)

In [57]:
gr_search

In [58]:
gr_search.fit(X_train, y_train)

In [59]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(36, 16)

In [60]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
29,0.092557,0.000217,0.000886,2.1e-05,0.1,5,31,"{'learning_rate': 0.1, 'min_samples_leaf': 5, ...",0.531274,0.515456,0.523957,0.513138,0.537867,0.524338,0.009334,1
13,0.09202,0.000475,0.000906,1.5e-05,0.05,2,31,"{'learning_rate': 0.05, 'min_samples_leaf': 2,...",0.538996,0.51932,0.51391,0.509274,0.53864,0.524028,0.012488,2
12,0.062574,0.000109,0.000786,1.5e-05,0.05,2,21,"{'learning_rate': 0.05, 'min_samples_leaf': 2,...",0.537452,0.508501,0.522411,0.508501,0.541731,0.523719,0.013985,3
24,0.062509,0.000109,0.000771,9e-06,0.1,2,21,"{'learning_rate': 0.1, 'min_samples_leaf': 2, ...",0.532819,0.517002,0.523184,0.506955,0.537094,0.523411,0.010841,4
18,0.162073,0.000207,0.001152,3.7e-05,0.05,5,55,"{'learning_rate': 0.05, 'min_samples_leaf': 5,...",0.532819,0.517002,0.51391,0.517002,0.534003,0.522947,0.008626,5


## CATboost grid_search CROSS_VALIDATION k=5

In [61]:
# Create and fit the thing
CATb = CatBoostClassifier(verbose=False,depth=10,learning_rate=1.0)
CATb.fit(X_train,y_train)
CATb_preds = CATb.predict(X_train)
pd.crosstab(CATb_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3167,7
1,13,3284


In [62]:
print(f'Accuracy-Train {round(CATb.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(CATb.score(X_validate,y_validate),4)}')
print(classification_report(y_train,CATb_preds))
print(classification_report(y_validate,CATb.predict(X_validate)))

Accuracy-Train 0.9969
Accuracy-Validate 0.4992
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3180
           1       1.00      1.00      1.00      3291

    accuracy                           1.00      6471
   macro avg       1.00      1.00      1.00      6471
weighted avg       1.00      1.00      1.00      6471

              precision    recall  f1-score   support

           0       0.49      0.51      0.50      1178
           1       0.51      0.49      0.50      1216

    accuracy                           0.50      2394
   macro avg       0.50      0.50      0.50      2394
weighted avg       0.50      0.50      0.50      2394



In [66]:
param_grid = {
    'verbose': [False],
    'depth': [5, 10, 12],
    'learning_rate': [0.1,0.2, 0.5],
}
gr_search = GridSearchCV(CatBoostClassifier(),
                      param_grid)

In [67]:
gr_search

In [69]:
gr_search.fit(X_train, y_train)

In [70]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(9, 16)

In [71]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_depth,param_learning_rate,param_verbose,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
6,7.103155,0.022825,0.00231,2e-05,12,0.1,False,"{'depth': 12, 'learning_rate': 0.1, 'verbose':...",0.507336,0.52473,0.520866,0.514683,0.52473,0.518469,0.006669,1
3,2.824779,0.002587,0.001629,1.4e-05,10,0.1,False,"{'depth': 10, 'learning_rate': 0.1, 'verbose':...",0.510425,0.506182,0.51932,0.51391,0.527048,0.515377,0.007252,2
1,1.246808,0.007507,0.001098,0.000214,5,0.2,False,"{'depth': 5, 'learning_rate': 0.2, 'verbose': ...",0.522008,0.510819,0.506182,0.508501,0.518547,0.513211,0.006053,3
0,1.257302,0.013522,0.001209,0.000214,5,0.1,False,"{'depth': 5, 'learning_rate': 0.1, 'verbose': ...",0.516602,0.506182,0.504637,0.500773,0.528594,0.511358,0.010085,4
7,7.093312,0.03617,0.002278,2.3e-05,12,0.2,False,"{'depth': 12, 'learning_rate': 0.2, 'verbose':...",0.494208,0.517774,0.512365,0.516229,0.514683,0.511052,0.008609,5


## SCALE for Logistic regression, MLP, etc.

In [72]:
sc_X = StandardScaler()
X_train_scaled = sc_X.fit_transform(X_train)
X_validate_scaled = sc_X.transform(X_validate)
X_test_scaled = sc_X.transform(X_test)

In [73]:
mlp = MLPClassifier(hidden_layer_sizes=(1024,512,128),activation="relu",random_state=2013,
                    batch_size=200,solver='adam')
mlp.fit(X_train_scaled, y_train)
mlp_preds = mlp.predict(X_train_scaled)
print(mlp.score(X_train_scaled, y_train))
pd.crosstab(mlp_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

0.9313861845155308


is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3025,289
1,155,3002


In [74]:
mlp.n_layers_

5

In [75]:
print(f'Accuracy-Train {round(mlp.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(mlp.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,mlp_preds))
print(classification_report(y_validate,mlp.predict(X_validate_scaled)))

Accuracy-Train 0.9314
Accuracy-Validate 0.5004
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      3180
           1       0.95      0.91      0.93      3291

    accuracy                           0.93      6471
   macro avg       0.93      0.93      0.93      6471
weighted avg       0.93      0.93      0.93      6471

              precision    recall  f1-score   support

           0       0.49      0.53      0.51      1178
           1       0.51      0.47      0.49      1216

    accuracy                           0.50      2394
   macro avg       0.50      0.50      0.50      2394
weighted avg       0.50      0.50      0.50      2394



## KNN grid_search CROSS_VALIDATION k=5

In [76]:
param_grid = {
    'n_neighbors': [70],
    'weights': ['uniform','distance'],
    'algorithm':['ball_tree','brute'],
    'leaf_size': [3,4,5,6,7,8,9],
}
gr_search = GridSearchCV(KNeighborsClassifier(),
                      param_grid)

In [77]:
gr_search

In [78]:
gr_search.fit(X_train, y_train)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [79]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(28, 17)

In [80]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head(31)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_leaf_size,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
27,0.000815,2.7e-05,0.024602,0.000997,brute,9,70,distance,"{'algorithm': 'brute', 'leaf_size': 9, 'n_neig...",0.535135,0.510819,0.509274,0.545595,0.53323,0.526811,0.014329,1
25,0.001038,0.000434,0.025845,0.001295,brute,8,70,distance,"{'algorithm': 'brute', 'leaf_size': 8, 'n_neig...",0.535135,0.510819,0.509274,0.545595,0.53323,0.526811,0.014329,1
23,0.000836,4.2e-05,0.024377,0.001117,brute,7,70,distance,"{'algorithm': 'brute', 'leaf_size': 7, 'n_neig...",0.535135,0.510819,0.509274,0.545595,0.53323,0.526811,0.014329,1
21,0.000841,4.4e-05,0.024959,0.00119,brute,6,70,distance,"{'algorithm': 'brute', 'leaf_size': 6, 'n_neig...",0.535135,0.510819,0.509274,0.545595,0.53323,0.526811,0.014329,1
19,0.000802,2.5e-05,0.025585,0.001902,brute,5,70,distance,"{'algorithm': 'brute', 'leaf_size': 5, 'n_neig...",0.535135,0.510819,0.509274,0.545595,0.53323,0.526811,0.014329,1
17,0.000812,2.1e-05,0.025371,0.001381,brute,4,70,distance,"{'algorithm': 'brute', 'leaf_size': 4, 'n_neig...",0.535135,0.510819,0.509274,0.545595,0.53323,0.526811,0.014329,1
15,0.000819,2.1e-05,0.025109,0.001028,brute,3,70,distance,"{'algorithm': 'brute', 'leaf_size': 3, 'n_neig...",0.535135,0.510819,0.509274,0.545595,0.53323,0.526811,0.014329,1
1,0.00229,4e-05,0.046682,0.000665,ball_tree,3,70,distance,"{'algorithm': 'ball_tree', 'leaf_size': 3, 'n_...",0.534363,0.510819,0.510046,0.544822,0.531685,0.526347,0.013718,8
3,0.002274,3.4e-05,0.046734,0.00064,ball_tree,4,70,distance,"{'algorithm': 'ball_tree', 'leaf_size': 4, 'n_...",0.534363,0.510819,0.510046,0.544822,0.531685,0.526347,0.013718,8
5,0.002275,3.1e-05,0.046698,0.000615,ball_tree,5,70,distance,"{'algorithm': 'ball_tree', 'leaf_size': 5, 'n_...",0.534363,0.510819,0.510046,0.544822,0.531685,0.526347,0.013718,8


## LOG grid_search CROSS_VALIDATION k=5

In [81]:
param_grid = {
    'penalty': ['none','l1','l2','elasticnet'],
    'class_weight': ['balanced',None],
    'max_iter':[100,50,200]
}
gr_search = GridSearchCV(LogisticRegression(),
                      param_grid)

In [82]:
gr_search

In [83]:
gr_search.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [84]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(24, 16)

In [85]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head(11)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_max_iter,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
22,0.057396,0.003589,0.001049,0.00045,,200,l2,"{'class_weight': None, 'max_iter': 200, 'penal...",0.511969,0.517774,0.528594,0.510046,0.527821,0.519241,0.007754,1
20,0.080129,0.028183,0.000992,0.000355,,200,none,"{'class_weight': None, 'max_iter': 200, 'penal...",0.517375,0.51932,0.521638,0.513138,0.520866,0.518467,0.003038,2
16,0.016354,0.001976,0.000778,0.00036,,50,none,"{'class_weight': None, 'max_iter': 50, 'penalt...",0.515058,0.509274,0.517002,0.512365,0.530912,0.516922,0.007463,3
10,0.072022,0.011972,0.000867,0.000333,balanced,200,l2,"{'class_weight': 'balanced', 'max_iter': 200, ...",0.518919,0.502318,0.523957,0.518547,0.51932,0.516612,0.007412,4
4,0.016908,0.00216,0.000904,0.000322,balanced,50,none,"{'class_weight': 'balanced', 'max_iter': 50, '...",0.515058,0.514683,0.517774,0.516229,0.517774,0.516304,0.001305,5
18,0.017405,0.004808,0.001761,0.001781,,50,l2,"{'class_weight': None, 'max_iter': 50, 'penalt...",0.511969,0.506955,0.517774,0.513138,0.530912,0.51615,0.008144,6
0,0.041363,0.017243,0.001038,0.000734,balanced,100,none,"{'class_weight': 'balanced', 'max_iter': 100, ...",0.522008,0.503091,0.513138,0.513138,0.528594,0.515994,0.008692,7
8,0.06358,0.005083,0.00076,0.000488,balanced,200,none,"{'class_weight': 'balanced', 'max_iter': 200, ...",0.514286,0.504637,0.520093,0.522411,0.513138,0.514913,0.0062,8
14,0.037548,0.010172,0.001131,0.000543,,100,l2,"{'class_weight': None, 'max_iter': 100, 'penal...",0.51583,0.504637,0.516229,0.514683,0.51932,0.51414,0.004993,9
12,0.031367,0.006145,0.000902,0.00035,,100,none,"{'class_weight': None, 'max_iter': 100, 'penal...",0.515058,0.503864,0.51932,0.510046,0.520866,0.513831,0.006242,10


## SVM grid_search CROSS_VALIDATION k=5

In [86]:
param_grid = {
    'shrinking': [True,False],
    'C':[1,5,10]
}
gr_search = GridSearchCV(SVC(),
                      param_grid)

In [87]:
gr_search

In [88]:
gr_search.fit(X_train, y_train)

In [89]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(6, 15)

In [90]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head(11)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_shrinking,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,0.41409,0.007729,0.218587,0.001134,10,True,"{'C': 10, 'shrinking': True}",0.528185,0.508501,0.525502,0.502318,0.544822,0.521866,0.015106,1
5,0.416331,0.004727,0.218645,0.001001,10,False,"{'C': 10, 'shrinking': False}",0.528185,0.508501,0.525502,0.502318,0.544049,0.521711,0.014872,2
2,0.403883,0.002031,0.219164,0.000833,5,True,"{'C': 5, 'shrinking': True}",0.532046,0.500773,0.523184,0.5,0.539413,0.519083,0.01611,3
3,0.403963,0.001606,0.219194,0.000811,5,False,"{'C': 5, 'shrinking': False}",0.532046,0.500773,0.523184,0.5,0.539413,0.519083,0.01611,3
0,0.408337,0.017755,0.220756,0.000832,1,True,"{'C': 1, 'shrinking': True}",0.528958,0.495363,0.504637,0.506955,0.539413,0.515065,0.016422,5
1,0.396672,0.000548,0.22013,0.001242,1,False,"{'C': 1, 'shrinking': False}",0.528958,0.495363,0.504637,0.506955,0.539413,0.515065,0.016422,5


# NOW return to single iterations of each model with optimized hyperparameters.

### RFC: entropy, max_d=5,n_est=55

In [112]:
# create the Random Forest model 
rf1 = RandomForestClassifier(criterion='entropy',n_estimators=55,max_depth=5)
# fit the model to the TRAIN dataset1
rf1.fit(X_train, y_train)
# use the model by calling for the predictions made via the TRAIN dataset
rf1_preds = rf1.predict(X_train)
pd.crosstab(rf1_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1838,1293
1,1342,1998


In [113]:
print(f'Accuracy-Train {round(rf1.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(rf1.score(X_validate,y_validate),4)}')
print(classification_report(y_train,rf1_preds))
print(classification_report(y_validate,rf1.predict(X_validate)))

Accuracy-Train 0.5928
Accuracy-Validate 0.5104
              precision    recall  f1-score   support

           0       0.59      0.58      0.58      3180
           1       0.60      0.61      0.60      3291

    accuracy                           0.59      6471
   macro avg       0.59      0.59      0.59      6471
weighted avg       0.59      0.59      0.59      6471

              precision    recall  f1-score   support

           0       0.50      0.48      0.49      1178
           1       0.52      0.54      0.53      1216

    accuracy                           0.51      2394
   macro avg       0.51      0.51      0.51      2394
weighted avg       0.51      0.51      0.51      2394



### KNN: n_neighbors=70, "distance" based weighting ,"brute" computation

In [116]:
knn = KNeighborsClassifier(n_neighbors=70,weights='distance',algorithm='brute')
knn.fit(X_train_scaled, y_train)
knn_preds = knn.predict(X_train_scaled)
print(knn.score(X_train_scaled, y_train))
pd.crosstab(knn_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

0.9969092875907897


is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3180,20
1,0,3271


In [117]:
print(f'Accuracy-Train {round(knn.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(knn.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,knn_preds))
print(classification_report(y_validate,knn.predict(X_validate_scaled)))

Accuracy-Train 0.9969
Accuracy-Validate 0.5
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3180
           1       1.00      0.99      1.00      3291

    accuracy                           1.00      6471
   macro avg       1.00      1.00      1.00      6471
weighted avg       1.00      1.00      1.00      6471

              precision    recall  f1-score   support

           0       0.49      0.46      0.48      1178
           1       0.51      0.53      0.52      1216

    accuracy                           0.50      2394
   macro avg       0.50      0.50      0.50      2394
weighted avg       0.50      0.50      0.50      2394



### GBC: learning_rate=0.05, n_est=31,min_samples_leaf=2

In [118]:
# create the Random Forest model 
gbc = GradientBoostingClassifier(learning_rate=0.05,n_estimators=31,min_samples_leaf=2)
# fit the model to the TRAIN dataset:
gbc.fit(X_train, y_train)
# use the model by calling for the predictions made via the TRAIN dataset
gbc_preds = gbc.predict(X_train)
pd.crosstab(gbc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1646,1346
1,1534,1945


In [119]:
print(f'Accuracy-Train {round(gbc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(gbc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,gbc_preds))
print(classification_report(y_validate,gbc.predict(X_validate)))

Accuracy-Train 0.5549
Accuracy-Validate 0.5038
              precision    recall  f1-score   support

           0       0.55      0.52      0.53      3180
           1       0.56      0.59      0.57      3291

    accuracy                           0.55      6471
   macro avg       0.55      0.55      0.55      6471
weighted avg       0.55      0.55      0.55      6471

              precision    recall  f1-score   support

           0       0.50      0.46      0.48      1178
           1       0.51      0.55      0.53      1216

    accuracy                           0.50      2394
   macro avg       0.50      0.50      0.50      2394
weighted avg       0.50      0.50      0.50      2394



In [127]:
cols = X_train.columns

In [130]:
cols

Index(['week_num', 'temp', 'humidity', 'wind', 'spread', 'ou',
       'abnormal_start', 'is_playoff', 'playoff_implications', 'is_turf',
       'is_outdoor'],
      dtype='object')

In [128]:
gini = gbc.feature_importances_

In [131]:
gini

array([0.07565601, 0.24076834, 0.11100929, 0.28817275, 0.06482177,
       0.12492188, 0.0238728 , 0.        , 0.02971773, 0.04105943,
       0.        ])

In [132]:
feat_df = pd.DataFrame({'Feature':cols, 'Coef':gini})

In [135]:
feat_df.sort_values('Coef',ascending=False)

Unnamed: 0,Feature,Coef
3,wind,0.288173
1,temp,0.240768
5,ou,0.124922
2,humidity,0.111009
0,week_num,0.075656
4,spread,0.064822
9,is_turf,0.041059
8,playoff_implications,0.029718
6,abnormal_start,0.023873
7,is_playoff,0.0
