In [1]:
#standard ds imports
import pandas as pd
import numpy as np
#viz and stats
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# .py imports
#import wranglerer as wr
#import modeling as md
import os
#sklearn imports
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
import sklearn.preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#CATboost imports
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import explore_r as ex
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

### This notebook builds upon the previous notebook (model_r) and incorporates GridSearch CrossValidation to explore each algorithm with a small subset of possible hyperparameter combinations.

In [2]:
df = pd.read_csv('prepped_data.csv')

In [3]:
df = df.drop(columns=['date', 'day_of_week', 'start_time','home_score',
       'home_wins', 'away_score', 'away_wins','total_scores'])
df['spread'] = abs(df['spread'])

In [4]:
df.head()

Unnamed: 0,week_num,stadium,temp,humidity,wind,spread,ou,is_under,abnormal_start,is_playoff,playoff_implications,is_turf,is_outdoor
0,19,State Farm Stadium,72,0,0,1.0,51.0,0,0,1,1,0,0
1,19,Lincoln Financial Field,52,48,14,2.5,45.5,1,0,1,1,0,1
2,19,GEHA Field at Arrowhead Stadium,22,55,13,1.5,48.0,1,0,1,1,0,1
3,19,Highmark Stadium,32,10,0,5.5,49.0,1,0,1,1,1,1
4,19,Levi's Stadium,55,47,19,4.0,47.0,1,0,1,1,0,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10811 entries, 0 to 10810
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   week_num              10811 non-null  int64  
 1   stadium               10811 non-null  object 
 2   temp                  10811 non-null  int64  
 3   humidity              10811 non-null  int64  
 4   wind                  10811 non-null  int64  
 5   spread                10811 non-null  float64
 6   ou                    10811 non-null  float64
 7   is_under              10811 non-null  int64  
 8   abnormal_start        10811 non-null  int64  
 9   is_playoff            10811 non-null  int64  
 10  playoff_implications  10811 non-null  int64  
 11  is_turf               10811 non-null  int64  
 12  is_outdoor            10811 non-null  int64  
dtypes: float64(2), int64(10), object(1)
memory usage: 1.1+ MB


## ESTABLISH BASELINE 50.7%

In [6]:
df.is_under.value_counts(normalize=True)

1    0.507354
0    0.492646
Name: is_under, dtype: float64

In [7]:
df['baseline'] = 1

In [8]:
baseline_accuracy = (df.baseline == df.is_under).mean()
print(f'baseline accuracy: {baseline_accuracy:.2%}')

baseline accuracy: 50.74%


In [9]:
subset = df[df.is_under == 1]
baseline_recall = (subset.baseline == subset.is_under).mean()
print(f'baseline recall: {baseline_recall:.2%}')

baseline recall: 100.00%


In [10]:
subset = df[df.baseline == 1]
baseline_precision = (subset.baseline == subset.is_under).mean()
print(f'baseline precision: {baseline_precision:.2%}')

baseline precision: 50.74%


In [11]:
df.drop(columns='baseline',inplace=True)

In [12]:
df_no_stadium = df.drop(columns='stadium')

In [13]:
df_no_stadium.head()

Unnamed: 0,week_num,temp,humidity,wind,spread,ou,is_under,abnormal_start,is_playoff,playoff_implications,is_turf,is_outdoor
0,19,72,0,0,1.0,51.0,0,0,1,1,0,0
1,19,52,48,14,2.5,45.5,1,0,1,1,0,1
2,19,22,55,13,1.5,48.0,1,0,1,1,0,1
3,19,32,10,0,5.5,49.0,1,0,1,1,1,1
4,19,55,47,19,4.0,47.0,1,0,1,1,0,1


In [14]:
X_train, y_train, X_validate, y_validate, X_test, y_test = ex.train_validate_test(df_no_stadium,'is_under')

In [15]:
X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape 

((6471, 11), (6471,), (2394, 11), (2394,), (1946, 11), (1946,))

In [16]:
print(y_train.value_counts(normalize=True))
print(y_validate.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

1    0.50734
0    0.49266
Name: is_under, dtype: float64
1    0.507519
0    0.492481
Name: is_under, dtype: float64
1    0.507194
0    0.492806
Name: is_under, dtype: float64


In [17]:
X_train.head()

Unnamed: 0,week_num,temp,humidity,wind,spread,ou,abnormal_start,is_playoff,playoff_implications,is_turf,is_outdoor
894,14,78,58,9,3.0,46.5,0,0,1,0,1
6913,11,32,73,11,4.0,41.5,0,0,1,1,1
886,14,72,0,0,3.5,48.0,0,0,0,1,0
7630,12,37,49,20,13.0,37.0,0,0,1,1,1
5466,7,81,24,5,3.5,36.5,0,0,0,0,1


# MODELING

## DTC VANILLA

In [18]:
# create the Decision Tree Classifier model 
dtc = DecisionTreeClassifier()

In [19]:
# fit the model to the TRAIN dataset:
dtc.fit(X_train, y_train)

In [20]:
# use the model by calling for the predictions made via the TRAIN dataset
dtc_preds = dtc.predict(X_train)
pd.crosstab(dtc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3188,27
1,0,3256


In [21]:
print(f'Accuracy-Train {round(dtc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(dtc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,dtc_preds))
print(classification_report(y_validate,dtc.predict(X_validate)))

Accuracy-Train 0.9958
Accuracy-Validate 0.5084
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3188
           1       1.00      0.99      1.00      3283

    accuracy                           1.00      6471
   macro avg       1.00      1.00      1.00      6471
weighted avg       1.00      1.00      1.00      6471

              precision    recall  f1-score   support

           0       0.50      0.51      0.51      1179
           1       0.52      0.51      0.51      1215

    accuracy                           0.51      2394
   macro avg       0.51      0.51      0.51      2394
weighted avg       0.51      0.51      0.51      2394



## DTC maxDepth = X

In [22]:
# create the Decision Tree Classifier model 
dtc = DecisionTreeClassifier(max_depth=5,min_samples_leaf=5)

In [23]:
# fit the model to the TRAIN dataset:
dtc.fit(X_train, y_train)

In [24]:
# use the model by calling for the predictions made via the TRAIN dataset
dtc_preds = dtc.predict(X_train)
pd.crosstab(dtc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1428,1221
1,1760,2062


In [25]:
print(f'Accuracy-Train {round(dtc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(dtc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,dtc_preds))
print(classification_report(y_validate,dtc.predict(X_validate)))

Accuracy-Train 0.5393
Accuracy-Validate 0.5046
              precision    recall  f1-score   support

           0       0.54      0.45      0.49      3188
           1       0.54      0.63      0.58      3283

    accuracy                           0.54      6471
   macro avg       0.54      0.54      0.53      6471
weighted avg       0.54      0.54      0.54      6471

              precision    recall  f1-score   support

           0       0.50      0.42      0.46      1179
           1       0.51      0.58      0.54      1215

    accuracy                           0.50      2394
   macro avg       0.50      0.50      0.50      2394
weighted avg       0.50      0.50      0.50      2394



## DTC grid_search CROSS_VALIDATION k=5

In [26]:
cross_val_score(dtc, X_train, y_train, cv=10)

array([0.52006173, 0.52859351, 0.50850077, 0.51313756, 0.48686244,
       0.48222566, 0.51622875, 0.50695518, 0.52395672, 0.48995363])

In [27]:
param_grid = {
    'max_depth': [None,18, 12, 10, 5],
    'min_samples_leaf': [1, 5, 10, 20],
    'criterion': ['gini', 'entropy'],
}

In [28]:
gr_search = GridSearchCV(DecisionTreeClassifier(),
                      param_grid)

In [29]:
gr_search

In [30]:
gr_search.fit(X_train, y_train)

In [31]:
results = gr_search.cv_results_

In [32]:
results_df_init = pd.DataFrame(results)

In [33]:
results_df_init.shape

(40, 16)

In [34]:
params = pd.DataFrame(results['params'])

In [35]:
params

Unnamed: 0,criterion,max_depth,min_samples_leaf
0,gini,,1
1,gini,,5
2,gini,,10
3,gini,,20
4,gini,18.0,1
5,gini,18.0,5
6,gini,18.0,10
7,gini,18.0,20
8,gini,12.0,1
9,gini,12.0,5


In [36]:
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
11,0.008705,0.000277,0.0005,6e-06,gini,12.0,20,"{'criterion': 'gini', 'max_depth': 12, 'min_sa...",0.511197,0.507728,0.508501,0.500773,0.530139,0.511668,0.009856,1
21,0.013475,0.000663,0.000559,8e-06,entropy,,5,"{'criterion': 'entropy', 'max_depth': None, 'm...",0.498069,0.50541,0.506955,0.506182,0.53323,0.509969,0.012057,2
6,0.010764,0.000501,0.000533,7e-06,gini,18.0,10,"{'criterion': 'gini', 'max_depth': 18, 'min_sa...",0.505019,0.496909,0.516229,0.518547,0.512365,0.509814,0.007918,3
25,0.012814,0.000385,0.000533,8e-06,entropy,18.0,5,"{'criterion': 'entropy', 'max_depth': 18, 'min...",0.514286,0.507728,0.506182,0.497682,0.523184,0.509812,0.008526,4
10,0.009317,0.000111,0.000509,9e-06,gini,12.0,10,"{'criterion': 'gini', 'max_depth': 12, 'min_sa...",0.513514,0.508501,0.497682,0.515456,0.513138,0.509658,0.006409,5


## RFC grid_search CROSS_VALIDATION k=5

In [37]:
param_grid = {
    'max_depth': [None,18, 12, 10, 5],
    'n_estimators': [55, 101, 201],
    'criterion': ['gini', 'entropy'],
}
gr_search = GridSearchCV(RandomForestClassifier(),
                      param_grid)

In [38]:
gr_search

In [39]:
gr_search.fit(X_train, y_train)

In [40]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(30, 16)

In [41]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head(11)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
24,0.133017,0.00076,0.005662,7.1e-05,entropy,10,55,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.535907,0.530912,0.477589,0.55255,0.534003,0.526192,0.025435,1
4,0.326414,0.002122,0.015652,0.000119,gini,18,101,"{'criterion': 'gini', 'max_depth': 18, 'n_esti...",0.514286,0.539413,0.479907,0.551005,0.522411,0.521404,0.024389,2
27,0.085537,0.000592,0.003791,2.7e-05,entropy,5,55,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",0.518919,0.512365,0.50541,0.545595,0.51932,0.520322,0.013619,3
5,0.650123,0.006646,0.030571,0.000241,gini,18,201,"{'criterion': 'gini', 'max_depth': 18, 'n_esti...",0.510425,0.530912,0.484544,0.551005,0.520866,0.51955,0.02204,4
25,0.244543,0.00194,0.009956,4.8e-05,entropy,10,101,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.511197,0.509274,0.498454,0.547913,0.527048,0.518777,0.017194,5
21,0.152102,0.00085,0.006511,5.2e-05,entropy,12,55,"{'criterion': 'entropy', 'max_depth': 12, 'n_e...",0.512741,0.521638,0.482998,0.544049,0.530912,0.518468,0.020544,6
8,0.526252,0.006801,0.023345,0.000225,gini,12,201,"{'criterion': 'gini', 'max_depth': 12, 'n_esti...",0.513514,0.527048,0.476043,0.541731,0.534003,0.518468,0.023154,7
22,0.27865,0.001058,0.011557,9.8e-05,entropy,12,101,"{'criterion': 'entropy', 'max_depth': 12, 'n_e...",0.510425,0.523184,0.493045,0.536321,0.528594,0.518314,0.015195,8
7,0.266115,0.001449,0.011969,4e-05,gini,12,101,"{'criterion': 'gini', 'max_depth': 12, 'n_esti...",0.515058,0.514683,0.486862,0.550232,0.52473,0.518313,0.020361,9
26,0.48315,0.003499,0.019131,0.000188,entropy,10,201,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.515058,0.523957,0.482998,0.536321,0.531685,0.518004,0.018932,10


## RFC grid_search CROSS_VALIDATION PLAYTIME

In [42]:
param_grid = {
    'max_depth': [5,8,10, 12,15],
    'n_estimators': [31, 55, 101,201,501],
    'criterion': ['gini', 'entropy'],
    'oob_score':['False','True']
}
gr_search = GridSearchCV(RandomForestClassifier(),
                      param_grid)

In [43]:
gr_search

In [44]:
gr_search.fit(X_train, y_train)

In [45]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(100, 17)

In [46]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head(11)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,param_oob_score,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
64,0.237945,0.000905,0.008432,9.4e-05,entropy,8,101,False,"{'criterion': 'entropy', 'max_depth': 8, 'n_es...",0.525869,0.523184,0.48609,0.545595,0.540958,0.524339,0.020951,1
21,0.0805,0.000524,0.003496,5.1e-05,gini,10,31,True,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.520463,0.520866,0.495363,0.547141,0.531685,0.523103,0.016929,2
69,1.199625,0.029358,0.040093,0.000274,entropy,8,501,True,"{'criterion': 'entropy', 'max_depth': 8, 'n_es...",0.505019,0.516229,0.488408,0.556414,0.540958,0.521406,0.024462,3
5,0.170038,0.000947,0.006598,7e-05,gini,5,101,True,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.511197,0.518547,0.497682,0.540958,0.535549,0.520787,0.015847,4
19,1.105652,0.007267,0.041551,0.000442,gini,8,501,True,"{'criterion': 'gini', 'max_depth': 8, 'n_estim...",0.503475,0.51391,0.484544,0.551777,0.548686,0.520479,0.026072,5
35,0.294543,0.003836,0.011981,0.00018,gini,12,101,True,"{'criterion': 'gini', 'max_depth': 12, 'n_esti...",0.504247,0.509274,0.497682,0.549459,0.539413,0.520015,0.020523,6
82,0.174119,0.001723,0.006596,0.000238,entropy,12,55,False,"{'criterion': 'entropy', 'max_depth': 12, 'n_e...",0.509653,0.510819,0.49459,0.556414,0.527048,0.519705,0.021034,7
4,0.172244,0.001118,0.006705,5.9e-05,gini,5,101,False,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.505019,0.527821,0.506182,0.537094,0.51932,0.519087,0.01237,8
32,0.159588,0.001082,0.00668,0.000141,gini,12,55,False,"{'criterion': 'gini', 'max_depth': 12, 'n_esti...",0.528185,0.51391,0.482998,0.543277,0.527048,0.519084,0.0203,9
28,1.277852,0.006054,0.049111,0.000489,gini,10,501,False,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.514286,0.518547,0.479907,0.547913,0.534003,0.518931,0.022855,10


## NBC grid_search CROSS_VALIDATION k=5

In [47]:
param_grid = {
    'var_smoothing': [.000000001,.000000002,.000000003]
}
gr_search = GridSearchCV(GaussianNB(),
                      param_grid)

In [48]:
gr_search

In [49]:
gr_search.fit(X_train, y_train)

In [50]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(3, 14)

In [51]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_var_smoothing,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004008,0.00216,0.001453,0.000356,0.0,{'var_smoothing': 1e-09},0.517375,0.497682,0.503864,0.527821,0.525502,0.514449,0.011855,1
1,0.002031,0.000123,0.000955,0.000105,0.0,{'var_smoothing': 2e-09},0.517375,0.497682,0.503864,0.527821,0.525502,0.514449,0.011855,1
2,0.001655,8.6e-05,0.000743,4.6e-05,0.0,{'var_smoothing': 3e-09},0.517375,0.497682,0.503864,0.527821,0.525502,0.514449,0.011855,1


## GBC grid_search CROSS_VALIDATION k=5

In [52]:
param_grid = {
    'learning_rate': [0.1,0.2,0.5,1.0, 5.0],
    'n_estimators': [55, 101, 201],
    'min_samples_leaf': [1,5,10,20],
}
gr_search = GridSearchCV(GradientBoostingClassifier(),
                      param_grid)

In [53]:
gr_search

In [54]:
gr_search.fit(X_train, y_train)

In [55]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(60, 16)

In [56]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
12,0.161948,0.000715,0.001126,1.5e-05,0.2,1,55,"{'learning_rate': 0.2, 'min_samples_leaf': 1, ...",0.525869,0.53323,0.496136,0.531685,0.530139,0.523412,0.013857,1
4,0.292017,0.004093,0.001675,5.5e-05,0.1,5,101,"{'learning_rate': 0.1, 'min_samples_leaf': 5, ...",0.530502,0.515456,0.489181,0.543277,0.534003,0.522484,0.018909,2
0,0.164831,0.013543,0.001132,7.1e-05,0.1,1,55,"{'learning_rate': 0.1, 'min_samples_leaf': 1, ...",0.538996,0.527821,0.483771,0.528594,0.529366,0.52171,0.019398,3
22,0.297993,0.001284,0.00184,5.4e-05,0.2,20,101,"{'learning_rate': 0.2, 'min_samples_leaf': 20,...",0.508108,0.512365,0.482226,0.563369,0.522411,0.517696,0.026408,4
9,0.163661,0.00311,0.001321,0.000114,0.1,20,55,"{'learning_rate': 0.1, 'min_samples_leaf': 20,...",0.521236,0.527048,0.488408,0.520093,0.522411,0.515839,0.013918,5


## GBC grid_search CROSS_VALIDATION PLAYTIME

In [57]:
param_grid = {
    'learning_rate': [0.02,0.05,0.1],
    'n_estimators': [21,31,55, 101],
    'min_samples_leaf': [2,5,10],
}
gr_search = GridSearchCV(GradientBoostingClassifier(),
                      param_grid)

In [58]:
gr_search

In [59]:
gr_search.fit(X_train, y_train)

In [60]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(36, 16)

In [61]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
31,0.296887,0.001743,0.001704,5.1e-05,0.1,5,101,"{'learning_rate': 0.1, 'min_samples_leaf': 5, ...",0.530502,0.515456,0.489181,0.543277,0.534003,0.522484,0.018909,1
1,0.089248,0.000589,0.000878,6.1e-05,0.02,2,31,"{'learning_rate': 0.02, 'min_samples_leaf': 2,...",0.527413,0.525502,0.501546,0.52473,0.530139,0.521866,0.01033,2
5,0.088754,0.000313,0.00082,3e-05,0.02,5,31,"{'learning_rate': 0.02, 'min_samples_leaf': 5,...",0.523552,0.525502,0.501546,0.523957,0.527821,0.520475,0.009583,3
9,0.090672,0.002445,0.000891,8.3e-05,0.02,10,31,"{'learning_rate': 0.02, 'min_samples_leaf': 10...",0.523552,0.525502,0.501546,0.523957,0.526275,0.520166,0.009363,4
4,0.060525,0.000337,0.000718,3.6e-05,0.02,5,21,"{'learning_rate': 0.02, 'min_samples_leaf': 5,...",0.528958,0.525502,0.50541,0.522411,0.510819,0.51862,0.008989,5


## CATboost grid_search CROSS_VALIDATION k=5

In [62]:
# Create and fit the thing
CATb = CatBoostClassifier(verbose=False,depth=10,learning_rate=1.0)
CATb.fit(X_train,y_train)
CATb_preds = CATb.predict(X_train)
pd.crosstab(CATb_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3178,17
1,10,3266


In [63]:
print(f'Accuracy-Train {round(CATb.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(CATb.score(X_validate,y_validate),4)}')
print(classification_report(y_train,CATb_preds))
print(classification_report(y_validate,CATb.predict(X_validate)))

Accuracy-Train 0.9958
Accuracy-Validate 0.5
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3188
           1       1.00      0.99      1.00      3283

    accuracy                           1.00      6471
   macro avg       1.00      1.00      1.00      6471
weighted avg       1.00      1.00      1.00      6471

              precision    recall  f1-score   support

           0       0.49      0.48      0.49      1179
           1       0.51      0.52      0.51      1215

    accuracy                           0.50      2394
   macro avg       0.50      0.50      0.50      2394
weighted avg       0.50      0.50      0.50      2394



In [64]:
param_grid = {
    'verbose': [False],
    'depth': [5, 10, 12],
    'learning_rate': [0.1,0.2, 0.5],
}
gr_search = GridSearchCV(CatBoostClassifier(),
                      param_grid)

In [65]:
gr_search

In [66]:
gr_search.fit(X_train, y_train)

In [67]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(9, 16)

In [68]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_depth,param_learning_rate,param_verbose,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
6,7.078878,0.028331,0.002407,8.9e-05,12,0.1,False,"{'depth': 12, 'learning_rate': 0.1, 'verbose':...",0.512741,0.545595,0.49459,0.53323,0.511592,0.51955,0.017878,1
8,6.519551,0.063714,0.00221,1.1e-05,12,0.5,False,"{'depth': 12, 'learning_rate': 0.5, 'verbose':...",0.498842,0.535549,0.491499,0.51391,0.509274,0.509815,0.015068,2
5,2.700013,0.031329,0.001873,0.000227,10,0.5,False,"{'depth': 10, 'learning_rate': 0.5, 'verbose':...",0.515058,0.521638,0.482226,0.527048,0.501546,0.509503,0.016083,3
3,2.835837,0.026854,0.001684,2.6e-05,10,0.1,False,"{'depth': 10, 'learning_rate': 0.1, 'verbose':...",0.491892,0.529366,0.477589,0.535549,0.509274,0.508734,0.021903,4
7,6.969984,0.034486,0.00228,3.3e-05,12,0.2,False,"{'depth': 12, 'learning_rate': 0.2, 'verbose':...",0.501158,0.528594,0.47527,0.527821,0.510046,0.508578,0.019685,5


## SCALE for Logistic regression, MLP, etc.

In [69]:
sc_X = StandardScaler()
X_train_scaled = sc_X.fit_transform(X_train)
X_validate_scaled = sc_X.transform(X_validate)
X_test_scaled = sc_X.transform(X_test)

In [70]:
mlp = MLPClassifier(hidden_layer_sizes=(1024,512,128),activation="relu",random_state=2013,
                    batch_size=200,solver='adam')
mlp.fit(X_train_scaled, y_train)
mlp_preds = mlp.predict(X_train_scaled)
print(mlp.score(X_train_scaled, y_train))
pd.crosstab(mlp_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

0.9528666357595426


is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3011,128
1,177,3155


In [71]:
mlp.n_layers_

5

In [72]:
print(f'Accuracy-Train {round(mlp.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(mlp.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,mlp_preds))
print(classification_report(y_validate,mlp.predict(X_validate_scaled)))

Accuracy-Train 0.9529
Accuracy-Validate 0.5046
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      3188
           1       0.95      0.96      0.95      3283

    accuracy                           0.95      6471
   macro avg       0.95      0.95      0.95      6471
weighted avg       0.95      0.95      0.95      6471

              precision    recall  f1-score   support

           0       0.50      0.51      0.50      1179
           1       0.51      0.50      0.51      1215

    accuracy                           0.50      2394
   macro avg       0.50      0.50      0.50      2394
weighted avg       0.50      0.50      0.50      2394



## KNN grid_search CROSS_VALIDATION k=5

In [73]:
param_grid = {
    'n_neighbors': [70],
    'weights': ['uniform','distance'],
    'algorithm':['ball_tree','brute'],
    'leaf_size': [3,4,5,6,7,8,9],
}
gr_search = GridSearchCV(KNeighborsClassifier(),
                      param_grid)

In [74]:
gr_search

In [75]:
gr_search.fit(X_train, y_train)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [76]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(28, 17)

In [77]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head(31)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_leaf_size,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
13,0.002132,5.1e-05,0.043323,0.000648,ball_tree,9,70,distance,"{'algorithm': 'ball_tree', 'leaf_size': 9, 'n_...",0.528958,0.51932,0.467543,0.527048,0.527821,0.514138,0.023544,1
7,0.002101,2.3e-05,0.043425,0.000718,ball_tree,6,70,distance,"{'algorithm': 'ball_tree', 'leaf_size': 6, 'n_...",0.528958,0.51932,0.467543,0.527048,0.527821,0.514138,0.023544,1
11,0.002086,2.4e-05,0.043292,0.000646,ball_tree,8,70,distance,"{'algorithm': 'ball_tree', 'leaf_size': 8, 'n_...",0.528958,0.51932,0.467543,0.527048,0.527821,0.514138,0.023544,1
9,0.002112,4.1e-05,0.043653,0.000564,ball_tree,7,70,distance,"{'algorithm': 'ball_tree', 'leaf_size': 7, 'n_...",0.528958,0.51932,0.467543,0.527048,0.527821,0.514138,0.023544,1
25,0.000814,2.2e-05,0.026051,0.001828,brute,8,70,distance,"{'algorithm': 'brute', 'leaf_size': 8, 'n_neig...",0.528185,0.518547,0.470634,0.52473,0.527821,0.513983,0.021948,5
23,0.000824,3.3e-05,0.025263,0.001209,brute,7,70,distance,"{'algorithm': 'brute', 'leaf_size': 7, 'n_neig...",0.528185,0.518547,0.470634,0.52473,0.527821,0.513983,0.021948,5
21,0.000832,5.6e-05,0.024931,0.001153,brute,6,70,distance,"{'algorithm': 'brute', 'leaf_size': 6, 'n_neig...",0.528185,0.518547,0.470634,0.52473,0.527821,0.513983,0.021948,5
19,0.000811,1.7e-05,0.024731,0.000951,brute,5,70,distance,"{'algorithm': 'brute', 'leaf_size': 5, 'n_neig...",0.528185,0.518547,0.470634,0.52473,0.527821,0.513983,0.021948,5
17,0.000817,2.6e-05,0.024566,0.000843,brute,4,70,distance,"{'algorithm': 'brute', 'leaf_size': 4, 'n_neig...",0.528185,0.518547,0.470634,0.52473,0.527821,0.513983,0.021948,5
15,0.000829,1.8e-05,0.024888,0.000706,brute,3,70,distance,"{'algorithm': 'brute', 'leaf_size': 3, 'n_neig...",0.528185,0.518547,0.470634,0.52473,0.527821,0.513983,0.021948,5


## LOG grid_search CROSS_VALIDATION k=5

In [78]:
param_grid = {
    'penalty': ['none','l1','l2','elasticnet'],
    'class_weight': ['balanced',None],
    'max_iter':[100,50,200]
}
gr_search = GridSearchCV(LogisticRegression(),
                      param_grid)

In [79]:
gr_search

In [80]:
gr_search.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [81]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(24, 16)

In [82]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head(11)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_max_iter,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
20,0.067161,0.010093,0.001061,0.000409,,200,none,"{'class_weight': None, 'max_iter': 200, 'penal...",0.525869,0.507728,0.497682,0.51932,0.503091,0.510738,0.010398,1
22,0.081435,0.016794,0.001249,0.000339,,200,l2,"{'class_weight': None, 'max_iter': 200, 'penal...",0.526641,0.509274,0.493818,0.517002,0.504637,0.510274,0.011106,2
14,0.037878,0.006469,0.000835,0.000591,,100,l2,"{'class_weight': None, 'max_iter': 100, 'penal...",0.525869,0.509274,0.493818,0.518547,0.495363,0.508574,0.01258,3
12,0.037593,0.010323,0.000949,0.000445,,100,none,"{'class_weight': None, 'max_iter': 100, 'penal...",0.519691,0.511592,0.492272,0.515456,0.502318,0.508266,0.009843,4
6,0.018426,0.003536,0.000762,0.000377,balanced,50,l2,"{'class_weight': 'balanced', 'max_iter': 50, '...",0.511969,0.500773,0.492272,0.525502,0.496909,0.505485,0.011942,5
16,0.018579,0.004576,0.000705,0.000391,,50,none,"{'class_weight': None, 'max_iter': 50, 'penalt...",0.509653,0.511592,0.493818,0.517774,0.493045,0.505176,0.009961,6
10,0.073666,0.035551,0.000729,0.000388,balanced,200,l2,"{'class_weight': 'balanced', 'max_iter': 200, ...",0.521236,0.514683,0.484544,0.512365,0.493045,0.505174,0.013949,7
4,0.023744,0.008862,0.000923,0.000448,balanced,50,none,"{'class_weight': 'balanced', 'max_iter': 50, '...",0.519691,0.508501,0.482226,0.517774,0.496909,0.50502,0.013967,8
18,0.019453,0.003739,0.000781,0.000357,,50,l2,"{'class_weight': None, 'max_iter': 50, 'penalt...",0.514286,0.510046,0.484544,0.515456,0.498454,0.504557,0.011673,9
8,0.068718,0.007945,0.001915,0.001656,balanced,200,none,"{'class_weight': 'balanced', 'max_iter': 200, ...",0.514286,0.511592,0.489181,0.508501,0.493045,0.503321,0.010208,10


## SVM grid_search CROSS_VALIDATION k=5

In [83]:
param_grid = {
    'shrinking': [True,False],
    'C':[1,5,10]
}
gr_search = GridSearchCV(SVC(),
                      param_grid)

In [84]:
gr_search

In [85]:
gr_search.fit(X_train, y_train)

In [86]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(6, 15)

In [87]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head(11)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_shrinking,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,0.404305,0.005469,0.222049,0.005807,10,True,"{'C': 10, 'shrinking': True}",0.527413,0.507728,0.509274,0.534003,0.527048,0.521093,0.010587,1
5,0.41554,0.009967,0.219741,0.001375,10,False,"{'C': 10, 'shrinking': False}",0.527413,0.507728,0.509274,0.534003,0.527048,0.521093,0.010587,1
2,0.39793,0.002617,0.219404,0.000763,5,True,"{'C': 5, 'shrinking': True}",0.525869,0.508501,0.50541,0.525502,0.531685,0.519393,0.010436,3
3,0.398342,0.001694,0.219931,0.001258,5,False,"{'C': 5, 'shrinking': False}",0.525869,0.508501,0.504637,0.525502,0.531685,0.519239,0.010645,4
0,0.404234,0.012649,0.223019,0.000948,1,True,"{'C': 1, 'shrinking': True}",0.52278,0.514683,0.502318,0.513138,0.528594,0.516303,0.008958,5
1,0.394556,0.00083,0.222639,0.001194,1,False,"{'C': 1, 'shrinking': False}",0.52278,0.514683,0.502318,0.513138,0.527821,0.516148,0.008749,6


# NOW return to single iterations of each model with optimized hyperparameters.

### RFC: entropy, max_d=5,n_est=55

In [88]:
# create the Random Forest model 
rf1 = RandomForestClassifier(criterion='entropy',n_estimators=55,max_depth=5)
# fit the model to the TRAIN dataset1
rf1.fit(X_train, y_train)
# use the model by calling for the predictions made via the TRAIN dataset
rf1_preds = rf1.predict(X_train)
pd.crosstab(rf1_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1449,919
1,1739,2364


In [89]:
print(f'Accuracy-Train {round(rf1.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(rf1.score(X_validate,y_validate),4)}')
print(classification_report(y_train,rf1_preds))
print(classification_report(y_validate,rf1.predict(X_validate)))

Accuracy-Train 0.5892
Accuracy-Validate 0.5054
              precision    recall  f1-score   support

           0       0.61      0.45      0.52      3188
           1       0.58      0.72      0.64      3283

    accuracy                           0.59      6471
   macro avg       0.59      0.59      0.58      6471
weighted avg       0.59      0.59      0.58      6471

              precision    recall  f1-score   support

           0       0.50      0.37      0.43      1179
           1       0.51      0.63      0.57      1215

    accuracy                           0.51      2394
   macro avg       0.50      0.50      0.50      2394
weighted avg       0.50      0.51      0.50      2394



### KNN: n_neighbors=70, "distance" based weighting ,"brute" computation

In [90]:
knn = KNeighborsClassifier(n_neighbors=70,weights='distance',algorithm='brute')
knn.fit(X_train_scaled, y_train)
knn_preds = knn.predict(X_train_scaled)
print(knn.score(X_train_scaled, y_train))
pd.crosstab(knn_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

0.9958275382475661


is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3184,23
1,4,3260


In [91]:
print(f'Accuracy-Train {round(knn.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(knn.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,knn_preds))
print(classification_report(y_validate,knn.predict(X_validate_scaled)))

Accuracy-Train 0.9958
Accuracy-Validate 0.5058
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3188
           1       1.00      0.99      1.00      3283

    accuracy                           1.00      6471
   macro avg       1.00      1.00      1.00      6471
weighted avg       1.00      1.00      1.00      6471

              precision    recall  f1-score   support

           0       0.50      0.48      0.49      1179
           1       0.51      0.53      0.52      1215

    accuracy                           0.51      2394
   macro avg       0.51      0.51      0.51      2394
weighted avg       0.51      0.51      0.51      2394



### GBC: learning_rate=0.05, n_est=31,min_samples_leaf=2

In [92]:
# create the Random Forest model 
gbc = GradientBoostingClassifier(learning_rate=0.05,n_estimators=31,min_samples_leaf=2)
# fit the model to the TRAIN dataset:
gbc.fit(X_train, y_train)
# use the model by calling for the predictions made via the TRAIN dataset
gbc_preds = gbc.predict(X_train)
pd.crosstab(gbc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1059,814
1,2129,2469


In [93]:
print(f'Accuracy-Train {round(gbc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(gbc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,gbc_preds))
print(classification_report(y_validate,gbc.predict(X_validate)))

Accuracy-Train 0.5452
Accuracy-Validate 0.5205
              precision    recall  f1-score   support

           0       0.57      0.33      0.42      3188
           1       0.54      0.75      0.63      3283

    accuracy                           0.55      6471
   macro avg       0.55      0.54      0.52      6471
weighted avg       0.55      0.55      0.52      6471

              precision    recall  f1-score   support

           0       0.52      0.32      0.39      1179
           1       0.52      0.72      0.60      1215

    accuracy                           0.52      2394
   macro avg       0.52      0.52      0.50      2394
weighted avg       0.52      0.52      0.50      2394



In [94]:
cols = X_train.columns

In [95]:
cols

Index(['week_num', 'temp', 'humidity', 'wind', 'spread', 'ou',
       'abnormal_start', 'is_playoff', 'playoff_implications', 'is_turf',
       'is_outdoor'],
      dtype='object')

In [96]:
gini = gbc.feature_importances_

In [97]:
gini

array([0.06697706, 0.10438619, 0.14205592, 0.30226101, 0.16706896,
       0.20315575, 0.00778298, 0.        , 0.        , 0.00631212,
       0.        ])

In [98]:
feat_df = pd.DataFrame({'Feature':cols, 'Coef':gini})

In [99]:
feat_df.sort_values('Coef',ascending=False)

Unnamed: 0,Feature,Coef
3,wind,0.302261
5,ou,0.203156
4,spread,0.167069
2,humidity,0.142056
1,temp,0.104386
0,week_num,0.066977
6,abnormal_start,0.007783
9,is_turf,0.006312
7,is_playoff,0.0
8,playoff_implications,0.0
