In [1]:
#standard ds imports
import pandas as pd
import numpy as np
#viz and stats
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# .py imports
#import wranglerer as wr
#import modeling as md
import os
#sklearn imports
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
import sklearn.preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix, plot_confusion_matrix,balanced_accuracy_score,roc_auc_score,make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#CATboost imports
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import explore_r as ex
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import xgboost as xgb

### This notebook builds upon the previous notebook (model_r_round3) and will use an ultra slim version of the original df...cols= wind, ou, is_outdoor, is_turf

# OK, enough of that...
## Let's work with a hi-speed,low-drag version of the df
### Cols: wind, ou, is_outdoor, is_turf, abnormal_start, playoff

In [2]:
df = pd.read_csv('prepped_data_stad.csv')

In [3]:
df.columns

Index(['date', 'day_of_week', 'start_time', 'week_num', 'home_score',
       'home_wins', 'away_score', 'away_wins', 'stadium', 'temp', 'humidity',
       'wind', 'spread', 'ou', 'is_under', 'abnormal_start', 'total_scores',
       'is_playoff', 'playoff_implications', 'is_turf', 'is_outdoor'],
      dtype='object')

In [4]:
df = df.drop(columns=['date','home_score',
       'home_wins', 'away_score', 'away_wins','total_scores',
                     'day_of_week','start_time', 'week_num','stadium', 'temp', 'humidity','spread',
                      'playoff_implications','abnormal_start','is_playoff'])
#df['spread'] = abs(df['spread'])

In [5]:
df.head()

Unnamed: 0,wind,ou,is_under,is_turf,is_outdoor
0,0,51.0,0,0,0
1,14,45.5,1,0,1
2,13,48.0,1,0,1
3,0,49.0,1,1,1
4,19,47.0,1,0,1


In [6]:
X_train, y_train, X_validate, y_validate, X_test, y_test = ex.train_validate_test(df,'is_under')

In [7]:
X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape 

((6471, 4), (6471,), (2394, 4), (2394,), (1946, 4), (1946,))

In [8]:
print(y_train.value_counts(normalize=True))
print(y_validate.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

1    0.50734
0    0.49266
Name: is_under, dtype: float64
1    0.507519
0    0.492481
Name: is_under, dtype: float64
1    0.507194
0    0.492806
Name: is_under, dtype: float64


In [9]:
X_train.head()

Unnamed: 0,wind,ou,is_turf,is_outdoor
894,9,46.5,0,1
6913,11,41.5,1,1
886,0,48.0,1,0
7630,20,37.0,1,1
5466,5,36.5,0,1


In [10]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1946 entries, 6593 to 94
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   wind        1946 non-null   int64  
 1   ou          1946 non-null   float64
 2   is_turf     1946 non-null   int64  
 3   is_outdoor  1946 non-null   int64  
dtypes: float64(1), int64(3)
memory usage: 76.0 KB


# MODELING with CV on slim df

## DTC with CV

In [11]:
param_grid = {
    'max_depth': [None,18, 12, 10, 5],
    'min_samples_leaf': [1, 5, 10, 20],
    'criterion': ['gini', 'entropy'],
}

gr_search = GridSearchCV(DecisionTreeClassifier(),param_grid)

In [12]:
gr_search

In [13]:
gr_search.fit(X_train, y_train)

In [14]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(40, 16)

In [15]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
28,0.003355,7.1e-05,0.000478,9e-06,entropy,12.0,1,"{'criterion': 'entropy', 'max_depth': 12, 'min...",0.508108,0.522411,0.527821,0.523184,0.517774,0.51986,0.006684,1
8,0.003294,6.5e-05,0.000485,6e-06,gini,12.0,1,"{'criterion': 'gini', 'max_depth': 12, 'min_sa...",0.508108,0.522411,0.52473,0.529366,0.512365,0.519396,0.007922,2
20,0.003931,0.000166,0.00051,7e-06,entropy,,1,"{'criterion': 'entropy', 'max_depth': None, 'm...",0.517375,0.527821,0.516229,0.513138,0.51932,0.518776,0.004947,3
0,0.008385,0.001941,0.001333,0.000544,gini,,1,"{'criterion': 'gini', 'max_depth': None, 'min_...",0.513514,0.527048,0.516229,0.512365,0.516229,0.517077,0.005211,4
30,0.003093,8.7e-05,0.000461,6e-06,entropy,12.0,10,"{'criterion': 'entropy', 'max_depth': 12, 'min...",0.501931,0.515456,0.514683,0.528594,0.517002,0.515533,0.008467,5


## DTC with CV PLAYTIME

In [None]:
param_grid = {
    'max_depth': [5],
    'min_samples_leaf': [10, 20,30,50,60],
    'criterion': ['gini', 'entropy'],
}

gr_search = GridSearchCV(DecisionTreeClassifier(),param_grid)

In [None]:
gr_search

In [None]:
gr_search.fit(X_train, y_train)

In [None]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

In [None]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

## RFC with CV

In [None]:
param_grid = {
    'max_depth': [None,18, 12, 10, 5],
    'n_estimators': [55, 101, 201],
    'criterion': ['gini', 'entropy'],
}
gr_search = GridSearchCV(RandomForestClassifier(),
                      param_grid)

In [None]:
gr_search

In [None]:
gr_search.fit(X_train, y_train)

In [None]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

In [None]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

## RFC with CV PLAYTIME

In [None]:
param_grid = {
    'max_depth': [2,3,5,10],
    'n_estimators': [31, 101, 201,501],
    'criterion': ['gini', 'entropy'],
}
gr_search = GridSearchCV(RandomForestClassifier(),
                      param_grid)

In [None]:
gr_search

In [None]:
gr_search.fit(X_train, y_train)

In [None]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

In [None]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

## NBC with CV

In [None]:
param_grid = {
    'var_smoothing': [.000000001,.000000002,.000000003]
}
gr_search = GridSearchCV(GaussianNB(),
                      param_grid)

In [None]:
gr_search

In [None]:
gr_search.fit(X_train, y_train)

In [None]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

In [None]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

## GBC with CV

In [None]:
param_grid = {
    'learning_rate': [0.1,0.2,0.5,1.0, 5.0],
    'n_estimators': [55, 101, 201],
    'min_samples_leaf': [1,5,10,20],
}
gr_search = GridSearchCV(GradientBoostingClassifier(),
                      param_grid)

In [None]:
gr_search

In [None]:
gr_search.fit(X_train, y_train)

In [None]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

In [None]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

## CATb with CV

In [None]:
# param_grid = {
#     'verbose': [False],
#     'depth': [5, 10, 12,15],
#     'learning_rate': [None,0.1, 0.3,0.7,1.0],
# }
# gr_search = GridSearchCV(CatBoostClassifier(),
#                       param_grid)

In [None]:
param_grid = {
    'verbose': [False],
    'depth': [5, 10, 12],
    'learning_rate': [None,0.1,0.2, 0.5],
}
gr_search = GridSearchCV(CatBoostClassifier(),
                      param_grid)

In [None]:
gr_search

In [None]:
gr_search.fit(X_train, y_train)

In [None]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

In [None]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

## SCALE for Logistic regression, MLP, etc.

In [None]:
sc_X = StandardScaler()
X_train_scaled = sc_X.fit_transform(X_train)
X_validate_scaled = sc_X.transform(X_validate)
X_test_scaled = sc_X.transform(X_test)

## MLP

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(1024,512,128),activation="relu",random_state=2013,
                    batch_size=500,solver='adam')
mlp.fit(X_train_scaled, y_train)
mlp_preds = mlp.predict(X_train_scaled)
print(mlp.score(X_train_scaled, y_train))
pd.crosstab(mlp_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

In [None]:
print(f'Accuracy-Train {round(mlp.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(mlp.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,mlp_preds))
print(classification_report(y_validate,mlp.predict(X_validate_scaled)))

## KNN with CV

In [None]:
param_grid = {
    'n_neighbors': [70],
    'weights': ['uniform','distance'],
    'algorithm':['ball_tree','brute'],
    'leaf_size': [3,4,5,6,7,8,9],
}
gr_search = GridSearchCV(KNeighborsClassifier(),
                      param_grid)

In [None]:
gr_search

In [None]:
gr_search.fit(X_train, y_train)

In [None]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

In [None]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

## LOG with CV

In [None]:
param_grid = {
    'penalty': ['none','l1','l2','elasticnet'],
    'class_weight': ['balanced',None],
    'max_iter':[100,50,200,500]
}
gr_search = GridSearchCV(LogisticRegression(),
                      param_grid)

In [None]:
gr_search

In [None]:
gr_search.fit(X_train, y_train)

In [None]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

In [None]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

## SVM with CV

In [None]:
param_grid = {
    'shrinking': [True,False],
    'C':[1,5,10]
}
gr_search = GridSearchCV(SVC(),
                      param_grid)

In [None]:
gr_search

In [None]:
gr_search.fit(X_train, y_train)

In [None]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

In [None]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

# NOW return to single iterations of each model with optimized hyperparameters.

### DTC: gini, max_d=5,min_samples_leaf=50

In [16]:
# create the Decision Tree Classifier model 
dtc = DecisionTreeClassifier(max_depth=5,min_samples_leaf=50)
# fit the model to the TRAIN dataset:
dtc.fit(X_train, y_train)
# use the model by calling for the predictions made via the TRAIN dataset
dtc_preds = dtc.predict(X_train)
pd.crosstab(dtc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,693,564
1,2495,2719


In [17]:
print(f'Accuracy-Train {round(dtc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(dtc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,dtc_preds))
print(classification_report(y_validate,dtc.predict(X_validate)))

Accuracy-Train 0.5273
Accuracy-Validate 0.5184
              precision    recall  f1-score   support

           0       0.55      0.22      0.31      3188
           1       0.52      0.83      0.64      3283

    accuracy                           0.53      6471
   macro avg       0.54      0.52      0.48      6471
weighted avg       0.54      0.53      0.48      6471

              precision    recall  f1-score   support

           0       0.53      0.21      0.30      1179
           1       0.52      0.82      0.63      1215

    accuracy                           0.52      2394
   macro avg       0.52      0.51      0.47      2394
weighted avg       0.52      0.52      0.47      2394



### RFC: entropy, max_d=3,n_est=201

In [21]:
# create the Random Forest model 
rf1 = RandomForestClassifier(criterion='entropy',n_estimators=201,max_depth=3)
# fit the model to the TRAIN dataset1
rf1.fit(X_train, y_train)
# use the model by calling for the predictions made via the TRAIN dataset
rf1_preds = rf1.predict(X_train)
pd.crosstab(rf1_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1092,902
1,2096,2381


In [22]:
print(f'Accuracy-Train {round(rf1.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(rf1.score(X_validate,y_validate),4)}')
print(classification_report(y_train,rf1_preds))
print(classification_report(y_validate,rf1.predict(X_validate)))

Accuracy-Train 0.5367
Accuracy-Validate 0.5238
              precision    recall  f1-score   support

           0       0.55      0.34      0.42      3188
           1       0.53      0.73      0.61      3283

    accuracy                           0.54      6471
   macro avg       0.54      0.53      0.52      6471
weighted avg       0.54      0.54      0.52      6471

              precision    recall  f1-score   support

           0       0.53      0.34      0.41      1179
           1       0.52      0.70      0.60      1215

    accuracy                           0.52      2394
   macro avg       0.52      0.52      0.51      2394
weighted avg       0.52      0.52      0.51      2394



In [23]:
print(f'Accuracy-Train {round(rf1.score(X_train,y_train),4)}')
print(f'Accuracy-test {round(rf1.score(X_test,y_test),4)}')
print(classification_report(y_train,rf1_preds))
print(classification_report(y_test,rf1.predict(X_test)))

Accuracy-Train 0.5367
Accuracy-test 0.5221
              precision    recall  f1-score   support

           0       0.55      0.34      0.42      3188
           1       0.53      0.73      0.61      3283

    accuracy                           0.54      6471
   macro avg       0.54      0.53      0.52      6471
weighted avg       0.54      0.54      0.52      6471

              precision    recall  f1-score   support

           0       0.52      0.35      0.42       959
           1       0.52      0.69      0.60       987

    accuracy                           0.52      1946
   macro avg       0.52      0.52      0.51      1946
weighted avg       0.52      0.52      0.51      1946



### SVM: vanilla

In [None]:
svm = SVC()
svm.fit(X_train_scaled, y_train)
svm_preds = svm.predict(X_train_scaled)
print(svm.score(X_train_scaled, y_train))
pd.crosstab(svm_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

In [None]:
print(f'Accuracy-Train {round(svm.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(svm.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,svm_preds))
print(classification_report(y_validate,svm.predict(X_validate_scaled)))