In [50]:
#standard ds imports
import pandas as pd
import numpy as np
#viz and stats
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# .py imports
#import wranglerer as wr
#import modeling as md
import os
#sklearn imports
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
import sklearn.preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#CATboost imports
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import explore_r as ex

In [2]:
df = pd.read_csv('prepped_data.csv')

In [3]:
df = df.drop(columns=['date', 'day_of_week', 'start_time','home_score',
       'home_wins', 'away_score', 'away_wins','total_scores'])
df['spread'] = abs(df['spread'])

In [4]:
df.head()

Unnamed: 0,week_num,temp,humidity,wind,spread,ou,is_under,abnormal_start,is_playoff,playoff_implications,is_turf,is_outdoor
0,19,72,0,0,1.0,51.0,0,0,1,1,0,0
1,19,52,48,14,2.5,45.5,1,0,1,1,0,1
2,19,22,55,13,1.5,48.0,1,0,1,1,0,1
3,19,32,10,0,5.5,49.0,1,0,1,1,1,1
4,19,55,47,19,4.0,47.0,1,0,1,1,0,1


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10811 entries, 0 to 10810
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   week_num              10811 non-null  int64  
 1   temp                  10811 non-null  int64  
 2   humidity              10811 non-null  int64  
 3   wind                  10811 non-null  int64  
 4   spread                10811 non-null  float64
 5   ou                    10811 non-null  float64
 6   is_under              10811 non-null  int64  
 7   abnormal_start        10811 non-null  int64  
 8   is_playoff            10811 non-null  int64  
 9   playoff_implications  10811 non-null  int64  
 10  is_turf               10811 non-null  int64  
 11  is_outdoor            10811 non-null  int64  
dtypes: float64(2), int64(10)
memory usage: 1013.7 KB


## ESTABLISH BASELINE 50.7%

In [5]:
df.is_under.value_counts(normalize=True)

1    0.507354
0    0.492646
Name: is_under, dtype: float64

In [6]:
df['baseline'] = 1

In [7]:
baseline_accuracy = (df.baseline == df.is_under).mean()
print(f'baseline accuracy: {baseline_accuracy:.2%}')

baseline accuracy: 50.74%


In [8]:
subset = df[df.is_under == 1]
baseline_recall = (subset.baseline == subset.is_under).mean()
print(f'baseline recall: {baseline_recall:.2%}')

baseline recall: 100.00%


In [9]:
subset = df[df.baseline == 1]
baseline_precision = (subset.baseline == subset.is_under).mean()
print(f'baseline precision: {baseline_precision:.2%}')

baseline precision: 50.74%


In [10]:
df.drop(columns='baseline',inplace=True)

In [11]:
X_train, y_train, X_validate, y_validate, X_test, y_test = ex.train_validate_test(df,'is_under')

In [12]:
X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape 

((6471, 11), (6471,), (2394, 11), (2394,), (1946, 11), (1946,))

In [15]:
X_train.head()

Unnamed: 0,week_num,temp,humidity,wind,spread,ou,abnormal_start,is_playoff,playoff_implications,is_turf,is_outdoor
1713,12,72,0,0,4.0,49.0,0,0,1,1,0
2164,17,72,0,0,7.5,38.0,0,0,1,0,0
2554,9,72,0,0,1.0,42.5,1,0,0,0,0
3117,6,58,52,10,12.5,40.0,0,0,0,0,1
9819,13,51,49,14,5.0,39.5,0,0,1,0,1


# MODELING

## DTC VANILLA

In [18]:
# create the Decision Tree Classifier model 
dtc = DecisionTreeClassifier()

In [19]:
# fit the model to the TRAIN dataset:
dtc.fit(X_train, y_train)

In [20]:
# use the model by calling for the predictions made via the TRAIN dataset
dtc_preds = dtc.predict(X_train)
pd.crosstab(dtc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3180,20
1,0,3271


In [21]:
print(f'Accuracy-Train {round(dtc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(dtc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,dtc_preds))
print(classification_report(y_validate,dtc.predict(X_validate)))

Accuracy-Train 0.9969
Accuracy-Validate 0.5134
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3180
           1       1.00      0.99      1.00      3291

    accuracy                           1.00      6471
   macro avg       1.00      1.00      1.00      6471
weighted avg       1.00      1.00      1.00      6471

              precision    recall  f1-score   support

           0       0.51      0.50      0.50      1178
           1       0.52      0.53      0.52      1216

    accuracy                           0.51      2394
   macro avg       0.51      0.51      0.51      2394
weighted avg       0.51      0.51      0.51      2394



## DTC maxDepth = X

In [70]:
# create the Decision Tree Classifier model 
dtc = DecisionTreeClassifier(max_depth=5,min_samples_leaf=5)

In [71]:
# fit the model to the TRAIN dataset:
dtc.fit(X_train, y_train)

In [72]:
# use the model by calling for the predictions made via the TRAIN dataset
dtc_preds = dtc.predict(X_train)
pd.crosstab(dtc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1604,1365
1,1576,1926


In [73]:
print(f'Accuracy-Train {round(dtc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(dtc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,dtc_preds))
print(classification_report(y_validate,dtc.predict(X_validate)))

Accuracy-Train 0.5455
Accuracy-Validate 0.5029
              precision    recall  f1-score   support

           0       0.54      0.50      0.52      3180
           1       0.55      0.59      0.57      3291

    accuracy                           0.55      6471
   macro avg       0.55      0.54      0.54      6471
weighted avg       0.55      0.55      0.54      6471

              precision    recall  f1-score   support

           0       0.49      0.46      0.48      1178
           1       0.51      0.55      0.53      1216

    accuracy                           0.50      2394
   macro avg       0.50      0.50      0.50      2394
weighted avg       0.50      0.50      0.50      2394



## CROSS_VALIDATION k=5

In [54]:
cross_val_score(dtc, X_train, y_train, cv=10)

array([0.49382716, 0.51159196, 0.49304482, 0.46522411, 0.50695518,
       0.51931994, 0.52086553, 0.50695518, 0.51468315, 0.49149923])

In [55]:
param_grid = {
    'max_depth': [None,18, 12, 10, 5],
    'min_samples_leaf': [1, 5, 10, 20],
    'criterion': ['gini', 'entropy'],
}

In [56]:
gr_search = GridSearchCV(DecisionTreeClassifier(),
                      param_grid)

In [57]:
gr_search

In [59]:
gr_search.fit(X_train, y_train)

In [61]:
results = gr_search.cv_results_

In [62]:
results_df_init = pd.DataFrame(results)

In [63]:
results_df_init.shape

(40, 16)

In [64]:
params = pd.DataFrame(results['params'])

In [65]:
params

Unnamed: 0,criterion,max_depth,min_samples_leaf
0,gini,,1
1,gini,,5
2,gini,,10
3,gini,,20
4,gini,18.0,1
5,gini,18.0,5
6,gini,18.0,10
7,gini,18.0,20
8,gini,12.0,1
9,gini,12.0,5


In [69]:
results_df_init.sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
37,0.004833,4e-05,0.000461,7e-06,entropy,5.0,5,"{'criterion': 'entropy', 'max_depth': 5, 'min_...",0.491892,0.510046,0.522411,0.520093,0.508501,0.510589,0.01081,1
2,0.010725,0.000403,0.000534,3e-06,gini,,10,"{'criterion': 'gini', 'max_depth': None, 'min_...",0.501931,0.492272,0.502318,0.517774,0.532457,0.509351,0.01415,2
38,0.004807,3.4e-05,0.000457,3e-06,entropy,5.0,10,"{'criterion': 'entropy', 'max_depth': 5, 'min_...",0.492664,0.504637,0.520866,0.51932,0.507728,0.509043,0.010342,3
17,0.00476,4e-05,0.000455,7e-06,gini,5.0,5,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.494981,0.506955,0.510046,0.521638,0.511592,0.509043,0.008582,4
16,0.004739,5.6e-05,0.00046,5e-06,gini,5.0,1,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.494208,0.502318,0.511592,0.520866,0.511592,0.508115,0.009097,5
36,0.004812,3e-05,0.000455,5e-06,entropy,5.0,1,"{'criterion': 'entropy', 'max_depth': 5, 'min_...",0.491892,0.506955,0.523184,0.520093,0.497682,0.507961,0.012197,6
29,0.01016,0.000134,0.000505,6e-06,entropy,12.0,5,"{'criterion': 'entropy', 'max_depth': 12, 'min...",0.496525,0.5,0.520866,0.498454,0.523184,0.507806,0.011685,7
20,0.01685,0.000423,0.000569,3e-06,entropy,,1,"{'criterion': 'entropy', 'max_depth': None, 'm...",0.492664,0.496136,0.508501,0.511592,0.528594,0.507497,0.012743,8
28,0.010505,0.000105,0.00051,9e-06,entropy,12.0,1,"{'criterion': 'entropy', 'max_depth': 12, 'min...",0.494981,0.479907,0.51932,0.51391,0.529366,0.507497,0.01776,9
30,0.009582,0.000154,0.000505,2e-06,entropy,12.0,10,"{'criterion': 'entropy', 'max_depth': 12, 'min...",0.475676,0.506955,0.520866,0.511592,0.521638,0.507345,0.016782,10
