In [1]:
#standard ds imports
import pandas as pd
import numpy as np
#viz and stats
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# .py imports
#import wranglerer as wr
#import modeling as md
import os
#sklearn imports
from sklearn.model_selection import train_test_split
import sklearn.preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#CATboost imports
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import explore_r as ex
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

### This notebook contains the basic building blocks for running a single instance of each ML algorithm.  There is some minor tinkering with hyperparameters, however, this is more of an exploratory step-through of each of the algorithms (vanila-ish).

In [2]:
df = pd.read_csv('prepped_data.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10811 entries, 0 to 10810
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   date                  10811 non-null  object 
 1   day_of_week           10811 non-null  object 
 2   start_time            10811 non-null  int64  
 3   week_num              10811 non-null  int64  
 4   home_score            10811 non-null  int64  
 5   home_wins             10811 non-null  int64  
 6   away_score            10811 non-null  int64  
 7   away_wins             10811 non-null  int64  
 8   stadium               10811 non-null  object 
 9   temp                  10811 non-null  int64  
 10  humidity              10811 non-null  int64  
 11  wind                  10811 non-null  int64  
 12  spread                10811 non-null  float64
 13  ou                    10811 non-null  float64
 14  is_under              10811 non-null  int64  
 15  abnormal_start     

In [4]:
df.columns

Index(['date', 'day_of_week', 'start_time', 'week_num', 'home_score',
       'home_wins', 'away_score', 'away_wins', 'stadium', 'temp', 'humidity',
       'wind', 'spread', 'ou', 'is_under', 'abnormal_start', 'total_scores',
       'is_playoff', 'playoff_implications', 'is_turf', 'is_outdoor'],
      dtype='object')

In [5]:
df = df.drop(columns=['date', 'day_of_week', 'start_time','home_score',
       'home_wins', 'away_score', 'away_wins','stadium','total_scores'])

In [6]:
df

Unnamed: 0,week_num,temp,humidity,wind,spread,ou,is_under,abnormal_start,is_playoff,playoff_implications,is_turf,is_outdoor
0,19,72,0,0,1.0,51.0,0,0,1,1,0,0
1,19,52,48,14,2.5,45.5,1,0,1,1,0,1
2,19,22,55,13,1.5,48.0,1,0,1,1,0,1
3,19,32,10,0,5.5,49.0,1,0,1,1,1,1
4,19,55,47,19,4.0,47.0,1,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
10806,1,76,71,8,1.0,37.0,1,0,0,0,1,1
10807,1,73,76,10,2.0,41.0,0,0,0,0,0,1
10808,1,70,77,10,4.0,36.5,0,0,0,0,0,1
10809,1,72,0,0,2.0,42.5,0,0,0,0,1,0


In [7]:
df['spread'] = abs(df['spread'])

In [8]:
df

Unnamed: 0,week_num,temp,humidity,wind,spread,ou,is_under,abnormal_start,is_playoff,playoff_implications,is_turf,is_outdoor
0,19,72,0,0,1.0,51.0,0,0,1,1,0,0
1,19,52,48,14,2.5,45.5,1,0,1,1,0,1
2,19,22,55,13,1.5,48.0,1,0,1,1,0,1
3,19,32,10,0,5.5,49.0,1,0,1,1,1,1
4,19,55,47,19,4.0,47.0,1,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
10806,1,76,71,8,1.0,37.0,1,0,0,0,1,1
10807,1,73,76,10,2.0,41.0,0,0,0,0,0,1
10808,1,70,77,10,4.0,36.5,0,0,0,0,0,1
10809,1,72,0,0,2.0,42.5,0,0,0,0,1,0


## ok, finally with proper df, establish baseline

In [9]:
df.is_under.value_counts(normalize=True)

1    0.507354
0    0.492646
Name: is_under, dtype: float64

In [10]:
df['baseline'] = 1

In [11]:
baseline_accuracy = (df.baseline == df.is_under).mean()
print(f'baseline accuracy: {baseline_accuracy:.2%}')

baseline accuracy: 50.74%


In [12]:
subset = df[df.is_under == 1]
baseline_recall = (subset.baseline == subset.is_under).mean()
print(f'baseline recall: {baseline_recall:.2%}')

baseline recall: 100.00%


In [13]:
subset = df[df.baseline == 1]
baseline_precision = (subset.baseline == subset.is_under).mean()
print(f'baseline precision: {baseline_precision:.2%}')

baseline precision: 50.74%


In [14]:
df.drop(columns='baseline',inplace=True)

In [15]:
X_train, y_train, X_validate, y_validate, X_test, y_test = ex.train_validate_test(df,'is_under')


In [16]:
X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape 


((6471, 11), (6471,), (2394, 11), (2394,), (1946, 11), (1946,))

In [17]:
print(y_train.value_counts(normalize=True))
print(y_validate.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

1    0.50734
0    0.49266
Name: is_under, dtype: float64
1    0.507519
0    0.492481
Name: is_under, dtype: float64
1    0.507194
0    0.492806
Name: is_under, dtype: float64


In [18]:
X_train.head()

Unnamed: 0,week_num,temp,humidity,wind,spread,ou,abnormal_start,is_playoff,playoff_implications,is_turf,is_outdoor
894,14,78,58,9,3.0,46.5,0,0,1,0,1
6913,11,32,73,11,4.0,41.5,0,0,1,1,1
886,14,72,0,0,3.5,48.0,0,0,0,1,0
7630,12,37,49,20,13.0,37.0,0,0,1,1,1
5466,7,81,24,5,3.5,36.5,0,0,0,0,1


## MODELING

## DTC maxDepth=4

In [19]:
# create the Decision Tree Classifier model 
dtc = DecisionTreeClassifier(max_depth=4)

In [20]:
# fit the model to the TRAIN dataset:
dtc.fit(X_train, y_train)

In [21]:
# use the model by calling for the predictions made via the TRAIN dataset
dtc_preds = dtc.predict(X_train)
pd.crosstab(dtc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,764,630
1,2424,2653


In [22]:
print(f'Accuracy-Train {round(dtc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(dtc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,dtc_preds))
print(classification_report(y_validate,dtc.predict(X_validate)))

Accuracy-Train 0.528
Accuracy-Validate 0.5267
              precision    recall  f1-score   support

           0       0.55      0.24      0.33      3188
           1       0.52      0.81      0.63      3283

    accuracy                           0.53      6471
   macro avg       0.54      0.52      0.48      6471
weighted avg       0.54      0.53      0.49      6471

              precision    recall  f1-score   support

           0       0.54      0.25      0.34      1179
           1       0.52      0.80      0.63      1215

    accuracy                           0.53      2394
   macro avg       0.53      0.52      0.48      2394
weighted avg       0.53      0.53      0.49      2394



## DTC maxDepth=X

In [23]:
# create the Decision Tree Classifier model 
dtc = DecisionTreeClassifier(max_depth=9,min_samples_leaf=10)

In [24]:
# fit the model to the TRAIN dataset:
dtc.fit(X_train, y_train)

In [25]:
# use the model by calling for the predictions made via the TRAIN dataset
dtc_preds = dtc.predict(X_train)
pd.crosstab(dtc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1625,991
1,1563,2292


In [26]:
print(f'Accuracy-Train {round(dtc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(dtc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,dtc_preds))
print(classification_report(y_validate,dtc.predict(X_validate)))

Accuracy-Train 0.6053
Accuracy-Validate 0.4912
              precision    recall  f1-score   support

           0       0.62      0.51      0.56      3188
           1       0.59      0.70      0.64      3283

    accuracy                           0.61      6471
   macro avg       0.61      0.60      0.60      6471
weighted avg       0.61      0.61      0.60      6471

              precision    recall  f1-score   support

           0       0.48      0.40      0.44      1179
           1       0.50      0.58      0.54      1215

    accuracy                           0.49      2394
   macro avg       0.49      0.49      0.49      2394
weighted avg       0.49      0.49      0.49      2394



## RFC maxDepth =6

In [27]:
# create the Random Forest model 
rf1 = RandomForestClassifier(n_estimators=201,max_depth=5)
# fit the model to the TRAIN dataset1
rf1.fit(X_train, y_train)
# use the model by calling for the predictions made via the TRAIN dataset
rf1_preds = rf1.predict(X_train)
pd.crosstab(rf1_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1532,901
1,1656,2382


In [28]:
print(f'Accuracy-Train {round(rf1.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(rf1.score(X_validate,y_validate),4)}')
print(classification_report(y_train,rf1_preds))
print(classification_report(y_validate,rf1.predict(X_validate)))

Accuracy-Train 0.6049
Accuracy-Validate 0.505
              precision    recall  f1-score   support

           0       0.63      0.48      0.55      3188
           1       0.59      0.73      0.65      3283

    accuracy                           0.60      6471
   macro avg       0.61      0.60      0.60      6471
weighted avg       0.61      0.60      0.60      6471

              precision    recall  f1-score   support

           0       0.50      0.39      0.44      1179
           1       0.51      0.62      0.56      1215

    accuracy                           0.51      2394
   macro avg       0.50      0.50      0.50      2394
weighted avg       0.50      0.51      0.50      2394



## NB classifier

In [29]:
# create the Random Forest model 
nbc = GaussianNB()
# fit the model to the TRAIN dataset1
nbc.fit(X_train, y_train)
# use the model by calling for the predictions made via the TRAIN dataset
nbc_preds = nbc.predict(X_train)
pd.crosstab(nbc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,969,848
1,2219,2435


In [30]:
print(f'Accuracy-Train {round(nbc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(nbc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,nbc_preds))
print(classification_report(y_validate,nbc.predict(X_validate)))

Accuracy-Train 0.526
Accuracy-Validate 0.5276
              precision    recall  f1-score   support

           0       0.53      0.30      0.39      3188
           1       0.52      0.74      0.61      3283

    accuracy                           0.53      6471
   macro avg       0.53      0.52      0.50      6471
weighted avg       0.53      0.53      0.50      6471

              precision    recall  f1-score   support

           0       0.54      0.30      0.39      1179
           1       0.52      0.74      0.62      1215

    accuracy                           0.53      2394
   macro avg       0.53      0.52      0.50      2394
weighted avg       0.53      0.53      0.50      2394



## GBC

In [31]:
# create the Random Forest model 
gbc = GradientBoostingClassifier()
# fit the model to the TRAIN dataset:
gbc.fit(X_train, y_train)
# use the model by calling for the predictions made via the TRAIN dataset
gbc_preds = gbc.predict(X_train)
pd.crosstab(gbc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1815,1183
1,1373,2100


In [32]:
print(f'Accuracy-Train {round(gbc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(gbc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,gbc_preds))
print(classification_report(y_validate,gbc.predict(X_validate)))

Accuracy-Train 0.605
Accuracy-Validate 0.5008
              precision    recall  f1-score   support

           0       0.61      0.57      0.59      3188
           1       0.60      0.64      0.62      3283

    accuracy                           0.61      6471
   macro avg       0.61      0.60      0.60      6471
weighted avg       0.61      0.61      0.60      6471

              precision    recall  f1-score   support

           0       0.49      0.48      0.48      1179
           1       0.51      0.52      0.52      1215

    accuracy                           0.50      2394
   macro avg       0.50      0.50      0.50      2394
weighted avg       0.50      0.50      0.50      2394



## CATBoost

In [33]:
# Create and fit the thing
CATb = CatBoostClassifier(verbose=False,depth=5)
CATb.fit(X_train,y_train)
CATb_preds = CATb.predict(X_train)
pd.crosstab(CATb_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2160,917
1,1028,2366


In [34]:
print(f'Accuracy-Train {round(CATb.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(CATb.score(X_validate,y_validate),4)}')
print(classification_report(y_train,CATb_preds))
print(classification_report(y_validate,CATb.predict(X_validate)))

Accuracy-Train 0.6994
Accuracy-Validate 0.5
              precision    recall  f1-score   support

           0       0.70      0.68      0.69      3188
           1       0.70      0.72      0.71      3283

    accuracy                           0.70      6471
   macro avg       0.70      0.70      0.70      6471
weighted avg       0.70      0.70      0.70      6471

              precision    recall  f1-score   support

           0       0.49      0.47      0.48      1179
           1       0.51      0.53      0.52      1215

    accuracy                           0.50      2394
   macro avg       0.50      0.50      0.50      2394
weighted avg       0.50      0.50      0.50      2394



## SCALE for Logistic regression, MLP, etc.

In [35]:
sc_X = StandardScaler()
X_train_scaled = sc_X.fit_transform(X_train)
X_validate_scaled = sc_X.transform(X_validate)
X_test_scaled = sc_X.transform(X_test)

In [36]:
mlp = MLPClassifier(hidden_layer_sizes=(256,128,64),activation="logistic",random_state=2013,
                    batch_size=100,solver='adam')
mlp.fit(X_train_scaled, y_train)
mlp_preds = mlp.predict(X_train_scaled)
print(mlp.score(X_train_scaled, y_train))
pd.crosstab(mlp_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

0.5241848246020708


is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,584,475
1,2604,2808


In [37]:
mlp.n_layers_

5

In [38]:
print(f'Accuracy-Train {round(mlp.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(mlp.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,mlp_preds))
print(classification_report(y_validate,mlp.predict(X_validate_scaled)))

Accuracy-Train 0.5242
Accuracy-Validate 0.5071
              precision    recall  f1-score   support

           0       0.55      0.18      0.28      3188
           1       0.52      0.86      0.65      3283

    accuracy                           0.52      6471
   macro avg       0.54      0.52      0.46      6471
weighted avg       0.53      0.52      0.46      6471

              precision    recall  f1-score   support

           0       0.50      0.18      0.27      1179
           1       0.51      0.82      0.63      1215

    accuracy                           0.51      2394
   macro avg       0.50      0.50      0.45      2394
weighted avg       0.50      0.51      0.45      2394



## KNN

In [39]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train_scaled, y_train)
knn_preds = knn.predict(X_train_scaled)
print(knn.score(X_train_scaled, y_train))
pd.crosstab(knn_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.7494977592335034


is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3188,1621
1,0,1662


In [40]:
print(f'Accuracy-Train {round(knn.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(knn.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,knn_preds))
print(classification_report(y_validate,knn.predict(X_validate_scaled)))

Accuracy-Train 0.7495
Accuracy-Validate 0.4954
              precision    recall  f1-score   support

           0       0.66      1.00      0.80      3188
           1       1.00      0.51      0.67      3283

    accuracy                           0.75      6471
   macro avg       0.83      0.75      0.73      6471
weighted avg       0.83      0.75      0.73      6471

              precision    recall  f1-score   support

           0       0.49      0.74      0.59      1179
           1       0.51      0.26      0.34      1215

    accuracy                           0.50      2394
   macro avg       0.50      0.50      0.47      2394
weighted avg       0.50      0.50      0.46      2394



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


## LOGISTIC REGRESSION

In [41]:
#C=1, class_weight={0:1, 1:99}, random_state=123, intercept_scaling=1, solver='lbfgs'
log = LogisticRegression()
log.fit(X_train_scaled, y_train)
log_preds = log.predict(X_train_scaled)
print(log.score(X_train_scaled, y_train))
pd.crosstab(log_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

0.5186215422654922


is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1411,1338
1,1777,1945


In [42]:
print(f'Accuracy-Train {round(log.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(log.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,log_preds))
print(classification_report(y_validate,log.predict(X_validate_scaled)))

Accuracy-Train 0.5186
Accuracy-Validate 0.5255
              precision    recall  f1-score   support

           0       0.51      0.44      0.48      3188
           1       0.52      0.59      0.56      3283

    accuracy                           0.52      6471
   macro avg       0.52      0.52      0.52      6471
weighted avg       0.52      0.52      0.52      6471

              precision    recall  f1-score   support

           0       0.52      0.45      0.48      1179
           1       0.53      0.60      0.56      1215

    accuracy                           0.53      2394
   macro avg       0.52      0.52      0.52      2394
weighted avg       0.52      0.53      0.52      2394



In [43]:
X_train.columns

Index(['week_num', 'temp', 'humidity', 'wind', 'spread', 'ou',
       'abnormal_start', 'is_playoff', 'playoff_implications', 'is_turf',
       'is_outdoor'],
      dtype='object')

In [44]:
log.coef_[0]

array([-0.01032114,  0.00455406, -0.04278349,  0.11488901, -0.01088508,
        0.08384507, -0.0063011 , -0.03250017,  0.0075701 ,  0.00361857,
       -0.0063397 ])

## SVM

In [45]:
#C=1, class_weight={0:1, 1:99}, random_state=123, intercept_scaling=1, solver='lbfgs'
svm = SVC()
svm.fit(X_train_scaled, y_train)
svm_preds = svm.predict(X_train_scaled)
print(svm.score(X_train_scaled, y_train))
pd.crosstab(svm_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

0.58383557409983


is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1874,1379
1,1314,1904


In [46]:
print(f'Accuracy-Train {round(svm.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(svm.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,svm_preds))
print(classification_report(y_validate,svm.predict(X_validate_scaled)))

Accuracy-Train 0.5838
Accuracy-Validate 0.51
              precision    recall  f1-score   support

           0       0.58      0.59      0.58      3188
           1       0.59      0.58      0.59      3283

    accuracy                           0.58      6471
   macro avg       0.58      0.58      0.58      6471
weighted avg       0.58      0.58      0.58      6471

              precision    recall  f1-score   support

           0       0.50      0.53      0.52      1179
           1       0.52      0.49      0.50      1215

    accuracy                           0.51      2394
   macro avg       0.51      0.51      0.51      2394
weighted avg       0.51      0.51      0.51      2394

