In [51]:
#standard ds imports
import pandas as pd
import numpy as np
#viz and stats
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# .py imports
#import wranglerer as wr
#import modeling as md
import os
#sklearn imports
from sklearn.model_selection import train_test_split
import sklearn.preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#CATboost imports
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import explore_r as ex
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

### This notebook contains the basic building blocks for running a single instance of each ML algorithm.  There is some minor tinkering with hyperparameters, however, this is more of an exploratory step-through of each of the algorithms (vanila-ish).

In [2]:
df = pd.read_csv('prepped_data.csv')

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10811 entries, 0 to 10810
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   week_num              10811 non-null  int64  
 1   temp                  10811 non-null  int64  
 2   humidity              10811 non-null  int64  
 3   wind                  10811 non-null  int64  
 4   spread                10811 non-null  float64
 5   ou                    10811 non-null  float64
 6   is_under              10811 non-null  int64  
 7   abnormal_start        10811 non-null  int64  
 8   is_playoff            10811 non-null  int64  
 9   playoff_implications  10811 non-null  int64  
 10  is_turf               10811 non-null  int64  
 11  is_outdoor            10811 non-null  int64  
dtypes: float64(2), int64(10)
memory usage: 1013.7 KB


In [4]:
df.columns

Index(['date', 'day_of_week', 'start_time', 'week_num', 'home_score',
       'home_wins', 'away_score', 'away_wins', 'stadium', 'temp', 'humidity',
       'wind', 'spread', 'ou', 'is_under', 'abnormal_start', 'total_scores',
       'is_playoff', 'playoff_implications', 'is_turf', 'is_outdoor'],
      dtype='object')

In [5]:
df = df.drop(columns=['date', 'day_of_week', 'start_time','home_score',
       'home_wins', 'away_score', 'away_wins','stadium','total_scores'])

In [6]:
df

Unnamed: 0,week_num,temp,humidity,wind,spread,ou,is_under,abnormal_start,is_playoff,playoff_implications,is_turf,is_outdoor
0,19,72,0,0,1.0,51.0,0,0,1,1,0,0
1,19,52,48,14,2.5,45.5,1,0,1,1,0,1
2,19,22,55,13,1.5,48.0,1,0,1,1,0,1
3,19,32,10,0,5.5,49.0,1,0,1,1,1,1
4,19,55,47,19,4.0,47.0,1,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
10806,1,76,71,8,1.0,37.0,1,0,0,0,1,1
10807,1,73,76,10,2.0,41.0,0,0,0,0,0,1
10808,1,70,77,10,4.0,36.5,0,0,0,0,0,1
10809,1,72,0,0,2.0,42.5,0,0,0,0,1,0


In [7]:
df['spread'] = abs(df['spread'])

In [8]:
df

Unnamed: 0,week_num,temp,humidity,wind,spread,ou,is_under,abnormal_start,is_playoff,playoff_implications,is_turf,is_outdoor
0,19,72,0,0,1.0,51.0,0,0,1,1,0,0
1,19,52,48,14,2.5,45.5,1,0,1,1,0,1
2,19,22,55,13,1.5,48.0,1,0,1,1,0,1
3,19,32,10,0,5.5,49.0,1,0,1,1,1,1
4,19,55,47,19,4.0,47.0,1,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
10806,1,76,71,8,1.0,37.0,1,0,0,0,1,1
10807,1,73,76,10,2.0,41.0,0,0,0,0,0,1
10808,1,70,77,10,4.0,36.5,0,0,0,0,0,1
10809,1,72,0,0,2.0,42.5,0,0,0,0,1,0


## ok, finally with proper df, establish baseline

In [9]:
df.is_under.value_counts(normalize=True)

1    0.507354
0    0.492646
Name: is_under, dtype: float64

In [10]:
df['baseline'] = 1

In [11]:
baseline_accuracy = (df.baseline == df.is_under).mean()
print(f'baseline accuracy: {baseline_accuracy:.2%}')

baseline accuracy: 50.74%


In [12]:
subset = df[df.is_under == 1]
baseline_recall = (subset.baseline == subset.is_under).mean()
print(f'baseline recall: {baseline_recall:.2%}')

baseline recall: 100.00%


In [13]:
subset = df[df.baseline == 1]
baseline_precision = (subset.baseline == subset.is_under).mean()
print(f'baseline precision: {baseline_precision:.2%}')

baseline precision: 50.74%


In [14]:
df.drop(columns='baseline',inplace=True)

In [15]:
X_train, y_train, X_validate, y_validate, X_test, y_test = ex.train_validate_test(df,'is_under')


In [16]:
X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape 


((6471, 11), (6471,), (2394, 11), (2394,), (1946, 11), (1946,))

In [None]:
print(y_train.value_counts(normalize=True))
print(y_validate.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

In [17]:
X_train.head()

Unnamed: 0,week_num,temp,humidity,wind,spread,ou,abnormal_start,is_playoff,playoff_implications,is_turf,is_outdoor
1713,12,72,0,0,4.0,49.0,0,0,1,1,0
2164,17,72,0,0,7.5,38.0,0,0,1,0,0
2554,9,72,0,0,1.0,42.5,1,0,0,0,0
3117,6,58,52,10,12.5,40.0,0,0,0,0,1
9819,13,51,49,14,5.0,39.5,0,0,1,0,1


## MODELING

## DTC maxDepth=4

In [18]:
# create the Decision Tree Classifier model 
dtc = DecisionTreeClassifier(max_depth=4)

In [19]:
# fit the model to the TRAIN dataset:
dtc.fit(X_train, y_train)

In [20]:
# use the model by calling for the predictions made via the TRAIN dataset
dtc_preds = dtc.predict(X_train)
pd.crosstab(dtc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1348,1187
1,1832,2104


In [21]:
print(f'Accuracy-Train {round(dtc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(dtc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,dtc_preds))
print(classification_report(y_validate,dtc.predict(X_validate)))

Accuracy-Train 0.5335
Accuracy-Validate 0.505
              precision    recall  f1-score   support

           0       0.53      0.42      0.47      3180
           1       0.53      0.64      0.58      3291

    accuracy                           0.53      6471
   macro avg       0.53      0.53      0.53      6471
weighted avg       0.53      0.53      0.53      6471

              precision    recall  f1-score   support

           0       0.50      0.40      0.44      1178
           1       0.51      0.61      0.55      1216

    accuracy                           0.51      2394
   macro avg       0.50      0.50      0.50      2394
weighted avg       0.50      0.51      0.50      2394



## DTC maxDepth=X

In [22]:
# create the Decision Tree Classifier model 
dtc = DecisionTreeClassifier(max_depth=9,min_samples_leaf=10)

In [23]:
# fit the model to the TRAIN dataset:
dtc.fit(X_train, y_train)

In [24]:
# use the model by calling for the predictions made via the TRAIN dataset
dtc_preds = dtc.predict(X_train)
pd.crosstab(dtc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1593,1021
1,1587,2270


In [25]:
print(f'Accuracy-Train {round(dtc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(dtc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,dtc_preds))
print(classification_report(y_validate,dtc.predict(X_validate)))

Accuracy-Train 0.597
Accuracy-Validate 0.5159
              precision    recall  f1-score   support

           0       0.61      0.50      0.55      3180
           1       0.59      0.69      0.64      3291

    accuracy                           0.60      6471
   macro avg       0.60      0.60      0.59      6471
weighted avg       0.60      0.60      0.59      6471

              precision    recall  f1-score   support

           0       0.51      0.41      0.45      1178
           1       0.52      0.62      0.57      1216

    accuracy                           0.52      2394
   macro avg       0.51      0.51      0.51      2394
weighted avg       0.51      0.52      0.51      2394



## RFC maxDepth =6

In [26]:
# create the Random Forest model 
rf1 = RandomForestClassifier(n_estimators=201,max_depth=5)
# fit the model to the TRAIN dataset1
rf1.fit(X_train, y_train)
# use the model by calling for the predictions made via the TRAIN dataset
rf1_preds = rf1.predict(X_train)
pd.crosstab(rf1_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1697,1103
1,1483,2188


In [27]:
print(f'Accuracy-Train {round(rf1.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(rf1.score(X_validate,y_validate),4)}')
print(classification_report(y_train,rf1_preds))
print(classification_report(y_validate,rf1.predict(X_validate)))

Accuracy-Train 0.6004
Accuracy-Validate 0.5109
              precision    recall  f1-score   support

           0       0.61      0.53      0.57      3180
           1       0.60      0.66      0.63      3291

    accuracy                           0.60      6471
   macro avg       0.60      0.60      0.60      6471
weighted avg       0.60      0.60      0.60      6471

              precision    recall  f1-score   support

           0       0.50      0.44      0.47      1178
           1       0.52      0.58      0.55      1216

    accuracy                           0.51      2394
   macro avg       0.51      0.51      0.51      2394
weighted avg       0.51      0.51      0.51      2394



## NB classifier

In [43]:
# create the Random Forest model 
nbc = GaussianNB()
# fit the model to the TRAIN dataset1
nbc.fit(X_train, y_train)
# use the model by calling for the predictions made via the TRAIN dataset
nbc_preds = nbc.predict(X_train)
pd.crosstab(nbc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1318,1208
1,1862,2083


In [44]:
print(f'Accuracy-Train {round(nbc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(nbc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,nbc_preds))
print(classification_report(y_validate,nbc.predict(X_validate)))

Accuracy-Train 0.5256
Accuracy-Validate 0.5121
              precision    recall  f1-score   support

           0       0.52      0.41      0.46      3180
           1       0.53      0.63      0.58      3291

    accuracy                           0.53      6471
   macro avg       0.52      0.52      0.52      6471
weighted avg       0.52      0.53      0.52      6471

              precision    recall  f1-score   support

           0       0.51      0.39      0.44      1178
           1       0.52      0.63      0.57      1216

    accuracy                           0.51      2394
   macro avg       0.51      0.51      0.50      2394
weighted avg       0.51      0.51      0.51      2394



## GBC

In [28]:
# create the Random Forest model 
gbc = GradientBoostingClassifier()
# fit the model to the TRAIN dataset:
gbc.fit(X_train, y_train)
# use the model by calling for the predictions made via the TRAIN dataset
gbc_preds = gbc.predict(X_train)
pd.crosstab(gbc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1858,1251
1,1322,2040


In [29]:
print(f'Accuracy-Train {round(gbc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(gbc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,gbc_preds))
print(classification_report(y_validate,gbc.predict(X_validate)))

Accuracy-Train 0.6024
Accuracy-Validate 0.4962
              precision    recall  f1-score   support

           0       0.60      0.58      0.59      3180
           1       0.61      0.62      0.61      3291

    accuracy                           0.60      6471
   macro avg       0.60      0.60      0.60      6471
weighted avg       0.60      0.60      0.60      6471

              precision    recall  f1-score   support

           0       0.49      0.47      0.48      1178
           1       0.50      0.52      0.51      1216

    accuracy                           0.50      2394
   macro avg       0.50      0.50      0.50      2394
weighted avg       0.50      0.50      0.50      2394



## CATBoost

In [30]:
# Create and fit the thing
CATb = CatBoostClassifier(verbose=False,depth=5)
CATb.fit(X_train,y_train)
CATb_preds = CATb.predict(X_train)
pd.crosstab(CATb_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2213,953
1,967,2338


In [31]:
print(f'Accuracy-Train {round(CATb.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(CATb.score(X_validate,y_validate),4)}')
print(classification_report(y_train,CATb_preds))
print(classification_report(y_validate,CATb.predict(X_validate)))

Accuracy-Train 0.7033
Accuracy-Validate 0.4992
              precision    recall  f1-score   support

           0       0.70      0.70      0.70      3180
           1       0.71      0.71      0.71      3291

    accuracy                           0.70      6471
   macro avg       0.70      0.70      0.70      6471
weighted avg       0.70      0.70      0.70      6471

              precision    recall  f1-score   support

           0       0.49      0.50      0.49      1178
           1       0.51      0.50      0.50      1216

    accuracy                           0.50      2394
   macro avg       0.50      0.50      0.50      2394
weighted avg       0.50      0.50      0.50      2394



## SCALE for Logistic regression, MLP, etc.

In [32]:
sc_X = StandardScaler()
X_train_scaled = sc_X.fit_transform(X_train)
X_validate_scaled = sc_X.transform(X_validate)
X_test_scaled = sc_X.transform(X_test)

In [33]:
mlp = MLPClassifier(hidden_layer_sizes=(256,128,64),activation="logistic",random_state=2013,
                    batch_size=100,solver='adam')
mlp.fit(X_train_scaled, y_train)
mlp_preds = mlp.predict(X_train_scaled)
print(mlp.score(X_train_scaled, y_train))
pd.crosstab(mlp_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

0.5189306135064132


is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2481,2414
1,699,877


In [34]:
mlp.n_layers_

5

In [35]:
print(f'Accuracy-Train {round(mlp.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(mlp.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,mlp_preds))
print(classification_report(y_validate,mlp.predict(X_validate_scaled)))

Accuracy-Train 0.5189
Accuracy-Validate 0.5029
              precision    recall  f1-score   support

           0       0.51      0.78      0.61      3180
           1       0.56      0.27      0.36      3291

    accuracy                           0.52      6471
   macro avg       0.53      0.52      0.49      6471
weighted avg       0.53      0.52      0.49      6471

              precision    recall  f1-score   support

           0       0.50      0.75      0.60      1178
           1       0.52      0.26      0.35      1216

    accuracy                           0.50      2394
   macro avg       0.51      0.51      0.47      2394
weighted avg       0.51      0.50      0.47      2394



## KNN

In [36]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train_scaled, y_train)
knn_preds = knn.predict(X_train_scaled)
print(knn.score(X_train_scaled, y_train))
pd.crosstab(knn_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.7566063977746871


is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3180,1575
1,0,1716


In [37]:
print(f'Accuracy-Train {round(knn.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(knn.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,knn_preds))
print(classification_report(y_validate,knn.predict(X_validate_scaled)))

Accuracy-Train 0.7566
Accuracy-Validate 0.5008
              precision    recall  f1-score   support

           0       0.67      1.00      0.80      3180
           1       1.00      0.52      0.69      3291

    accuracy                           0.76      6471
   macro avg       0.83      0.76      0.74      6471
weighted avg       0.84      0.76      0.74      6471

              precision    recall  f1-score   support

           0       0.50      0.77      0.60      1178
           1       0.52      0.24      0.33      1216

    accuracy                           0.50      2394
   macro avg       0.51      0.51      0.47      2394
weighted avg       0.51      0.50      0.46      2394



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


## LOGISTIC REGRESSION

In [38]:
#C=1, class_weight={0:1, 1:99}, random_state=123, intercept_scaling=1, solver='lbfgs'
log = LogisticRegression()
log.fit(X_train_scaled, y_train)
log_preds = log.predict(X_train_scaled)
print(log.score(X_train_scaled, y_train))
pd.crosstab(log_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

0.5243393602225312


is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1434,1332
1,1746,1959


In [39]:
print(f'Accuracy-Train {round(log.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(log.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,log_preds))
print(classification_report(y_validate,log.predict(X_validate_scaled)))

Accuracy-Train 0.5243
Accuracy-Validate 0.5104
              precision    recall  f1-score   support

           0       0.52      0.45      0.48      3180
           1       0.53      0.60      0.56      3291

    accuracy                           0.52      6471
   macro avg       0.52      0.52      0.52      6471
weighted avg       0.52      0.52      0.52      6471

              precision    recall  f1-score   support

           0       0.50      0.43      0.46      1178
           1       0.52      0.59      0.55      1216

    accuracy                           0.51      2394
   macro avg       0.51      0.51      0.51      2394
weighted avg       0.51      0.51      0.51      2394



In [40]:
X_train.columns

Index(['week_num', 'temp', 'humidity', 'wind', 'spread', 'ou',
       'abnormal_start', 'is_playoff', 'playoff_implications', 'is_turf',
       'is_outdoor'],
      dtype='object')

In [41]:
log.coef_[0]

array([ 0.01886066,  0.02391496, -0.00496279,  0.12738364,  0.00574551,
        0.06961586,  0.03178671, -0.01791491, -0.01296095, -0.0362064 ,
       -0.00978648])

## SVM

In [46]:
#C=1, class_weight={0:1, 1:99}, random_state=123, intercept_scaling=1, solver='lbfgs'
svm = SVC()
svm.fit(X_train_scaled, y_train)
svm_preds = svm.predict(X_train_scaled)
print(svm.score(X_train_scaled, y_train))
pd.crosstab(svm_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

0.5753361149745017


is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1659,1227
1,1521,2064


In [47]:
print(f'Accuracy-Train {round(svm.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(svm.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,svm_preds))
print(classification_report(y_validate,svm.predict(X_validate_scaled)))

Accuracy-Train 0.5753
Accuracy-Validate 0.5067
              precision    recall  f1-score   support

           0       0.57      0.52      0.55      3180
           1       0.58      0.63      0.60      3291

    accuracy                           0.58      6471
   macro avg       0.58      0.57      0.57      6471
weighted avg       0.58      0.58      0.57      6471

              precision    recall  f1-score   support

           0       0.50      0.46      0.48      1178
           1       0.51      0.55      0.53      1216

    accuracy                           0.51      2394
   macro avg       0.51      0.51      0.51      2394
weighted avg       0.51      0.51      0.51      2394

