In [1]:
#standard ds imports
import pandas as pd
import numpy as np
#viz and stats
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# .py imports
import wranglerer as wr
#import modeling as md
import os
#sklearn imports
from sklearn.model_selection import train_test_split
import sklearn.preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#CATboost imports
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

## ACQUIRE

In [2]:
df = pd.read_csv('nba.csv')
df = df.drop(columns='team')

In [3]:
df.head()

Unnamed: 0,fg_pct,opp_fg_pct,three_pt_pct,opp_three_pt_pct,ft_pct,rebounds,opp_rebounds,assists,steals,opp_steals,trnovrs_commited,pts,opp_pts,opp_fouls,prev_season,playoffs
0,0.464,0.458,0.173,0.251,0.771,46.3,44.2,23.5,9.6,8.4,18.3,105.2,102.3,26.6,1,1
1,0.49,0.47,0.384,0.286,0.779,43.3,40.7,25.9,9.5,8.1,18.1,109.4,101.9,24.2,0,1
2,0.484,0.496,0.255,0.271,0.779,42.4,41.5,25.5,8.3,10.0,19.9,104.3,107.0,26.1,0,0
3,0.474,0.501,0.193,0.291,0.772,42.0,44.1,24.0,8.7,8.1,15.6,106.7,106.4,23.2,0,0
4,0.463,0.492,0.325,0.327,0.737,44.7,44.1,24.2,8.7,9.5,17.9,103.4,107.6,23.7,1,0


## BASELINE

In [4]:
df.playoffs.value_counts()

1    692
0    532
Name: playoffs, dtype: int64

In [5]:
df['baseline'] = 1

In [6]:
baseline_accuracy = (df.baseline == df.playoffs).mean()
print(f'baseline accuracy: {baseline_accuracy:.2%}')


baseline accuracy: 56.54%


In [7]:
subset = df[df.playoffs == 1]
baseline_recall = (subset.baseline == subset.playoffs).mean()
print(f'baseline recall: {baseline_recall:.2%}')


baseline recall: 100.00%


In [8]:
subset = df[df.baseline == 1]
baseline_precision = (subset.baseline == subset.playoffs).mean()
print(f'baseline precision: {baseline_precision:.2%}')


baseline precision: 56.54%


In [9]:
df.drop(columns='baseline',inplace=True)

## SPLIT

In [10]:
X_train, y_train, X_validate, y_validate, X_test, y_test = wr.train_validate_test(df,'playoffs')

In [11]:
X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape 

((685, 15), (685,), (294, 15), (294,), (245, 15), (245,))

In [12]:
X_train.head()

Unnamed: 0,fg_pct,opp_fg_pct,three_pt_pct,opp_three_pt_pct,ft_pct,rebounds,opp_rebounds,assists,steals,opp_steals,trnovrs_commited,pts,opp_pts,opp_fouls,prev_season
1075,0.465,0.448,0.365,0.344,0.802,44.5,45.8,26.2,8.6,6.8,12.8,112.2,107.8,19.5,1
293,0.507,0.482,0.333,0.331,0.746,41.0,43.9,24.1,10.0,9.3,15.8,113.6,109.9,23.6,1
123,0.476,0.495,0.194,0.32,0.744,44.1,42.5,22.5,7.2,9.0,18.7,102.5,108.4,23.9,0
411,0.482,0.454,0.378,0.366,0.691,44.3,45.5,27.4,8.7,8.5,15.3,112.9,106.9,24.1,1
462,0.438,0.474,0.358,0.362,0.723,43.5,47.8,20.1,8.2,9.5,17.4,103.3,111.2,22.9,1


## MODEL

## DTC maxDepth=4

In [127]:
# create the Decision Tree Classifier model 
dtc = DecisionTreeClassifier(max_depth=4)

In [128]:
# fit the model to the TRAIN dataset:
dtc.fit(X_train, y_train)

In [129]:
# use the model by calling for the predictions made via the TRAIN dataset
dtc_preds = dtc.predict(X_train)
pd.crosstab(dtc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

playoffs,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,265,38
1,26,356


In [130]:
print(f'Accuracy-Train {round(dtc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(dtc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,dtc_preds))
print(classification_report(y_validate,dtc.predict(X_validate)))

Accuracy-Train 0.9066
Accuracy-Validate 0.881
              precision    recall  f1-score   support

           0       0.87      0.91      0.89       291
           1       0.93      0.90      0.92       394

    accuracy                           0.91       685
   macro avg       0.90      0.91      0.90       685
weighted avg       0.91      0.91      0.91       685

              precision    recall  f1-score   support

           0       0.88      0.86      0.87       139
           1       0.88      0.90      0.89       155

    accuracy                           0.88       294
   macro avg       0.88      0.88      0.88       294
weighted avg       0.88      0.88      0.88       294



In [122]:
print(f'Accuracy-Test {round(dtc.score(X_test,y_test),4)}')

print(classification_report(y_test,dtc.predict(X_test)))

Accuracy-Test 0.8531
              precision    recall  f1-score   support

           0       0.81      0.84      0.83       102
           1       0.88      0.86      0.87       143

    accuracy                           0.85       245
   macro avg       0.85      0.85      0.85       245
weighted avg       0.85      0.85      0.85       245



In [17]:
dtc_feat_df = pd.DataFrame(
    {'feat': X_train.columns.to_list(),
     'coef': dtc.feature_importances_
    }
)
dtc_feat_df

Unnamed: 0,feat,coef
0,fg_pct,0.0
1,opp_fg_pct,0.0
2,three_pt_pct,0.0
3,opp_three_pt_pct,0.032585
4,ft_pct,0.0
5,rebounds,0.0
6,opp_rebounds,0.007713
7,assists,0.008113
8,steals,0.0
9,opp_steals,0.020279


In [18]:
dtc_feat_df['coef'].sum()

1.0

In [19]:
dtc_feat_df.sort_values('coef',ascending=False)

Unnamed: 0,feat,coef
12,opp_pts,0.46932
11,pts,0.441069
3,opp_three_pt_pct,0.032585
13,opp_fouls,0.020922
9,opp_steals,0.020279
7,assists,0.008113
6,opp_rebounds,0.007713
0,fg_pct,0.0
1,opp_fg_pct,0.0
2,three_pt_pct,0.0


## RFC maxDepth=6

In [53]:
# create the Random Forest model 
rf1 = RandomForestClassifier(n_estimators=201,max_depth=6,min_samples_leaf=1)
# fit the model to the TRAIN dataset1
rf1.fit(X_train, y_train)
# use the model by calling for the predictions made via the TRAIN dataset
rf1_preds = rf1.predict(X_train)
pd.crosstab(rf1_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

playoffs,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,280,5
1,11,389


In [49]:
print(f'Accuracy-Train {round(rf1.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(rf1.score(X_validate,y_validate),4)}')
print(classification_report(y_train,rf1_preds))
print(classification_report(y_validate,rf1.predict(X_validate)))

Accuracy-Train 0.946
Accuracy-Validate 0.8469
              precision    recall  f1-score   support

           0       0.95      0.92      0.94       291
           1       0.94      0.97      0.95       394

    accuracy                           0.95       685
   macro avg       0.95      0.94      0.94       685
weighted avg       0.95      0.95      0.95       685

              precision    recall  f1-score   support

           0       0.92      0.74      0.82       139
           1       0.80      0.94      0.87       155

    accuracy                           0.85       294
   macro avg       0.86      0.84      0.84       294
weighted avg       0.86      0.85      0.84       294



In [40]:
X_train.columns.to_list()

['fg_pct',
 'opp_fg_pct',
 'three_pt_pct',
 'opp_three_pt_pct',
 'ft_pct',
 'rebounds',
 'opp_rebounds',
 'assists',
 'steals',
 'opp_steals',
 'trnovrs_commited',
 'pts',
 'opp_pts',
 'opp_fouls',
 'prev_season']

In [41]:
rf1.feature_importances_

array([0.08912893, 0.09976036, 0.01968614, 0.05575968, 0.0207448 ,
       0.03001656, 0.0394854 , 0.06168372, 0.02077952, 0.01242297,
       0.02310539, 0.20428185, 0.17479695, 0.0239347 , 0.12441304])

In [42]:
# build a quick df to compare results...will append to this df as needed
feat_df = pd.DataFrame(
    {'feat': X_train.columns.to_list(),
     'coef': rf1.feature_importances_
    }
)
feat_df

Unnamed: 0,feat,coef
0,fg_pct,0.089129
1,opp_fg_pct,0.09976
2,three_pt_pct,0.019686
3,opp_three_pt_pct,0.05576
4,ft_pct,0.020745
5,rebounds,0.030017
6,opp_rebounds,0.039485
7,assists,0.061684
8,steals,0.02078
9,opp_steals,0.012423


In [43]:
feat_df['coef'].sum()

1.0

In [44]:
feat_df.sort_values('coef',ascending=False)

Unnamed: 0,feat,coef
11,pts,0.204282
12,opp_pts,0.174797
14,prev_season,0.124413
1,opp_fg_pct,0.09976
0,fg_pct,0.089129
7,assists,0.061684
3,opp_three_pt_pct,0.05576
6,opp_rebounds,0.039485
5,rebounds,0.030017
13,opp_fouls,0.023935


In [45]:
model_list = []

for i in range (1, 9):
    for j in range(3,19,3):
        rf = RandomForestClassifier(max_depth=i, min_samples_leaf=j, random_state=2013)
        rf = rf.fit(X_train, y_train)
        train_accuracy = rf.score(X_train, y_train)
        validate_accuracy = rf.score(X_validate, y_validate)
        model_preds = rf.predict(X_train)

        output = {
            "min_samples_per_leaf": j,
            "max_depth": i,
            "train_accuracy": train_accuracy,
            "validate_accuracy": validate_accuracy,
        }
        model_list.append(output)

In [46]:
output_df = pd.DataFrame(model_list)

In [47]:
output_df.sort_values('validate_accuracy',ascending=False)

Unnamed: 0,min_samples_per_leaf,max_depth,train_accuracy,validate_accuracy
24,3,5,0.945985,0.870748
32,9,6,0.938686,0.870748
33,12,6,0.928467,0.863946
45,12,8,0.922628,0.860544
44,9,8,0.934307,0.860544
30,3,6,0.966423,0.860544
39,12,7,0.922628,0.860544
38,9,7,0.934307,0.860544
25,6,5,0.932847,0.857143
21,12,4,0.90365,0.857143


## GradientBoostClassifier

In [54]:
# create the Random Forest model 
gbc = GradientBoostingClassifier()
# fit the model to the TRAIN dataset:
gbc.fit(X_train, y_train)
# use the model by calling for the predictions made via the TRAIN dataset
gbc_preds = gbc.predict(X_train)
pd.crosstab(gbc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

playoffs,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,287,2
1,4,392


In [55]:
print(f'Accuracy-Train {round(gbc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(gbc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,gbc_preds))
print(classification_report(y_validate,gbc.predict(X_validate)))

Accuracy-Train 0.9912
Accuracy-Validate 0.8844
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       291
           1       0.99      0.99      0.99       394

    accuracy                           0.99       685
   macro avg       0.99      0.99      0.99       685
weighted avg       0.99      0.99      0.99       685

              precision    recall  f1-score   support

           0       0.91      0.84      0.87       139
           1       0.87      0.92      0.89       155

    accuracy                           0.88       294
   macro avg       0.89      0.88      0.88       294
weighted avg       0.89      0.88      0.88       294



## CATboost

In [56]:
# Create and fit the thing
CATb = CatBoostClassifier(verbose=False,depth=5)
CATb.fit(X_train,y_train)
CATb_preds = CATb.predict(X_train)
pd.crosstab(CATb_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

playoffs,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,283,3
1,8,391


In [57]:
print(f'Accuracy-Train {round(CATb.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(CATb.score(X_validate,y_validate),4)}')
print(classification_report(y_train,CATb_preds))
print(classification_report(y_validate,CATb.predict(X_validate)))

Accuracy-Train 0.9839
Accuracy-Validate 0.8707
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       291
           1       0.98      0.99      0.99       394

    accuracy                           0.98       685
   macro avg       0.98      0.98      0.98       685
weighted avg       0.98      0.98      0.98       685

              precision    recall  f1-score   support

           0       0.92      0.79      0.85       139
           1       0.83      0.94      0.88       155

    accuracy                           0.87       294
   macro avg       0.88      0.87      0.87       294
weighted avg       0.88      0.87      0.87       294



## MULTILAYER PERCEPTRON

## SCALE

In [58]:
sc_X = StandardScaler()
X_train_scaled = sc_X.fit_transform(X_train)
X_validate_scaled = sc_X.transform(X_validate)
X_test_scaled = sc_X.transform(X_test)

In [101]:
mlp = MLPClassifier(hidden_layer_sizes=(256,128,64),activation="relu",random_state=2013,
                    batch_size=100,solver='sgd')
mlp.fit(X_train_scaled, y_train)
mlp_preds = mlp.predict(X_train_scaled)
print(mlp.score(X_train_scaled, y_train))
pd.crosstab(mlp_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

0.9357664233576642




playoffs,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,269,22
1,22,372


In [99]:
mlp.n_layers_

5

In [102]:
print(f'Accuracy-Train {round(mlp.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(mlp.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,mlp_preds))
print(classification_report(y_validate,mlp.predict(X_validate_scaled)))

Accuracy-Train 0.9358
Accuracy-Validate 0.881
              precision    recall  f1-score   support

           0       0.92      0.92      0.92       291
           1       0.94      0.94      0.94       394

    accuracy                           0.94       685
   macro avg       0.93      0.93      0.93       685
weighted avg       0.94      0.94      0.94       685

              precision    recall  f1-score   support

           0       0.91      0.83      0.87       139
           1       0.86      0.92      0.89       155

    accuracy                           0.88       294
   macro avg       0.88      0.88      0.88       294
weighted avg       0.88      0.88      0.88       294



In [103]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train_scaled, y_train)
knn_preds = knn.predict(X_train_scaled)
print(knn.score(X_train_scaled, y_train))
pd.crosstab(knn_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

0.8978102189781022


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


playoffs,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,291,70
1,0,324


In [104]:
print(f'Accuracy-Train {round(knn.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(knn.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,knn_preds))
print(classification_report(y_validate,knn.predict(X_validate_scaled)))

Accuracy-Train 0.8978
Accuracy-Validate 0.7891
              precision    recall  f1-score   support

           0       0.81      1.00      0.89       291
           1       1.00      0.82      0.90       394

    accuracy                           0.90       685
   macro avg       0.90      0.91      0.90       685
weighted avg       0.92      0.90      0.90       685

              precision    recall  f1-score   support

           0       0.75      0.83      0.79       139
           1       0.83      0.75      0.79       155

    accuracy                           0.79       294
   macro avg       0.79      0.79      0.79       294
weighted avg       0.79      0.79      0.79       294



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [109]:
#C=1, class_weight={0:1, 1:99}, random_state=123, intercept_scaling=1, solver='lbfgs'
log = LogisticRegression()
log.fit(X_train_scaled, y_train)
log_preds = log.predict(X_train_scaled)
print(log.score(X_train_scaled, y_train))
pd.crosstab(log_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

0.9036496350364963


playoffs,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,254,29
1,37,365


In [110]:
print(f'Accuracy-Train {round(log.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(log.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,log_preds))
print(classification_report(y_validate,log.predict(X_validate_scaled)))

Accuracy-Train 0.9036
Accuracy-Validate 0.8844
              precision    recall  f1-score   support

           0       0.90      0.87      0.89       291
           1       0.91      0.93      0.92       394

    accuracy                           0.90       685
   macro avg       0.90      0.90      0.90       685
weighted avg       0.90      0.90      0.90       685

              precision    recall  f1-score   support

           0       0.93      0.82      0.87       139
           1       0.85      0.94      0.90       155

    accuracy                           0.88       294
   macro avg       0.89      0.88      0.88       294
weighted avg       0.89      0.88      0.88       294



In [114]:
log.coef_[0]

array([ 0.43301913, -0.57269903, -0.05203796, -0.04233068,  0.29606291,
        0.25695718, -0.48054471,  0.40906484,  0.50168211, -0.05699655,
       -0.38835358,  2.67723263, -2.6370074 ,  0.26591438,  0.1474218 ])

In [115]:
# build a quick df to compare results...will append to this df as needed
feat_df = pd.DataFrame(
    {'feat': X_train.columns.to_list(),
     'coef': log.coef_[0]
    }
)
feat_df

Unnamed: 0,feat,coef
0,fg_pct,0.433019
1,opp_fg_pct,-0.572699
2,three_pt_pct,-0.052038
3,opp_three_pt_pct,-0.042331
4,ft_pct,0.296063
5,rebounds,0.256957
6,opp_rebounds,-0.480545
7,assists,0.409065
8,steals,0.501682
9,opp_steals,-0.056997


In [116]:
feat_df['coef'].sum()

0.757385063126709

In [119]:
feat_df['abs_val'] = abs(feat_df.coef)
#feat_df.sort_values('coef',ascending=False)

In [121]:
feat_df.sort_values('abs_val',ascending=False)

Unnamed: 0,feat,coef,abs_val
11,pts,2.677233,2.677233
12,opp_pts,-2.637007,2.637007
1,opp_fg_pct,-0.572699,0.572699
8,steals,0.501682,0.501682
6,opp_rebounds,-0.480545,0.480545
0,fg_pct,0.433019,0.433019
7,assists,0.409065,0.409065
10,trnovrs_commited,-0.388354,0.388354
4,ft_pct,0.296063,0.296063
13,opp_fouls,0.265914,0.265914
