In [1]:
#standard ds imports
import pandas as pd
import numpy as np
#viz and stats
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# .py imports
import wranglerer as wr
#import modeling as md
import os
#sklearn imports
from sklearn.model_selection import train_test_split
import sklearn.preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#CATboost imports
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

## ACQUIRE

In [2]:
df = pd.read_csv('nba.csv')
df = df.drop(columns='team')

In [3]:
df.head()

Unnamed: 0,fg_pct,opp_fg_pct,three_pt_pct,opp_three_pt_pct,ft_pct,rebounds,opp_rebounds,assists,steals,opp_steals,trnovrs_commited,pts,opp_pts,opp_fouls,prev_season,playoffs
0,0.464,0.458,0.173,0.251,0.771,46.3,44.2,23.5,9.6,8.4,18.3,105.2,102.3,26.6,1,1
1,0.49,0.47,0.384,0.286,0.779,43.3,40.7,25.9,9.5,8.1,18.1,109.4,101.9,24.2,0,1
2,0.484,0.496,0.255,0.271,0.779,42.4,41.5,25.5,8.3,10.0,19.9,104.3,107.0,26.1,0,0
3,0.474,0.501,0.193,0.291,0.772,42.0,44.1,24.0,8.7,8.1,15.6,106.7,106.4,23.2,0,0
4,0.463,0.492,0.325,0.327,0.737,44.7,44.1,24.2,8.7,9.5,17.9,103.4,107.6,23.7,1,0


## BASELINE

In [4]:
df.playoffs.value_counts()

1    692
0    532
Name: playoffs, dtype: int64

In [5]:
df['baseline'] = 1

In [6]:
baseline_accuracy = (df.baseline == df.playoffs).mean()
print(f'baseline accuracy: {baseline_accuracy:.2%}')


baseline accuracy: 56.54%


In [7]:
subset = df[df.playoffs == 1]
baseline_recall = (subset.baseline == subset.playoffs).mean()
print(f'baseline recall: {baseline_recall:.2%}')


baseline recall: 100.00%


In [8]:
subset = df[df.baseline == 1]
baseline_precision = (subset.baseline == subset.playoffs).mean()
print(f'baseline precision: {baseline_precision:.2%}')


baseline precision: 56.54%


In [9]:
df.drop(columns='baseline',inplace=True)

## SPLIT

In [10]:
X_train, y_train, X_validate, y_validate, X_test, y_test = wr.train_validate_test(df,'playoffs')

In [11]:
X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape 

((732, 15), (732,), (271, 15), (271,), (221, 15), (221,))

In [12]:
X_train.head()

Unnamed: 0,fg_pct,opp_fg_pct,three_pt_pct,opp_three_pt_pct,ft_pct,rebounds,opp_rebounds,assists,steals,opp_steals,trnovrs_commited,pts,opp_pts,opp_fouls,prev_season
827,0.492,0.452,0.412,0.355,0.77,45.0,44.3,24.4,6.1,8.2,15.4,115.3,110.2,22.8,0
756,0.476,0.445,0.378,0.362,0.769,45.9,44.5,25.4,8.3,8.0,14.7,113.0,105.5,23.5,1
626,0.443,0.437,0.346,0.362,0.75,43.0,47.0,22.0,10.1,9.4,17.4,102.1,103.7,23.8,1
55,0.495,0.5,0.2,0.304,0.719,39.9,44.2,24.4,8.8,8.4,17.9,104.2,107.2,25.3,1
117,0.471,0.478,0.287,0.277,0.749,43.4,43.6,25.0,7.4,7.3,16.5,106.0,108.7,25.0,0


## MODEL

## DTC maxDepth=4

In [13]:
# create the Decision Tree Classifier model 
dtc = DecisionTreeClassifier(max_depth=4)

In [14]:
# fit the model to the TRAIN dataset:
dtc.fit(X_train, y_train)

In [15]:
# use the model by calling for the predictions made via the TRAIN dataset
dtc_preds = dtc.predict(X_train)
pd.crosstab(dtc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

playoffs,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,279,55
1,35,363


In [16]:
print(f'Accuracy-Train {round(dtc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(dtc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,dtc_preds))
print(classification_report(y_validate,dtc.predict(X_validate)))

Accuracy-Train 0.877
Accuracy-Validate 0.8192
              precision    recall  f1-score   support

           0       0.84      0.89      0.86       314
           1       0.91      0.87      0.89       418

    accuracy                           0.88       732
   macro avg       0.87      0.88      0.88       732
weighted avg       0.88      0.88      0.88       732

              precision    recall  f1-score   support

           0       0.77      0.81      0.79       114
           1       0.86      0.83      0.84       157

    accuracy                           0.82       271
   macro avg       0.81      0.82      0.82       271
weighted avg       0.82      0.82      0.82       271



In [17]:
print(f'Accuracy-Test {round(dtc.score(X_test,y_test),4)}')

print(classification_report(y_test,dtc.predict(X_test)))

Accuracy-Test 0.81
              precision    recall  f1-score   support

           0       0.80      0.79      0.80       104
           1       0.82      0.83      0.82       117

    accuracy                           0.81       221
   macro avg       0.81      0.81      0.81       221
weighted avg       0.81      0.81      0.81       221



In [18]:
dtc_feat_df = pd.DataFrame(
    {'feat': X_train.columns.to_list(),
     'coef': dtc.feature_importances_
    }
)
dtc_feat_df

Unnamed: 0,feat,coef
0,fg_pct,0.0
1,opp_fg_pct,0.0
2,three_pt_pct,0.023362
3,opp_three_pt_pct,0.0
4,ft_pct,0.0
5,rebounds,0.0
6,opp_rebounds,0.013489
7,assists,0.0
8,steals,0.0
9,opp_steals,0.0


In [19]:
dtc_feat_df['coef'].sum()

1.0

In [20]:
dtc_feat_df.sort_values('coef',ascending=False)

Unnamed: 0,feat,coef
11,pts,0.356871
12,opp_pts,0.303726
14,prev_season,0.290094
2,three_pt_pct,0.023362
6,opp_rebounds,0.013489
13,opp_fouls,0.012459
0,fg_pct,0.0
1,opp_fg_pct,0.0
3,opp_three_pt_pct,0.0
4,ft_pct,0.0


## RFC maxDepth=6

In [21]:
# create the Random Forest model 
rf1 = RandomForestClassifier(n_estimators=201,max_depth=6,min_samples_leaf=1)
# fit the model to the TRAIN dataset1
rf1.fit(X_train, y_train)
# use the model by calling for the predictions made via the TRAIN dataset
rf1_preds = rf1.predict(X_train)
pd.crosstab(rf1_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

playoffs,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,303,7
1,11,411


In [22]:
print(f'Accuracy-Train {round(rf1.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(rf1.score(X_validate,y_validate),4)}')
print(classification_report(y_train,rf1_preds))
print(classification_report(y_validate,rf1.predict(X_validate)))

Accuracy-Train 0.9754
Accuracy-Validate 0.8635
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       314
           1       0.97      0.98      0.98       418

    accuracy                           0.98       732
   macro avg       0.98      0.97      0.97       732
weighted avg       0.98      0.98      0.98       732

              precision    recall  f1-score   support

           0       0.88      0.78      0.83       114
           1       0.85      0.92      0.89       157

    accuracy                           0.86       271
   macro avg       0.87      0.85      0.86       271
weighted avg       0.86      0.86      0.86       271



In [23]:
X_train.columns.to_list()

['fg_pct',
 'opp_fg_pct',
 'three_pt_pct',
 'opp_three_pt_pct',
 'ft_pct',
 'rebounds',
 'opp_rebounds',
 'assists',
 'steals',
 'opp_steals',
 'trnovrs_commited',
 'pts',
 'opp_pts',
 'opp_fouls',
 'prev_season']

In [24]:
rf1.feature_importances_

array([0.10687662, 0.07707102, 0.03149438, 0.05343988, 0.03077943,
       0.0368399 , 0.04604335, 0.06147485, 0.03482507, 0.02512046,
       0.03469452, 0.15972013, 0.17704936, 0.03591142, 0.08865962])

In [25]:
# build a quick df to compare results...will append to this df as needed
feat_df = pd.DataFrame(
    {'feat': X_train.columns.to_list(),
     'coef': rf1.feature_importances_
    }
)
feat_df

Unnamed: 0,feat,coef
0,fg_pct,0.106877
1,opp_fg_pct,0.077071
2,three_pt_pct,0.031494
3,opp_three_pt_pct,0.05344
4,ft_pct,0.030779
5,rebounds,0.03684
6,opp_rebounds,0.046043
7,assists,0.061475
8,steals,0.034825
9,opp_steals,0.02512


In [26]:
feat_df['coef'].sum()

1.0

In [27]:
feat_df.sort_values('coef',ascending=False)

Unnamed: 0,feat,coef
12,opp_pts,0.177049
11,pts,0.15972
0,fg_pct,0.106877
14,prev_season,0.08866
1,opp_fg_pct,0.077071
7,assists,0.061475
3,opp_three_pt_pct,0.05344
6,opp_rebounds,0.046043
5,rebounds,0.03684
13,opp_fouls,0.035911


In [28]:
model_list = []

for i in range (1, 9):
    for j in range(3,19,3):
        rf = RandomForestClassifier(max_depth=i, min_samples_leaf=j, random_state=2013)
        rf = rf.fit(X_train, y_train)
        train_accuracy = rf.score(X_train, y_train)
        validate_accuracy = rf.score(X_validate, y_validate)
        model_preds = rf.predict(X_train)

        output = {
            "min_samples_per_leaf": j,
            "max_depth": i,
            "train_accuracy": train_accuracy,
            "validate_accuracy": validate_accuracy,
        }
        model_list.append(output)

In [29]:
output_df = pd.DataFrame(model_list)

In [30]:
output_df.sort_values('validate_accuracy',ascending=False)

Unnamed: 0,min_samples_per_leaf,max_depth,train_accuracy,validate_accuracy
26,9,5,0.920765,0.870849
30,3,6,0.961749,0.867159
42,3,8,0.984973,0.867159
39,12,7,0.924863,0.867159
32,9,6,0.927596,0.867159
36,3,7,0.978142,0.867159
27,12,5,0.919399,0.863469
25,6,5,0.930328,0.863469
23,18,4,0.896175,0.863469
38,9,7,0.935792,0.863469


## GradientBoostClassifier

In [31]:
# create the Random Forest model 
gbc = GradientBoostingClassifier()
# fit the model to the TRAIN dataset:
gbc.fit(X_train, y_train)
# use the model by calling for the predictions made via the TRAIN dataset
gbc_preds = gbc.predict(X_train)
pd.crosstab(gbc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

playoffs,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,310,3
1,4,415


In [32]:
print(f'Accuracy-Train {round(gbc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(gbc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,gbc_preds))
print(classification_report(y_validate,gbc.predict(X_validate)))

Accuracy-Train 0.9904
Accuracy-Validate 0.8819
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       314
           1       0.99      0.99      0.99       418

    accuracy                           0.99       732
   macro avg       0.99      0.99      0.99       732
weighted avg       0.99      0.99      0.99       732

              precision    recall  f1-score   support

           0       0.87      0.85      0.86       114
           1       0.89      0.90      0.90       157

    accuracy                           0.88       271
   macro avg       0.88      0.88      0.88       271
weighted avg       0.88      0.88      0.88       271



## CATboost

In [33]:
# Create and fit the thing
CATb = CatBoostClassifier(verbose=False,depth=5)
CATb.fit(X_train,y_train)
CATb_preds = CATb.predict(X_train)
pd.crosstab(CATb_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

playoffs,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,309,6
1,5,412


In [34]:
print(f'Accuracy-Train {round(CATb.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(CATb.score(X_validate,y_validate),4)}')
print(classification_report(y_train,CATb_preds))
print(classification_report(y_validate,CATb.predict(X_validate)))

Accuracy-Train 0.985
Accuracy-Validate 0.8745
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       314
           1       0.99      0.99      0.99       418

    accuracy                           0.98       732
   macro avg       0.98      0.98      0.98       732
weighted avg       0.98      0.98      0.98       732

              precision    recall  f1-score   support

           0       0.88      0.81      0.84       114
           1       0.87      0.92      0.90       157

    accuracy                           0.87       271
   macro avg       0.88      0.87      0.87       271
weighted avg       0.88      0.87      0.87       271



## MULTILAYER PERCEPTRON

## SCALE

In [35]:
sc_X = StandardScaler()
X_train_scaled = sc_X.fit_transform(X_train)
X_validate_scaled = sc_X.transform(X_validate)
X_test_scaled = sc_X.transform(X_test)

In [36]:
mlp = MLPClassifier(hidden_layer_sizes=(256,128,64),activation="relu",random_state=2013,
                    batch_size=100,solver='sgd')
mlp.fit(X_train_scaled, y_train)
mlp_preds = mlp.predict(X_train_scaled)
print(mlp.score(X_train_scaled, y_train))
pd.crosstab(mlp_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

0.9344262295081968




playoffs,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,287,21
1,27,397


In [37]:
mlp.n_layers_

5

In [38]:
print(f'Accuracy-Train {round(mlp.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(mlp.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,mlp_preds))
print(classification_report(y_validate,mlp.predict(X_validate_scaled)))

Accuracy-Train 0.9344
Accuracy-Validate 0.8856
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       314
           1       0.94      0.95      0.94       418

    accuracy                           0.93       732
   macro avg       0.93      0.93      0.93       732
weighted avg       0.93      0.93      0.93       732

              precision    recall  f1-score   support

           0       0.86      0.87      0.86       114
           1       0.90      0.90      0.90       157

    accuracy                           0.89       271
   macro avg       0.88      0.88      0.88       271
weighted avg       0.89      0.89      0.89       271



In [39]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train_scaled, y_train)
knn_preds = knn.predict(X_train_scaled)
print(knn.score(X_train_scaled, y_train))
pd.crosstab(knn_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

0.9016393442622951


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


playoffs,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,314,72
1,0,346


In [40]:
print(f'Accuracy-Train {round(knn.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(knn.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,knn_preds))
print(classification_report(y_validate,knn.predict(X_validate_scaled)))

Accuracy-Train 0.9016
Accuracy-Validate 0.7897
              precision    recall  f1-score   support

           0       0.81      1.00      0.90       314
           1       1.00      0.83      0.91       418

    accuracy                           0.90       732
   macro avg       0.91      0.91      0.90       732
weighted avg       0.92      0.90      0.90       732

              precision    recall  f1-score   support

           0       0.69      0.89      0.78       114
           1       0.90      0.71      0.80       157

    accuracy                           0.79       271
   macro avg       0.80      0.80      0.79       271
weighted avg       0.82      0.79      0.79       271



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [41]:
#C=1, class_weight={0:1, 1:99}, random_state=123, intercept_scaling=1, solver='lbfgs'
log = LogisticRegression()
log.fit(X_train_scaled, y_train)
log_preds = log.predict(X_train_scaled)
print(log.score(X_train_scaled, y_train))
pd.crosstab(log_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

0.9057377049180327


playoffs,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,275,30
1,39,388


In [42]:
print(f'Accuracy-Train {round(log.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(log.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,log_preds))
print(classification_report(y_validate,log.predict(X_validate_scaled)))

Accuracy-Train 0.9057
Accuracy-Validate 0.8819
              precision    recall  f1-score   support

           0       0.90      0.88      0.89       314
           1       0.91      0.93      0.92       418

    accuracy                           0.91       732
   macro avg       0.91      0.90      0.90       732
weighted avg       0.91      0.91      0.91       732

              precision    recall  f1-score   support

           0       0.87      0.85      0.86       114
           1       0.89      0.90      0.90       157

    accuracy                           0.88       271
   macro avg       0.88      0.88      0.88       271
weighted avg       0.88      0.88      0.88       271



In [43]:
log.coef_[0]

array([ 0.63543681, -0.51279769, -0.20033177, -0.16341588,  0.12776366,
        0.03828152, -0.30299356,  0.2833693 ,  0.33302948,  0.16652471,
       -0.70891624,  2.77406991, -2.90244622,  0.4528879 ,  0.09634467])

In [44]:
# build a quick df to compare results...will append to this df as needed
feat_df = pd.DataFrame(
    {'feat': X_train.columns.to_list(),
     'coef': log.coef_[0]
    }
)
feat_df

Unnamed: 0,feat,coef
0,fg_pct,0.635437
1,opp_fg_pct,-0.512798
2,three_pt_pct,-0.200332
3,opp_three_pt_pct,-0.163416
4,ft_pct,0.127764
5,rebounds,0.038282
6,opp_rebounds,-0.302994
7,assists,0.283369
8,steals,0.333029
9,opp_steals,0.166525


In [45]:
feat_df['coef'].sum()

0.11680659358476925

In [46]:
feat_df['abs_val'] = abs(feat_df.coef)
#feat_df.sort_values('coef',ascending=False)

In [47]:
feat_df.sort_values('abs_val',ascending=False)

Unnamed: 0,feat,coef,abs_val
12,opp_pts,-2.902446,2.902446
11,pts,2.77407,2.77407
10,trnovrs_commited,-0.708916,0.708916
0,fg_pct,0.635437,0.635437
1,opp_fg_pct,-0.512798,0.512798
13,opp_fouls,0.452888,0.452888
8,steals,0.333029,0.333029
6,opp_rebounds,-0.302994,0.302994
7,assists,0.283369,0.283369
2,three_pt_pct,-0.200332,0.200332
