In [21]:
# Import libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 25)
import matplotlib.pyplot as plt
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, plot_confusion_matrix

In [22]:
#Load model and features
model = pickle.load(open("Model.sav", 'rb'))
features = pickle.load(open("Features.sav", 'rb'))

In [23]:
features

['MIN',
 'OREB',
 '%FTA',
 '%PTS',
 '3PA',
 '%TOV',
 'PF',
 '%OREB',
 'STL',
 'DD2',
 '%PFD',
 '%3PA',
 'BLK',
 '%AST',
 'GP',
 '+/-',
 '%BLK',
 '%DREB',
 '%PF',
 'NETRTG',
 'Loss%',
 '%STL',
 'COUNTRY',
 'COLLEGE',
 '%BLKA',
 'TD3',
 'BMI',
 'AGE']

## Testing
Let us test our model with two scenarios:
1. Player who did not sustained season ending injury
2. Player who sustained season ending injury

In [24]:
# Example 1: Mikal Bridges 2021-2022 season, no injury.
example_1 = {'MIN':2854,
 'OREB':.9,
 '%FTA':13.3,
 '%PTS': 16.7,
 '3PA': 3.8,
 '%TOV':9.7,
 'PF':1.8,
 '%OREB':14.1,
 'STL':1.2,
 'DD2':0,
 '%PFD':13.9,
 '%3PA':17.2,
 'BLK':.4,
 '%AST':10.9,
 'GP':82,
 '+/-':7,
 '%BLK':15.1,
 '%DREB':13.1,
 '%PF':13.4,
 'NETRTG':9.8,
 'Loss%':.22,
 '%STL':18.8,
 'COUNTRY':1,
 'COLLEGE':1,
 '%BLKA':14,
 'TD3':0,
 'BMI':24.23,
 'AGE':25}

In [25]:
# Example 2: Klay Thompson 2017-2018 season, sustained a season-ending left ACL injury in Jun. 2019.
example_2 = { 'MIN':2504,
 'OREB':.4,
 '%FTA':10.1,
 '%PTS':24.5,
 '3PA':7.1,
 '%TOV':15.5,
 'PF':1.6,
 '%OREB':7.3,
 'STL':0.8,
 'DD2':2,
 '%PFD':13.1,
 '%3PA':34.2,
 'BLK':.5,
 '%AST':11.2,
 'GP':73,
 '+/-':5.4,
 '%BLK':9,
 '%DREB':13.2,
 '%PF':11.6,
 'NETRTG':9.8,
 'Loss%':.29,
 '%STL':13.1,
 'COUNTRY':1,
 'COLLEGE':1,
 '%BLKA':9,
 'TD3':0,
 'BMI':25.51,
 'AGE':27}

In [26]:
# Turning that into a dataframe
new_test_example = pd.concat([pd.DataFrame(example_1, index=[0]), pd.DataFrame(example_2, index=[1])])
new_test_example

Unnamed: 0,MIN,OREB,%FTA,%PTS,3PA,%TOV,PF,%OREB,STL,DD2,%PFD,%3PA,...,%BLK,%DREB,%PF,NETRTG,Loss%,%STL,COUNTRY,COLLEGE,%BLKA,TD3,BMI,AGE
0,2854,0.9,13.3,16.7,3.8,9.7,1.8,14.1,1.2,0,13.9,17.2,...,15.1,13.1,13.4,9.8,0.22,18.8,1,1,14,0,24.23,25
1,2504,0.4,10.1,24.5,7.1,15.5,1.6,7.3,0.8,2,13.1,34.2,...,9.0,13.2,11.6,9.8,0.29,13.1,1,1,9,0,25.51,27


In [27]:
new_test_example.iloc[1, :].to_frame().T


Unnamed: 0,MIN,OREB,%FTA,%PTS,3PA,%TOV,PF,%OREB,STL,DD2,%PFD,%3PA,...,%BLK,%DREB,%PF,NETRTG,Loss%,%STL,COUNTRY,COLLEGE,%BLKA,TD3,BMI,AGE
1,2504.0,0.4,10.1,24.5,7.1,15.5,1.6,7.3,0.8,2.0,13.1,34.2,...,9.0,13.2,11.6,9.8,0.29,13.1,1.0,1.0,9.0,0.0,25.51,27.0


In [28]:
klay = new_test_example.iloc[1, :].to_frame().T
klay = pd.DataFrame(klay, index=[1])
klay

Unnamed: 0,MIN,OREB,%FTA,%PTS,3PA,%TOV,PF,%OREB,STL,DD2,%PFD,%3PA,...,%BLK,%DREB,%PF,NETRTG,Loss%,%STL,COUNTRY,COLLEGE,%BLKA,TD3,BMI,AGE
1,2504.0,0.4,10.1,24.5,7.1,15.5,1.6,7.3,0.8,2.0,13.1,34.2,...,9.0,13.2,11.6,9.8,0.29,13.1,1.0,1.0,9.0,0.0,25.51,27.0


In [29]:
# Proving we can use our model to predict on that!
model.predict(klay)

array([1])

In [30]:
model.predict_proba(klay)[0][1]

0.57

In [31]:
pd.DataFrame(model[1].feature_importances_, new_test_example.columns).sort_values(by = 0, ascending=False)

Unnamed: 0,0
BLK,0.075039
STL,0.06093
MIN,0.048432
GP,0.048421
PF,0.043772
Loss%,0.041701
OREB,0.040372
%DREB,0.040243
3PA,0.039786
BMI,0.038515


In [32]:
list(new_test_example.columns)

['MIN',
 'OREB',
 '%FTA',
 '%PTS',
 '3PA',
 '%TOV',
 'PF',
 '%OREB',
 'STL',
 'DD2',
 '%PFD',
 '%3PA',
 'BLK',
 '%AST',
 'GP',
 '+/-',
 '%BLK',
 '%DREB',
 '%PF',
 'NETRTG',
 'Loss%',
 '%STL',
 'COUNTRY',
 'COLLEGE',
 '%BLKA',
 'TD3',
 'BMI',
 'AGE']

### Build Current Players Dataframe

In [33]:
# Load 2022-2023 NBA
players = pd.read_csv('Data/player_stats/2022_2023.csv')
players.head()

Unnamed: 0,Name,TEAM,AGE,HEIGHT,WEIGHT,COLLEGE,COUNTRY,GP,NETRTG,TS%,W,L,...,%FTA,%OREB,%DREB,%REB,%AST,%TOV,%STL,%BLK,%BLKA,%PF,%PFD,%PTS
0,Aaron Holiday,ATL,26,72,185,UCLA,USA,63,0.9,52.8,32,31,...,8.0,12.3,9.0,9.9,21.5,16.4,25.7,14.3,15.2,24.2,14.1,12.3
1,AJ Griffin,ATL,19,78,220,Duke,USA,72,1.5,57.7,34,38,...,7.6,11.6,11.9,11.9,9.8,11.0,21.1,8.2,15.3,15.3,7.7,18.7
2,Bogdan Bogdanovic,ATL,30,77,225,,Serbia,54,1.2,58.6,27,27,...,10.4,7.0,13.7,12.1,19.1,16.6,21.0,10.7,8.8,14.4,10.4,20.2
3,Bruno Fernando,ATL,24,82,240,Maryland,Angola,39,-8.9,57.1,13,26,...,25.6,42.2,28.1,32.4,17.8,16.4,12.8,59.6,23.3,38.5,27.3,17.2
4,Clint Capela,ATL,29,82,256,,Switzerland,65,-1.1,65.6,35,30,...,16.6,56.7,38.6,43.6,6.3,11.9,18.8,48.5,15.6,22.6,18.0,18.2


In [34]:
players.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 539 entries, 0 to 538
Data columns (total 54 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Name     539 non-null    object 
 1   TEAM     539 non-null    object 
 2   AGE      539 non-null    int64  
 3   HEIGHT   539 non-null    int64  
 4   WEIGHT   539 non-null    int64  
 5   COLLEGE  539 non-null    object 
 6   COUNTRY  539 non-null    object 
 7   GP       539 non-null    int64  
 8   NETRTG   539 non-null    float64
 9   TS%      539 non-null    float64
 10  W        539 non-null    int64  
 11  L        539 non-null    int64  
 12  MPG      539 non-null    float64
 13  PTS      539 non-null    float64
 14  FGM      539 non-null    float64
 15  FGA      539 non-null    float64
 16  FG%      539 non-null    float64
 17  3PM      539 non-null    float64
 18  3PA      539 non-null    float64
 19  3P%      539 non-null    float64
 20  FTM      539 non-null    float64
 21  FTA      539 non

In [35]:
players.isnull().sum()

Name       0
TEAM       0
AGE        0
HEIGHT     0
WEIGHT     0
COLLEGE    0
COUNTRY    0
GP         0
NETRTG     0
TS%        0
W          0
L          0
MPG        0
PTS        0
FGM        0
FGA        0
FG%        0
3PM        0
3PA        0
3P%        0
FTM        0
FTA        0
FT%        0
OREB       0
DREB       0
REB        0
AST        0
TOV        0
STL        0
BLK        0
PF         0
FP         0
DD2        0
TD3        0
+/-        0
MIN        0
USG%       0
%FGM       0
%FGA       0
%3PM       0
%3PA       0
%FTM       0
%FTA       0
%OREB      0
%DREB      0
%REB       0
%AST       0
%TOV       0
%STL       0
%BLK       0
%BLKA      0
%PF        0
%PFD       0
%PTS       0
dtype: int64

In [36]:
# Load 2023-2024 NBA roster

In [37]:
# Inner join dataframes

In [38]:
# Process current player data

# Binary bin Country['USA'] = 1, other = 0
players.COUNTRY.replace(to_replace='USA', value=1, inplace=True) 
players.loc[players['COUNTRY'] != 1, 'COUNTRY'] = 0
players['COUNTRY'] = players['COUNTRY'].astype(int)

# Binary bin College = 1 , no College = 0
players.COLLEGE.replace(to_replace='None', value=0, inplace=True)
players.loc[players['COLLEGE'] != 0, 'COLLEGE'] = 1
players['COLLEGE'] = players['COLLEGE'].astype(int)

# Dummify Teams
players = pd.get_dummies(players, columns=['TEAM'])

# Engineering features
players['BMI'] = (players['WEIGHT'] / players['HEIGHT']**2) * 703
players['Loss%'] = players['L'] / players['GP']

players.head()


Unnamed: 0,Name,AGE,HEIGHT,WEIGHT,COLLEGE,COUNTRY,GP,NETRTG,TS%,W,L,MPG,...,TEAM_OKC,TEAM_ORL,TEAM_PHI,TEAM_PHX,TEAM_POR,TEAM_SAC,TEAM_SAS,TEAM_TOR,TEAM_UTA,TEAM_WAS,BMI,Loss%
0,Aaron Holiday,26,72,185,1,1,63,0.9,52.8,32,31,13.4,...,0,0,0,0,0,0,0,0,0,0,25.08777,0.492063
1,AJ Griffin,19,78,220,1,1,72,1.5,57.7,34,38,19.5,...,0,0,0,0,0,0,0,0,0,0,25.420776,0.527778
2,Bogdan Bogdanovic,30,77,225,0,0,54,1.2,58.6,27,27,27.9,...,0,0,0,0,0,0,0,0,0,0,26.678192,0.5
3,Bruno Fernando,24,82,240,1,0,39,-8.9,57.1,13,26,10.4,...,0,0,0,0,0,0,0,0,0,0,25.092207,0.666667
4,Clint Capela,29,82,256,0,0,65,-1.1,65.6,35,30,26.6,...,0,0,0,0,0,0,0,0,0,0,26.765021,0.461538


In [39]:
# Name and team
name = ['Name']

# # Domain knowledge features
# good_features = ['AGE', 'BMI', 'Loss%', 'COLLEGE', 'COUNTRY', 'GP', 'NETRTG', 'MPG', 'PTS',
#                 'FGA', '3PA', 'FTA', 'OREB', 'DREB', 'AST', 'TOV', 'STL', 'BLK', 'PF',
#                 'DD2', 'TD3', '+/-', 'MIN', 'USG%', '%FGA', 
#                 '%3PA',  '%FTA', '%OREB', '%DREB', '%AST', '%TOV',
#                 '%STL', '%BLK', '%BLKA', '%PF', '%PFD', '%PTS']

# # Teams
# teams = ['TEAM_ATL', 'TEAM_BKN', 'TEAM_BOS', 'TEAM_CHA',
#        'TEAM_CHI', 'TEAM_CLE', 'TEAM_DAL', 'TEAM_DEN', 'TEAM_DET', 'TEAM_GSW',
#        'TEAM_HOU', 'TEAM_IND', 'TEAM_LAC', 'TEAM_LAL', 'TEAM_MEM', 'TEAM_MIA',
#        'TEAM_MIL', 'TEAM_MIN', 'TEAM_NOP', 'TEAM_NYK',
#        'TEAM_OKC', 'TEAM_ORL', 'TEAM_PHI', 'TEAM_PHX', 'TEAM_POR', 'TEAM_SAC',
#        'TEAM_SAS', 'TEAM_TOR', 'TEAM_UTA', 'TEAM_WAS']

name_features = name + features

In [40]:
# Isolate vif_features
players = players[name_features]
players

Unnamed: 0,Name,MIN,OREB,%FTA,%PTS,3PA,%TOV,PF,%OREB,STL,DD2,%PFD,...,%BLK,%DREB,%PF,NETRTG,Loss%,%STL,COUNTRY,COLLEGE,%BLKA,TD3,BMI,AGE
0,Aaron Holiday,845,0.4,8.0,12.3,1.4,16.4,1.3,12.3,0.6,0,14.1,...,14.3,9.0,24.2,0.9,0.492063,25.7,1,1,15.2,0,25.087770,26
1,AJ Griffin,1401,0.5,7.6,18.7,3.6,11.0,1.2,11.6,0.6,0,7.7,...,8.2,11.9,15.3,1.5,0.527778,21.1,1,1,15.3,0,25.420776,19
2,Bogdan Bogdanovic,1509,0.4,10.4,20.2,6.7,16.6,1.6,7.0,0.8,0,10.4,...,10.7,13.7,14.4,1.2,0.500000,21.0,0,0,8.8,0,26.678192,30
3,Bruno Fernando,405,1.4,25.6,17.2,0.1,16.4,1.9,42.2,0.2,1,27.3,...,59.6,28.1,38.5,-8.9,0.666667,12.8,0,1,23.3,0,25.092207,24
4,Clint Capela,1730,4.0,16.6,18.2,0.0,11.9,2.1,56.7,0.7,35,18.0,...,48.5,38.6,22.6,-1.1,0.461538,18.8,0,0,15.6,0,26.765021,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
534,Kyle Kuzma,2239,0.9,23.1,25.7,7.5,28.4,2.3,12.9,0.6,14,20.0,...,12.1,25.5,17.0,0.3,0.546875,11.6,1,1,26.8,1,23.679774,27
535,Monte Morris,1696,0.4,10.1,15.4,3.3,13.4,1.2,9.1,0.7,2,11.1,...,7.6,15.3,11.7,0.1,0.548387,21.3,1,1,11.7,0,23.493243,28
536,Quenton Jackson,135,0.2,31.0,17.7,1.3,8.3,1.1,6.3,0.4,0,35.6,...,7.7,6.6,16.9,-6.7,0.666667,13.8,1,1,18.8,0,20.512565,24
537,Taj Gibson,480,0.7,22.0,15.2,0.5,19.4,1.7,38.1,0.3,0,20.6,...,35.3,19.7,40.8,-1.1,0.571429,20.5,1,1,18.6,0,24.858406,38


In [41]:
# Save current player data to disk
pickle.dump(players, open('CurrentPlayers.sav', 'wb'))

In [42]:
testp = players.drop('Name', axis = 1)
testp

Unnamed: 0,MIN,OREB,%FTA,%PTS,3PA,%TOV,PF,%OREB,STL,DD2,%PFD,%3PA,...,%BLK,%DREB,%PF,NETRTG,Loss%,%STL,COUNTRY,COLLEGE,%BLKA,TD3,BMI,AGE
0,845,0.4,8.0,12.3,1.4,16.4,1.3,12.3,0.6,0,14.1,16.1,...,14.3,9.0,24.2,0.9,0.492063,25.7,1,1,15.2,0,25.087770,26
1,1401,0.5,7.6,18.7,3.6,11.0,1.2,11.6,0.6,0,7.7,27.8,...,8.2,11.9,15.3,1.5,0.527778,21.1,1,1,15.3,0,25.420776,19
2,1509,0.4,10.4,20.2,6.7,16.6,1.6,7.0,0.8,0,10.4,36.8,...,10.7,13.7,14.4,1.2,0.500000,21.0,0,0,8.8,0,26.678192,30
3,405,1.4,25.6,17.2,0.1,16.4,1.9,42.2,0.2,1,27.3,1.4,...,59.6,28.1,38.5,-8.9,0.666667,12.8,0,1,23.3,0,25.092207,24
4,1730,4.0,16.6,18.2,0.0,11.9,2.1,56.7,0.7,35,18.0,0.1,...,48.5,38.6,22.6,-1.1,0.461538,18.8,0,0,15.6,0,26.765021,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
534,2239,0.9,23.1,25.7,7.5,28.4,2.3,12.9,0.6,14,20.0,32.1,...,12.1,25.5,17.0,0.3,0.546875,11.6,1,1,26.8,1,23.679774,27
535,1696,0.4,10.1,15.4,3.3,13.4,1.2,9.1,0.7,2,11.1,18.6,...,7.6,15.3,11.7,0.1,0.548387,21.3,1,1,11.7,0,23.493243,28
536,135,0.2,31.0,17.7,1.3,8.3,1.1,6.3,0.4,0,35.6,12.9,...,7.7,6.6,16.9,-6.7,0.666667,13.8,1,1,18.8,0,20.512565,24
537,480,0.7,22.0,15.2,0.5,19.4,1.7,38.1,0.3,0,20.6,7.3,...,35.3,19.7,40.8,-1.1,0.571429,20.5,1,1,18.6,0,24.858406,38


In [43]:
model.predict(testp).sum()/len(model.predict(testp))

0.09461966604823747

In [44]:
model.predict(testp)

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [45]:
pd.DataFrame({'Name':players.Name})

Unnamed: 0,Name
0,Aaron Holiday
1,AJ Griffin
2,Bogdan Bogdanovic
3,Bruno Fernando
4,Clint Capela
...,...
534,Kyle Kuzma
535,Monte Morris
536,Quenton Jackson
537,Taj Gibson


In [52]:
model.predict_proba(testp)

array([[0.85, 0.15],
       [0.76, 0.24],
       [0.28, 0.72],
       ...,
       [0.89, 0.11],
       [0.95, 0.05],
       [0.75, 0.25]])