This Random Forest Classifier Model is intended to determine the weight of categorical factors in determining the NBA MVP

In [261]:
# Import Statements
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

In [262]:
# pip commands
%pip install numpy pandas scikit-learn matplotlib

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [263]:
# Read CSV files
advanced = pd.read_csv("Advanced.csv")
per_thirty_six = pd.read_csv("Per 36 Minutes.csv")
player_award_shares = pd.read_csv("Player Award Shares.csv")
player_per_game = pd.read_csv("Player Per Game.csv")
player_shooting = pd.read_csv("Player Shooting.csv")
player_totals = pd.read_csv("Player Totals.csv")

In [264]:
advanced = advanced.dropna()
print(advanced.columns)
advanced.head()

Index(['seas_id', 'season', 'player_id', 'player', 'birth_year', 'pos', 'age',
       'experience', 'lg', 'tm', 'g', 'mp', 'per', 'ts_percent', 'x3p_ar',
       'f_tr', 'orb_percent', 'drb_percent', 'trb_percent', 'ast_percent',
       'stl_percent', 'blk_percent', 'tov_percent', 'usg_percent', 'ows',
       'dws', 'ws', 'ws_48', 'obpm', 'dbpm', 'bpm', 'vorp'],
      dtype='object')


Unnamed: 0,seas_id,season,player_id,player,birth_year,pos,age,experience,lg,tm,...,tov_percent,usg_percent,ows,dws,ws,ws_48,obpm,dbpm,bpm,vorp
249,31385,2024,5066,Johnny Davis,2002.0,SG,21.0,2,NBA,WAS,...,11.1,15.6,0.0,0.0,0.0,0.016,-4.0,-1.4,-5.4,-0.1
810,30769,2023,5066,Johnny Davis,2002.0,SG,20.0,1,NBA,WAS,...,8.0,20.4,-0.6,0.3,-0.3,-0.031,-5.8,-1.7,-7.4,-0.6
1238,29708,2022,4910,Brandon Williams,1999.0,PG,22.0,1,NBA,POR,...,15.2,25.9,-0.7,0.2,-0.5,-0.037,-2.6,-3.1,-5.7,-0.6
1443,29913,2022,4619,George King,1994.0,SF,28.0,2,NBA,DAL,...,14.5,16.3,-0.1,0.0,-0.1,-0.296,-16.5,-3.5,-20.1,-0.1
2469,29422,2021,4548,Mike James,1990.0,PG,30.0,2,NBA,BRK,...,16.1,22.9,0.0,0.1,0.1,0.028,-2.1,-2.3,-4.4,-0.1


In [265]:
per_thirty_six = per_thirty_six.dropna()
drop = ['seas_id', 'player_id', 'birth_year', 'pos', 'age', 'experience', 'lg', 'tm', 'g', 'mp']
per_thirty_six = per_thirty_six.drop(columns = drop)
print(per_thirty_six.columns)
per_thirty_six.head()

Index(['season', 'player', 'gs', 'fg_per_36_min', 'fga_per_36_min',
       'fg_percent', 'x3p_per_36_min', 'x3pa_per_36_min', 'x3p_percent',
       'x2p_per_36_min', 'x2pa_per_36_min', 'x2p_percent', 'ft_per_36_min',
       'fta_per_36_min', 'ft_percent', 'orb_per_36_min', 'drb_per_36_min',
       'trb_per_36_min', 'ast_per_36_min', 'stl_per_36_min', 'blk_per_36_min',
       'tov_per_36_min', 'pf_per_36_min', 'pts_per_36_min'],
      dtype='object')


Unnamed: 0,season,player,gs,fg_per_36_min,fga_per_36_min,fg_percent,x3p_per_36_min,x3pa_per_36_min,x3p_percent,x2p_per_36_min,...,ft_percent,orb_per_36_min,drb_per_36_min,trb_per_36_min,ast_per_36_min,stl_per_36_min,blk_per_36_min,tov_per_36_min,pf_per_36_min,pts_per_36_min
249,2024,Johnny Davis,0.0,5.6,11.2,0.5,0.3,0.9,0.333,5.3,...,0.8,2.1,3.2,5.3,1.5,2.1,0.3,1.5,2.7,12.7
810,2023,Johnny Davis,5.0,5.6,14.6,0.386,1.4,6.0,0.243,4.2,...,0.519,0.8,4.6,5.4,2.4,0.9,0.7,1.4,4.1,13.9
1238,2022,Brandon Williams,16.0,5.9,15.9,0.372,1.7,6.0,0.292,4.2,...,0.701,1.1,3.1,4.2,5.3,1.4,0.6,3.3,2.9,17.4
1443,2022,George King,0.0,0.0,9.5,0.0,0.0,7.6,0.0,0.0,...,0.5,0.0,9.5,9.5,0.0,0.0,0.0,1.9,7.6,1.9
2469,2021,Mike James,1.0,5.2,14.0,0.37,1.7,4.7,0.355,3.5,...,0.778,0.5,4.4,4.9,8.2,0.9,0.2,3.1,1.5,15.3


In [266]:
# Merge first two dfs
df = pd.merge(advanced, per_thirty_six, on=['season', 'player'], how='inner')
df.columns

Index(['seas_id', 'season', 'player_id', 'player', 'birth_year', 'pos', 'age',
       'experience', 'lg', 'tm', 'g', 'mp', 'per', 'ts_percent', 'x3p_ar',
       'f_tr', 'orb_percent', 'drb_percent', 'trb_percent', 'ast_percent',
       'stl_percent', 'blk_percent', 'tov_percent', 'usg_percent', 'ows',
       'dws', 'ws', 'ws_48', 'obpm', 'dbpm', 'bpm', 'vorp', 'gs',
       'fg_per_36_min', 'fga_per_36_min', 'fg_percent', 'x3p_per_36_min',
       'x3pa_per_36_min', 'x3p_percent', 'x2p_per_36_min', 'x2pa_per_36_min',
       'x2p_percent', 'ft_per_36_min', 'fta_per_36_min', 'ft_percent',
       'orb_per_36_min', 'drb_per_36_min', 'trb_per_36_min', 'ast_per_36_min',
       'stl_per_36_min', 'blk_per_36_min', 'tov_per_36_min', 'pf_per_36_min',
       'pts_per_36_min'],
      dtype='object')

In [267]:
player_award_shares = player_award_shares.dropna()
player_award_shares = player_award_shares[player_award_shares['award'] == 'nba mvp']
player_award_shares = player_award_shares[['season', 'award', 'player', 'winner']]
print(player_award_shares.columns)
player_award_shares.head()

Index(['season', 'award', 'player', 'winner'], dtype='object')


Unnamed: 0,season,award,player,winner
25,2023,nba mvp,Joel Embiid,True
26,2023,nba mvp,Nikola Jokić,False
27,2023,nba mvp,Giannis Antetokounmpo,False
28,2023,nba mvp,Jayson Tatum,False
29,2023,nba mvp,Shai Gilgeous-Alexander,False


In [268]:
df = pd.merge(df, player_award_shares, on=['season', 'player'], how='inner')
df.columns

Index(['seas_id', 'season', 'player_id', 'player', 'birth_year', 'pos', 'age',
       'experience', 'lg', 'tm', 'g', 'mp', 'per', 'ts_percent', 'x3p_ar',
       'f_tr', 'orb_percent', 'drb_percent', 'trb_percent', 'ast_percent',
       'stl_percent', 'blk_percent', 'tov_percent', 'usg_percent', 'ows',
       'dws', 'ws', 'ws_48', 'obpm', 'dbpm', 'bpm', 'vorp', 'gs',
       'fg_per_36_min', 'fga_per_36_min', 'fg_percent', 'x3p_per_36_min',
       'x3pa_per_36_min', 'x3p_percent', 'x2p_per_36_min', 'x2pa_per_36_min',
       'x2p_percent', 'ft_per_36_min', 'fta_per_36_min', 'ft_percent',
       'orb_per_36_min', 'drb_per_36_min', 'trb_per_36_min', 'ast_per_36_min',
       'stl_per_36_min', 'blk_per_36_min', 'tov_per_36_min', 'pf_per_36_min',
       'pts_per_36_min', 'award', 'winner'],
      dtype='object')

In [269]:
player_per_game = player_per_game.dropna()
drop = ['seas_id', 'player_id', 'birth_year', 'pos', 'age', 'experience', 'lg', 'tm', 'g', 'gs', 'fg_percent']
player_per_game = player_per_game.drop(columns = drop)
print(player_per_game.columns)
player_per_game.head()

Index(['season', 'player', 'mp_per_game', 'fg_per_game', 'fga_per_game',
       'x3p_per_game', 'x3pa_per_game', 'x3p_percent', 'x2p_per_game',
       'x2pa_per_game', 'x2p_percent', 'e_fg_percent', 'ft_per_game',
       'fta_per_game', 'ft_percent', 'orb_per_game', 'drb_per_game',
       'trb_per_game', 'ast_per_game', 'stl_per_game', 'blk_per_game',
       'tov_per_game', 'pf_per_game', 'pts_per_game'],
      dtype='object')


Unnamed: 0,season,player,mp_per_game,fg_per_game,fga_per_game,x3p_per_game,x3pa_per_game,x3p_percent,x2p_per_game,x2pa_per_game,...,ft_percent,orb_per_game,drb_per_game,trb_per_game,ast_per_game,stl_per_game,blk_per_game,tov_per_game,pf_per_game,pts_per_game
249,2024,Johnny Davis,9.4,1.5,2.9,0.1,0.2,0.333,1.4,2.7,...,0.8,0.5,0.8,1.4,0.4,0.5,0.1,0.4,0.7,3.3
810,2023,Johnny Davis,15.1,2.4,6.1,0.6,2.5,0.243,1.8,3.6,...,0.519,0.3,1.9,2.3,1.0,0.4,0.3,0.6,1.7,5.8
1238,2022,Brandon Williams,26.7,4.4,11.8,1.3,4.4,0.292,3.1,7.3,...,0.701,0.8,2.3,3.1,3.9,1.0,0.4,2.4,2.2,12.9
1443,2022,George King,4.8,0.0,1.3,0.0,1.0,0.0,0.0,0.3,...,0.5,0.0,1.3,1.3,0.0,0.0,0.0,0.3,1.0,0.3
2469,2021,Mike James,18.2,2.6,7.1,0.8,2.4,0.355,1.8,4.7,...,0.778,0.2,2.2,2.5,4.2,0.5,0.1,1.5,0.8,7.7


In [270]:
df = pd.merge(df, player_per_game, on=['season', 'player'], how='inner')
df.columns

Index(['seas_id', 'season', 'player_id', 'player', 'birth_year', 'pos', 'age',
       'experience', 'lg', 'tm', 'g', 'mp', 'per', 'ts_percent', 'x3p_ar',
       'f_tr', 'orb_percent', 'drb_percent', 'trb_percent', 'ast_percent',
       'stl_percent', 'blk_percent', 'tov_percent', 'usg_percent', 'ows',
       'dws', 'ws', 'ws_48', 'obpm', 'dbpm', 'bpm', 'vorp', 'gs',
       'fg_per_36_min', 'fga_per_36_min', 'fg_percent', 'x3p_per_36_min',
       'x3pa_per_36_min', 'x3p_percent_x', 'x2p_per_36_min', 'x2pa_per_36_min',
       'x2p_percent_x', 'ft_per_36_min', 'fta_per_36_min', 'ft_percent_x',
       'orb_per_36_min', 'drb_per_36_min', 'trb_per_36_min', 'ast_per_36_min',
       'stl_per_36_min', 'blk_per_36_min', 'tov_per_36_min', 'pf_per_36_min',
       'pts_per_36_min', 'award', 'winner', 'mp_per_game', 'fg_per_game',
       'fga_per_game', 'x3p_per_game', 'x3pa_per_game', 'x3p_percent_y',
       'x2p_per_game', 'x2pa_per_game', 'x2p_percent_y', 'e_fg_percent',
       'ft_per_game', 'ft

In [271]:
player_shooting.dropna()
drop = ['seas_id', 'player_id', 'birth_year', 'pos', 'age', 'experience', 'lg', 'tm', 'g', 'mp']
player_shooting = player_shooting.drop(columns = drop)
print(player_shooting.columns)
player_shooting.head()

Index(['season', 'player', 'fg_percent', 'avg_dist_fga',
       'percent_fga_from_x2p_range', 'percent_fga_from_x0_3_range',
       'percent_fga_from_x3_10_range', 'percent_fga_from_x10_16_range',
       'percent_fga_from_x16_3p_range', 'percent_fga_from_x3p_range',
       'fg_percent_from_x2p_range', 'fg_percent_from_x0_3_range',
       'fg_percent_from_x3_10_range', 'fg_percent_from_x10_16_range',
       'fg_percent_from_x16_3p_range', 'fg_percent_from_x3p_range',
       'percent_assisted_x2p_fg', 'percent_assisted_x3p_fg',
       'percent_dunks_of_fga', 'num_of_dunks', 'percent_corner_3s_of_3pa',
       'corner_3_point_percent', 'num_heaves_attempted', 'num_heaves_made'],
      dtype='object')


Unnamed: 0,season,player,fg_percent,avg_dist_fga,percent_fga_from_x2p_range,percent_fga_from_x0_3_range,percent_fga_from_x3_10_range,percent_fga_from_x10_16_range,percent_fga_from_x16_3p_range,percent_fga_from_x3p_range,...,fg_percent_from_x16_3p_range,fg_percent_from_x3p_range,percent_assisted_x2p_fg,percent_assisted_x3p_fg,percent_dunks_of_fga,num_of_dunks,percent_corner_3s_of_3pa,corner_3_point_percent,num_heaves_attempted,num_heaves_made
0,2024,A.J. Green,0.367,23.6,0.133,0.0,0.067,0.033,0.033,0.867,...,1.0,0.308,1.0,0.875,0.0,0,0.269,0.429,0,0
1,2024,AJ Griffin,0.303,21.4,0.273,0.03,0.121,0.061,0.061,0.727,...,0.0,0.333,0.5,0.875,0.03,1,0.25,0.333,0,0
2,2024,Aaron Gordon,0.486,8.6,0.801,0.481,0.215,0.083,0.022,0.199,...,0.5,0.222,0.613,0.625,0.204,35,0.25,0.111,1,0
3,2024,Aaron Holiday,0.452,16.8,0.493,0.137,0.164,0.137,0.055,0.507,...,0.25,0.351,0.35,0.846,0.0,0,0.243,0.444,0,0
4,2024,Aaron Nesmith,0.528,14.9,0.472,0.278,0.157,0.028,0.009,0.528,...,0.0,0.439,0.531,1.0,0.028,2,0.421,0.583,0,0


In [272]:
df = pd.merge(df, player_shooting, on=['season', 'player'], how='inner')
df.columns

Index(['seas_id', 'season', 'player_id', 'player', 'birth_year', 'pos', 'age',
       'experience', 'lg', 'tm', 'g', 'mp', 'per', 'ts_percent', 'x3p_ar',
       'f_tr', 'orb_percent', 'drb_percent', 'trb_percent', 'ast_percent',
       'stl_percent', 'blk_percent', 'tov_percent', 'usg_percent', 'ows',
       'dws', 'ws', 'ws_48', 'obpm', 'dbpm', 'bpm', 'vorp', 'gs',
       'fg_per_36_min', 'fga_per_36_min', 'fg_percent_x', 'x3p_per_36_min',
       'x3pa_per_36_min', 'x3p_percent_x', 'x2p_per_36_min', 'x2pa_per_36_min',
       'x2p_percent_x', 'ft_per_36_min', 'fta_per_36_min', 'ft_percent_x',
       'orb_per_36_min', 'drb_per_36_min', 'trb_per_36_min', 'ast_per_36_min',
       'stl_per_36_min', 'blk_per_36_min', 'tov_per_36_min', 'pf_per_36_min',
       'pts_per_36_min', 'award', 'winner', 'mp_per_game', 'fg_per_game',
       'fga_per_game', 'x3p_per_game', 'x3pa_per_game', 'x3p_percent_y',
       'x2p_per_game', 'x2pa_per_game', 'x2p_percent_y', 'e_fg_percent',
       'ft_per_game', '

In [273]:
player_totals.dropna()
drop = ['seas_id', 'player_id', 'birth_year', 'pos', 'age', 'experience', 'lg', 'tm', 'g', 'mp', 'gs']
player_totals = player_totals.drop(columns = drop)
print(player_totals.columns)
player_totals.head()

Index(['season', 'player', 'fg', 'fga', 'fg_percent', 'x3p', 'x3pa',
       'x3p_percent', 'x2p', 'x2pa', 'x2p_percent', 'e_fg_percent', 'ft',
       'fta', 'ft_percent', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov',
       'pf', 'pts'],
      dtype='object')


Unnamed: 0,season,player,fg,fga,fg_percent,x3p,x3pa,x3p_percent,x2p,x2pa,...,ft_percent,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,2024,A.J. Green,11,30,0.367,8.0,26.0,0.308,3,4,...,1.0,1.0,7.0,8.0,8,0.0,0.0,0.0,8,31
1,2024,AJ Griffin,10,33,0.303,8.0,24.0,0.333,2,9,...,1.0,2.0,8.0,10.0,3,1.0,0.0,4.0,5,30
2,2024,Aaron Gordon,88,181,0.486,8.0,36.0,0.222,80,145,...,0.52,42.0,73.0,115.0,61,18.0,14.0,29.0,31,210
3,2024,Aaron Holiday,33,73,0.452,13.0,37.0,0.351,20,36,...,0.857,2.0,24.0,26.0,28,8.0,1.0,6.0,26,85
4,2024,Aaron Nesmith,57,108,0.528,25.0,57.0,0.439,32,51,...,0.654,15.0,34.0,49.0,15,17.0,8.0,9.0,49,156


In [274]:
df = pd.merge(df, player_totals, on=['season', 'player'], how='inner')
df.columns

Index(['seas_id', 'season', 'player_id', 'player', 'birth_year', 'pos', 'age',
       'experience', 'lg', 'tm',
       ...
       'ft_percent', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf',
       'pts'],
      dtype='object', length=122)

In [286]:
# Load each CSV file into a DataFrame
files = {
    "Advanced": "/path/to/Advanced.csv",
    "Per36Minutes": "/path/to/Per 36 Minutes.csv",
    "AwardShares": "/path/to/Player Award Shares.csv",
    "PerGame": "/path/to/Player Per Game.csv",
    "Shooting": "/path/to/Player Shooting.csv",
    "Totals": "/path/to/Player Totals.csv"
}

dfs = {name: pd.read_csv(path) for name, path in files.items()}

# Merging all DataFrames on common keys
common_keys = {'age', 'player', 'player_id', 'seas_id', 'season', 'tm'}
merged_df = dfs['Advanced']
for name, df in dfs.items():
    if name != 'Advanced':
        merged_df = pd.merge(merged_df, df, on=list(common_keys), how='outer')

# Creating the Y target variable (MVP award)
merged_df['Y'] = merged_df['award'] == 'nba mvp'

# Identifying potential numerical columns for X and removing redundant/irrelevant ones
numerical_columns = merged_df.select_dtypes(include=['float64', 'int64']).columns.tolist()
irrelevant_or_duplicate_cols = ['birth_year_x', 'birth_year_y', 'experience_x', 'experience_y', 
                                'g_x', 'g_y', 'mp_x', 'mp_y', 'fg_percent_x', 'fg_percent_y', 
                                'e_fg_percent_x', 'e_fg_percent_y', 'ft_percent_x', 'ft_percent_y']
final_numerical_columns = [col for col in numerical_columns if col not in irrelevant_or_duplicate_cols]

# Creating the X features DataFrame and handling missing values
X = merged_df[final_numerical_columns]
missing_value_threshold = 0.5 * len(merged_df)
columns_to_exclude = X.columns[X.isnull().sum() > missing_value_threshold]
X = X.drop(columns=columns_to_exclude).fillna(X.mean())

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, merged_df['Y'], test_size=0.2, random_state=42)


FileNotFoundError: [Errno 2] No such file or directory: '/path/to/Advanced.csv'

In [275]:
# drop all values from this season for now
df = df[df['season'] != 2024]

In [285]:
# Begin with Random Forest. Split into X and Y
Y = df['award'] == 'nba mvp'
drop = ['seas_id', 'player_id', 'birth_year', 'pos', 'age', 'lg', 'tm', 'g', 'mp']
X = df.drop(columns = drop)
print(X.shape)
print(Y.shape)

(2, 120)
(2,)


In [279]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [280]:
# Parameters
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [10, 20, 30, None]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

In [281]:
# Parameter Grid
param_grid = {'n_estimators': n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'bootstrap': bootstrap}

In [282]:
# Fit for Best Hyperparameters
rf_model = RandomForestClassifier()
rf_Grid = GridSearchCV(estimator = rf_model, param_grid = param_grid, cv = 5, verbose = 2, n_jobs = -1)
rf_Grid.fit(X_train, y_train)
rf_Grid.best_params_

Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


ValueError: Cannot have number of splits n_splits=5 greater than the number of samples: n_samples=1.

In [None]:
# Metrics
train_acc = rf_Grid.score(X_train, y_train)
test_acc = rf_Grid.score(X_test, y_test)
print(f'Train Accuracy: {train_acc}')
print(f'Test Accuracy: {test_acc}')

importances = rf_Grid.best_estimator_.feature_importances_
feature_names = X.columns
feature_importances = pd.Series(importances, index=feature_names).sort_values(ascending=False)

In [None]:
# Feature importances
plt.figure(figsize=(10,6))
feature_importances.plot(kind='bar')
plt.title('Feature Importances')
plt.show()