# Predicting NBA MVP

The goal of this project is to predict the percentage share of MVP Votes that a player received in a given year.

In [60]:
import warnings
warnings.simplefilter(action='ignore')

# EDA Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')
%matplotlib inline

# ML Libraries
from sklearn.model_selection import cross_val_score, train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures, SplineTransformer
from yellowbrick.model_selection import FeatureImportances
from sklearn.feature_selection import SequentialFeatureSelector

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

**Load Dataset**

In [13]:
stats = pd.read_csv('nba_stats.csv')
stats.head()

Unnamed: 0,Player,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Pts Won,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,A.C. Green,PF,27.0,82.0,21.0,26.4,3.1,6.6,0.476,0.1,0.7,0.2,3.0,5.9,0.507,0.486,2.7,3.7,0.738,2.5,3.8,6.3,0.9,0.7,0.3,1.2,1.4,9.1,1991,0.0,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
1,Byron Scott,SG,29.0,82.0,82.0,32.1,6.1,12.8,0.477,0.9,2.7,0.324,5.2,10.1,0.517,0.51,1.4,1.8,0.797,0.7,2.3,3.0,2.2,1.2,0.3,1.0,1.8,14.5,1991,0.0,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
2,Elden Campbell,PF,22.0,52.0,0.0,7.3,1.1,2.4,0.455,0.0,0.0,,1.1,2.4,0.455,0.455,0.6,0.9,0.653,0.8,1.1,1.8,0.2,0.2,0.7,0.3,1.4,2.8,1991,0.0,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
3,Irving Thomas,PF,25.0,26.0,0.0,4.2,0.7,1.9,0.34,0.0,0.0,,0.7,1.9,0.34,0.34,0.5,0.8,0.571,0.5,0.7,1.2,0.4,0.2,0.0,0.5,0.9,1.8,1991,0.0,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
4,James Worthy,SF,29.0,78.0,74.0,38.6,9.2,18.7,0.492,0.3,1.2,0.289,8.8,17.5,0.505,0.501,2.7,3.4,0.797,1.4,3.2,4.6,3.5,1.3,0.4,1.6,1.5,21.4,1991,0.0,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73


In [14]:
stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13043 entries, 0 to 13042
Data columns (total 40 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Player   13021 non-null  object 
 1   Pos      13014 non-null  object 
 2   Age      13014 non-null  float64
 3   G        13014 non-null  float64
 4   GS       13014 non-null  float64
 5   MP       13014 non-null  float64
 6   FG       13014 non-null  float64
 7   FGA      13014 non-null  float64
 8   FG%      12955 non-null  float64
 9   3P       13014 non-null  float64
 10  3PA      13014 non-null  float64
 11  3P%      11154 non-null  float64
 12  2P       13014 non-null  float64
 13  2PA      13014 non-null  float64
 14  2P%      12915 non-null  float64
 15  eFG%     12955 non-null  float64
 16  FT       13014 non-null  float64
 17  FTA      13014 non-null  float64
 18  FT%      12530 non-null  float64
 19  ORB      13014 non-null  float64
 20  DRB      13014 non-null  float64
 21  TRB      130

### Data Cleaning

**Find & Handle Missing Values**

In [15]:
pd.isnull(stats).sum()

Player       22
Pos          29
Age          29
G            29
GS           29
MP           29
FG           29
FGA          29
FG%          88
3P           29
3PA          29
3P%        1889
2P           29
2PA          29
2P%         128
eFG%         88
FT           29
FTA          29
FT%         513
ORB          29
DRB          29
TRB          29
AST          29
STL          29
BLK          29
TOV          29
PF           29
PTS          29
Year          0
Pts Won      22
Pts Max      22
Share        22
Team          7
W           323
L           323
W/L%        323
GB          323
PS/G        323
PA/G        323
SRS         323
dtype: int64

We'll delete all rows without player name data.

In [16]:
stats.drop(index=stats.loc[stats['Player'].isnull()].index, inplace=True)

We'll drop the rows with no technical data.

In [17]:
stats.drop(stats.loc[stats['Pos'].isnull()].index, inplace=True)

We'll review the rows without Win/ Loss data

In [18]:
stats.loc[stats['W'].isnull()]

Unnamed: 0,Player,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Pts Won,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
2496,Alan Anderson,SF,31.0,78.0,26.0,22.7,2.5,6.2,0.4,1.1,3.2,0.339,1.4,3.0,0.464,0.487,1.2,1.5,0.78,0.5,1.7,2.2,1.0,0.6,0.1,0.8,1.9,7.2,2014,0.0,0.0,0.0,Brooklyn Nets,,,,,,,
2497,Andray Blatche,PF,27.0,73.0,7.0,22.2,4.4,9.2,0.476,0.2,0.7,0.278,4.2,8.5,0.494,0.487,2.2,3.0,0.742,1.6,3.7,5.3,1.5,1.0,0.5,1.5,2.3,11.2,2014,0.0,0.0,0.0,Brooklyn Nets,,,,,,,
2498,Andrei Kirilenko,PF,32.0,45.0,4.0,19.0,1.8,3.6,0.513,0.0,0.1,0.2,1.8,3.4,0.523,0.516,1.4,2.6,0.513,1.2,2.0,3.2,1.6,0.9,0.4,1.2,1.4,5.0,2014,0.0,0.0,0.0,Brooklyn Nets,,,,,,,
2499,Brook Lopez,C,25.0,17.0,17.0,31.4,7.6,13.5,0.563,0.0,0.1,0.0,7.6,13.4,0.566,0.563,5.5,6.8,0.817,2.3,3.7,6.0,0.9,0.5,1.8,1.6,3.1,20.7,2014,0.0,0.0,0.0,Brooklyn Nets,,,,,,,
2500,Deron Williams,PG,29.0,64.0,58.0,32.2,5.0,11.2,0.45,1.5,4.2,0.366,3.5,7.0,0.5,0.518,2.7,3.4,0.801,0.2,2.4,2.6,6.1,1.5,0.2,2.2,2.3,14.3,2014,0.0,0.0,0.0,Brooklyn Nets,,,,,,,
2501,Jason Collins,C,35.0,22.0,1.0,7.8,0.5,1.1,0.458,0.0,0.1,0.0,0.5,1.0,0.524,0.458,0.1,0.2,0.75,0.3,0.5,0.9,0.2,0.4,0.0,0.3,1.4,1.1,2014,0.0,0.0,0.0,Brooklyn Nets,,,,,,,
2502,Jason Terry,PG,36.0,35.0,0.0,16.3,1.6,4.3,0.362,1.1,2.9,0.379,0.5,1.4,0.327,0.49,0.3,0.4,0.667,0.1,0.9,1.1,1.6,0.4,0.0,0.8,1.4,4.5,2014,0.0,0.0,0.0,Brooklyn Nets,,,,,,,
2503,Joe Johnson,SG,32.0,79.0,79.0,32.6,5.8,12.9,0.454,2.1,5.1,0.401,3.8,7.8,0.489,0.533,2.0,2.5,0.815,0.6,2.8,3.4,2.7,0.6,0.1,1.5,1.6,15.8,2014,0.0,0.0,0.0,Brooklyn Nets,,,,,,,
2504,Jorge Gutiérrez,PG,25.0,15.0,2.0,16.3,1.7,3.6,0.463,0.2,0.8,0.25,1.5,2.8,0.524,0.491,0.6,0.8,0.75,0.2,1.3,1.5,2.0,0.7,0.1,0.9,2.5,4.1,2014,0.0,0.0,0.0,Brooklyn Nets,,,,,,,
2505,Kevin Garnett,C,37.0,54.0,54.0,20.5,2.9,6.6,0.441,0.0,0.1,0.0,2.9,6.5,0.445,0.441,0.7,0.9,0.809,1.1,5.5,6.6,1.5,0.8,0.7,1.3,2.3,6.5,2014,0.0,0.0,0.0,Brooklyn Nets,,,,,,,


In [21]:
stats.loc[stats['Player'] == 'Milton Blatche', :]

Unnamed: 0,Player,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Pts Won,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
2497,Andray Blatche,PF,27.0,73.0,7.0,22.2,4.4,9.2,0.476,0.2,0.7,0.278,4.2,8.5,0.494,0.487,2.2,3.0,0.742,1.6,3.7,5.3,1.5,1.0,0.5,1.5,2.3,11.2,2014,0.0,0.0,0.0,Brooklyn Nets,,,,,,,
3734,Andray Blatche,PF,23.0,81.0,36.0,27.9,5.9,12.3,0.478,0.2,0.5,0.295,5.7,11.8,0.487,0.485,2.1,2.9,0.744,2.0,4.3,6.3,2.1,1.0,0.9,2.2,2.7,14.1,2010,0.0,0.0,0.0,Washington Wizards,26.0,56.0,0.317,33.0,96.2,101.0,-4.72
4703,Andray Blatche,PF,19.0,29.0,0.0,6.0,0.9,2.3,0.388,0.1,0.4,0.231,0.8,1.9,0.426,0.41,0.3,0.4,0.833,0.4,0.9,1.3,0.3,0.2,0.2,0.4,1.2,2.2,2006,0.0,0.0,0.0,Washington Wizards,42.0,40.0,0.512,10.0,101.7,99.8,1.57
4717,Andray Blatche,PF,20.0,56.0,13.0,12.2,1.5,3.5,0.437,0.1,0.5,0.148,1.5,3.0,0.482,0.447,0.5,0.9,0.612,1.4,2.0,3.4,0.7,0.3,0.6,0.9,1.7,3.7,2007,0.0,0.0,0.0,Washington Wizards,41.0,41.0,0.5,3.0,104.3,104.9,-0.8
4733,Andray Blatche,PF,21.0,82.0,15.0,20.4,3.1,6.5,0.474,0.0,0.2,0.231,3.1,6.4,0.48,0.477,1.3,1.9,0.695,2.0,3.2,5.2,1.1,0.6,1.4,1.4,3.1,7.5,2008,0.0,0.0,0.0,Washington Wizards,43.0,39.0,0.524,9.0,98.8,99.2,-0.61
4745,Andray Blatche,C,22.0,71.0,36.0,24.0,4.2,8.9,0.471,0.1,0.3,0.238,4.1,8.6,0.479,0.475,1.6,2.3,0.704,1.8,3.5,5.3,1.7,0.7,1.0,1.6,3.0,10.0,2009,0.0,0.0,0.0,Washington Wizards,19.0,63.0,0.232,40.0,96.1,103.5,-6.98
4758,Andray Blatche,PF,24.0,64.0,63.0,33.9,6.7,15.0,0.445,0.1,0.3,0.222,6.6,14.7,0.449,0.447,3.4,4.4,0.777,2.9,5.4,8.2,2.3,1.5,0.8,2.7,2.8,16.8,2011,0.0,0.0,0.0,Washington Wizards,23.0,59.0,0.28,35.0,97.3,104.7,-7.3
4770,Andray Blatche,PF,25.0,26.0,13.0,24.1,3.6,9.4,0.38,0.1,0.3,0.286,3.5,9.2,0.382,0.384,1.3,1.9,0.673,1.6,4.2,5.8,1.1,0.8,0.7,1.4,2.1,8.5,2012,0.0,0.0,0.0,Washington Wizards,20.0,46.0,0.303,26.0,93.6,98.4,-5.14
4786,Andray Blatche,C,26.0,82.0,8.0,19.0,4.2,8.2,0.512,0.0,0.3,0.136,4.2,8.0,0.524,0.514,1.8,2.6,0.685,2.0,3.1,5.1,1.0,1.0,0.7,1.5,2.0,10.3,2013,0.0,0.0,0.0,Brooklyn Nets,,,,,,,


In [22]:
team_name = dict()
year_name = dict()
for index, row in stats.loc[stats['W'].isnull()].iterrows():
    team = row[-8]
    year = row[-12]
    if team not in team_name:
        team_name[team] = 1
    else:
        team_name[team] += 1

    if year not in year_name:
        year_name[year] = 1
    else:
        year_name[year] += 1

In [24]:
print(team_name)
print(year_name)

{'Brooklyn Nets': 160, 'Charlotte Hornets': 156}
{2014: 15, 2015: 15, 2019: 17, 2021: 18, 2018: 15, 1993: 11, 1994: 12, 1995: 15, 2013: 16, 2016: 15, 2017: 14, 1992: 13, 1996: 13, 1997: 11, 1998: 12, 2000: 15, 2001: 15, 2002: 14, 2022: 14, 1999: 12, 2020: 21, 1991: 13}


All the missing data is from 2 teams throughout the period reflected in the dataset. We'll fill the data with a constant, 0.

In [49]:
stats.loc[stats['W'].isnull()] = stats.loc[stats['W'].isnull()].fillna(0)
stats.loc[stats['W'].isnull()]

Unnamed: 0,Player,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Pts Won,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS


In [50]:
stats.isnull().sum()

Player        0
Pos           0
Age           0
G             0
GS            0
MP            0
FG            0
FGA           0
FG%          56
3P            0
3PA           0
3P%        1832
2P            0
2PA           0
2P%          95
eFG%         56
FT            0
FTA           0
FT%         468
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Year          0
Pts Won       0
Pts Max       0
Share         0
Team          0
W             0
L             0
W/L%          0
GB            0
PS/G          0
PA/G          0
SRS           0
dtype: int64

Let's review the missing records in FG%, 3P%, 2P%, eFG%, and FT%. Our null hypothesis is that these missing values in these percent fields mean that the player did not attempt any of the actual values i.e. Field Goals, 3-Points, 2-Points, and Free Throws.

Let's test this hypothesis.

In [52]:
stats.loc[stats['3P%'].isnull()][['Player', '3P', '3P%']].sample(20)

Unnamed: 0,Player,3P,3P%
2,Elden Campbell,0.0,
3,Irving Thomas,0.0,
17,Jack Haley,0.0,
19,Keith Owens,0.0,
32,James Edwards,0.0,
45,Jerrod Mustaf,0.0,
49,Mark West,0.0,
53,Aaron Swinson,0.0,
54,Antonio Lang,0.0,
65,Wayman Tisdale,0.0,


In [53]:
stats.loc[stats['FT%'].isnull()][['Player', 'FT', 'FT%']].sampel(20)

Unnamed: 0,Player,FT,FT%
74,John Coker,0.0,
82,Adrian Caldwell,0.0,
96,Bruno Šundov,0.0,
134,Jamal Robinson,0.0,
138,A.J. Bramlett,0.0,
141,Benoit Benjamin,0.0,
208,A.J. Guyton,0.0,
218,Guy Rucker,0.0,
225,Ben Bentil,0.0,
328,Loren Woods,0.0,


Hypothesis confirmed. We'll replace the missing values with 0

In [54]:
stats = stats.fillna(0)
stats.isnull().sum()

Player     0
Pos        0
Age        0
G          0
GS         0
MP         0
FG         0
FGA        0
FG%        0
3P         0
3PA        0
3P%        0
2P         0
2PA        0
2P%        0
eFG%       0
FT         0
FTA        0
FT%        0
ORB        0
DRB        0
TRB        0
AST        0
STL        0
BLK        0
TOV        0
PF         0
PTS        0
Year       0
Pts Won    0
Pts Max    0
Share      0
Team       0
W          0
L          0
W/L%       0
GB         0
PS/G       0
PA/G       0
SRS        0
dtype: int64

In [55]:
stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13014 entries, 0 to 13013
Data columns (total 40 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Player   13014 non-null  object 
 1   Pos      13014 non-null  object 
 2   Age      13014 non-null  float64
 3   G        13014 non-null  float64
 4   GS       13014 non-null  float64
 5   MP       13014 non-null  float64
 6   FG       13014 non-null  float64
 7   FGA      13014 non-null  float64
 8   FG%      13014 non-null  float64
 9   3P       13014 non-null  float64
 10  3PA      13014 non-null  float64
 11  3P%      13014 non-null  float64
 12  2P       13014 non-null  float64
 13  2PA      13014 non-null  float64
 14  2P%      13014 non-null  float64
 15  eFG%     13014 non-null  float64
 16  FT       13014 non-null  float64
 17  FTA      13014 non-null  float64
 18  FT%      13014 non-null  float64
 19  ORB      13014 non-null  float64
 20  DRB      13014 non-null  float64
 21  TRB      130

### Training a Machine Learning Model

We'll be using all numeric columns as predictors for our ML model. We'll also be removing the columns that give the algorithms the exact information we're trying to predict, i.e. Pts Won, Pts Max & Share (our target variable).

In [57]:
predictors = list(stats.columns)
predictors.remove('Player')
predictors.remove('Pos')
predictors.remove('Team')
predictors.remove('Pts Won')
predictors.remove('Pts Max')
predictors.remove('Share')
predictors

['Age',
 'G',
 'GS',
 'MP',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '3P%',
 '2P',
 '2PA',
 '2P%',
 'eFG%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS',
 'Year',
 'W',
 'L',
 'W/L%',
 'GB',
 'PS/G',
 'PA/G',
 'SRS']

### Machine Learning

**Develop Train and Test Sets**

Training data will be segmented by year and reflect all data before 2021. The remainder data will be for testing.

In [58]:
train = stats.loc[stats['Year'] < 2021]
test = stats.loc[stats['Year'] >= 2021]

**Build 1st Model**

In [61]:
reg = Ridge(alpha=.1)
reg.fit(train[predictors], train['Share'])

In [62]:
predictions = reg.predict(test[predictors])

In [67]:
test_pred = pd.DataFrame({'player':test['Player'],
                          'y_test':test['Share'],
                          'y_pred':predictions,
                          'residuals':test['Share'] - predictions}, index=test.index).sort_values(by='y_test', ascending=False)
test_pred

Unnamed: 0,player,y_test,y_pred,residuals
10057,Nikola Jokić,0.961,0.160897,0.800103
523,Nikola Jokić,0.875,0.201656,0.673344
642,Joel Embiid,0.706,0.201924,0.504076
9825,Giannis Antetokounmpo,0.595,0.233275,0.361725
7364,Joel Embiid,0.58,0.171261,0.408739
3041,Stephen Curry,0.449,0.14963,0.29937
8444,Giannis Antetokounmpo,0.345,0.215697,0.129303
1170,Devin Booker,0.216,0.100563,0.115437
9632,Luka Dončić,0.146,0.171661,-0.025661
1151,Chris Paul,0.138,0.077804,0.060196


**Evaluate Model**

In [68]:
mean_squared_error(test_pred['y_test'], test_pred['y_pred'])

0.0028129088432197386

Using the MSE as a metric for this model is not useful given that more than half of the values in the test target are 0, and we're trying to make predictions for values greater than 0.

We'll rank players by their actual rank and predicted rank based on actual and predicted Share values.

In [69]:
test_pred['Rk'] = list(range(1, test_pred.shape[0]+1))
test_pred.head(10)

Unnamed: 0,player,y_test,y_pred,residuals,Rk
10057,Nikola Jokić,0.961,0.160897,0.800103,1
523,Nikola Jokić,0.875,0.201656,0.673344,2
642,Joel Embiid,0.706,0.201924,0.504076,3
9825,Giannis Antetokounmpo,0.595,0.233275,0.361725,4
7364,Joel Embiid,0.58,0.171261,0.408739,5
3041,Stephen Curry,0.449,0.14963,0.29937,6
8444,Giannis Antetokounmpo,0.345,0.215697,0.129303,7
1170,Devin Booker,0.216,0.100563,0.115437,8
9632,Luka Dončić,0.146,0.171661,-0.025661,9
1151,Chris Paul,0.138,0.077804,0.060196,10


In [71]:
test_pred = test_pred.sort_values(by='y_pred', ascending=False)
test_pred['Predicted_Rk'] = list(range(1, test_pred.shape[0]+1))
test_pred.head()

Unnamed: 0,player,y_test,y_pred,residuals,Rk,Predicted_Rk
9825,Giannis Antetokounmpo,0.595,0.233275,0.361725,4,1
8444,Giannis Antetokounmpo,0.345,0.215697,0.129303,7,2
642,Joel Embiid,0.706,0.201924,0.504076,3,3
523,Nikola Jokić,0.875,0.201656,0.673344,2,4
6773,LeBron James,0.001,0.171982,-0.170982,24,5


In [72]:
test_pred = test_pred.sort_values(by='y_test', ascending=False)
test_pred

Unnamed: 0,player,y_test,y_pred,residuals,Rk,Predicted_Rk
10057,Nikola Jokić,0.961,0.160897,0.800103,1,8
523,Nikola Jokić,0.875,0.201656,0.673344,2,4
642,Joel Embiid,0.706,0.201924,0.504076,3,3
9825,Giannis Antetokounmpo,0.595,0.233275,0.361725,4,1
7364,Joel Embiid,0.58,0.171261,0.408739,5,7
3041,Stephen Curry,0.449,0.14963,0.29937,6,12
8444,Giannis Antetokounmpo,0.345,0.215697,0.129303,7,2
1170,Devin Booker,0.216,0.100563,0.115437,8,31
9632,Luka Dončić,0.146,0.171661,-0.025661,9,6
1151,Chris Paul,0.138,0.077804,0.060196,10,56


We'll use a custom metric called average precision.