# Machine Learning

In this part we will actually train and test the machine learning models for nba mvp prediction

In [1]:
import pandas as pd

In [2]:
stats = pd.read_csv("player_mvp_stats.csv")

In [3]:
stats

Unnamed: 0.1,Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,...,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
1,1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,...,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
2,2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,...,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
3,3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,...,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
4,4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,...,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14087,14087,Spencer Hawes,PF,28,MIL,54,1,14.8,2.5,5.1,...,0.0,0.0,Milwaukee Bucks,42.0,40.0,0.512,9.0,103.6,103.8,-0.45
14088,14088,Steve Novak,PF,33,MIL,8,0,2.8,0.3,0.9,...,0.0,0.0,Milwaukee Bucks,42.0,40.0,0.512,9.0,103.6,103.8,-0.45
14089,14089,Terrence Jones,PF,25,MIL,54,12,23.5,4.3,9.1,...,0.0,0.0,Milwaukee Bucks,42.0,40.0,0.512,9.0,103.6,103.8,-0.45
14090,14090,Thon Maker,C,19,MIL,57,34,9.9,1.5,3.2,...,0.0,0.0,Milwaukee Bucks,42.0,40.0,0.512,9.0,103.6,103.8,-0.45


In [4]:
del stats["Unnamed: 0"]

Machine learning models do not like working with missing values, so lets see that

In [5]:
pd.isnull(stats)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14087,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14088,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14089,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14090,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


The False is saying that this isnt a null value

Easier way to see is by sum()

In [6]:
pd.isnull(stats).sum()

Player        0
Pos           0
Age           0
Tm            0
G             0
GS            0
MP            0
FG            0
FGA           0
FG%          50
3P            0
3PA           0
3P%        2042
2P            0
2PA           0
2P%          84
eFG%         50
FT            0
FTA           0
FT%         462
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Year          0
Pts Won       0
Pts Max       0
Share         0
Team          0
W           540
L           540
W/L%        540
GB          540
PS/G        540
PA/G        540
SRS         540
dtype: int64

It looks like some of the are null because they are precentages and it might mean that the player never attempted. Lets check this hypothesis

In [8]:
stats[pd.isnull(stats["3P%"])][["Player", "3PA"]]

Unnamed: 0,Player,3PA
2,Elden Campbell,0.0
3,Irving Thomas,0.0
18,Jack Haley,0.0
20,Keith Owens,0.0
30,Benoit Benjamin,0.0
...,...,...
14061,Evan Eschmeyer,0.0
14062,Gheorghe Mureșan,0.0
14064,Jim McIlvaine,0.0
14070,Mark Hendrickson,0.0


To solve this for will change all the missing values to 0

In [9]:
stats = stats.fillna(0) 

### Training ML models

In [35]:
stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

We only want the numerical columns, and we dont want ones related to the share or pts won for mvp

In [36]:
predictors = [ 'Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS']

In [37]:
train = stats[stats["Year"] < 2021] # we are using these years as our training data

In [38]:
test = stats[stats["Year"] == 2021] # this is the year we want to test our model on

In [39]:
from sklearn.linear_model import Ridge # using ridge regression (to avoid overfitting)

reg = Ridge(alpha=.1)

In [40]:
reg.fit(train[predictors], train["Share"])

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [41]:
predictions = reg.predict(test[predictors])

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [42]:
predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)

In [43]:
predictions

Unnamed: 0,predictions
630,0.087907
631,0.060584
632,0.076754
633,0.069919
634,0.085074
...,...
13897,0.066516
13898,0.067512
13899,0.095511
13900,0.058653


In [44]:
combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)

In [45]:
combination

Unnamed: 0,Player,Share,predictions
630,Aaron Gordon,0.0,0.087907
631,Austin Rivers,0.0,0.060584
632,Bol Bol,0.0,0.076754
633,Facundo Campazzo,0.0,0.069919
634,Greg Whittington,0.0,0.085074
...,...,...,...
13897,Patty Mills,0.0,0.066516
13898,Quinndary Weatherspoon,0.0,0.067512
13899,Rudy Gay,0.0,0.095511
13900,Tre Jones,0.0,0.058653


In [46]:
# lets look at the highest scoring shares
combination.sort_values("Share", ascending=False).head(10)

Unnamed: 0,Player,Share,predictions
630,Aaron Gordon,0.0,0.087907
9912,Justin Jackson,0.0,0.068981
10670,Gordon Hayward,0.0,0.119075
10669,Devonte' Graham,0.0,0.093266
10668,Cody Zeller,0.0,0.092278
10667,Cody Martin,0.0,0.060236
10666,Caleb Martin,0.0,0.072745
10665,Brad Wanamaker,0.0,0.06918
10664,Bismack Biyombo,0.0,0.056324
9919,Thanasis Antetokounmpo,0.0,0.064415


In [60]:
stats

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,0.477,...,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,0.455,...,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,0.340,...,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,0.492,...,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14087,Spencer Hawes,PF,28,MIL,54,1,14.8,2.5,5.1,0.484,...,0.0,0.0,Milwaukee Bucks,42.0,40.0,0.512,9.0,103.6,103.8,-0.45
14088,Steve Novak,PF,33,MIL,8,0,2.8,0.3,0.9,0.286,...,0.0,0.0,Milwaukee Bucks,42.0,40.0,0.512,9.0,103.6,103.8,-0.45
14089,Terrence Jones,PF,25,MIL,54,12,23.5,4.3,9.1,0.470,...,0.0,0.0,Milwaukee Bucks,42.0,40.0,0.512,9.0,103.6,103.8,-0.45
14090,Thon Maker,C,19,MIL,57,34,9.9,1.5,3.2,0.459,...,0.0,0.0,Milwaukee Bucks,42.0,40.0,0.512,9.0,103.6,103.8,-0.45


In [67]:
test["Share"].unique()

array([0.])

In [71]:
stats_2021 = stats[stats['Year'] == 2021]

In [74]:
stats_2021_subset = stats_2021[['Player', 'Year', 'Share']]

In [75]:
stats_2021_subset

Unnamed: 0,Player,Year,Share
630,Aaron Gordon,2021,0.0
631,Austin Rivers,2021,0.0
632,Bol Bol,2021,0.0
633,Facundo Campazzo,2021,0.0
634,Greg Whittington,2021,0.0
...,...,...,...
13897,Patty Mills,2021,0.0
13898,Quinndary Weatherspoon,2021,0.0
13899,Rudy Gay,2021,0.0
13900,Tre Jones,2021,0.0


In [76]:
stats_2021_subset.to_csv("2021 shares.csv")

In [94]:
stats_2020 = stats[stats['Year'] == 2021
]
stats_2020_subset = stats_2020[['Player', 'Year', 'Share']]
stats_2020_subset["Share"].unique()

array([0.])