In [2]:
import pandas as pd

In [3]:
stats = pd.read_csv("player_mvp_stats.csv")

In [4]:
stats

Unnamed: 0.1,Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,...,Pts Won,Pts Max,Share,W,L,W/L%,GB,PS/G,PA/G,SRS
0,0,A.C. Green,27.0,Los Angeles Lakers,PF,82.0,21.0,26.4,3.1,6.6,...,0.0,0.0,0.000,58.0,24.0,0.707,5.0,106.3,99.6,6.73
1,1,Byron Scott,29.0,Los Angeles Lakers,SG,82.0,82.0,32.1,6.1,12.8,...,0.0,0.0,0.000,58.0,24.0,0.707,5.0,106.3,99.6,6.73
2,2,Elden Campbell,22.0,Los Angeles Lakers,PF,52.0,0.0,7.3,1.1,2.4,...,0.0,0.0,0.000,58.0,24.0,0.707,5.0,106.3,99.6,6.73
3,3,Irving Thomas,25.0,Los Angeles Lakers,PF,26.0,0.0,4.2,0.7,1.9,...,0.0,0.0,0.000,58.0,24.0,0.707,5.0,106.3,99.6,6.73
4,4,James Worthy,29.0,Los Angeles Lakers,SF,78.0,74.0,38.6,9.2,18.7,...,0.0,0.0,0.000,58.0,24.0,0.707,5.0,106.3,99.6,6.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14115,14115,Luka DonÄiÄ,,,,,,,,,...,146.0,1000.0,0.146,,,,,,,
14116,14116,Nikola JokiÄ,,,,,,,,,...,674.0,1000.0,0.674,,,,,,,
14117,14117,Luka DonÄiÄ,,,,,,,,,...,10.0,1000.0,0.010,,,,,,,
14118,14118,Nikola JokiÄ,,,,,,,,,...,926.0,990.0,0.935,,,,,,,


In [5]:
del stats["Unnamed: 0"]

In [7]:
# returning null items
pd.isnull(stats).sum()
# all % areas have higher numbers

Player         0
Age           24
Team          24
Pos           24
G             24
GS            24
MP            24
FG            24
FGA           24
FG%           89
3P            24
3PA           24
3P%         1941
2P            24
2PA           24
2P%          133
eFG%          89
FT            24
FTA           24
FT%          566
ORB           24
DRB           24
TRB           24
AST           24
STL           24
BLK           24
TOV           24
PF            24
PTS           24
Awards     12336
Year           0
Pts Won        0
Pts Max        0
Share          0
W             24
L             24
W/L%          24
GB            24
PS/G          24
PA/G          24
SRS           24
dtype: int64

In [9]:
# gets items where 3P% is null
stats[pd.isnull(stats["3P%"])][["Player", "3PA"]]

Unnamed: 0,Player,3PA
2,Elden Campbell,0.0
3,Irving Thomas,0.0
17,Jack Haley,0.0
19,Keith Owens,0.0
32,James Edwards,0.0
...,...,...
14115,Luka DonÄiÄ,
14116,Nikola JokiÄ,
14117,Luka DonÄiÄ,
14118,Nikola JokiÄ,


In [10]:
# gets items where FT% is null
stats[pd.isnull(stats["FT%"])][["Player", "FTA"]]

Unnamed: 0,Player,FTA
74,John Coker,0.0
82,Adrian Caldwell,0.0
96,Bruno Šundov,0.0
134,Jamal Robinson,0.0
138,A.J. Bramlett,0.0
...,...,...
14115,Luka DonÄiÄ,
14116,Nikola JokiÄ,
14117,Luka DonÄiÄ,
14118,Nikola JokiÄ,


In [11]:
# fills in 0 for null values
stats = stats.fillna(0)

In [12]:
stats.columns

Index(['Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Awards',
       'Year', 'Pts Won', 'Pts Max', 'Share', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [13]:
# explicit list of predictors
# had to remove string columns and Points Won and Shares since we are predicting those
# goal: predict percentage share MVP votes that a player got in a given year
predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
       'Year', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS']

In [14]:
# make the training model
train = stats[stats["Year"] < 2024]

In [15]:
test = stats[stats["Year"] == 2024]

In [16]:
# Ridge is a form of linear regression that avoids overfitting by shrinking it
from sklearn.linear_model import Ridge

reg = Ridge(alpha=.1) # controls how much the coefficient is going to be shrunk to avoid overfitting

In [17]:
reg.fit(train[predictors], train["Share"])

In [18]:
predictions = reg.predict(test[predictors])

In [19]:
predictions= pd.DataFrame(predictions, columns=["predictions"], index=test.index)

In [20]:
predictions

Unnamed: 0,predictions
199,-0.000203
200,-0.021090
201,0.016795
202,0.008196
203,-0.015660
...,...
13339,0.015494
13340,0.027933
13341,0.067622
14118,0.139516


In [21]:
combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)

In [22]:
combination

Unnamed: 0,Player,Share,predictions
199,A.J. Green,0.000,-0.000203
200,Andre Jackson Jr.,0.000,-0.021090
201,Bobby Portis,0.000,0.016795
202,Brook Lopez,0.000,0.008196
203,Chris Livingston,0.000,-0.015660
...,...,...,...
13339,Terquavion Smith,0.000,0.015494
13340,Tobias Harris,0.000,0.027933
13341,Tyrese Maxey,0.000,0.067622
14118,Nikola JokiÄ,0.935,0.139516


In [24]:
# get the first 10 players with MVP votes
combination.sort_values("Share", ascending=False).head(10)
# the results says that Giannis has high prediction number, but we want to predict where a player will rank in the MVP voting

Unnamed: 0,Player,Share,predictions
14118,Nikola JokiÄ,0.935,0.139516
1149,Shai Gilgeous-Alexander,0.646,0.167554
14119,Luka DonÄiÄ,0.572,0.139516
206,Giannis Antetokounmpo,0.194,0.207161
12593,Jalen Brunson,0.143,0.099247
2461,Jayson Tatum,0.087,0.113114
7474,Anthony Edwards,0.018,0.088774
3623,Domantas Sabonis,0.003,0.086975
10889,Kevin Durant,0.001,0.098904
8937,Isaiah Stewart,0.0,-0.033107


In [None]:
# The next step to evaluate how accurate/good was the Ridge algorithm in predicting the MVP Share