In [1]:
import pandas as pd

In [2]:
stats = pd.read_csv("player_mvp_stats.csv")

In [3]:
del stats["Unnamed: 0"]

In [4]:
pd.isnull(stats).sum()

Player        0
Pos           0
Age           0
Tm            0
G             0
GS            0
MP            0
FG            0
FGA           0
FG%          44
3P            0
3PA           0
3P%        1196
2P            0
2PA           0
2P%          86
eFG%         44
FT            0
FTA           0
FT%         421
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Year          0
Pts Won       0
Pts Max       0
Share         0
Team          0
W             0
L             0
W/L%          0
GB            0
PS/G          0
PA/G          0
SRS           0
dtype: int64

In [5]:
stats = stats.fillna(0)

In [6]:
stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [7]:
predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS']

In [8]:
train = stats[stats["Year"] < 2024]

In [76]:
train = train[train["G"] >= 60]

In [9]:
test = stats[stats["Year"] == 2024]

In [10]:
test = test[test["G"] >= 65]

In [11]:
from sklearn.linear_model import Ridge

reg = Ridge(alpha=.1)

In [12]:
reg.fit(train[predictors], train["Share"])

In [13]:
predictions = reg.predict(test[predictors])

In [14]:
predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)

In [15]:
predictions

Unnamed: 0,predictions
324,0.005909
326,-0.000891
328,0.040330
330,-0.017696
663,0.007484
...,...
10327,0.001898
10328,0.025768
10333,-0.001940
10336,0.028556


In [16]:
combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)

In [17]:
combination

Unnamed: 0,Player,Share,predictions
324,Bogdan Bogdanović,0.0,0.005909
326,Clint Capela,0.0,-0.000891
328,Dejounte Murray,0.0,0.040330
330,Garrison Mathews,0.0,-0.017696
663,Al Horford,0.0,0.007484
...,...,...,...
10327,Corey Kispert,0.0,0.001898
10328,Deni Avdija,0.0,0.025768
10333,Jordan Poole,0.0,-0.001940
10336,Kyle Kuzma,0.0,0.028556


In [18]:
combination.sort_values("Share", ascending=False).head(10)

Unnamed: 0,Player,Share,predictions
2598,Nikola Jokić,0.935,0.162281
7175,Shai Gilgeous-Alexander,0.646,0.154109
2266,Luka Dončić,0.572,0.189447
5717,Giannis Antetokounmpo,0.194,0.190317
6891,Jalen Brunson,0.143,0.09028
669,Jayson Tatum,0.087,0.110431
6040,Anthony Edwards,0.018,0.084207
8853,Domantas Sabonis,0.003,0.099053
8184,Kevin Durant,0.001,0.094596
7502,Jalen Suggs,0.0,0.010763


In [19]:
from sklearn.metrics import mean_squared_error

mean_squared_error(combination["Share"], combination["predictions"])

np.float64(0.00643757704298824)

In [20]:
combination["Share"].value_counts()

Share
0.000    175
0.087      1
0.572      1
0.935      1
0.194      1
0.018      1
0.143      1
0.646      1
0.001      1
0.003      1
Name: count, dtype: int64

In [21]:
combination = combination.sort_values("Share", ascending=False)
combination["Rk"] = list(range(1, combination.shape[0]+1))

In [22]:
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk
2598,Nikola Jokić,0.935,0.162281,1
7175,Shai Gilgeous-Alexander,0.646,0.154109,2
2266,Luka Dončić,0.572,0.189447,3
5717,Giannis Antetokounmpo,0.194,0.190317,4
6891,Jalen Brunson,0.143,0.09028,5
669,Jayson Tatum,0.087,0.110431,6
6040,Anthony Edwards,0.018,0.084207,7
8853,Domantas Sabonis,0.003,0.099053,8
8184,Kevin Durant,0.001,0.094596,9
7502,Jalen Suggs,0.0,0.010763,10


In [23]:
combination = combination.sort_values("predictions", ascending=False)
combination["Predicted_Rk"] = list(range(1, combination.shape[0]+1))

In [24]:
combination.sort_values("Share", ascending=False).head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk
2598,Nikola Jokić,0.935,0.162281,1,3
7175,Shai Gilgeous-Alexander,0.646,0.154109,2,4
2266,Luka Dončić,0.572,0.189447,3,2
5717,Giannis Antetokounmpo,0.194,0.190317,4,1
6891,Jalen Brunson,0.143,0.09028,5,13
669,Jayson Tatum,0.087,0.110431,6,7
6040,Anthony Edwards,0.018,0.084207,7,14
8853,Domantas Sabonis,0.003,0.099053,8,8
8184,Kevin Durant,0.001,0.094596,9,9
2587,Christian Braun,0.0,-0.003782,109,121


In [25]:
def find_ap(combination):
    actual = combination.sort_values("Share", ascending=False).head(5)
    predicted = combination.sort_values("predictions", ascending=False)
    precision_scores = []
    found = 0
    seen = 1
    for index, row in predicted.iterrows():
        if row["Player"] in actual["Player"].values:
            found += 1
            precision_scores.append(found/seen)
        seen += 1
    return sum(precision_scores) / len(precision_scores)

In [26]:
find_ap(combination)

0.876923076923077

In [27]:
years = list(range(2004,2025))

In [28]:
average_precision_scores = []
all_predictions = []
for year in years[5:]:
    train = stats[stats["Year"] < year]
    test = stats[stats["Year"] == year]
    if year == 2024:
        test = test[test["G"] >= 65]
    reg.fit(train[predictors], train["Share"])
    predictions = reg.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
    combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
    all_predictions.append(combination)
    average_precision_scores.append(find_ap(combination))

In [29]:
sum(average_precision_scores) / len(average_precision_scores)

0.7768049138361639

In [30]:
def add_ranks(combination):
    combination = combination.sort_values("Share", ascending=False)
    combination["Rk"] = list(range(1, combination.shape[0]+1))
    combination = combination.sort_values("predictions", ascending=False)
    combination["Predicted_Rk"] = list(range(1, combination.shape[0]+1))
    combination["Diff"] = combination["Rk"] - combination["Predicted_Rk"]
    return combination

In [31]:
ranking = add_ranks(all_predictions[1])
ranking[ranking["Rk"] < 6].sort_values("Diff", ascending=False)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk,Diff
5130,Dwyane Wade,0.097,0.119979,5,2,3
1664,LeBron James,0.98,0.194826,1,1,0
7268,Dwight Howard,0.389,0.105992,4,5,-1
4411,Kobe Bryant,0.487,0.072113,3,12,-9
6925,Kevin Durant,0.495,0.072041,2,13,-11


In [32]:
def backtest(stats, model, year, predictors):
    average_precision_scores = []
    all_predictions = []
    for year in years[5:]:
        train = stats[stats["Year"] < year]
        test = stats[stats["Year"] == year]
        if year == 2024:
            test = test[test["G"] >= 65]
        model.fit(train[predictors], train["Share"])
        predictions = reg.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
        combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        average_precision_scores.append(find_ap(combination))
    return sum(average_precision_scores)/len(average_precision_scores), average_precision_scores, pd.concat(all_predictions)

In [33]:
mean_ap, aps,  all_predictions = backtest(stats, reg, years[5:], predictors)

In [34]:
mean_ap

0.7768049138361639

In [35]:
all_predictions[all_predictions["Rk"] <= 5].sort_values("Diff").head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk,Diff
1381,Joakim Noah,0.258,0.040626,4,36,-32
8121,Chris Paul,0.138,0.075978,5,28,-23
4441,Kobe Bryant,0.291,0.058258,4,16,-12
8142,Devin Booker,0.216,0.083494,4,15,-11
6925,Kevin Durant,0.495,0.072041,2,13,-11
6700,Carmelo Anthony,0.393,0.071468,3,12,-9
4411,Kobe Bryant,0.487,0.072113,3,12,-9
6891,Jalen Brunson,0.143,0.09028,5,13,-8
7072,Paul George,0.352,0.118324,3,11,-8
4397,Kobe Bryant,0.577,0.091842,2,10,-8


In [36]:
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
13,0.070957,eFG%
29,0.036711,W/L%
18,0.026181,DRB
17,0.015103,ORB
25,0.011937,PTS
21,0.010838,STL
10,0.007246,2P
20,0.006922,AST
8,0.006288,3PA
22,0.005266,BLK


In [37]:
stat_ratios = stats[["PTS", "AST", "STL", "BLK", "3P", "Year"]].apply(lambda x: x/x.mean()) 

In [38]:
stat_ratios

Unnamed: 0,PTS,AST,STL,BLK,3P,Year
0,0.479914,0.162801,0.319312,0.744404,0.000000,0.994789
1,0.899839,1.573740,1.277249,0.496269,0.429431,0.994789
2,0.539903,1.302406,1.277249,1.240674,0.143144,0.994789
3,1.223781,0.434135,1.117593,0.992539,1.145151,0.994789
4,0.239957,0.271335,0.000000,0.744404,0.000000,0.994789
...,...,...,...,...,...,...
10338,1.403749,0.596936,0.638624,1.736943,0.286288,1.004717
10339,0.527905,0.434135,0.798280,0.992539,1.145151,1.004717
10340,0.599893,0.325601,0.478968,1.240674,0.000000,1.004717
10341,1.019817,0.705470,0.798280,1.736943,1.431438,1.004717


In [39]:
stats[["PTS_T", "AST_R", "STL_R", "BLK_R", "3P_R"]] = stat_ratios[["PTS", "AST", "STL", "BLK", "3P"]]

In [40]:
stats.head(5)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,W/L%,GB,PS/G,PA/G,SRS,PTS_T,AST_R,STL_R,BLK_R,3P_R
0,Alan Henderson,PF,31,ATL,6,0,11.3,1.7,3.5,0.476,...,0.341,33.0,92.8,97.5,-5.0,0.479914,0.162801,0.319312,0.744404,0.0
1,Bob Sura,SG,30,ATL,80,18,20.8,2.5,6.1,0.416,...,0.341,33.0,92.8,97.5,-5.0,0.899839,1.57374,1.277249,0.496269,0.429431
2,Boris Diaw,SG,21,ATL,76,37,25.3,1.8,4.1,0.447,...,0.341,33.0,92.8,97.5,-5.0,0.539903,1.302406,1.277249,1.240674,0.143144
3,Chris Crawford,PF,28,ATL,56,25,21.6,3.8,8.4,0.448,...,0.341,33.0,92.8,97.5,-5.0,1.223781,0.434135,1.117593,0.992539,1.145151
4,Hiram Fuller,PF,22,ATL,4,0,10.8,0.8,2.0,0.375,...,0.341,33.0,92.8,97.5,-5.0,0.239957,0.271335,0.0,0.744404,0.0


In [41]:
predictors += ["PTS_T", "AST_R", "STL_R", "BLK_R", "3P_R"]

In [42]:
mean_ap, aps,  all_predictions = backtest(stats, reg, years[5:], predictors)

In [43]:
mean_ap

0.7768049138361639

In [44]:
stats["NPos"] = stats["Pos"].astype("category").cat.codes

In [45]:
stats["NTm"] = stats["Team"].astype("category").cat.codes

In [46]:
stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,PS/G,PA/G,SRS,PTS_T,AST_R,STL_R,BLK_R,3P_R,NPos,NTm
0,Alan Henderson,PF,31,ATL,6,0,11.3,1.7,3.5,0.476,...,92.8,97.5,-5.0,0.479914,0.162801,0.319312,0.744404,0.0,2,0
1,Bob Sura,SG,30,ATL,80,18,20.8,2.5,6.1,0.416,...,92.8,97.5,-5.0,0.899839,1.57374,1.277249,0.496269,0.429431,12,0
2,Boris Diaw,SG,21,ATL,76,37,25.3,1.8,4.1,0.447,...,92.8,97.5,-5.0,0.539903,1.302406,1.277249,1.240674,0.143144,12,0
3,Chris Crawford,PF,28,ATL,56,25,21.6,3.8,8.4,0.448,...,92.8,97.5,-5.0,1.223781,0.434135,1.117593,0.992539,1.145151,2,0
4,Hiram Fuller,PF,22,ATL,4,0,10.8,0.8,2.0,0.375,...,92.8,97.5,-5.0,0.239957,0.271335,0.0,0.744404,0.0,2,0


In [49]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=500, random_state = 1, min_samples_split=2)

mean_ap, aps,  all_predictions = backtest(stats, rf, years, predictors)

In [50]:
mean_ap

0.812378212065712

In [51]:
mean_ap, aps,  all_predictions = backtest(stats, reg, years, predictors)

In [52]:
mean_ap

0.7768049138361639