In [1]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

In [2]:
#Loading Data

combined = pd.read_csv('combined.csv')
combined[['season', 'player', 'mvp_share']].sort_values('mvp_share', ascending=False).head(10)
combined.dtypes

seas_id            int64
season             int64
player_id          int64
player            object
pos               object
age              float64
tm_x              object
g                  int64
mp_per_game      float64
pts_per_game     float64
fg_per_game      float64
fga_per_game     float64
fg_percent       float64
x3p_per_game     float64
x3pa_per_game    float64
x3p_percent      float64
e_fg_percent     float64
ft_per_game      float64
fta_per_game     float64
orb_per_game     float64
drb_per_game     float64
trb_per_game     float64
ast_per_game     float64
stl_per_game     float64
blk_per_game     float64
pf_per_game      float64
pts_won          float64
pts_max          float64
mvp_share        float64
mvp              float64
team              object
w                float64
l                float64
lg                object
playoffs           int64
tm_y              object
tov_percent      float64
usg_percent      float64
ows              float64
dws              float64


In [3]:
#Setting Predictor Variables

predictors = ['age', 'g','mp_per_game', 'pts_per_game', 'fg_per_game', 'fga_per_game',
       'fg_percent', 'x3p_per_game', 'x3pa_per_game', 'x3p_percent',
       'e_fg_percent', 'ft_per_game', 'fta_per_game', 'orb_per_game',
       'drb_per_game', 'trb_per_game', 'ast_per_game', 'stl_per_game',
       'blk_per_game', 'pf_per_game', 'w', 'l', 'playoffs', 'tov_percent',
       'usg_percent', 'ows', 'dws', 'ws', 'obpm', 'dbpm', 'bpm', 'vorp']

In [4]:
#Train test splits, using old season data to predict current season results

train = combined[combined['season']<2023]
test = combined[combined['season']==2023]

In [None]:
#Scaling is not required in this as we are using time series and individual statistical data

#Dummy variables are also not required as all the data used for prediction is numerical

In [5]:
#Using a Ridge Regression model to avoid overfitting

reg = Ridge(alpha=0.1)
reg.fit(train[predictors], train['mvp_share'])

In [6]:
predictions = reg.predict(test[predictors])

In [7]:
predictions = pd.DataFrame(predictions, columns=['predictions'], index=test.index)
predictions

Unnamed: 0,predictions
15472,0.117366
15473,0.105078
15474,-0.033703
15475,0.012667
15476,-0.019851
...,...
19375,0.000183
19376,-0.018290
19377,-0.003213
19378,-0.000473


In [8]:
#Initial results of the Ridge Regression model. Creating columns to see the predicted rank vs the actual rank based on MVP share

combined_test = pd.concat([test[['player', 'mvp_share']], predictions], axis=1)

combined_test = combined_test[combined_test['predictions']>0].sort_values('predictions',ascending=False)
combined_test['predicted_rank'] = list(range(1, 1 + combined_test.shape[0]))

combined_test = combined_test[combined_test['mvp_share']>0].sort_values('mvp_share',ascending=False)
combined_test['actual_rank'] = list(range(1, 1 + combined_test.shape[0]))

combined_test = combined_test[['player', 'mvp_share', 'predictions','actual_rank', 'predicted_rank']]
combined_test['difference'] = abs(combined_test['predicted_rank'] - combined_test['actual_rank'])
combined_test.sort_values('predicted_rank', ascending=True)

Unnamed: 0,player,mvp_share,predictions,actual_rank,predicted_rank,difference
17416,Nikola Jokić,0.674,0.222874,2,1,1
17688,Giannis Antetokounmpo,0.606,0.212725,3,2,1
17967,Luka Dončić,0.01,0.199196,8,3,5
16878,Joel Embiid,0.915,0.185514,1,4,3
19366,Shai Gilgeous-Alexander,0.046,0.142476,5,5,0
17225,Jayson Tatum,0.28,0.135256,4,6,2
19019,Domantas Sabonis,0.027,0.132395,7,7,0
19228,Ja Morant,0.001,0.11791,12,8,4
15643,Jimmy Butler,0.003,0.114275,10,11,1
18144,Donovan Mitchell,0.03,0.093585,6,14,8


In [9]:
#Creating metrics to evaluate if the model correctly predicted the rank of the player receveing MVP votes

def precision(combined_test):
    mvp_list = combined_test.sort_values('mvp_share', ascending = False).head(5)
    p_mvp_list = combined_test.sort_values('predictions', ascending = False).head(5)
    correct = 0
    total = 0
    p_correct = 0
    
    for index, row in p_mvp_list.iterrows():
        if row['difference'] == 0:
            correct += 1
        else: 
            total += row['difference']
            
    for index, row in p_mvp_list.iterrows():
        if row['player'] in mvp_list['player'].values:
            p_correct += 1
            
    return [correct, total, p_correct/5]
    
#     return print("Correctly Predicted Rank:",correct,
#                  "\nTotal Difference For Top Five Predicted Vote Getters:", total, 
#                  "\nPercentage correct in Top Five:", format(p_correct/5, ".0%"))


In [10]:
precision(combined_test)

[1, 10, 0.8]

In [15]:
seasons = list(range(1980, 2024))

In [22]:
#Looping over all years

def history(stats, model, seasons, predictors):
    
    pred = []
    all_pred = []

    for season in seasons[5:]:
        train = combined[combined['season']<season]
        test = combined[combined['season']==season]
        model.fit(train[predictors], train['mvp_share'])
        predictions = model.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=['predictions'], index=test.index)
        combined_test = pd.concat([test[['player', 'mvp_share']], predictions], axis=1)
        combined_test = combined_test[combined_test['predictions']>0].sort_values('predictions',ascending=False)
        combined_test['predicted_rank'] = list(range(1, combined_test.shape[0]+1))
        combined_test = combined_test[combined_test['mvp_share']>0].sort_values('mvp_share',ascending=False)
        combined_test['actual_rank'] = list(range(1, combined_test.shape[0]+1))
        combined_test = combined_test[['player', 'mvp_share', 'predictions','actual_rank', 'predicted_rank']]
        combined_test['difference'] = abs(combined_test['predicted_rank'] - combined_test['actual_rank'])
        all_pred.append(combined_test)
        pred.append(precision(combined_test))
        pred_df = pd.DataFrame(pred, columns = ['correct', 'total_diff_top_five', 'p_correct_top_5'])
    return pred_df

In [31]:
#Top 5 Metric for all years

reg_pred = history(combined, reg, seasons[30:], predictors)
reg_pred

Unnamed: 0,correct,total_diff_top_five,p_correct_top_5
0,0,9,0.8
1,1,11,0.8
2,2,7,0.6
3,0,8,0.8
4,0,11,0.8
5,2,6,0.8
6,1,12,0.6
7,1,8,0.8
8,1,10,0.8


In [32]:
#Cumulative Metrics for all years

reg_pred[['correct', 'total_diff_top_five', 'p_correct_top_5']].mean()

correct                0.888889
total_diff_top_five    9.111111
p_correct_top_5        0.755556
dtype: float64

In [25]:
#Testing against random forest model

rf = RandomForestRegressor(n_estimators=100, random_state = 1, min_samples_split = 5)


In [33]:
rf_pred = history(combined, rf, seasons[30:], predictors)

In [36]:
rf_pred

Unnamed: 0,correct,total_diff_top_five,p_correct_top_5
0,2,5,0.8
1,1,6,1.0
2,0,14,0.6
3,3,5,0.8
4,2,8,0.8
5,0,7,0.8
6,2,7,0.8
7,1,8,0.8
8,3,5,0.8


In [37]:
rf_pred[['correct', 'total_diff_top_five', 'p_correct_top_5']].mean()

correct                1.555556
total_diff_top_five    7.222222
p_correct_top_5        0.800000
dtype: float64

In [None]:
#The random forest regressor is more accurate than the Ridge Regressor as it on average predicted more ranks correctly,
# had a smaller difference in rank amongst the top five vote getters, and predicted the top 5 vote getters more accurately. 