**Using only players considered in mvp voting**

In [1]:
import pandas as pd

In [2]:
stats = pd.read_csv("player_mvp_stats.csv")
years = list(range(1991, 2024))

In [3]:
del stats["Unnamed: 0"]

In [4]:
pd.isnull(stats).sum()
stats_all_players = stats.fillna(0)
stats_all_players

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,A.C. Green,PF,27.0,LAL,82.0,21.0,26.4,3.1,6.6,0.476,...,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
1,Byron Scott,SG,29.0,LAL,82.0,82.0,32.1,6.1,12.8,0.477,...,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
2,Elden Campbell,PF,22.0,LAL,52.0,0.0,7.3,1.1,2.4,0.455,...,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
3,Irving Thomas,PF,25.0,LAL,26.0,0.0,4.2,0.7,1.9,0.340,...,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
4,James Worthy,SF,29.0,LAL,78.0,74.0,38.6,9.2,18.7,0.492,...,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15233,Terrence Jones,PF,25.0,MIL,54.0,12.0,23.5,4.3,9.1,0.470,...,0.0,0.0,Milwaukee Bucks,42.0,40.0,0.512,9.0,103.6,103.8,-0.45
15234,Thon Maker,C,19.0,MIL,57.0,34.0,9.9,1.5,3.2,0.459,...,0.0,0.0,Milwaukee Bucks,42.0,40.0,0.512,9.0,103.6,103.8,-0.45
15235,Tony Snell,SG,25.0,MIL,80.0,80.0,29.2,3.1,6.8,0.455,...,0.0,0.0,Milwaukee Bucks,42.0,40.0,0.512,9.0,103.6,103.8,-0.45
15236,Alperen Sengun,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.000,...,0.0,0.0,0,0.0,0.0,0.000,0.0,0.0,0.0,0.00


In [5]:
mvp_players = stats_all_players[stats_all_players["Share"] > 0]
mvp_players.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year', 'PER',
       'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
       'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM',
       'VORP', 'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB',
       'PS/G', 'PA/G', 'SRS'],
      dtype='object')

**Modeling**

**Train/Test Split**

In [6]:
def train_test_split(year, df):
    predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS','PER','TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
       'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM',
       'VORP']
    #test year = selected year, train year = other years outside of selected year
    train = df[df['Year'] < year]
    test = df[df['Year'] == year]
    
    X_train = train[predictors]
    X_test = train["Share"]

    Y = test[predictors]
    
    return X_train, X_test, Y, test

**Function to run multiple models**

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import numpy as np

In [8]:
def run_model(regressor, X_train, X_test, Y, test):
    model = regressor
    model.fit(X_train, X_test) 
    predictions = model.predict(Y)
    
    predictions = pd.DataFrame(predictions, columns=["Predictions"], index = test.index)
    combinations = pd.concat([test[["Player", "Share"]], predictions], axis = 1)
    combinations.sort_values("Predictions", ascending = False)
    combinations = combinations.sort_values("Share", ascending = False)
    combinations["Rank"] = list(range(1, combinations.shape[0] + 1))
    combinations = combinations.sort_values("Predictions", ascending = False)
    combinations["Predicted_Rank"] = list(range(1, combinations.shape[0] + 1))

    # mae = mean_absolute_error(combinations["Share"], combinations["Predictions"])
    mse = mean_squared_error(combinations["Share"], combinations["Predictions"])
    # rmse = np.sqrt(mean_squared_error(combinations["Share"], combinations["Predictions"]))
    return combinations,mse

**Implementing backtesting for model selection**

In [9]:
def backtest(model, years):
    mse_lst = []
    all_predictions = []
    for year in years:
        X_train, X_test, Y, test = train_test_split(year, mvp_players)
        results, mse = run_model(model,  X_train=X_train, X_test=X_test, Y = Y, test=test)
        all_predictions.append(results)
        mse_lst.append(mse)

    return sum(mse_lst)/len(mse_lst)
    # , mse_lst, pd.concat(all_predictions)    

**Find avg metric score across all seasons**

In [10]:
# years = [year for year in range(1992, 2024)]

# def run_model_average(df, regressor, print_metric = False):
#     mae_lst = []
#     r2_lst = []
    
#     for year in (years):
#         X_train, X_test, Y, test = train_test_split(year=year, df=df)
#         mvp, mae, r2, = run_model(regressor,
#                                  X_train, 
#                                  X_test, 
#                                  Y, 
#                                  test)
                                        
#         mae_lst.append(mae)
#         r2_lst.append(r2)
        
#     d = {
#     'year': years,
#     'MAE': mae_lst,
#     'R squared': r2_lst,
#     }

#     summary = pd.DataFrame(d)
#     avg_mae = summary['MAE'].mean()
#     avg_r2  = summary['R squared'].mean()
    
#     if print_metric == True:
#         print(f"Average MAE: {avg_mae}")
#         print(f"Average R squared: {avg_r2}")

#     return summary

**Defining a rank based metric**

In [11]:
# def find_ap(combination):
#     actual = combination.sort_values("Share", ascending = False).head(5)
#     predicted = combination.sort_values("Predictions", ascending = False)
#     ps = []
#     found = 0
#     seen = 1
#     for index,row in predicted.iterrows():
#         if row["Player"] in actual["Player"].values:
#             found +=1
#             ps.append(found / seen)
#         seen += 1

#     return sum(ps)/len(ps)        

**Setting year for train/test split**

In [12]:
X_train, X_test, Y, test = train_test_split(2015, mvp_players)

**Linear Regression Model**

In [13]:
mvp_linear_race, mse = run_model(LinearRegression(), X_train=X_train, X_test=X_test, Y = Y, test=test)
mean_linear_mse = backtest(LinearRegression(), years[5:])
mvp_linear_race

Unnamed: 0,Player,Share,Predictions,Rank,Predicted_Rank
6601,Stephen Curry,0.922,0.544157,1,1
14472,James Harden,0.72,0.456468,2,2
10374,Chris Paul,0.095,0.380078,6,3
4192,LeBron James,0.425,0.249202,3,4
6907,Russell Westbrook,0.271,0.224514,4,5
4473,Anthony Davis,0.156,0.151939,5,6
6596,Klay Thompson,0.001,0.127361,10,7
4833,LaMarcus Aldridge,0.005,0.073114,7,8
10371,Blake Griffin,0.002,0.054296,8,9
11177,Marc Gasol,0.002,0.029962,9,10


In [14]:
print(mean_linear_mse)

0.055889220717971276


**Ridge Regression Model**

In [15]:
mvp_ridge_race, mse = run_model(Ridge(), X_train=X_train, X_test = X_test, Y = Y, test=test)
mean_ridge_mse = backtest(Ridge(), years[5:])
mvp_ridge_race

Unnamed: 0,Player,Share,Predictions,Rank,Predicted_Rank
6601,Stephen Curry,0.922,0.504404,1,1
14472,James Harden,0.72,0.483812,2,2
10374,Chris Paul,0.095,0.344885,6,3
4192,LeBron James,0.425,0.277348,3,4
4473,Anthony Davis,0.156,0.216546,5,5
6907,Russell Westbrook,0.271,0.212088,4,6
6596,Klay Thompson,0.001,0.150143,10,7
4833,LaMarcus Aldridge,0.005,0.107499,7,8
10202,Tim Duncan,0.001,0.066208,12,9
11177,Marc Gasol,0.002,0.064845,9,10


In [16]:
print(mean_ridge_mse)

0.039647664949490034


**Random Forest Regressor Model**

In [27]:
mvp_race_rf, mse = run_model(RandomForestRegressor(), X_train, X_test, Y, test)
mean_race_rf_mse = backtest(RandomForestRegressor(), years[5:])
mvp_race_rf

Unnamed: 0,Player,Share,Predictions,Rank,Predicted_Rank
6601,Stephen Curry,0.922,0.60541,1,1
14472,James Harden,0.72,0.48415,2,2
10374,Chris Paul,0.095,0.38868,6,3
6907,Russell Westbrook,0.271,0.27083,4,4
4473,Anthony Davis,0.156,0.18084,5,5
4192,LeBron James,0.425,0.12563,3,6
6596,Klay Thompson,0.001,0.09435,10,7
10194,Kawhi Leonard,0.001,0.03569,11,8
4833,LaMarcus Aldridge,0.005,0.02483,7,9
10371,Blake Griffin,0.002,0.01881,8,10


In [29]:
mean_race_rf_mse

0.03614998556714165

**Building a XGBoost Model**

In [20]:
mvp_race_XGBoost, mse = run_model(XGBRegressor(), X_train, X_test, Y, test)
mean_race_XGBoost_mse = backtest(XGBRegressor(), years[5:])
mvp_race_XGBoost

Unnamed: 0,Player,Share,Predictions,Rank,Predicted_Rank
6601,Stephen Curry,0.922,0.774199,1,1
10374,Chris Paul,0.095,0.562043,6,2
14472,James Harden,0.72,0.302153,2,3
6907,Russell Westbrook,0.271,0.243687,4,4
4473,Anthony Davis,0.156,0.180141,5,5
6596,Klay Thompson,0.001,0.156092,10,6
4192,LeBron James,0.425,0.050245,3,7
10371,Blake Griffin,0.002,0.036718,8,8
11177,Marc Gasol,0.002,0.03472,9,9
4833,LaMarcus Aldridge,0.005,0.026104,7,10


In [21]:
mean_race_XGBoost_mse

0.04087379905890938

**Hyperparameter Tuning**

In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
param_grid = {
        'bootstrap':[True],

        'max_depth':[10,20,30,40,50,60,70,80,90,100,None],

        'max_features': ['auto', 'sqrt'],

        'min_samples_leaf': [1, 2, 4],

        'min_samples_split': [2, 5, 10],

        'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
    }

In [24]:
# model = RandomForestRegressor()
# grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv = 3, n_jobs = -1, verbose = 2)
# grid_search.fit(X_train, X_test) 

# grid_search.best_params_

In [25]:
mvp_race_rf_optimized, mse = run_model(RandomForestRegressor(bootstrap=True, max_depth=30, max_features = 'sqrt', min_samples_leaf=2, min_samples_split=2, n_estimators = 400), X_train, X_test, Y, test)
mean_race_rf_mse_optimized = backtest(RandomForestRegressor(bootstrap=True, max_depth=30, max_features = 'sqrt', min_samples_leaf=2, min_samples_split=2, n_estimators = 400), years[5:])
mean_race_rf_mse_optimized

0.03333612377971234

In [30]:
#How to use backtest as scoring function: https://stackoverflow.com/questions/32401493/how-to-create-customize-your-own-scorer-function-in-scikit-learn