In [166]:
import pandas as pd

from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor


In [167]:
stats = pd.read_csv("../data cleaning/combined_stats_master.csv")

In [168]:
del stats["Unnamed: 0"]
stats

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,...,Pts Won,Pts Max,Share,W,L,W/L%,GB,PS/G,PA/G,SRS
0,Doc Rivers,29,Atlanta Hawks,PG,79,79,32.7,5.6,12.9,0.435,...,0.0,0.0,0.00,43,39,0.524,18.0,109.8,109.0,0.72
1,Dominique Wilkins,31,Atlanta Hawks,SF,81,81,38.0,9.5,20.2,0.470,...,29.0,960.0,0.03,43,39,0.524,18.0,109.8,109.0,0.72
2,Duane Ferrell,25,Atlanta Hawks,SF,78,2,14.9,2.2,4.6,0.489,...,0.0,0.0,0.00,43,39,0.524,18.0,109.8,109.0,0.72
3,Gary Leonard,23,Atlanta Hawks,C,4,0,2.3,0.0,0.0,0.000,...,0.0,0.0,0.00,43,39,0.524,18.0,109.8,109.0,0.72
4,John Battle,28,Atlanta Hawks,SG,79,2,23.6,5.0,10.9,0.461,...,0.0,0.0,0.00,43,39,0.524,18.0,109.8,109.0,0.72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15803,Marvin Bagley III,24,Washington Wizards,C,50,25,21.1,4.8,8.2,0.586,...,0.0,0.0,0.00,15,67,0.183,32.0,113.7,123.0,-9.29
15804,Patrick Baldwin Jr.,21,Washington Wizards,SF,38,7,13.0,1.6,4.1,0.381,...,0.0,0.0,0.00,15,67,0.183,32.0,113.7,123.0,-9.29
15805,Richaun Holmes,30,Washington Wizards,C,40,10,13.9,2.1,3.7,0.558,...,0.0,0.0,0.00,15,67,0.183,32.0,113.7,123.0,-9.29
15806,Tristan Vukcevic,20,Washington Wizards,C,10,4,15.3,2.9,6.7,0.433,...,0.0,0.0,0.00,15,67,0.183,32.0,113.7,123.0,-9.29


In [169]:
stats.columns

Index(['Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G',
       'SRS'],
      dtype='object')

In [170]:
# Select all numeric columns for predictions except the target variables 'Pts Won', 'Pts Max', 'Share'
# These are the features (predictor variables) that will be used to predict MVP share
predictor_features = ['Age','G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
        'W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G',
       'SRS']


In [171]:
# Training set: use all years before 2024
# This data will be used to train the model
train_data = stats[stats["Year"] < 2024]


In [172]:
# Test set: use only year 2024
# This data will be used to evaluate the model's performance
test_data = stats[stats["Year"] == 2024]


In [173]:
# Define Ridge regression model with alpha parameter for regularization
ridge_model = Ridge(alpha=0.1)


In [174]:
# Train the model using training data
# X (features): predictor_features, y (target): Share (MVP voting share)
ridge_model.fit(train_data[predictor_features], train_data["Share"])


In [175]:
# Make predictions on the test set
predictions_2024 = ridge_model.predict(test_data[predictor_features])


In [176]:
# Convert predictions to DataFrame with proper index alignment
predictions_df = pd.DataFrame(predictions_2024, columns=["Predicted"], index=test_data.index)
predictions_df


Unnamed: 0,Predicted
524,-0.025309
525,0.004664
526,-0.003725
527,0.008457
528,-0.002275
...,...
15803,-0.001254
15804,-0.008072
15805,-0.027929
15806,0.009920


In [177]:
# Create comparison DataFrame with actual and predicted values
comparison_df = pd.concat([test_data[["Player","Share"]], predictions_df], axis=1)
comparison_df


Unnamed: 0,Player,Share,Predicted
524,AJ Griffin,0.0,-0.025309
525,Bogdan Bogdanovic,0.0,0.004664
526,Bruno Fernando,0.0,-0.003725
527,Clint Capela,0.0,0.008457
528,De'Andre Hunter,0.0,-0.002275
...,...,...,...
15803,Marvin Bagley III,0.0,-0.001254
15804,Patrick Baldwin Jr.,0.0,-0.008072
15805,Richaun Holmes,0.0,-0.027929
15806,Tristan Vukcevic,0.0,0.009920


In [178]:
# Display top 20 players sorted by actual MVP share
# Compare real results vs model predictions for 2024
comparison_df.sort_values(by="Share", ascending=False).head(20)


Unnamed: 0,Player,Share,Predicted
3970,Nikola Jokic,0.935,0.172834
10697,Shai Gilgeous-Alexander,0.646,0.168547
3434,Luka Doncic,0.572,0.188416
8656,Giannis Antetokounmpo,0.194,0.212369
10413,Jalen Brunson,0.143,0.09901
1067,Jayson Tatum,0.087,0.113581
9169,Anthony Edwards,0.018,0.088173
13338,Domantas Sabonis,0.003,0.09725
12295,Kevin Durant,0.001,0.102027
10680,Aaron Wiggins,0.0,-0.002419


In [179]:
# Evaluate model using Mean Squared Error (MSE)
mean_squared_error(comparison_df["Share"], comparison_df["Predicted"])


0.00247370958842746

In [180]:
# Note: MSE is not the best metric for this case
# What matters is whether the model correctly identifies the top 10 MVP candidates
# Let's see the distribution of actual MVP shares
comparison_df["Share"].value_counts()


Share
0.000    563
0.087      1
0.572      1
0.935      1
0.194      1
0.018      1
0.143      1
0.646      1
0.001      1
0.003      1
Name: count, dtype: int64

In [181]:
# Create ranking based on actual MVP share
ranked_by_actual = comparison_df.sort_values("Share", ascending=False)
ranked_by_actual["Rank"] = list(range(1, ranked_by_actual.shape[0] + 1))
ranked_by_actual


Unnamed: 0,Player,Share,Predicted,Rank
3970,Nikola Jokic,0.935,0.172834,1
10697,Shai Gilgeous-Alexander,0.646,0.168547,2
3434,Luka Doncic,0.572,0.188416,3
8656,Giannis Antetokounmpo,0.194,0.212369,4
10413,Jalen Brunson,0.143,0.099010,5
...,...,...,...,...
5571,Dillon Brooks,0.000,-0.030686,568
5572,Fred VanVleet,0.000,0.032116,569
5573,Jabari Smith Jr.,0.000,0.000765,570
5574,Jae'Sean Tate,0.000,-0.022537,571


In [182]:
# Add predicted ranking based on model predictions
ranked_by_actual = ranked_by_actual.sort_values("Predicted", ascending=False)
ranked_by_actual["Predicted Rank"] = list(range(1, ranked_by_actual.shape[0] + 1))
ranked_by_actual


Unnamed: 0,Player,Share,Predicted,Rank,Predicted Rank
8656,Giannis Antetokounmpo,0.194,0.212369,4,1
11770,Joel Embiid,0.000,0.206445,55,2
3434,Luka Doncic,0.572,0.188416,3,3
3970,Nikola Jokic,0.935,0.172834,1,4
10697,Shai Gilgeous-Alexander,0.646,0.168547,2,5
...,...,...,...,...,...
7582,Trey Jemison,0.000,-0.044483,476,568
1795,Nathan Mensah,0.000,-0.044536,409,569
4484,Isaiah Livers,0.000,-0.045315,526,570
14626,Malik Williams,0.000,-0.045343,155,571


In [183]:
# Calculate Average Precision for top 7 MVP candidates
# This metric measures how well the model ranks the top 7 players
def calculate_top7_average_precision(comparison):
    actual_top_7 = comparison.sort_values("Share", ascending=False).head(7)
    predicted_ranking = comparison.sort_values("Predicted", ascending=False)
    
    precision_scores = []
    correct_found = 0 
    players_seen = 1
    
    for index, row in predicted_ranking.iterrows():
        if row["Player"] in actual_top_7["Player"].values:
            correct_found += 1
            precision_scores.append(correct_found / players_seen)
        players_seen += 1
    
    return sum(precision_scores) / len(precision_scores)


In [184]:
calculate_top7_average_precision(comparison_df)


0.67906162464986

In [185]:
# Define the range of years available in the dataset
years_range = list(range(1991, 2025))


In [186]:
# Backtesting: test model on multiple years
# Skip first 5 years to have enough training data
average_precisions = []
all_year_predictions = []

for year in years_range[5:]:
    train_data = stats[stats["Year"] < year]
    test_data = stats[stats["Year"] == year]
    
    ridge_model.fit(train_data[predictor_features], train_data["Share"])
    year_predictions = ridge_model.predict(test_data[predictor_features])
    
    predictions_df = pd.DataFrame(year_predictions, columns=["Predicted"], index=test_data.index)
    comparison_df = pd.concat([test_data[["Player","Share"]], predictions_df], axis=1)
    
    all_year_predictions.append(comparison_df)
    average_precisions.append(calculate_top7_average_precision(comparison_df))


In [187]:
# Display average precision scores for each year
average_precisions

[0.5359307359307359,
 0.5022012578616352,
 0.7980769230769231,
 0.624829931972789,
 0.7780612244897959,
 1.0,
 0.7641287284144428,
 0.9480519480519481,
 0.7406015037593985,
 0.6257936507936508,
 0.6025910364145659,
 0.6269841269841269,
 0.9821428571428571,
 0.7139455782312926,
 0.7931122448979592,
 0.9093537414965985,
 0.6577380952380951,
 0.7806122448979591,
 0.6061086596800883,
 0.831547619047619,
 0.7555555555555555,
 0.7947278911564626,
 0.9123376623376623,
 0.6826229326229326,
 0.8784013605442177,
 0.7922077922077921,
 0.7696555545295042,
 0.6875258799171843,
 0.67906162464986]

In [188]:
# Calculate overall mean average precision across all years
sum(average_precisions) / len(average_precisions)


0.7508244262725399

In [189]:
def add_ranking_columns(comparison):
    """
    Add ranking columns and calculate difference between actual and predicted ranks
    
    Args:
        comparison: DataFrame with Player, Share, and Predicted columns
    
    Returns:
        DataFrame with added Rank, Predicted Rank, and Difference columns
    """
    # Create ranking based on actual MVP share
    ranked_comparison = comparison.sort_values("Share", ascending=False)
    ranked_comparison["Rank"] = list(range(1, ranked_comparison.shape[0] + 1))
    
    # Add predicted ranking
    ranked_comparison = ranked_comparison.sort_values("Predicted", ascending=False)
    ranked_comparison["Predicted Rank"] = list(range(1, ranked_comparison.shape[0] + 1))
    
    # Calculate difference (positive = overrated, negative = underrated)
    ranked_comparison["Difference"] = ranked_comparison["Rank"] - ranked_comparison["Predicted Rank"]
    
    return ranked_comparison


In [190]:
# Analyze predictions for a specific year (index 1 = 1997)
year_ranking = add_ranking_columns(all_year_predictions[1])
year_ranking[year_ranking["Rank"] <= 7].sort_values("Difference", ascending=False)


Unnamed: 0,Player,Share,Predicted,Rank,Predicted Rank,Difference
5140,Hakeem Olajuwon,0.083,0.136375,7,4,3
14730,Karl Malone,0.857,0.19232,1,2,-1
1895,Michael Jordan,0.832,0.167613,2,3,-1
4063,Grant Hill,0.327,0.128629,3,6,-3
13975,Gary Payton,0.091,0.093414,6,10,-4
7684,Tim Hardaway,0.207,0.05996,4,20,-16
1546,Glen Rice,0.117,0.033096,5,53,-48


In [191]:
def backtest_model(stats, model, years, features):
    """
    Perform backtesting on the model across multiple years
    
    Args:
        stats: Complete dataset with all years
        model: Machine learning model to test
        years: List of years to test
        features: List of feature column names
    
    Returns:
        tuple: (mean_average_precision, list_of_aps, concatenated_predictions)
    """
    average_precisions = []
    all_predictions = []

    for year in years[5:]:  # Skip first 5 years for sufficient training data
        train_data = stats[stats["Year"] < year]
        test_data = stats[stats["Year"] == year]
        
        model.fit(train_data[features], train_data["Share"])
        year_predictions = model.predict(test_data[features])
        
        predictions_df = pd.DataFrame(year_predictions, columns=["Predicted"], index=test_data.index)
        comparison = pd.concat([test_data[["Player","Share"]], predictions_df], axis=1)
        comparison = add_ranking_columns(comparison)
        
        all_predictions.append(comparison)
        average_precisions.append(calculate_top7_average_precision(comparison))
        
    mean_ap = sum(average_precisions) / len(average_precisions)
    return mean_ap, average_precisions, pd.concat(all_predictions)


In [192]:
# Run backtesting with Ridge regression model
mean_avg_precision, avg_precisions_list, all_predictions = backtest_model(stats, ridge_model, years_range, predictor_features)
mean_avg_precision

0.7508244262725399

In [193]:
# Show top 7 ranked players with largest prediction differences
year_ranking[year_ranking["Rank"] <= 7].sort_values("Difference", ascending=False)


Unnamed: 0,Player,Share,Predicted,Rank,Predicted Rank,Difference
5140,Hakeem Olajuwon,0.083,0.136375,7,4,3
14730,Karl Malone,0.857,0.19232,1,2,-1
1895,Michael Jordan,0.832,0.167613,2,3,-1
4063,Grant Hill,0.327,0.128629,3,6,-3
13975,Gary Payton,0.091,0.093414,6,10,-4
7684,Tim Hardaway,0.207,0.05996,4,20,-16
1546,Glen Rice,0.117,0.033096,5,53,-48


In [194]:
# Display feature importance (coefficients from Ridge regression)
# Higher values indicate more important features for MVP prediction
feature_importance = pd.concat([pd.Series(ridge_model.coef_), pd.Series(predictor_features)], axis=1)
feature_importance.sort_values(0, ascending=False)


Unnamed: 0,0,1
13,0.123132,eFG%
29,0.031146,W/L%
18,0.029929,DRB
17,0.016409,ORB
10,0.015186,2P
21,0.012398,STL
12,0.010999,2P%
15,0.010338,FTA
22,0.010228,BLK
25,0.007551,PTS


In [195]:
# Calculate year-normalized ratios for key statistics
# Normalize each stat by dividing by the year's average to account for era differences
normalized_stats = stats.groupby("Year")[["PTS", "AST", "STL", "BLK", "3P"]].apply(
    lambda x: x / x.mean(), include_groups=False
)

In [196]:
# Add normalized ratio columns to the stats DataFrame
normalized_stats = normalized_stats.reset_index(level=0, drop=True)
stats[["PTS_R", "AST_R", "STL_R", "BLK_R", "3P_R"]] = normalized_stats[["PTS", "AST", "STL", "BLK", "3P"]]


In [197]:
stats.head()

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,...,W/L%,GB,PS/G,PA/G,SRS,PTS_R,AST_R,STL_R,BLK_R,3P_R
0,Doc Rivers,29,Atlanta Hawks,PG,79,79,32.7,5.6,12.9,0.435,...,0.524,18.0,109.8,109.0,0.72,1.692601,2.010078,2.608773,1.346939,5.594452
1,Dominique Wilkins,31,Atlanta Hawks,SF,81,81,38.0,9.5,20.2,0.47,...,0.524,18.0,109.8,109.0,0.72,2.884104,1.542618,2.059558,1.795918,5.085865
2,Duane Ferrell,25,Atlanta Hawks,SF,78,2,14.9,2.2,4.6,0.489,...,0.524,18.0,109.8,109.0,0.72,0.679268,0.327222,0.549215,0.673469,0.0
3,Gary Leonard,23,Atlanta Hawks,C,4,0,2.3,0.0,0.0,0.0,...,0.524,18.0,109.8,109.0,0.72,0.055678,0.0,0.0,0.673469,0.0
4,John Battle,28,Atlanta Hawks,SG,79,2,23.6,5.0,10.9,0.461,...,0.524,18.0,109.8,109.0,0.72,1.514433,1.262142,0.823823,0.22449,1.017173


In [198]:
# Add normalized ratio features to predictor features list
predictor_features += ["PTS_R", "AST_R", "STL_R", "BLK_R", "3P_R"]


In [199]:
# Test model with normalized features added
mean_avg_precision, avg_precisions_list, all_predictions = backtest_model(stats, ridge_model, years_range, predictor_features)
mean_avg_precision


0.7575530849588599

In [200]:
# Encode categorical variables (Position and Team) as numeric codes
stats["Position_Encoded"] = stats["Pos"].astype("category").cat.codes
stats["Team_Encoded"] = stats["Team"].astype("category").cat.codes


In [201]:
# Initialize StandardScaler for feature scaling (not used in current implementation)
scaler = StandardScaler()


In [202]:
# Test Random Forest model (alternative to Ridge regression)
# Using 50 trees, fixed random state, and minimum 5 samples per split
random_forest_model = RandomForestRegressor(n_estimators=400, random_state=1, min_samples_split=5)


In [203]:
mean_avg_precision_rf, avg_precisions_rf, all_predictions_rf = backtest_model(stats, random_forest_model, years_range[28:], predictor_features)
mean_avg_precision_rf

0.9379251700680271

In [None]:
# Compare Ridge regression performance on the same subset of years
mean_avg_precision_ridge, avg_precisions_ridge, all_predictions_ridge = backtest_model(stats, ridge_model, years_range[28:], predictor_features)
mean_avg_precision_ridge

0.6900793650793651