## Machine Learning

In [1]:
# Import pandas
import pandas as pd

In [2]:
# Load csv data into a pandas DataFrame
stats = pd.read_csv("./data/cleaned_data_for_ml.csv", low_memory=False)

In [3]:
# View DataFrame
stats

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,...,Pts Won,Pts Max,Share,W,L,W/L%,GB,PS/G,PA/G,SRS
0,A.C. Green,30.0,Phoenix Suns,PF,82.0,55.0,34.5,5.7,11.3,0.502,...,0.0,0.0,0.000,56.0,26.0,0.683,7.0,108.2,103.4,4.68
1,Cedric Ceballos,24.0,Phoenix Suns,SF,53.0,43.0,30.2,8.0,15.0,0.535,...,0.0,0.0,0.000,56.0,26.0,0.683,7.0,108.2,103.4,4.68
2,Charles Barkley,30.0,Phoenix Suns,PF,65.0,65.0,35.4,8.0,16.1,0.495,...,5.0,1010.0,0.005,56.0,26.0,0.683,7.0,108.2,103.4,4.68
3,Dan Majerle,28.0,Phoenix Suns,SG,80.0,76.0,40.1,6.0,14.2,0.418,...,0.0,0.0,0.000,56.0,26.0,0.683,7.0,108.2,103.4,4.68
4,Danny Ainge,34.0,Phoenix Suns,SG,68.0,1.0,22.9,3.3,7.9,0.417,...,0.0,0.0,0.000,56.0,26.0,0.683,7.0,108.2,103.4,4.68
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12405,Josh Okogie,21.0,Minnesota Timbelwolves,SG,62.0,28.0,25.0,2.7,6.4,0.427,...,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.0,0.00
12406,Karl-Anthony Towns,24.0,Minnesota Timbelwolves,C,35.0,35.0,33.9,9.0,17.8,0.508,...,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.0,0.00
12407,Kelan Martin,24.0,Minnesota Timbelwolves,SF,31.0,4.0,16.0,2.3,5.8,0.392,...,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.0,0.00
12408,Naz Reid,20.0,Minnesota Timbelwolves,C,30.0,11.0,16.5,3.3,8.1,0.412,...,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.0,0.00


In [4]:
# View columns in the dataframe
stats.columns

Index(['Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Awards',
       'Year', 'Pts Won', 'Pts Max', 'Share', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [5]:
# Set a list of predictore columns using the list of columns from df
predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA',
    '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 
    'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year', 'W', 'L', 
    'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS']

In [6]:
# Check the available years in the data
years = stats["Year"].unique()
years

array([1994., 1995., 1996.,    0., 1998., 1999., 2000., 2001., 2023.,
       2002., 2003., 2017., 2010., 2011., 2012., 2013., 2014., 2008.,
       2009., 2015., 2016., 2018., 2019., 2020., 2022., 2021., 2004.,
       2005., 2006., 2007., 1997.])

In [7]:
# Remove all rows where year is 0
stats = stats[stats["Year"] != 0]

In [8]:
# Check the available years in the data
years = stats["Year"].unique()
years

array([1994., 1995., 1996., 1998., 1999., 2000., 2001., 2023., 2002.,
       2003., 2017., 2010., 2011., 2012., 2013., 2014., 2008., 2009.,
       2015., 2016., 2018., 2019., 2020., 2022., 2021., 2004., 2005.,
       2006., 2007., 1997.])

In [9]:
# Create a train set of data that does not include the 2023 season
train = stats[stats["Year"] < 2023]

In [10]:
# Create a test set of data that only includes the 2023 season
test = stats[stats["Year"] == 2023]

In [11]:
# Import the Ridge model from sklearn
from sklearn.linear_model import Ridge

# Initialise model with a regulirization strength of 1.0
reg = Ridge(alpha=1.0)

In [12]:
# Train the Ridge (ML) model using the train data and passing in X
# (predictors) and Y(target)
reg.fit(train[predictors], train["Share"])

In [13]:
# Predict the MVP share for each player in the 2023 season
predictions = reg.predict(test[predictors])

In [14]:
predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
predictions

Unnamed: 0,predictions
116,0.004249
117,0.031171
118,0.043303
119,0.226760
120,-0.006157
...,...
12214,-0.016496
12215,0.001101
12216,-0.017173
12217,0.020887


In [15]:
# Combine the Player and Share columns in teh test data with the predictions
combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
combination

Unnamed: 0,Player,Share,predictions
116,A.J. Green,0.000,0.004249
117,Bobby Portis,0.000,0.031171
118,Brook Lopez,0.000,0.043303
119,Giannis Antetokounmpo,0.606,0.226760
120,Grayson Allen,0.000,-0.006157
...,...,...,...
12214,Mitchell Robinson,0.000,-0.016496
12215,Obi Toppin,0.000,0.001101
12216,Quentin Grimes,0.000,-0.017173
12217,RJ Barrett,0.000,0.020887


In [16]:
# Sort the Share values and view the first 15 rows
combination.sort_values("Share", ascending=False).head(15)

Unnamed: 0,Player,Share,predictions
11938,Joel Embiid,0.915,0.194514
119,Giannis Antetokounmpo,0.606,0.22676
2132,Jayson Tatum,0.28,0.133079
928,Shai Gilgeous-Alexander,0.046,0.14604
10843,Donovan Mitchell,0.03,0.081089
3187,Domantas Sabonis,0.027,0.08366
5113,Stephen Curry,0.005,0.103291
8313,Jimmy Butler,0.003,0.107836
3185,De'Aaron Fox,0.002,0.089983
9956,Ja Morant,0.001,0.124287


In [17]:
# Import the mean_squared_error function from sklearn
from sklearn.metrics import mean_squared_error

# Calculate the mean squared error of the predictions
mean_squared_error(combination["Share"], combination["predictions"])

0.0024775302030947926

In [18]:
# Get unique values and counts
combination["Share"].value_counts()

Share
0.000    458
0.001      2
0.606      1
0.046      1
0.280      1
0.002      1
0.027      1
0.005      1
0.003      1
0.030      1
0.915      1
Name: count, dtype: int64

In [19]:
# Sort rows in combination based on the Share column from highest to lowest
combination = combination.sort_values("Share", ascending=False)

# Create a new column called Rk
combination["Rk"] = list(range(1, combination.shape[0] + 1))

In [20]:
# View first 15 rows
combination.head(15)

Unnamed: 0,Player,Share,predictions,Rk
11938,Joel Embiid,0.915,0.194514,1
119,Giannis Antetokounmpo,0.606,0.22676,2
2132,Jayson Tatum,0.28,0.133079,3
928,Shai Gilgeous-Alexander,0.046,0.14604,4
10843,Donovan Mitchell,0.03,0.081089,5
3187,Domantas Sabonis,0.027,0.08366,6
5113,Stephen Curry,0.005,0.103291,7
8313,Jimmy Butler,0.003,0.107836,8
3185,De'Aaron Fox,0.002,0.089983,9
9956,Ja Morant,0.001,0.124287,10


In [21]:
# Sort rows in commbination based on predictions from highest to smallest
combination = combination.sort_values("predictions", ascending=False)

# Create a column Predicted_Rk with the rank of the predictions
combination["Predicted_Rk"] = list(range(1, combination.shape[0] + 1))

# View the first 15 rows
combination.head(15)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk
119,Giannis Antetokounmpo,0.606,0.22676,2,1
11938,Joel Embiid,0.915,0.194514,1,2
11470,Luka Dončić,0.0,0.191753,145,3
491,Nikola Jokić,0.0,0.15859,342,4
928,Shai Gilgeous-Alexander,0.046,0.14604,4,5
6231,Damian Lillard,0.0,0.137166,355,6
2132,Jayson Tatum,0.28,0.133079,3,7
6577,Anthony Davis,0.0,0.131956,78,8
6581,LeBron James,0.0,0.127415,75,9
9956,Ja Morant,0.001,0.124287,10,10


In [22]:
# Sort rows based on Share from highest to lowest and view the first 10 rows
combination.sort_values("Share", ascending=False).head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk
11938,Joel Embiid,0.915,0.194514,1,2
119,Giannis Antetokounmpo,0.606,0.22676,2,1
2132,Jayson Tatum,0.28,0.133079,3,7
928,Shai Gilgeous-Alexander,0.046,0.14604,4,5
10843,Donovan Mitchell,0.03,0.081089,5,24
3187,Domantas Sabonis,0.027,0.08366,6,22
5113,Stephen Curry,0.005,0.103291,7,13
8313,Jimmy Butler,0.003,0.107836,8,12
3185,De'Aaron Fox,0.002,0.089983,9,15
9956,Ja Morant,0.001,0.124287,10,10


In [23]:
def find_ap(combination: pd.DataFrame):
    """
    Find prediction precision
    """
    # Get the actual Share values
    actual = combination.sort_values("Share", ascending=False).head(5)

    # Get the predicted share values
    predicted = combination.sort_values("predictions", ascending=False).head(5)

    # Initialize variables
    ps = []
    found = 0
    seen = 1

    # Loop through each row in predicted
    for idx, row in predicted.iterrows():
        if row["Player"] in actual["Player"].values:
            # If the player in predicted is in actual

            # Increment found by 1 (credit for a correct prediction)
            found += 1

            # Append the found/seen ratio to ps
            ps.append(found/seen)
        
        # Increment seen by 1 
        seen += 1
    
    # Return ps
    return ps

In [24]:
# Find the ap
find_ap(combination)

[1.0, 1.0, 0.6]

These are Joel, Giannis and Shai from our combination table. Joel and Giannis are perfect, but it took us longer to predict Shai (hence the .6)

In [25]:
# Aggregate the aps
sum(find_ap(combination)) / len(find_ap(combination))

0.8666666666666667

In [26]:
# Create a list of years
years = list(range(1994, 2024))

In [27]:
# initialize variables
aps = []
all_predictions = []

# Loop through each year
for year in years[5:]:

    # Create train split
    train = stats[stats["Year"] < year]

    # Create test split
    test = stats[stats["Year"] == year]

    # Back testing predictions
    reg.fit(train[predictors], train["Share"])
    predictions = reg.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
    combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)

    # Return preditions for all the years
    all_predictions.append(combination)
    
    # Find the average precision
    aps.append(find_ap(combination))

In [28]:
# View precisions
aps

[[0.5],
 [1.0, 1.0, 1.0],
 [1.0, 1.0, 1.0],
 [1.0, 1.0, 0.75],
 [1.0, 1.0, 1.0, 1.0],
 [1.0, 1.0, 1.0, 1.0],
 [1.0, 0.6666666666666666],
 [1.0, 0.4],
 [1.0, 1.0],
 [1.0, 1.0, 0.75, 0.8],
 [1.0, 1.0, 1.0, 1.0],
 [1.0, 1.0, 1.0],
 [1.0, 1.0],
 [0.5, 0.5],
 [1.0, 1.0],
 [1.0, 1.0, 0.75],
 [1.0, 1.0, 1.0, 0.8],
 [1.0, 1.0, 1.0],
 [1.0, 1.0, 1.0],
 [1.0, 1.0, 1.0],
 [1.0, 1.0],
 [1.0, 1.0, 0.75, 0.8],
 [1.0, 1.0],
 [1.0, 1.0],
 [1.0, 1.0, 0.6]]

In [29]:
# Flatten list of lists
aps_flat = [item for sublist in aps for item in sublist]

# Calculate overall precision of the predictions
result = sum(aps_flat) / len(aps_flat)

result


0.9366666666666666

The high average precision means we are generally finding our top 5 MVPs accross multiple NBA seasons pretty quickly, and the the model performs better across seasons

In [30]:
# Machine learning model diagnosis
def add_ranks(combination):
    """
    Add ranks
    """
    # Sort rows in commbination based on Share from highest to smallest
    combination = combination.sort_values("Share", ascending=False)
    
    # Create a column Rk with the rank of the players (mvps)
    combination["Rk"] = list(range(1, combination.shape[0] + 1))

    # Sort rows in commbination based on predictions from highest to smallest
    combination = combination.sort_values("predictions", ascending=False)
    
    # Create a column Predicted_Rk with the rank of the predictions
    combination["Predicted_Rk"] = list(range(1, combination.shape[0] + 1))

    # Difference between ranks
    combination["Diff"] = combination["Rk"] - combination["Predicted_Rk"]
    
    return combination


In [31]:
add_ranks(all_predictions[1]).sort_values("Diff", ascending=False)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk,Diff
2938,Tracy McGrady,0.0,0.058905,409,21,388
12352,Stephon Marbury,0.0,0.054883,411,26,385
3518,Patrick Ewing,0.0,0.049210,396,30,366
3517,Marcus Camby,0.0,0.037225,397,41,356
2584,Dikembe Mutombo,0.0,0.048790,379,32,347
...,...,...,...,...,...,...
7620,Chris Herren,0.0,-0.022926,42,376,-334
7215,Mike Bibby,0.0,-0.019152,19,354,-335
7216,Milt Palacio,0.0,-0.019486,20,358,-338
7218,Othella Harrington,0.0,-0.032606,17,393,-376


In [32]:
# Only look at players with a ranking of 5 or less
ranking = add_ranks(all_predictions[1])
ranking[ranking["Rk"] <= 5].sort_values("Diff", ascending=False)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk,Diff
1329,Karl Malone,0.258,0.168554,4,2,2
83,Shaquille O'Neal,0.998,0.245859,1,1,0
3863,Alonzo Mourning,0.303,0.146838,3,3,0
7380,Tim Duncan,0.205,0.118589,5,7,-2
4632,Kevin Garnett,0.337,0.116019,2,8,-6


If the Diff is > 0 it means the player was predicted higher that they actually rank, if < 0 it means they were predicted lower than they actually rank. If 0 it means that they were predicted correctly by the model

In [33]:
# Automate backtest function
def backtest(stats, model, years, predictors):
    """
    Backtest
    """
    # Initialize variables
    aps = []
    all_predictions = []
    
    # Loop through each year
    for year in years[5:]:

        # Create train split
        train = stats[stats["Year"] < year]

        # Create test split
        test = stats[stats["Year"] == year]

        # Back testing predictions
        reg.fit(train[predictors], train["Share"])
        predictions = reg.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
        combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)

        # Return preditions for all the years
        all_predictions.append(combination)

        # Use the add_ranks function
        all_predictions.append(add_ranks(combination))
        
        # Find the average precision
        aps.append(find_ap(combination))

    # Flatten list of lists
    aps_flat = [item for sublist in aps for item in sublist]

    # Calculate overall precision of the predictions
    result = sum(aps_flat) / len(aps_flat)
    
    # Return mean average precision
    return result, aps, pd.concat(all_predictions)

In [34]:
mean_aps, aps, predictions = backtest(stats, reg, years[5:], predictors)

In [35]:
mean_aps

0.9342261904761905

In [36]:
aps

[[1.0, 1.0, 1.0, 1.0],
 [1.0, 0.6666666666666666],
 [1.0, 0.4],
 [1.0, 1.0],
 [1.0, 1.0, 0.75, 0.8],
 [1.0, 1.0, 1.0, 1.0],
 [1.0, 1.0, 1.0],
 [1.0, 1.0],
 [0.5, 0.5],
 [1.0, 1.0],
 [1.0, 1.0, 0.75],
 [1.0, 1.0, 1.0, 0.8],
 [1.0, 1.0, 1.0],
 [1.0, 1.0, 1.0],
 [1.0, 1.0, 1.0],
 [1.0, 1.0],
 [1.0, 1.0, 0.75, 0.8],
 [1.0, 1.0],
 [1.0, 1.0],
 [1.0, 1.0, 0.6]]

In [37]:
predictions

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk,Diff
775,Aaron McKie,0.0,-0.011246,,,
776,Allen Iverson,0.0,0.068482,,,
777,Amal McCaskill,0.0,-0.013487,,,
778,Derrick Coleman,0.0,-0.007196,,,
779,Eric Snow,0.0,-0.015714,,,
...,...,...,...,...,...,...
11460,Chris Silva,0.0,-0.042287,213.0,465.0,-252.0
6238,Justin Minaya,0.0,-0.042771,88.0,466.0,-378.0
6239,Justise Winslow,0.0,-0.049032,87.0,467.0,-380.0
9957,Jacob Gilyard,0.0,-0.053180,57.0,468.0,-411.0
