In [1]:
import pandas as pd
import numpy as np

## 1. Get the Data

In [2]:
stats = pd.read_csv("mvps_player_team.csv" , index_col = 0)

In [3]:
stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,0.477,...,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,0.455,...,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,0.34,...,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,0.492,...,0.0,0.0,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73


In [4]:
stats.shape

(14697, 41)

## 2. Checking Null values in each column

In [5]:
pd.isnull(stats).sum()

Player        0
Pos           0
Age           0
Tm            0
G             0
GS            0
MP            0
FG            0
FGA           0
FG%          59
3P            0
3PA           0
3P%        2086
2P            0
2PA           0
2P%         100
eFG%         59
FT            0
FTA           0
FT%         521
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Year          0
Pts Won       0
Pts Max       0
Share         0
Team          0
W           401
L           401
W/L%        401
GB          401
PS/G        401
PA/G        401
SRS         401
dtype: int64

In [6]:
stats = stats.fillna(0)

In [7]:
pd.isnull(stats).sum()

Player     0
Pos        0
Age        0
Tm         0
G          0
GS         0
MP         0
FG         0
FGA        0
FG%        0
3P         0
3PA        0
3P%        0
2P         0
2PA        0
2P%        0
eFG%       0
FT         0
FTA        0
FT%        0
ORB        0
DRB        0
TRB        0
AST        0
STL        0
BLK        0
TOV        0
PF         0
PTS        0
Year       0
Pts Won    0
Pts Max    0
Share      0
Team       0
W          0
L          0
W/L%       0
GB         0
PS/G       0
PA/G       0
SRS        0
dtype: int64

## 3. Predictor and Target columns

In [8]:
stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [9]:
predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
             '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
             'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
             'W', 'L', 'W/L%', 'GB', 'PS/G','PA/G', 'SRS']
       

## 4. Making changes in the Year column

In [10]:
stats["Year"].unique()

array([1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
       2002, 2003, 2017, 2010, 2011, 2012, 2013, 2014, 2015, 2008, 2009,
       2016, 2018, 2019, 2020, 2021, 2022, 2004, 2005, 2006, 2007],
      dtype=int64)

In [11]:
stats = stats.sort_values(by = ["Year" ]  , ascending = True)

#### Resetting the indices

In [12]:
stats = stats.reset_index(drop = True)

In [13]:
stats.index.values

array([    0,     1,     2, ..., 14694, 14695, 14696], dtype=int64)

In [14]:
stats[['Player','Team','Share','Year']].head(10)

Unnamed: 0,Player,Team,Share,Year
0,A.C. Green,Los Angeles Lakers,0.0,1991
1,Avery Johnson,San Antonio Spurs,0.0,1991
2,Stojko Vranković,Boston Celtics,0.0,1991
3,Robert Parish,Boston Celtics,0.01,1991
4,Reggie Lewis,Boston Celtics,0.0,1991
5,Michael Smith,Boston Celtics,0.0,1991
6,Larry Bird,Boston Celtics,0.026,1991
7,Kevin McHale,Boston Celtics,0.001,1991
8,Kevin Gamble,Boston Celtics,0.0,1991
9,Joe Kleine,Boston Celtics,0.0,1991


## 5. Training and Test dataset

In [15]:
d_train = stats[stats["Year"] <2022]
d_test = stats[stats["Year"]  == 2022]

In [16]:
# X_train = d_train[predictor]
# y_train = d_train["Share"]
# X_test =  d_test[predictor]
# y_test =  d_test["Share"]

In [17]:
#X_train.shape

In [18]:
#y_train.shape

In [19]:
#X_test.shape

In [20]:
#y_test.shape

## 6. Model Training and Prediction

In [21]:
import sklearn

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [23]:
models = [LinearRegression(),Ridge(alpha = 0.1), KNeighborsRegressor() ,SVR(),
          DecisionTreeRegressor(), RandomForestRegressor()]

In [24]:
import warnings
warnings.filterwarnings('ignore')

In [25]:
def acc_metric(DF):
    
    actual = DF.sort_values(by = ["Share"] , ascending = False).head(5)
    #print(actual)
    predicted = DF.sort_values(by = ["Prediction"] , ascending = False).head(5)
    #print(predicted)
    
    act_play_list = actual["Player"].values

    pred_play_list = predicted["Player"].values
    #print("Actual: ", act_play_list)
    #print("Predicted: ", pred_play_list)
    
    score_list2 = []
    for player_name in act_play_list:
        
        if player_name in pred_play_list:
            score_list2.append(1)
            #print(score_list)
        
        else:
            
            rank_predicted = DF[DF["Player"] == player_name]["RK_predict"].iloc[0]
            penalty = np.round(5/rank_predicted,3)
            score_list2.append(penalty)
            #print(score_list)
            
    # print(f"Accuracy Score for the NBA Season : {year}")        
    accuracy_res = np.round(sum(score_list2)/len(score_list2),2)
    
    return score_list2, accuracy_res

In [26]:
def ModelTraining(ML_models , DF_train , DF_test , predictor_col ):
    
    X_train = DF_train[predictor_col]
    y_train = DF_train["Share"]
    X_test =  DF_test[predictor_col]
    y_test =  DF_test["Share"]
    
    score_list = [] 
    accuracy_list = []
    model_list = []
    
    for model in ML_models :
        
        model.fit(X_train.values, y_train)
        # y_hat_train = model.predict(X_train.values)
        y_hat_test = model.predict(X_test.values)
        
        pred_DF = pd.DataFrame(data = y_hat_test , columns = ["Prediction"] , index = X_test.index )
    
        combination  = pd.concat([DF_test[["Player" , "Share"]] ,pred_DF ], axis = 1)
        
        # print(combination)
        rank_df = combination.copy()
    
        rank_df = rank_df[rank_df["Share"] > 0]
        
        # Sorting for Actual ranking
        rank_df = rank_df.sort_values(by = ["Share"] , ascending = False )
        # Assigning Actual Rank
        rank_df["RK_actual"] = list(range(1,len(rank_df)+1))
    
        #print(rank_df)
    
        # Sorting for Predicted Rank
        rank_df = rank_df.sort_values(by = ["Prediction"] , ascending = False)
        # Assigning Predicted Rank
        rank_df["RK_predict"] = list(range(1,len(rank_df)+1))
        
        score_arr , accuracy = acc_metric(DF = rank_df)
        
        score_list.append(score_arr)
        accuracy_list.append(accuracy)
        # year = 2022
        
        mod = str(model).split('(')[0]
        model_list.append(mod)
        
        
        #print(f"Accuracy Score list for the NBA Season {year} : {score_arr}")
        #print(f"Accuracy Score for the NBA Season {year} : {accuracy}\n\n")
        
        # print()
    
    score_col = {"Score": score_list}
    
    # df = pd.DataFrame(data, columns=['Score'])
    Score_DF = pd.DataFrame(data = score_col , columns = ["Score"] , 
                            index = model_list
                           )
    
    Accuracy_DF = pd.DataFrame(data = accuracy_list , columns = ["Accuracy"] , 
                               index = model_list
                              )
    
    Testing_DF  = pd.concat([Score_DF,Accuracy_DF], axis = 1)
    
    return Testing_DF

In [27]:
Model_perf_DF = ModelTraining(ML_models = models , DF_train = d_train , 
                              DF_test = d_test, predictor_col = predictors )

In [28]:
Model_perf_DF

Unnamed: 0,Score,Accuracy
LinearRegression,"[1, 1, 1, 0.455, 1]",0.89
Ridge,"[1, 1, 1, 0.455, 1]",0.89
KNeighborsRegressor,"[1, 1, 1, 1, 0.625]",0.92
SVR,"[1, 1, 1, 1, 1]",1.0
DecisionTreeRegressor,"[0.833, 1, 1, 1, 0.714]",0.91
RandomForestRegressor,"[1, 1, 1, 1, 0.833]",0.97


In [29]:
print(Model_perf_DF)

                                         Score  Accuracy
LinearRegression           [1, 1, 1, 0.455, 1]      0.89
Ridge                      [1, 1, 1, 0.455, 1]      0.89
KNeighborsRegressor        [1, 1, 1, 1, 0.625]      0.92
SVR                            [1, 1, 1, 1, 1]      1.00
DecisionTreeRegressor  [0.833, 1, 1, 1, 0.714]      0.91
RandomForestRegressor      [1, 1, 1, 1, 0.833]      0.97


## 7. Back Testing

In [30]:
from colored import fg

In [31]:
color = fg('blue')

In [32]:
color2 = fg('black')

In [33]:
def back_testing(DF,ML_Models_list ,Predictors_list, years_list):
    
    model_perf_year_DF = []
    
    for year in years_list:
        
        d_train = DF[DF["Year"] < year]
        d_test  = DF[DF["Year"]  == year]
       
        Model_perf_DF = ModelTraining(ML_models = ML_Models_list , DF_train = d_train , 
                                      DF_test= d_test, predictor_col = Predictors_list)
                                     
        
        model_perf_year_DF.append(Model_perf_DF)
        
    
    count = 0
    
    for model_perf_year in model_perf_year_DF:
        
        print(color+f"******************\33[1m NBA SEASON : {years_list[count]} \33[0m"+color+"*****************\n\n")
        
        print(color2+" ")
        print(model_perf_year)
        print("\n\n")
        count += 1

In [34]:
years = [2019,2020,2021,2022]

In [35]:
back_testing(DF = stats,ML_Models_list = models ,
             Predictors_list = predictors, years_list = years)

[38;5;4m******************[1m NBA SEASON : 2019 [0m[38;5;4m*****************


[38;5;0m 
                                             Score  Accuracy
LinearRegression         [1, 1, 0.625, 0.556, 0.5]      0.74
Ridge                    [1, 1, 0.625, 0.556, 0.5]      0.74
KNeighborsRegressor        [1, 1, 1, 0.714, 0.833]      0.91
SVR                      [1, 1, 0.833, 0.5, 0.556]      0.78
DecisionTreeRegressor  [0.833, 1, 0.714, 0.556, 1]      0.82
RandomForestRegressor      [1, 1, 0.714, 0.455, 1]      0.83



[38;5;4m******************[1m NBA SEASON : 2020 [0m[38;5;4m*****************


[38;5;0m 
                                         Score  Accuracy
LinearRegression           [1, 1, 1, 1, 0.833]      0.97
Ridge                      [1, 1, 1, 1, 0.833]      0.97
KNeighborsRegressor    [1, 1, 1, 0.625, 0.714]      0.87
SVR                        [1, 1, 1, 1, 0.833]      0.97
DecisionTreeRegressor        [1, 0.5, 1, 1, 1]      0.90
RandomForestRegressor          [1, 1, 1,