In [38]:
import pandas as pd
import numpy as np
import os

from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from imblearn.over_sampling import RandomOverSampler, SMOTENC

import mlflow

In [2]:
dataframes_path = os.path.join(os.getcwd(), 'dataframes')

In [3]:
df = pd.read_pickle(os.path.join(dataframes_path, 'complete_stats.pkl'))

In [4]:
def check_rk_season(df, rk_season_pairs):
    return df.apply(lambda x: (x['Rk'], x['Season']) in rk_season_pairs, axis = 1)

def drop_players_multiteams(df):
    df_tot = df[df['Tm'] == 'TOT']
    rk_season_pairs = list(zip(df_tot['Rk'], df_tot['Season']))
    df_tot_full = df[check_rk_season(df, rk_season_pairs)]
    drop_index = df_tot_full[df_tot_full['Tm'] != 'TOT'].index
    return df.drop(drop_index).reset_index(drop = True)

- Drop players with more than one team
- Index with tuple for Rk & Player
- Drop columns, Rk GT, Votes, MaxVotes
- LabelEncode for Player, Pos and Team
- Separate types of columns

In [5]:
class DropPlayersMultiTeams(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):   
        df_tot = X[X['Tm'] == 'TOT']
        rk_season_pairs = list(zip(df_tot['Rk'], df_tot['Season']))
        df_tot_full = X[check_rk_season(X, rk_season_pairs)]
        drop_index = df_tot_full[df_tot_full['Tm'] != 'TOT'].index
        return X.drop(drop_index).reset_index(drop = True)

In [6]:
class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_drop):
        self.cols_to_drop = cols_to_drop
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        return X.drop(columns = self.cols_to_drop)

In [7]:
class SetIndex(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        return X.set_index(['Rk', 'Season'], drop = False)

In [8]:
class OutlierFilter(BaseEstimator, TransformerMixin):
    '''
    Clase que filtra los outliers utilizando np.quantile()
    Los cuantiles a filtrar así como las columnas a filtrar son los parámetros de la clase.
    '''
    
    def __init__(self, q, col_to_filter):
        self.q = q
        self.col_to_filter = col_to_filter
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        '''
        El método considera outlier a aquel cliente que es outlier en todas las columnas que le pasas.
        Es decir: si tiene que filtrar importe y número de pedidos, sólo va a eliminar aquellos clientes
        que son outlier tanto en importe como número de pedidos. Si eres outlier en importe pero no en pedido
        no se te va a filtrar del dataset.
        '''
        
        # lista vacía
        criteria_list = []
        
        # agregamos a la lista los clientes que no son outliers
        for col in self.col_to_filter:
            criteria = (X[col] > np.quantile(X[col], q = self.q)) & (X[col] < np.quantile(X[col], q = 1 - self.q))
            criteria_list.append(criteria)
            
        # si hay más de 1 columna
        if len(self.col_to_filter) > 1:
            
            # creamos el criterio global: es decir outlier en todas las columnas
            global_criteria = criteria_list[0]
            
            for criteria in criteria_list[1:]:
                global_criteria = global_criteria & criteria
                
        else:
            global_criteria = criteria_list[0]
            
        # filtramos nuestra dataframe
        X = X[global_criteria]
        
        # guardamos el índice como parámetro de la clase porque en caso contrario lo perderíamos.
        self.index = X.index
        
        return X

In [9]:
cols_to_drop = ['Rk', 'GT', 'Votes', 'MaxVotes', 'FG_tot', '3PA_tot', '2PA_tot', 'FGA_tot_rank']
cols_to_filter = ['PER', 'WS/48', 'BPM', 'USG%']

In [10]:
pipe = Pipeline(steps = [
    ('DropPlayersMultiTeams', DropPlayersMultiTeams()),
    ('OutlierFilter', OutlierFilter(q = .0005, col_to_filter = cols_to_filter)),
    ('SetIndex', SetIndex()),
    ('DropColumns', DropColumns(cols_to_drop))
])

In [11]:
df = pipe.fit_transform(df)

In [12]:
def extract_players_list(df):
    players_list = df[['Player']]
    df.drop('Player', axis = 1, inplace = True)
    return players_list

In [13]:
players_list = extract_players_list(df)

In [14]:
encoder_position = LabelEncoder()
encoder_team = LabelEncoder()

In [15]:
df['Pos'] = encoder_position.fit_transform(df['Pos'])
df['Tm'] = encoder_team.fit_transform(df['Tm'])

In [16]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Pos,Age,Tm,G,GS,MP_pg,FG_pg,FGA_pg,FG%,3P_pg,...,DWS_rank,WS_rank,WS/48_rank,OBPM_rank,DBPM_rank,BPM_rank,VORP_rank,%W_rank,%GS,Share
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1980,0,32,16,82,-10,38.3,10.2,16.9,0.604,0.0,...,3,1,1,5,7,2,1,2,-1.0,0.665
2,1980,2,25,11,67,-10,18.2,2.3,4.7,0.481,0.0,...,44,71,83,50,32,54,41,44,-1.0,0.0
3,1980,0,25,29,75,-10,28.9,6.2,11.7,0.531,0.0,...,13,27,21,23,11,12,12,6,-1.0,0.0
4,1980,5,31,1,80,80,35.8,4.8,9.9,0.482,0.1,...,23,14,25,26,33,31,20,1,1.0,0.009
5,1980,0,31,5,26,-10,21.5,1.0,2.3,0.45,0.0,...,47,85,109,63,21,56,42,38,-1.0,0.0


In [17]:
len(df[df['Season'] > 2015]) / len(df)

0.17507805847289243

In [18]:
val_df = df[df['Season'] > 2015]
dev_df = df[df['Season'] <= 2015]

In [19]:
def oversample(df, os_param = 'ros', sampling_strategy = .3):
    df_os = df.copy()
    df_os.loc[:, 'Contender'] = (df_os['Share'] > 0) * 1
    df_X = df_os.drop('Contender', axis = 1)
    df_y = df_os[['Contender']]

    if os_param == 'smote':
        int_cols = df_X.select_dtypes('int').columns
        cat_index = [df_X.columns.get_loc(column) for column in int_cols]
        os_technique = SMOTENC(sampling_strategy = sampling_strategy, random_state=23, categorical_features= cat_index)  
    else:
        os_technique = RandomOverSampler(sampling_strategy = sampling_strategy, random_state = 23)

    X_resampled, y_resampled = os_technique.fit_resample(df_X, df_y)
    df_ros = pd.concat([X_resampled, y_resampled], axis = 1)
    df_ros.drop('Contender', axis = 1, inplace = True)

    return df_ros, sampling_strategy

In [20]:
dev_df, sampling_ratio = oversample(dev_df, os_param = 'smote', sampling_strategy = .2)

In [21]:
dev_df_X = dev_df.drop('Share', axis=1)
dev_df_y = dev_df[['Share']]

val_df_X = val_df.drop('Share', axis=1)
val_df_y = val_df[['Share']]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
                                        dev_df_X,
                                        dev_df_y,
                                        test_size = 0.2,
                                        random_state = 23
                                     )

In [23]:
mlflow.set_experiment('NBA MVP prediction model')

<Experiment: artifact_location='file:///C:/Users/marcr/OneDrive/Documents/Data%20Science/NBA%20Project/mlruns/0', experiment_id='0', lifecycle_stage='active', name='NBA MVP prediction model', tags={}>

In [24]:
def eval_metrics(actual, predicted):
    rmse = np.sqrt(metrics.mean_squared_error(actual, predicted))
    mae = metrics.mean_absolute_error(actual, predicted)
    r2 = metrics.r2_score(actual, predicted)

    return rmse, mae, r2

In [39]:
parameters = {'max_depth': 12, 'random_state': 23, 'min_samples_split': 20}
param_grid = {'max_depth': [10, 12, 14], 'min_samples_split': [15, 20, 25], 'random_state': [23], 'max_features': [.9, .8, .7], 'n_estimators': [100, 200]}

In [40]:
with mlflow.start_run():
    model = RandomForestRegressor()
    # model.set_params(**parameters)

    # cv_scores = cross_validate(model, X_train, np.ravel(y_train), cv = 10, scoring = ['neg_root_mean_squared_error', 'neg_mean_absolute_error', 'r2'], n_jobs = -1)

    # model.fit(X_train, np.ravel(y_train))

    grid_model = GridSearchCV(model, param_grid, scoring = 'neg_root_mean_squared_error', n_jobs = -1, cv = 10)

    grid_model.fit(X_train, y_train)
    
    best_params = grid_model.best_params_

    best_model = grid_model.best_estimator_

    best_cv_score = grid_model.best_score_

    y_pred_train = best_model.predict(X_train)
    y_pred_test = best_model.predict(X_test)
    y_pred_val = best_model.predict(val_df_X)

    rmse_train, mae_train, r2_train = eval_metrics(y_train, y_pred_train)
    rmse_test, mae_test, r2_test = eval_metrics(y_test, y_pred_test)
    rmse_val, mae_val, r2_val = eval_metrics(val_df_y, y_pred_val)

    mlflow.log_param('max_depth', best_params['max_depth'])
    mlflow.log_param('min_samples_split', best_params['min_samples_split'])
    mlflow.log_param('max_features', best_params['max_features'])
    mlflow.log_param('n_estimators', best_params['n_estimators'])
    mlflow.log_param('sampling_ratio', sampling_ratio)


    mlflow.log_metric('rmse_train', rmse_train)
    # mlflow.log_metric('mae_train', mae_train)
    mlflow.log_metric('r2_train', r2_train)
    mlflow.log_metric('rmse_test', rmse_test)
    # mlflow.log_metric('mae_test', mae_test)
    mlflow.log_metric('r2_test', r2_test)
    mlflow.log_metric('rmse_val', rmse_val)
    # mlflow.log_metric('mae_train', mae_train)
    mlflow.log_metric('r2_val', r2_val)
    mlflow.log_metric('rmse_cv', best_cv_score * -1)
    # mlflow.log_metric('mae_cv', cv_scores['test_neg_mean_absolute_error'].mean() * -1)
    # mlflow.log_metric('r2_cv', cv_scores['test_r2'].mean())

    mlflow.sklearn.log_model(best_model, 'model')

    df.to_pickle(os.path.join(dataframes_path, 'entry_dataframe.pkl'))

    mlflow.log_artifact(os.path.join(dataframes_path, 'entry_dataframe.pkl'))

    df_results = pd.DataFrame(
        {
            'Train': eval_metrics(y_train, y_pred_train),
            'CV': [best_cv_score * -1, np.nan, np.nan], 
            'Test': eval_metrics(y_test, y_pred_test), 
            'Validation': eval_metrics(val_df_y, y_pred_val),
        }, 
        index = ['RMSE', 'MAE', 'R2']
        )
    display(df_results)

    players_val = players_list[players_list.index.get_level_values(1) > 2015]

    players_pred_share_df = pd.DataFrame(y_pred_val, index = val_df.index, columns = ['PredShare'])

    val_df_results = pd.concat([players_val, val_df_y, players_pred_share_df], axis = 1)

    val_df_contenders = val_df_results[val_df_results['Share'] > 0]
    rmse_contenders = metrics.mean_squared_error(val_df_contenders['Share'], val_df_contenders['PredShare']) ** .5
    mlflow.log_metric('rmse_cont', rmse_contenders)

    print(f'Contenders Results: {rmse_contenders}')
    for season in set(val_df_contenders.index.get_level_values(1)):
        display(val_df_contenders.loc[pd.IndexSlice[:, season], :].sort_values(by = 'Share', ascending = False))

    val_df_nocontenders = val_df_results[val_df_results['Share'] == 0]
    mlflow.log_metric('mae_no_cont', metrics.mean_absolute_error(val_df_nocontenders['Share'], val_df_nocontenders['PredShare']))

    print('No contenders but predicted contender')
    display(val_df_nocontenders[val_df_nocontenders['PredShare'] > 0])


  self.best_estimator_.fit(X, y, **fit_params)


Unnamed: 0,Train,CV,Test,Validation
RMSE,0.022162,0.039748,0.039771,0.035384
MAE,0.005782,,0.010509,0.006166
R2,0.959159,,0.861365,0.609004


Contenders Results: 0.20550278200957336


Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
105,2016,Stephen Curry,1.0,0.837119
263,2016,Kawhi Leonard,0.484,0.378077
222,2016,LeBron James,0.482,0.433539
452,2016,Russell Westbrook,0.371,0.298059
126,2016,Kevin Durant,0.112,0.48282
348,2016,Chris Paul,0.082,0.216353
167,2016,Draymond Green,0.038,0.092159
266,2016,Damian Lillard,0.02,0.112415
178,2016,James Harden,0.007,0.325971
273,2016,Kyle Lowry,0.005,0.055059


Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
458,2017,Russell Westbrook,0.879,0.372722
173,2017,James Harden,0.746,0.433512
261,2017,Kawhi Leonard,0.495,0.427796
221,2017,LeBron James,0.33,0.369179
425,2017,Isaiah Thomas,0.08,0.394721
98,2017,Stephen Curry,0.051,0.323103
16,2017,Giannis Antetokounmpo,0.007,0.147595
453,2017,John Wall,0.007,0.12533
100,2017,Anthony Davis,0.002,0.174605
119,2017,Kevin Durant,0.002,0.432884


Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
195,2018,James Harden,0.955,0.690064
249,2018,LeBron James,0.731,0.400896
122,2018,Anthony Davis,0.441,0.231019
298,2018,Damian Lillard,0.205,0.239435
508,2018,Russell Westbrook,0.075,0.253837
16,2018,Giannis Antetokounmpo,0.074,0.169098
143,2018,Kevin Durant,0.065,0.420741
130,2018,DeMar DeRozan,0.032,0.016042
7,2018,LaMarcus Aldridge,0.006,0.047356
79,2018,Jimmy Butler,0.005,0.024802


Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
18,2019,Giannis Antetokounmpo,0.932,0.574975
207,2019,James Harden,0.768,0.5283
184,2019,Paul George,0.352,0.129165
270,2019,Nikola Jokić,0.21,0.215219
124,2019,Stephen Curry,0.173,0.276804
305,2019,Damian Lillard,0.068,0.119476
155,2019,Joel Embiid,0.049,0.150451
150,2019,Kevin Durant,0.025,0.266525
301,2019,Kawhi Leonard,0.013,0.298284
506,2019,Russell Westbrook,0.008,0.180454


Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
13,2020,Giannis Antetokounmpo,0.952,0.473693
251,2020,LeBron James,0.746,0.448282
199,2020,James Harden,0.363,0.39875
134,2020,Luka Dončić,0.198,0.326361
294,2020,Kawhi Leonard,0.166,0.314257
123,2020,Anthony Davis,0.081,0.323674
396,2020,Chris Paul,0.026,0.006773
297,2020,Damian Lillard,0.023,0.322016
262,2020,Nikola Jokić,0.018,0.135364
449,2020,Pascal Siakam,0.017,0.000817


Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
263,2021,Nikola Jokić,0.961,0.539585
146,2021,Joel Embiid,0.58,0.172561
116,2021,Stephen Curry,0.449,0.350642
12,2021,Giannis Antetokounmpo,0.345,0.296199
396,2021,Chris Paul,0.138,0.014519
130,2021,Luka Dončić,0.042,0.284552
297,2021,Damian Lillard,0.038,0.297023
421,2021,Julius Randle,0.02,0.039327
439,2021,Derrick Rose,0.01,2.4e-05
180,2021,Rudy Gobert,0.008,0.082991


No contenders but predicted contender


Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2016,Quincy Acy,0.0,0.000024
2,2016,Jordan Adams,0.0,0.000130
3,2016,Steven Adams,0.0,0.000024
4,2016,Arron Afflalo,0.0,0.000024
5,2016,Alexis Ajinça,0.0,0.000024
...,...,...,...,...
536,2021,Delon Wright,0.0,0.000024
537,2021,Thaddeus Young,0.0,0.000146
538,2021,Trae Young,0.0,0.143433
539,2021,Cody Zeller,0.0,0.000024


In [41]:
val_df_nocontenders[val_df_nocontenders['PredShare'] > 0.1]

Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
453,2016,Hassan Whiteside,0.0,0.211265
66,2017,Jimmy Butler,0.0,0.161068
90,2017,DeMarcus Cousins,0.0,0.235432
124,2017,Joel Embiid,0.0,0.165856
156,2017,Rudy Gobert,0.0,0.163417
234,2017,Nikola Jokić,0.0,0.153462
351,2017,Chris Paul,0.0,0.210512
434,2017,Karl-Anthony Towns,0.0,0.223994
240,2018,Kyrie Irving,0.0,0.226911
484,2018,Karl-Anthony Towns,0.0,0.17359


In [43]:
top_features = pd.DataFrame(best_model.feature_importances_, index = X_train.columns, columns = ['Importance']).sort_values('Importance', ascending = False)

In [44]:
top_features[top_features['Importance'] > 0]

Unnamed: 0,Importance
WS,0.410439
PER,0.133610
VORP,0.105184
%W,0.074177
WS/48,0.014797
...,...
2P%_rank,0.000334
STL_pg_rank,0.000319
%GS,0.000244
ORB_pg_rank,0.000139
