In [1]:
import pandas as pd
import numpy as np
import os

from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

from sklearn_pandas import DataFrameMapper

import xgboost as xgb

from imblearn.over_sampling import RandomOverSampler, SMOTENC

import mlflow

In [2]:
dataframes_path = os.path.join(os.getcwd(), 'dataframes')

In [3]:
df = pd.read_pickle(os.path.join(dataframes_path, 'complete_stats.pkl'))

In [4]:
def check_rk_season(df, rk_season_pairs):
    return df.apply(lambda x: (x['Rk'], x['Season']) in rk_season_pairs, axis = 1)

- Drop players with more than one team
- Index with tuple for Rk & Player
- Drop columns, Rk GT, Votes, MaxVotes
- LabelEncode for Player, Pos and Team
- Separate types of columns

In [5]:
class DropPlayersMultiTeams(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):   
        df_tot = X[X['Tm'] == 'TOT']
        rk_season_pairs = list(zip(df_tot['Rk'], df_tot['Season']))
        df_tot_full = X[check_rk_season(X, rk_season_pairs)]
        drop_index = df_tot_full[df_tot_full['Tm'] != 'TOT'].index
        return X.drop(drop_index).reset_index(drop = True)

In [6]:
class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_drop):
        self.cols_to_drop = cols_to_drop
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        return X.drop(columns = self.cols_to_drop)

In [7]:
class SetIndex(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        return X.set_index(['Rk', 'Season'], drop = False)

In [8]:
class OutlierFilter(BaseEstimator, TransformerMixin):
    '''
    Clase que filtra los outliers utilizando np.quantile()
    Los cuantiles a filtrar así como las columnas a filtrar son los parámetros de la clase.
    '''
    
    def __init__(self, q, col_to_filter):
        self.q = q
        self.col_to_filter = col_to_filter
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        '''
        El método considera outlier a aquel cliente que es outlier en todas las columnas que le pasas.
        Es decir: si tiene que filtrar importe y número de pedidos, sólo va a eliminar aquellos clientes
        que son outlier tanto en importe como número de pedidos. Si eres outlier en importe pero no en pedido
        no se te va a filtrar del dataset.
        '''
        
        # lista vacía
        criteria_list = []
        
        # agregamos a la lista los clientes que no son outliers
        for col in self.col_to_filter:
            criteria = (X[col] > np.quantile(X[col], q = self.q)) & (X[col] < np.quantile(X[col], q = 1 - self.q))
            criteria_list.append(criteria)
            
        # si hay más de 1 columna
        if len(self.col_to_filter) > 1:
            
            # creamos el criterio global: es decir outlier en todas las columnas
            global_criteria = criteria_list[0]
            
            for criteria in criteria_list[1:]:
                global_criteria = global_criteria & criteria
                
        else:
            global_criteria = criteria_list[0]
            
        # filtramos nuestra dataframe
        X = X[global_criteria]
        
        # guardamos el índice como parámetro de la clase porque en caso contrario lo perderíamos.
        self.index = X.index
        
        return X

In [9]:
class DropPlayers(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.players_list = []
    
    def fit(self, X, y = None):
        self.players_list_ = X[['Player']]
        return self
    
    def transform(self, X, y = None):
        return X.drop('Player', axis = 1)

In [10]:
cols_to_drop = ['Rk', 'GT', 'Votes', 'MaxVotes', 'FG_tot', '3PA_tot', '2PA_tot', 'FGA_tot_rank', 'Tm', 'Pos']
cols_to_filter = ['PER', 'WS/48', 'BPM', 'USG%']
cols_to_ohe = ['Pos', 'Tm']

In [11]:
pipe = Pipeline(steps = [
    ('DropPlayersMultiTeams', DropPlayersMultiTeams()),
    ('OutlierFilter', OutlierFilter(q = .0005, col_to_filter = cols_to_filter)),
    ('SetIndex', SetIndex()),
    ('DropColumns', DropColumns(cols_to_drop)),
    ('DropPlayers', DropPlayers()),
    # ('OneHotEncoder', DataFrameMapper([(cols_to_ohe, OneHotEncoder(drop = 'if_binary'), {'alias': 'Pos_Tm'})], input_df=True, df_out=True, default = None))
])

In [12]:
df = pipe.fit_transform(df)

In [13]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,G,GS,MP_pg,FG_pg,FGA_pg,FG%,3P_pg,3PA_pg,3P%,...,DWS_rank,WS_rank,WS/48_rank,OBPM_rank,DBPM_rank,BPM_rank,VORP_rank,%W_rank,%GS,Share
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1980,32,82,-10,38.3,10.2,16.9,0.604,0.0,0.0,0.0,...,3,1,1,5,7,2,1,2,-1.0,0.665
2,1980,25,67,-10,18.2,2.3,4.7,0.481,0.0,0.0,0.0,...,44,71,83,50,32,54,41,44,-1.0,0.0
3,1980,25,75,-10,28.9,6.2,11.7,0.531,0.0,0.0,0.0,...,13,27,21,23,11,12,12,6,-1.0,0.0
4,1980,31,80,80,35.8,4.8,9.9,0.482,0.1,0.2,0.222,...,23,14,25,26,33,31,20,1,1.0,0.009
5,1980,31,26,-10,21.5,1.0,2.3,0.45,0.0,0.0,0.0,...,47,85,109,63,21,56,42,38,-1.0,0.0


In [14]:
len(df[df['Season'] > 2015]) / len(df)

0.17507805847289243

In [15]:
val_df = df[df['Season'] > 2015]
dev_df = df[df['Season'] <= 2015]

In [16]:
def oversample(df, os_param = 'ros', sampling_strategy = .3):
    df_os = df.copy()
    df_os.loc[:, 'Contender'] = (df_os['Share'] > 0) * 1
    df_X = df_os.drop('Contender', axis = 1)
    df_y = df_os[['Contender']]

    if os_param == 'smote':
        int_cols = df_X.select_dtypes('int').columns
        cat_index = [df_X.columns.get_loc(column) for column in int_cols]
        os_technique = SMOTENC(sampling_strategy = sampling_strategy, random_state=23, categorical_features= cat_index)  
    else:
        os_technique = RandomOverSampler(sampling_strategy = sampling_strategy, random_state = 23)

    X_resampled, y_resampled = os_technique.fit_resample(df_X, df_y)
    df_ros = pd.concat([X_resampled, y_resampled], axis = 1)
    df_ros.drop('Contender', axis = 1, inplace = True)

    return df_ros, sampling_strategy

In [17]:
dev_df, sampling_ratio = oversample(dev_df, os_param = 'smote', sampling_strategy = .25)

In [18]:
dev_df_X = dev_df.drop('Share', axis=1)
dev_df_y = dev_df[['Share']]

val_df_X = val_df.drop('Share', axis=1)
val_df_y = val_df[['Share']]

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
                                        dev_df_X,
                                        dev_df_y,
                                        test_size = 0.2,
                                        random_state = 23
                                     )

In [20]:
datasets = {'train': X_train, 'test': X_test, 'val': val_df_X}
real_targets = {'train': y_train, 'test': y_test, 'val': val_df_y}

In [21]:
mlflow.set_experiment('NBA MVP prediction model')

<Experiment: artifact_location='file:///C:/Users/marcr/OneDrive/Documents/Data%20Science/NBA%20Project/mlruns/0', experiment_id='0', lifecycle_stage='active', name='NBA MVP prediction model', tags={}>

In [22]:
def eval_metrics(actual, predicted):
    rmse = np.sqrt(metrics.mean_squared_error(actual, predicted))
    mae = metrics.mean_absolute_error(actual, predicted)
    r2 = metrics.r2_score(actual, predicted)

    return {'rmse': rmse, 'mae': mae, 'r2': r2}

In [23]:
def retrieve_best(grid_object):
    best_model = grid_object.best_estimator_    
    best_params = grid_object.best_params_
    best_cv_score = grid_object.best_score_
    best_params['best_ntree_limit'] = best_model.best_ntree_limit
    return best_model, best_params, best_cv_score

In [24]:
def predict_model(model, datasets):
    results_dict = {}
    for type, dataset in datasets.items():
        prediction_series = pd.Series(model.predict(dataset), index = dataset.index, name = 'PredShare')
        results_dict[type] = prediction_series
    return results_dict

In [25]:
def log_params_mlflow_xgb(params, sampling_ratio):
    mlflow.log_param('max_depth', params['max_depth'])
    mlflow.log_param('colsample_bytree', params['colsample_bytree'])
    mlflow.log_param('learning_rate', params['learning_rate'])
    mlflow.log_param('n_estimators', params['n_estimators'])
    mlflow.log_param('best_ntree_limit', params['best_ntree_limit'])
    mlflow.log_param('subsample', params['subsample'])
    mlflow.log_param('sampling_ratio', sampling_ratio)

In [26]:
def log_params_mlflow_rf(params, sampling_ratio):
    mlflow.log_param('max_depth', params['max_depth'])
    mlflow.log_param('max_features', params['max_features'])
    mlflow.log_param('min_samples_split', params['min_samples_split'])
    mlflow.log_param('n_estimators', params['model__n_estimators'])
    mlflow.log_param('sampling_ratio', sampling_ratio)

In [27]:
def get_metrics(targets_real, targets_predicted):
    train_metrics = eval_metrics(targets_real['train'], targets_predicted['train'])
    test_metrics = eval_metrics(targets_real['test'], targets_predicted['test'])
    val_metrics = eval_metrics(targets_real['val'], targets_predicted['val'])
    return train_metrics, test_metrics, val_metrics

In [28]:
def log_metrics_mlflow(targets_real, targets_predicted, cv_score):
    train_metrics, test_metrics, val_metrics = get_metrics(targets_real, targets_predicted)

    mlflow.log_metric('rmse_train', train_metrics['rmse'])
    mlflow.log_metric('r2_train', train_metrics['r2'])
    mlflow.log_metric('rmse_test', test_metrics['rmse'])
    mlflow.log_metric('r2_test', test_metrics['r2'])
    mlflow.log_metric('rmse_val', val_metrics['rmse'])
    mlflow.log_metric('r2_val', val_metrics['r2'])
    mlflow.log_metric('rmse_cv', abs(cv_score))

In [29]:
def log_model_mlflow(model):
    mlflow.sklearn.log_model(model, 'model')

In [30]:
def log_df_mlflow(df, path):
    df.to_pickle(os.path.join(path, 'entry_dataframe.pkl'))
    mlflow.log_artifact(os.path.join(path, 'entry_dataframe.pkl'))

In [31]:
def display_metrics(targets_real, targets_predicted, cv_score):
    train_metrics, test_metrics, val_metrics = get_metrics(targets_real, targets_predicted)
    df_results = pd.DataFrame(
        {
            'Train': [train_metrics['rmse'], train_metrics['mae'], train_metrics['r2']],
            'CV': [cv_score * -1, np.nan, np.nan], 
            'Test': [test_metrics['rmse'], test_metrics['mae'], test_metrics['r2']], 
            'Validation': [val_metrics['rmse'], val_metrics['mae'], val_metrics['r2']],
        }, 
        index = ['RMSE', 'MAE', 'R2']
        )
    
    return df_results

In [32]:
def get_advanced_metrics(y_real, y_predict):

    results = pd.concat([y_real, y_predict], axis = 1)

    results_contenders = results[results['Share'] > 0]
    results_no_contenders = results[results['Share'] == 0]

    rmse_contenders = metrics.mean_squared_error(results_contenders['Share'], results_contenders['PredShare']) ** .5
    mae_no_contenders = metrics.mean_absolute_error(results_no_contenders['Share'], results_no_contenders['PredShare']) ** .5

    return rmse_contenders, mae_no_contenders

In [33]:
def log_advanced_metrics_mlflow(y_real, y_predict):
    rmse_contenders, mae_no_contenders = get_advanced_metrics(y_real, y_predict)
    mlflow.log_metric('rmse_cont', rmse_contenders)
    mlflow.log_metric('mae_no_cont', mae_no_contenders)

In [34]:
param_grid = {
    'max_depth': [6, 8, 10], 
    'colsample_bytree': [.5], 
    'learning_rate': [.1],
    'subsample': [1, .8, .6],
    'n_estimators': [1000]
    }

In [35]:
fit_params = {
    "early_stopping_rounds":20, 
    "eval_metric" : "rmse", 
    "eval_set" : [[X_test, y_test]]
}

In [36]:
with mlflow.start_run():
    model = xgb.XGBRegressor(random_state = 23, n_estimators = 1000)

    grid_model = GridSearchCV(model, param_grid, scoring = 'neg_root_mean_squared_error', n_jobs = -1, cv = 10)

    grid_model.fit(X_train, y_train, **fit_params)

    best_model, best_params, best_cv_score = retrieve_best(grid_model)

    predicted_targets = predict_model(best_model, datasets)

    log_params_mlflow_xgb(best_params, sampling_ratio)

    log_metrics_mlflow(real_targets, predicted_targets, best_cv_score)

    log_advanced_metrics_mlflow(real_targets['val'], predicted_targets['val'])

    log_model_mlflow(best_model)

    log_df_mlflow(df, dataframes_path)

[0]	validation_0-rmse:0.43678
[1]	validation_0-rmse:0.39370
[2]	validation_0-rmse:0.35507
[3]	validation_0-rmse:0.32026
[4]	validation_0-rmse:0.28897
[5]	validation_0-rmse:0.26097
[6]	validation_0-rmse:0.23569
[7]	validation_0-rmse:0.21295
[8]	validation_0-rmse:0.19279
[9]	validation_0-rmse:0.17453
[10]	validation_0-rmse:0.15815
[11]	validation_0-rmse:0.14348
[12]	validation_0-rmse:0.13044
[13]	validation_0-rmse:0.11875
[14]	validation_0-rmse:0.10829
[15]	validation_0-rmse:0.09906
[16]	validation_0-rmse:0.09085
[17]	validation_0-rmse:0.08349
[18]	validation_0-rmse:0.07697
[19]	validation_0-rmse:0.07131
[20]	validation_0-rmse:0.06641
[21]	validation_0-rmse:0.06198
[22]	validation_0-rmse:0.05819
[23]	validation_0-rmse:0.05484
[24]	validation_0-rmse:0.05196
[25]	validation_0-rmse:0.04947
[26]	validation_0-rmse:0.04747
[27]	validation_0-rmse:0.04572
[28]	validation_0-rmse:0.04417
[29]	validation_0-rmse:0.04262
[30]	validation_0-rmse:0.04145
[31]	validation_0-rmse:0.04038
[32]	validation_0-



In [37]:
grid_model.best_estimator_.best_ntree_limit

591

In [38]:
display_metrics(real_targets, predicted_targets, best_cv_score)

Unnamed: 0,Train,CV,Test,Validation
RMSE,0.000665,0.030058,0.030219,0.0331
MAE,0.000318,,0.008304,0.005636
R2,0.99997,,0.932534,0.657858


In [39]:
def get_val_results(real_val_y, pred_val_y, players_series):

    players_series_val = players_series[players_series.index.get_level_values(1) > 2015]
    results_val = pd.concat([players_series_val, real_val_y, pred_val_y], axis = 1)
    
    results_val_contenders = results_val[results_val['Share'] > 0]
    results_val_no_contenders = results_val[results_val['Share'] == 0]

    return results_val_contenders, results_val_no_contenders

In [40]:
def display_val_results(results_val_contenders, results_val_no_contenders):
    print(f'Contenders Results:')
    for season in set(results_val_contenders.index.get_level_values(1)):
        display(results_val_contenders.loc[pd.IndexSlice[:, season], :].sort_values(by = 'Share', ascending = False))

    print(f'No contenders results:')
    display(results_val_no_contenders[results_val_no_contenders['PredShare'] > 0])

In [41]:
contenders_df, no_contenders_df = get_val_results(real_targets['val'], predicted_targets['val'], pipe['DropPlayers'].players_list_)

In [42]:
display_val_results(contenders_df, no_contenders_df)

Contenders Results:


Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
105,2016,Stephen Curry,1.0,0.964236
263,2016,Kawhi Leonard,0.484,0.33287
222,2016,LeBron James,0.482,0.432793
452,2016,Russell Westbrook,0.371,0.191129
126,2016,Kevin Durant,0.112,0.605469
348,2016,Chris Paul,0.082,0.16442
167,2016,Draymond Green,0.038,0.053914
266,2016,Damian Lillard,0.02,0.033292
178,2016,James Harden,0.007,0.324091
273,2016,Kyle Lowry,0.005,0.129491


Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
458,2017,Russell Westbrook,0.879,0.559861
173,2017,James Harden,0.746,0.467868
261,2017,Kawhi Leonard,0.495,0.500365
221,2017,LeBron James,0.33,0.34495
425,2017,Isaiah Thomas,0.08,0.338275
98,2017,Stephen Curry,0.051,0.460013
16,2017,Giannis Antetokounmpo,0.007,0.090593
453,2017,John Wall,0.007,0.096531
100,2017,Anthony Davis,0.002,0.190594
119,2017,Kevin Durant,0.002,0.450008


Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
195,2018,James Harden,0.955,0.867224
249,2018,LeBron James,0.731,0.43564
122,2018,Anthony Davis,0.441,0.222637
298,2018,Damian Lillard,0.205,0.228861
508,2018,Russell Westbrook,0.075,0.25822
16,2018,Giannis Antetokounmpo,0.074,0.162859
143,2018,Kevin Durant,0.065,0.367809
130,2018,DeMar DeRozan,0.032,0.035444
7,2018,LaMarcus Aldridge,0.006,0.103441
79,2018,Jimmy Butler,0.005,0.02728


Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
18,2019,Giannis Antetokounmpo,0.932,0.619208
207,2019,James Harden,0.768,0.583388
184,2019,Paul George,0.352,0.141576
270,2019,Nikola Jokić,0.21,0.176943
124,2019,Stephen Curry,0.173,0.245602
305,2019,Damian Lillard,0.068,0.196572
155,2019,Joel Embiid,0.049,0.191379
150,2019,Kevin Durant,0.025,0.361476
301,2019,Kawhi Leonard,0.013,0.234243
506,2019,Russell Westbrook,0.008,0.12492


Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
13,2020,Giannis Antetokounmpo,0.952,0.668879
251,2020,LeBron James,0.746,0.519328
199,2020,James Harden,0.363,0.465355
134,2020,Luka Dončić,0.198,0.364993
294,2020,Kawhi Leonard,0.166,0.307436
123,2020,Anthony Davis,0.081,0.496
396,2020,Chris Paul,0.026,-0.001702
297,2020,Damian Lillard,0.023,0.283165
262,2020,Nikola Jokić,0.018,0.147684
449,2020,Pascal Siakam,0.017,0.014858


Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
263,2021,Nikola Jokić,0.961,0.520558
146,2021,Joel Embiid,0.58,0.167409
116,2021,Stephen Curry,0.449,0.358516
12,2021,Giannis Antetokounmpo,0.345,0.377677
396,2021,Chris Paul,0.138,0.025152
130,2021,Luka Dončić,0.042,0.395176
297,2021,Damian Lillard,0.038,0.364448
421,2021,Julius Randle,0.02,0.065629
439,2021,Derrick Rose,0.01,-9.8e-05
180,2021,Rudy Gobert,0.008,0.114774


No contenders results:


Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,2016,Jordan Adams,0.0,0.012304
4,2016,Arron Afflalo,0.0,0.000032
5,2016,Alexis Ajinça,0.0,0.000017
7,2016,LaMarcus Aldridge,0.0,0.022903
8,2016,Cliff Alexander,0.0,0.000018
...,...,...,...,...
535,2021,Robert Woodard II,0.0,0.000050
536,2021,Delon Wright,0.0,0.000023
538,2021,Trae Young,0.0,0.058910
539,2021,Cody Zeller,0.0,0.000222


In [43]:
no_contenders_df[no_contenders_df['PredShare'] > 0.1]

Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
97,2016,DeMarcus Cousins,0.0,0.100982
453,2016,Hassan Whiteside,0.0,0.116006
156,2017,Rudy Gobert,0.0,0.115771
266,2017,Damian Lillard,0.0,0.130551
434,2017,Karl-Anthony Towns,0.0,0.136015
240,2018,Kyrie Irving,0.0,0.173408
525,2020,Trae Young,0.0,0.155214
38,2021,Bradley Beal,0.0,0.122247
58,2021,Devin Booker,0.0,0.106074
140,2021,Kevin Durant,0.0,0.142349


In [44]:
top_features = pd.DataFrame(best_model.feature_importances_, index = X_train.columns, columns = ['Importance']).sort_values('Importance', ascending = False)

In [45]:
top_features[top_features['Importance'] > 0.005]

Unnamed: 0,Importance
WS,0.288776
PER,0.148157
VORP,0.063477
WS/48,0.058956
%W,0.046761
PTS_pg,0.026245
FGA_tot,0.017838
BPM,0.014587
PTS_tot_rank,0.014569
TOV_pg,0.011319
