In [1]:
import pandas as pd
import numpy as np
import os

from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

from sklearn_pandas import DataFrameMapper

import xgboost as xgb

from imblearn.over_sampling import RandomOverSampler, SMOTENC

import mlflow

In [2]:
dataframes_path = os.path.join(os.getcwd(), 'dataframes')

In [3]:
df = pd.read_pickle(os.path.join(dataframes_path, 'complete_stats.pkl'))

In [4]:
def check_rk_season(df, rk_season_pairs):
    return df.apply(lambda x: (x['Rk'], x['Season']) in rk_season_pairs, axis = 1)

- Drop players with more than one team
- Index with tuple for Rk & Player
- Drop columns, Rk GT, Votes, MaxVotes
- LabelEncode for Player, Pos and Team
- Separate types of columns

In [5]:
class DropPlayersMultiTeams(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):   
        df_tot = X[X['Tm'] == 'TOT']
        rk_season_pairs = list(zip(df_tot['Rk'], df_tot['Season']))
        df_tot_full = X[check_rk_season(X, rk_season_pairs)]
        drop_index = df_tot_full[df_tot_full['Tm'] != 'TOT'].index
        return X.drop(drop_index).reset_index(drop = True)

In [6]:
class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_drop):
        self.cols_to_drop = cols_to_drop
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        return X.drop(columns = self.cols_to_drop)

In [7]:
class SetIndex(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        return X.set_index(['Rk', 'Season'], drop = False)

In [8]:
class OutlierFilter(BaseEstimator, TransformerMixin):
    '''
    Clase que filtra los outliers utilizando np.quantile()
    Los cuantiles a filtrar así como las columnas a filtrar son los parámetros de la clase.
    '''
    
    def __init__(self, q, col_to_filter):
        self.q = q
        self.col_to_filter = col_to_filter
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        '''
        El método considera outlier a aquel cliente que es outlier en todas las columnas que le pasas.
        Es decir: si tiene que filtrar importe y número de pedidos, sólo va a eliminar aquellos clientes
        que son outlier tanto en importe como número de pedidos. Si eres outlier en importe pero no en pedido
        no se te va a filtrar del dataset.
        '''
        
        # lista vacía
        criteria_list = []
        
        # agregamos a la lista los clientes que no son outliers
        for col in self.col_to_filter:
            criteria = (X[col] > np.quantile(X[col], q = self.q)) & (X[col] < np.quantile(X[col], q = 1 - self.q))
            criteria_list.append(criteria)
            
        # si hay más de 1 columna
        if len(self.col_to_filter) > 1:
            
            # creamos el criterio global: es decir outlier en todas las columnas
            global_criteria = criteria_list[0]
            
            for criteria in criteria_list[1:]:
                global_criteria = global_criteria & criteria
                
        else:
            global_criteria = criteria_list[0]
            
        # filtramos nuestra dataframe
        X = X[global_criteria]
        
        # guardamos el índice como parámetro de la clase porque en caso contrario lo perderíamos.
        self.index = X.index
        
        return X

In [9]:
class DropPlayers(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.players_list = []
    
    def fit(self, X, y = None):
        self.players_list_ = X[['Player']]
        return self
    
    def transform(self, X, y = None):
        return X.drop('Player', axis = 1)

In [10]:
cols_to_drop = ['Rk', 'GT', 'Votes', 'MaxVotes', 'FG_tot', '3PA_tot', '2PA_tot', 'FGA_tot_rank', 'Tm', 'Pos']
cols_to_filter = ['PER', 'WS/48', 'BPM', 'USG%']
cols_to_ohe = ['Pos', 'Tm']

In [11]:
pipe = Pipeline(steps = [
    ('DropPlayersMultiTeams', DropPlayersMultiTeams()),
    ('OutlierFilter', OutlierFilter(q = .0005, col_to_filter = cols_to_filter)),
    ('SetIndex', SetIndex()),
    ('DropColumns', DropColumns(cols_to_drop)),
    ('DropPlayers', DropPlayers()),
    # ('OneHotEncoder', DataFrameMapper([(cols_to_ohe, OneHotEncoder(drop = 'if_binary'), {'alias': 'Pos_Tm'})], input_df=True, df_out=True, default = None))
])

In [12]:
df = pipe.fit_transform(df)

In [13]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,G,GS,MP_pg,FG_pg,FGA_pg,FG%,3P_pg,3PA_pg,3P%,...,DWS_rank,WS_rank,WS/48_rank,OBPM_rank,DBPM_rank,BPM_rank,VORP_rank,%W_rank,%GS,Share
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1980,32,82,-10,38.3,10.2,16.9,0.604,0.0,0.0,0.0,...,3,1,1,5,7,2,1,2,-1.0,0.665
2,1980,25,67,-10,18.2,2.3,4.7,0.481,0.0,0.0,0.0,...,44,71,83,50,32,54,41,44,-1.0,0.0
3,1980,25,75,-10,28.9,6.2,11.7,0.531,0.0,0.0,0.0,...,13,27,21,23,11,12,12,6,-1.0,0.0
4,1980,31,80,80,35.8,4.8,9.9,0.482,0.1,0.2,0.222,...,23,14,25,26,33,31,20,1,1.0,0.009
5,1980,31,26,-10,21.5,1.0,2.3,0.45,0.0,0.0,0.0,...,47,85,109,63,21,56,42,38,-1.0,0.0


In [14]:
len(df[df['Season'] > 2015]) / len(df)

0.17507805847289243

In [46]:
val_df = df[df['Season'] > 2015]
dev_df = df[df['Season'] <= 2015]

In [47]:
def oversample(df, os_param = 'ros', sampling_strategy = .3):
    df_os = df.copy()
    df_os.loc[:, 'Contender'] = (df_os['Share'] > 0) * 1
    df_X = df_os.drop('Contender', axis = 1)
    df_y = df_os[['Contender']]

    if os_param == 'smote':
        int_cols = df_X.select_dtypes('int').columns
        cat_index = [df_X.columns.get_loc(column) for column in int_cols]
        os_technique = SMOTENC(sampling_strategy = sampling_strategy, random_state=23, categorical_features= cat_index)  
    else:
        os_technique = RandomOverSampler(sampling_strategy = sampling_strategy, random_state = 23)

    X_resampled, y_resampled = os_technique.fit_resample(df_X, df_y)
    df_ros = pd.concat([X_resampled, y_resampled], axis = 1)
    df_ros.drop('Contender', axis = 1, inplace = True)

    return df_ros, sampling_strategy

In [48]:
dev_df, sampling_ratio = oversample(dev_df, os_param = 'smote', sampling_strategy = .25)

In [49]:
dev_df_X = dev_df.drop('Share', axis=1)
dev_df_y = dev_df[['Share']]

val_df_X = val_df.drop('Share', axis=1)
val_df_y = val_df[['Share']]

In [50]:
X_train, X_test, y_train, y_test = train_test_split(
                                        dev_df_X,
                                        dev_df_y,
                                        test_size = 0.2,
                                        random_state = 23
                                     )

In [51]:
datasets = {'train': X_train, 'test': X_test, 'val': val_df_X}
real_targets = {'train': y_train, 'test': y_test, 'val': val_df_y}

In [52]:
mlflow.set_experiment('NBA MVP prediction model')

<Experiment: artifact_location='file:///C:/Users/marcr/OneDrive/Documents/Data%20Science/NBA%20Project/mlruns/0', experiment_id='0', lifecycle_stage='active', name='NBA MVP prediction model', tags={}>

In [53]:
def eval_metrics(actual, predicted):
    rmse = np.sqrt(metrics.mean_squared_error(actual, predicted))
    mae = metrics.mean_absolute_error(actual, predicted)
    r2 = metrics.r2_score(actual, predicted)

    return {'rmse': rmse, 'mae': mae, 'r2': r2}

In [54]:
def retrieve_best(grid_object):
    best_model = grid_object.best_estimator_    
    best_params = grid_object.best_params_
    best_cv_score = grid_object.best_score_
    best_params['best_ntree_limit'] = best_model.best_ntree_limit
    return best_model, best_params, best_cv_score

In [55]:
def predict_model(model, datasets):
    results_dict = {}
    for type, dataset in datasets.items():
        prediction_series = pd.Series(model.predict(dataset), index = dataset.index, name = 'PredShare')
        results_dict[type] = prediction_series
    return results_dict

In [77]:
def log_params_mlflow_xgb(params, sampling_ratio):
    mlflow.log_param('max_depth', params['max_depth'])
    mlflow.log_param('colsample_bytree', params['colsample_bytree'])
    mlflow.log_param('learning_rate', params['learning_rate'])
    mlflow.log_param('n_estimators', params['n_estimators'])
    mlflow.log_param('best_ntree_limit', params['best_ntree_limit'])
    mlflow.log_param('sampling_ratio', sampling_ratio)

In [78]:
def log_params_mlflow_rf(params, sampling_ratio):
    mlflow.log_param('max_depth', params['max_depth'])
    mlflow.log_param('max_features', params['max_features'])
    mlflow.log_param('min_samples_split', params['min_samples_split'])
    mlflow.log_param('n_estimators', params['model__n_estimators'])
    mlflow.log_param('sampling_ratio', sampling_ratio)

In [79]:
def get_metrics(targets_real, targets_predicted):
    train_metrics = eval_metrics(targets_real['train'], targets_predicted['train'])
    test_metrics = eval_metrics(targets_real['test'], targets_predicted['test'])
    val_metrics = eval_metrics(targets_real['val'], targets_predicted['val'])
    return train_metrics, test_metrics, val_metrics

In [80]:
def log_metrics_mlflow(targets_real, targets_predicted, cv_score):
    train_metrics, test_metrics, val_metrics = get_metrics(targets_real, targets_predicted)

    mlflow.log_metric('rmse_train', train_metrics['rmse'])
    mlflow.log_metric('r2_train', train_metrics['r2'])
    mlflow.log_metric('rmse_test', test_metrics['rmse'])
    mlflow.log_metric('r2_test', test_metrics['r2'])
    mlflow.log_metric('rmse_val', val_metrics['rmse'])
    mlflow.log_metric('r2_val', val_metrics['r2'])
    mlflow.log_metric('rmse_cv', abs(cv_score))

In [81]:
def log_model_mlflow(model):
    mlflow.sklearn.log_model(model, 'model')

In [82]:
def log_df_mlflow(df, path):
    df.to_pickle(os.path.join(path, 'entry_dataframe.pkl'))
    mlflow.log_artifact(os.path.join(path, 'entry_dataframe.pkl'))

In [83]:
def display_metrics(targets_real, targets_predicted, cv_score):
    train_metrics, test_metrics, val_metrics = get_metrics(targets_real, targets_predicted)
    df_results = pd.DataFrame(
        {
            'Train': [train_metrics['rmse'], train_metrics['mae'], train_metrics['r2']],
            'CV': [cv_score * -1, np.nan, np.nan], 
            'Test': [test_metrics['rmse'], test_metrics['mae'], test_metrics['r2']], 
            'Validation': [val_metrics['rmse'], val_metrics['mae'], val_metrics['r2']],
        }, 
        index = ['RMSE', 'MAE', 'R2']
        )
    
    return df_results

In [84]:
def get_advanced_metrics(y_real, y_predict):

    results = pd.concat([y_real, y_predict], axis = 1)

    results_contenders = results[results['Share'] > 0]
    results_no_contenders = results[results['Share'] == 0]

    rmse_contenders = metrics.mean_squared_error(results_contenders['Share'], results_contenders['PredShare']) ** .5
    mae_no_contenders = metrics.mean_absolute_error(results_no_contenders['Share'], results_no_contenders['PredShare']) ** .5

    return rmse_contenders, mae_no_contenders

In [85]:
def log_advanced_metrics_mlflow(y_real, y_predict):
    rmse_contenders, mae_no_contenders = get_advanced_metrics(y_real, y_predict)
    mlflow.log_metric('rmse_cont', rmse_contenders)
    mlflow.log_metric('mae_no_cont', mae_no_contenders)

In [98]:
param_grid = {
    'max_depth': [8, 10, 13], 
    'colsample_bytree': [.8, .5], 
    'random_state': [23], 
    'learning_rate': [.1, .3], 
    'n_estimators': [400]
    }

In [99]:
fit_params = {
    "early_stopping_rounds":10, 
    "eval_metric" : "rmse", 
    "eval_set" : [[X_test, y_test]]
}

In [100]:
with mlflow.start_run():
    model = xgb.XGBRegressor()

    grid_model = GridSearchCV(model, param_grid, scoring = 'neg_root_mean_squared_error', n_jobs = -1, cv = 10)

    grid_model.fit(X_train, y_train, **fit_params)

    best_model, best_params, best_cv_score = retrieve_best(grid_model)

    predicted_targets = predict_model(best_model, datasets)

    log_params_mlflow_xgb(best_params, sampling_ratio)

    log_metrics_mlflow(real_targets, predicted_targets, best_cv_score)

    log_advanced_metrics_mlflow(real_targets['val'], predicted_targets['val'])

    log_model_mlflow(best_model)

    log_df_mlflow(df, dataframes_path)

[0]	validation_0-rmse:0.43663
[1]	validation_0-rmse:0.39349
[2]	validation_0-rmse:0.35475
[3]	validation_0-rmse:0.31983
[4]	validation_0-rmse:0.28851
[5]	validation_0-rmse:0.26043
[6]	validation_0-rmse:0.23509
[7]	validation_0-rmse:0.21229
[8]	validation_0-rmse:0.19199
[9]	validation_0-rmse:0.17373
[10]	validation_0-rmse:0.15729
[11]	validation_0-rmse:0.14250
[12]	validation_0-rmse:0.12940
[13]	validation_0-rmse:0.11762
[14]	validation_0-rmse:0.10714
[15]	validation_0-rmse:0.09780
[16]	validation_0-rmse:0.08949
[17]	validation_0-rmse:0.08207
[18]	validation_0-rmse:0.07552
[19]	validation_0-rmse:0.06976
[20]	validation_0-rmse:0.06476
[21]	validation_0-rmse:0.06026
[22]	validation_0-rmse:0.05637
[23]	validation_0-rmse:0.05295
[24]	validation_0-rmse:0.05002
[25]	validation_0-rmse:0.04747
[26]	validation_0-rmse:0.04532
[27]	validation_0-rmse:0.04342
[28]	validation_0-rmse:0.04175
[29]	validation_0-rmse:0.04044
[30]	validation_0-rmse:0.03928
[31]	validation_0-rmse:0.03833
[32]	validation_0-



In [101]:
grid_model.best_estimator_.best_ntree_limit

227

In [102]:
display_metrics(real_targets, predicted_targets, best_cv_score)

Unnamed: 0,Train,CV,Test,Validation
RMSE,0.000673,0.031088,0.03152,0.031118
MAE,0.000291,,0.008413,0.005154
R2,0.999969,,0.926598,0.697601


In [103]:
def get_val_results(real_val_y, pred_val_y, players_series):

    players_series_val = players_series[players_series.index.get_level_values(1) > 2015]
    results_val = pd.concat([players_series_val, real_val_y, pred_val_y], axis = 1)
    
    results_val_contenders = results_val[results_val['Share'] > 0]
    results_val_no_contenders = results_val[results_val['Share'] == 0]

    return results_val_contenders, results_val_no_contenders

In [104]:
def display_val_results(results_val_contenders, results_val_no_contenders):
    print(f'Contenders Results:')
    for season in set(results_val_contenders.index.get_level_values(1)):
        display(results_val_contenders.loc[pd.IndexSlice[:, season], :].sort_values(by = 'Share', ascending = False))

    print(f'No contenders results:')
    display(results_val_no_contenders[results_val_no_contenders['PredShare'] > 0])

In [105]:
contenders_df, no_contenders_df = get_val_results(real_targets['val'], predicted_targets['val'], pipe['DropPlayers'].players_list_)

In [106]:
display_val_results(contenders_df, no_contenders_df)

Contenders Results:


Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
105,2016,Stephen Curry,1.0,0.924279
263,2016,Kawhi Leonard,0.484,0.323645
222,2016,LeBron James,0.482,0.527682
452,2016,Russell Westbrook,0.371,0.325511
126,2016,Kevin Durant,0.112,0.538428
348,2016,Chris Paul,0.082,0.11418
167,2016,Draymond Green,0.038,0.056498
266,2016,Damian Lillard,0.02,0.023731
178,2016,James Harden,0.007,0.255149
273,2016,Kyle Lowry,0.005,0.078732


Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
458,2017,Russell Westbrook,0.879,0.489463
173,2017,James Harden,0.746,0.508633
261,2017,Kawhi Leonard,0.495,0.43112
221,2017,LeBron James,0.33,0.424559
425,2017,Isaiah Thomas,0.08,0.300424
98,2017,Stephen Curry,0.051,0.288029
16,2017,Giannis Antetokounmpo,0.007,0.134976
453,2017,John Wall,0.007,0.099765
100,2017,Anthony Davis,0.002,0.173429
119,2017,Kevin Durant,0.002,0.336806


Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
195,2018,James Harden,0.955,0.738965
249,2018,LeBron James,0.731,0.50314
122,2018,Anthony Davis,0.441,0.199057
298,2018,Damian Lillard,0.205,0.234328
508,2018,Russell Westbrook,0.075,0.362742
16,2018,Giannis Antetokounmpo,0.074,0.140022
143,2018,Kevin Durant,0.065,0.398838
130,2018,DeMar DeRozan,0.032,0.022899
7,2018,LaMarcus Aldridge,0.006,0.15322
79,2018,Jimmy Butler,0.005,0.083894


Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
18,2019,Giannis Antetokounmpo,0.932,0.595602
207,2019,James Harden,0.768,0.539441
184,2019,Paul George,0.352,0.127189
270,2019,Nikola Jokić,0.21,0.131257
124,2019,Stephen Curry,0.173,0.239218
305,2019,Damian Lillard,0.068,0.153816
155,2019,Joel Embiid,0.049,0.201476
150,2019,Kevin Durant,0.025,0.307134
301,2019,Kawhi Leonard,0.013,0.284012
506,2019,Russell Westbrook,0.008,0.139276


Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
13,2020,Giannis Antetokounmpo,0.952,0.60343
251,2020,LeBron James,0.746,0.551617
199,2020,James Harden,0.363,0.374026
134,2020,Luka Dončić,0.198,0.359295
294,2020,Kawhi Leonard,0.166,0.346502
123,2020,Anthony Davis,0.081,0.299781
396,2020,Chris Paul,0.026,-0.00383
297,2020,Damian Lillard,0.023,0.274318
262,2020,Nikola Jokić,0.018,0.149539
449,2020,Pascal Siakam,0.017,0.006165


Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
263,2021,Nikola Jokić,0.961,0.541411
146,2021,Joel Embiid,0.58,0.162087
116,2021,Stephen Curry,0.449,0.263611
12,2021,Giannis Antetokounmpo,0.345,0.421549
396,2021,Chris Paul,0.138,0.002124
130,2021,Luka Dončić,0.042,0.340534
297,2021,Damian Lillard,0.038,0.29174
421,2021,Julius Randle,0.02,0.045137
439,2021,Derrick Rose,0.01,6e-06
180,2021,Rudy Gobert,0.008,0.086093


No contenders results:


Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2016,Quincy Acy,0.0,0.000014
4,2016,Arron Afflalo,0.0,0.000006
5,2016,Alexis Ajinça,0.0,0.000007
7,2016,LaMarcus Aldridge,0.0,0.033854
8,2016,Cliff Alexander,0.0,0.000007
...,...,...,...,...
533,2021,James Wiseman,0.0,0.000045
535,2021,Robert Woodard II,0.0,0.000009
536,2021,Delon Wright,0.0,0.000107
537,2021,Thaddeus Young,0.0,0.000212


In [107]:
no_contenders_df[no_contenders_df['PredShare'] > 0.1]

Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Share,PredShare
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
97,2016,DeMarcus Cousins,0.0,0.112206
453,2016,Hassan Whiteside,0.0,0.126376
66,2017,Jimmy Butler,0.0,0.120937
156,2017,Rudy Gobert,0.0,0.113382
351,2017,Chris Paul,0.0,0.133337
434,2017,Karl-Anthony Towns,0.0,0.125217
112,2018,DeMarcus Cousins,0.0,0.117166
240,2018,Kyrie Irving,0.0,0.150262
126,2019,Anthony Davis,0.0,0.144202
250,2019,Kyrie Irving,0.0,0.143099


In [108]:
top_features = pd.DataFrame(best_model.feature_importances_, index = X_train.columns, columns = ['Importance']).sort_values('Importance', ascending = False)

In [109]:
top_features[top_features['Importance'] > 0.005]

Unnamed: 0,Importance
WS,0.261669
PER,0.168703
VORP,0.080229
WS/48,0.06144
%W,0.03664
PTS_pg,0.03262
FGA_tot,0.018009
BPM,0.014719
TOV_pg,0.01443
PTS_tot_rank,0.01442
