In [1]:
import pandas as pd
import numpy as np
import os

from sklearn import model_selection, metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

import mlflow

In [2]:
dataframes_path = os.path.join(os.getcwd(), 'dataframes')

In [3]:
df = pd.read_pickle(os.path.join(dataframes_path, 'complete_stats.pkl'))

In [4]:
def check_rk_season(df, rk_season_pairs):
    return df.apply(lambda x: (x['Rk'], x['Season']) in rk_season_pairs, axis = 1)

def drop_players_multiteams(df):
    df_tot = df[df['Tm'] == 'TOT']
    rk_season_pairs = list(zip(df_tot['Rk'], df_tot['Season']))
    df_tot_full = df[check_rk_season(df, rk_season_pairs)]
    drop_index = df_tot_full[df_tot_full['Tm'] != 'TOT'].index
    return df.drop(drop_index).reset_index(drop = True)

- Drop players with more than one team
- Index with tuple for Rk & Player
- Drop columns, Rk GT, Votes, MaxVotes
- LabelEncode for Player, Pos and Team
- Separate types of columns

In [5]:
class DropPlayersMultiTeams(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):   
        df_tot = X[X['Tm'] == 'TOT']
        rk_season_pairs = list(zip(df_tot['Rk'], df_tot['Season']))
        df_tot_full = X[check_rk_season(X, rk_season_pairs)]
        drop_index = df_tot_full[df_tot_full['Tm'] != 'TOT'].index
        return X.drop(drop_index).reset_index(drop = True)

In [6]:
class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_drop):
        self.cols_to_drop = cols_to_drop
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        return X.drop(columns = self.cols_to_drop)

In [7]:
class SetIndex(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        return X.set_index(['Rk', 'Season'], drop = False)

In [8]:
cols_to_drop = ['Rk', 'GT', 'Votes', 'MaxVotes', 'FG_tot', '3PA_tot', '2PA_tot', 'FGA_tot_rank']

In [9]:
pipe = Pipeline(steps = [
    ('DropPlayersMultiTeams', DropPlayersMultiTeams()),
    ('SetIndex', SetIndex()),
    ('DropColumns', DropColumns(cols_to_drop))
])

In [10]:
df = pipe.fit_transform(df)

In [11]:
encoder_player = LabelEncoder()
encoder_position = LabelEncoder()
encoder_team = LabelEncoder()

In [12]:
df['Player'] = encoder_player.fit_transform(df['Player'])
df['Pos'] = encoder_player.fit_transform(df['Pos'])
df['Tm'] = encoder_team.fit_transform(df['Tm'])

In [13]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Pos,Age,Tm,G,GS,MP_pg,FG_pg,FGA_pg,FG%,...,DWS_rank,WS_rank,WS/48_rank,OBPM_rank,DBPM_rank,BPM_rank,VORP_rank,%W_rank,%GS,Share
Rk,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1980,1729,0,32,16,82,-10,38.3,10.2,16.9,0.604,...,3,1,1,5,7,2,1,2,-1.0,0.665
2,1980,3007,2,25,11,67,-10,18.2,2.3,4.7,0.481,...,44,71,83,50,32,54,41,44,-1.0,0.0
3,1980,89,0,25,29,75,-10,28.9,6.2,11.7,0.531,...,13,27,21,23,11,12,12,6,-1.0,0.0
4,1980,2994,5,31,1,80,80,35.8,4.8,9.9,0.482,...,23,14,25,26,33,31,20,1,1.0,0.009
5,1980,793,0,31,5,26,-10,21.5,1.0,2.3,0.45,...,47,85,109,63,21,56,42,38,-1.0,0.0


In [19]:
len(df[df['Season'] > 2014]) / len(df)

0.20318950404343156