In [61]:
import pandas as pd
import numpy as np
from tqdm import tqdm

pd.set_option('mode.chained_assignment', None) # turn off warning message of SettingWithCopyWarning 

df = pd.read_csv('../data/2012-18_playerBoxScore.csv') # reading csv in this way for now.
df = df.dropna()

df['playRat'] = (df['playPTS'] + 
                (df['playBLK'] * 2) +
                (df['playTO'] * -0.5) +
                (df['playSTL'] * 2) +
                (df['playAST'] * 1.5) +
                (df['playTRB'] * 1.25))

          
                
# filter columns for modeling
cols_to_use = ['gmDate', 'teamAbbr',  'teamLoc', 'teamRslt', 'playDispNm', 'playMin', 'playRat', 'playPos', 'playStat']
df = df[cols_to_use].copy()
# format datetime for sorting
df['gmDate'] = pd.to_datetime(df['gmDate']).copy()
# replace categorical values with numbers to apply `rolling` to them
rep_dict = {'teamLoc': {'Home':1, 'Away':0},
            'playStat': {'Starter':1, 'Bench':0}}

for x in rep_dict.keys():
    df[x] = df[x].apply(lambda y: rep_dict[x][y])
    
df_org = df.copy()
df = pd.DataFrame() 

cols_keep = ['playDispNm', 'gmDate', 'teamAbbr', 'playMin', 'teamLoc', 'playStat', 'playPos']
cols_roll = ['playMin','playRat']
windows = [5,20]
ewm_alpha = [0.1, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7]
agg_funcs = ['median']

# iterate through names to make new df with rolling and ewm features
name_list = list(df_org['playDispNm'].unique())

for name in tqdm(name_list):
    thisguy = df_org.query("playDispNm == @name").sort_values('gmDate', ascending=True)
    if len(thisguy) < 30: # ignoring players that have less than 30 games' record
        continue
    cols_created = []
    
    # make rolling features
    for col in cols_roll:
        for t in windows:
            for fun in agg_funcs:
                if (t == 1) & (fun in ['std', 'median']):
                    continue  # 1-day std and median are useless
                if (t > 20) & (fun != 'std'):
                    continue # adding std_25, std_30 only
                new_col = col+'_last'+str(t)+'_'+fun
                cols_created.append(new_col)
                thisguy.loc[:, new_col] = getattr(thisguy[col].rolling(t, min_periods=1), fun)().copy()
    
    # make ewm features

    for col in cols_roll:
        for alpha in ewm_alpha:
            new_col_mean = col+'_ewm_0'+str(alpha-int(alpha))[2:] # create ewm feature name
            cols_created.append(new_col_mean)
            thisguy.loc[:, new_col_mean] = thisguy[col].ewm(alpha=alpha, min_periods=1).mean()
            
            if col == 'playMin':
                new_col_std = col+'_ewm_std_0'+str(alpha-int(alpha))[2:] # create ewm feature name
                thisguy.loc[:, new_col_std] = thisguy[col].ewm(alpha=alpha, min_periods=1).std()
                cols_created.append(new_col_std)
        
    # shift created features by 1 row so that it means the "last n games"          
    cols_created.append('gmDate')
    merge_temp = thisguy[cols_created].copy().set_index('gmDate').shift(1, axis = 0).reset_index().copy()
    thisguy_result = pd.merge(thisguy[cols_keep], merge_temp, how='left', on='gmDate')

    # append this guy's result table into df
    df = pd.concat((df, thisguy_result), axis=0, ignore_index=True).copy()

    
df = df.dropna().copy()

# wrangling part ends, save the result dataframe
df.to_csv('../data/player_data_ready.csv', index=False)

100%|██████████| 936/936 [01:28<00:00, 10.52it/s]
