In [1]:
import pandas as pd
import numpy as np
import time
pd.set_option('mode.chained_assignment', None) # turn off warning message of SettingWithCopyWarning 

In [14]:
df = pd.read_csv('../data/2012-18_playerBoxScore.csv') # reading csv in this way for now.

# Data Wrangling / Feature Engineering  
> <font color='blue'>Please refer to **'../doc/metadata_playerBoxScore.pdf'** for description on the meaning of features.  
> The wrangling part will take about 8 minutes to run, and the result dataframe will be about 140MB.

In [3]:
df.shape

(155713, 51)

In [4]:
df = df.dropna()

In [5]:
df.shape

(155672, 51)

In [6]:
# filter columns for modeling
cols_to_use = ['gmDate', 'teamAbbr',  'teamLoc', 'teamRslt', 'playDispNm', 'playStat', 'playMin', 'playPos', 'playPTS', 'playAST', 'playTO', 'playSTL', 'playBLK', 'playTRB']
df = df[cols_to_use].copy()

In [7]:
# replace categorical values with numbers to apply `rolling` to them
rep_dict = {'teamLoc': {'Home':1, 'Away':0},
            'teamRslt': {'Win':1, 'Loss':0},
            'playStat': {'Starter':1, 'Bench':0}}

for x in rep_dict.keys():
    df[x] = df[x].apply(lambda y: rep_dict[x][y])

In [8]:
df_org = df.copy()

In [9]:
start = time.time() # timer

df = pd.DataFrame() 

cols_keep = ['playDispNm', 'gmDate', 'teamAbbr', 'playMin', 'teamLoc', 'playStat', 'playPos']
cols_roll = ['teamLoc', 'teamRslt', 'playMin', 'playPTS', 'playAST', 'playTO', 'playSTL', 'playBLK', 'playTRB']
windows = [1, 5, 10, 15, 20]
agg_funcs = ['mean', 'std', 'median']

# iterate through names to make new df with windows features
name_list = list(df_org['playDispNm'].unique())

for name in name_list:
    thisguy = df_org.query("playDispNm == @name")
    if len(thisguy) < 30: # ignoring players that have less than 30 games' record
        continue
    cols_created = []

    for col in cols_roll:
        for t in windows:
            for fun in agg_funcs:
                if (t == 1) & (fun in ['std', 'median']):
                    continue  # 1-day std and median are useless
                new_col = col+'_last'+str(t)+'_'+fun
                cols_created.append(new_col)
                thisguy.loc[:, new_col] = getattr(thisguy[col].rolling(t, min_periods=1), fun)().copy()

    # shift created features by 1 row so that it means the "last n games"          
    cols_created.append('gmDate')
    merge_temp = thisguy[cols_created].copy().set_index('gmDate').shift(1, axis = 0).reset_index().copy()
    thisguy_result = pd.merge(thisguy[cols_keep], merge_temp, how='left', on='gmDate')

    # append this guy's result table into df
    df = pd.concat((df, thisguy_result), axis=0, ignore_index=True).copy()

end = time.time()
print("time used:{}".format(end - start))

time used:230.57562494277954


In [10]:
df.shape

(152929, 124)

In [11]:
df = df.dropna().copy()

In [12]:
df.shape

(151493, 124)

In [13]:
# wrangling part ends, save the result dataframe
# df.to_csv('../data/player_data_ready.csv', index=False)

#### <font color='red'>Some notes on the wrangling part: 
> * There's only "Regular" data. No "pre season" / "post season".  
> * Didn't assign any value to the `win_types` argument of `rolling` function, meaning all windows values are calculated with equal weights.   
> * Players having less than 30 games' record are filtered out.