In [1]:
import pickle
import pandas as pd

In [2]:
from datetime import datetime

In [11]:
with open('../Data/allplayerFantasyGameLogs.pickle', 'rb') as handle:
  allplayerFantasyGameLogs = pickle.load(handle)

In [19]:
allplayerFantasyGameLogs.set_index('GAME_DATE', inplace=True)

##Data Integration

We will split the allplayerGameLogs into training and test sets. And we need to use two helpful functions to extract player-level and team-level features to train the prediction model for fantasy points from a player. We also need another helpful function to aggregate corresponding information from the test set for the testing purpose.

In [60]:
def aggr(group):
        group['NumGames'] = group['fullName'].count()
        group['AvgFanPTs'] = group['FanPTs'].mean()
        group['AvgPTS'] = group['PTS'].mean()
        group['AvgMIN'] = group['MIN'].mean() 
        group['AvgFGM'] = group['FGM'].mean()
        group['AvgFGA'] = group['FGA'].mean()
        group['AvgFG3M'] = group['FG3M'].mean()
        group['AvgFG3A'] = group['FG3A'].mean()
        group['AvgREB'] = group['REB'].mean()
        group['AvgAST'] = group['AST'].mean()
        group['AvgSTL'] = group['STL'].mean()
        group['AvgTOV'] = group['TOV'].mean()
        group['AvgPLUS_MINUS'] = group['PLUS_MINUS'].mean()
        group['NumDouBL'] = group['DouBL'].sum()
        group['NumTriBL'] = group['TriBL'].sum()

        group['Last3GameAvgFanPTs'] = group['FanPTs'][:3].mean()
        group['Last6GameAvgFanPTs'] = group['FanPTs'][:6].mean()
        
        group['Last3GameAvgMIN'] = group['MIN'][:3].mean()
        group['Last3GameAvgPTS'] = group['PTS'][:3].mean()
        return group
    
def aggr_stats(date,allplayerFantasyGameLogs):
    interest_columns = ['fullName','Player_ID','Team','position1','MIN','PTS','FGM','FGA', 'FG3M','FG3A', \
                        'REB','AST','STL','TOV','PLUS_MINUS','DouBL','TriBL','FanPTs']
    tmp = allplayerFantasyGameLogs.ix['2015-10-27':date]
    
    playerID_tmp = tmp.reset_index().copy()
    tmp.grouped = playerID_tmp[interest_columns].groupby('Player_ID')
    
    Newdf = tmp.grouped.apply(aggr)
    Newdf = Newdf.drop(['MIN','PTS','FGM','FGA', 'FG3M','FG3A','REB','AST','STL','TOV','PLUS_MINUS','DouBL','TriBL','FanPTs'],axis=1)
    Newdf.drop_duplicates(inplace=True)
    
    bins = [0, 10, 20, 30, 40, 100]
    group_names = ['benchPlayer','belowAvg','average','advanced','top']
    Newdf['Rank']= pd.cut(Newdf['AvgFanPTs'],bins,labels=group_names)
    
    return(Newdf)

In [54]:
def aggr_teamVSteam(group):
        group['TeamStdVSFanPTs'] = group['FanPTs'].std()
        group['TeamAvgVSFanPTs'] = group['FanPTs'].mean()
        group['TeamMaxVSFanPTs'] = group['FanPTs'].max()
        return group

def aggr_team(group):
        group['TeamStdFanPTs'] = group['TeamStdVSFanPTs'].mean()
        group['TeamAvgFanPTs'] = group['TeamAvgVSFanPTs'].mean()
        group['TeamMaxFanPTs'] = group['TeamMaxVSFanPTs'].mean()
        return group    

def generate_team_features(playerGameLogs, playerFeatureTable, date):
    tmp = playerGameLogs['2015-10-27': date]
    tmp = tmp.reset_index()
    bad_players = playerFeatureTable[playerFeatureTable.Rank=='benchPlayer']['Player_ID']
    interest_cols = ['fullName','Player_ID','Team','OpponentTeam','position1','FanPTs','MIN']
    tmp = tmp[interest_cols]
    tmp = tmp[~tmp['Player_ID'].isin(bad_players)]
    
    newdf = tmp.copy()
    newdf_grouped = newdf.groupby(['Team','OpponentTeam'])
        
    Newdf = newdf_grouped.apply(aggr_teamVSteam)
    Newdf.drop(['fullName','Player_ID','MIN','FanPTs','position1'],inplace=True,axis=1)
    Newdf.drop_duplicates(['Team','OpponentTeam'],inplace=True)
    
    Newdf.drop('OpponentTeam',axis=1,inplace=True)
    
    Newdf2 = Newdf.copy()
    Newdf2_grouped = Newdf2.groupby('Team')
    
    Newdf_overall = Newdf2_grouped.apply(aggr_team)
    Newdf_overall.drop(['TeamStdVSFanPTs','TeamAvgVSFanPTs','TeamMaxVSFanPTs'],inplace=True,axis=1)
    Newdf_overall.drop_duplicates('Team',inplace=True)
    
    return(Newdf_overall)

In [64]:
def drop_y(df):
    # list comprehension of the cols that end with '_y'
    to_drop = [x for x in df if x.endswith('_y')]
    df.drop(to_drop, axis=1, inplace=True)

def rename_x(df):
    for col in df:
        if col.endswith('_x'):
            df.rename(columns={col:col.rstrip('_x')}, inplace=True)

In [82]:
def get_train_test(train_date, test_date): #format like'2/10/2016'
    train_date_index = pd.date_range(start='11/1/2015', end=train_date, freq='D')
    train_df = pd.DataFrame()
    
    alldates = allplayerFantasyGameLogs.index
    trydates = pd.date_range(start='10/27/2015', end='2/29/2016', freq='D')
    s = set(alldates)
    nodates = [x for x in trydates if x not in s]
    
    for idx in train_date_index:
        tmp_idx = idx+1
        if tmp_idx not in nodates and idx not in nodates:
            #aggregate the statistics from players -> player-level features
            trainLogs = allplayerFantasyGameLogs.ix['2015-10-27':idx]
            train_player_df = aggr_stats(idx,trainLogs)   
            #next we need to collect the player's next game Fantasy Points.
            next_date = idx + 1
            tmpLogs = allplayerFantasyGameLogs[['fullName', 'Player_ID','Team','OpponentTeam','FanPTs']].ix[next_date]
            tmpLogs.rename(columns={'FanPTs':'NewGameFanPTs'},inplace=True)
            #join the tmpLogs and player festure table by Player_ID, which is based on the players on a new game day
            newgame_df = pd.merge(tmpLogs,train_player_df,how='inner',on='Player_ID')
            drop_y(newgame_df)
            rename_x(newgame_df)

            #get the team features table 
            train_team_df = generate_team_features(allplayerFantasyGameLogs, train_player_df, idx)
            newgame_df = pd.merge(newgame_df,train_team_df,how='left',on='Team')
            train_df = pd.concat([train_df,newgame_df],axis=0)

    test_date_index = pd.date_range(start=train_date, end=test_date, freq='D')[1:]
    start_test_date = pd.date_range(start=train_date, end=test_date, freq='D')[0]
    test_df = pd.DataFrame()
    for idx in test_date_index:
        tmp_idx = idx+1
        if tmp_idx not in nodates and idx not in nodates:
            #aggregate the statistics from players -> player-level features
            testLogs = allplayerFantasyGameLogs.ix[start_test_date:idx]
            test_player_df = aggr_stats(idx,testLogs)   
            #next we need to collect the player's next game Fantasy Points.
            next_date = idx + 1
            tmpLogs = allplayerFantasyGameLogs[['fullName', 'Player_ID','Team','OpponentTeam','FanPTs']].ix[next_date]
            tmpLogs.rename(columns={'FanPTs':'NewGameFanPTs'},inplace=True)
            #join the tmpLogs and player festure table by Player_ID, which is based on the players on a new game day
            newgame_df = pd.merge(tmpLogs,train_player_df,how='inner',on='Player_ID')
            drop_y(newgame_df)
            rename_x(newgame_df)

            #get the team features table 
            test_team_df = generate_team_features(allplayerFantasyGameLogs, test_player_df, idx)
            newgame_df = pd.merge(newgame_df,test_team_df,how='left',on='Team')
            test_df = pd.concat([test_df,newgame_df],axis=0) 
        
    return(train_df, test_df)

In [83]:
train_set, test_set = get_train_test('2/20/2016', '2/28/2016')

In [86]:
train_set.shape

(16384, 29)

In [87]:
test_set.shape

(1269, 29)

In [88]:
with open('../Data/train_set_02_29.pickle', 'wb') as handle:
  pickle.dump(train_set, handle)

In [89]:
with open('../Data/test_set_02_29.pickle', 'wb') as handle:
  pickle.dump(test_set, handle)

##Prepocessing

In [None]:
with open('../Data/train_set_02_29.pickle', 'rb') as handle:
  train_set = pickle.load(handle)

In [None]:
with open('../Data/test_set_02_29.pickle', 'rb') as handle:
  train_set = pickle.load(handle)

##Modeling

##Testing