##Collect allplayerGameLogs by 3.12

In [3]:
import requests
import pickle
from urllib import urlencode
import pandas as pd

In [104]:
import numpy as np
from sets import Set

In [6]:
with open('../Data/playid_df.pickle','rb') as data_file:
    playid_data = pickle.load(data_file)
playid_df = pd.DataFrame(playid_data)
ids_ls = playid_df['playerId']

In [8]:
def collectAllPlayerGameLogs(ids_ls):
    leagueid = '00'
    url = "http://stats.nba.com/stats/playergamelog?"
    u_a = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'} #header is necessary
    allplayerGameLogs_df_ls = []
    season='2015-16'
    seasontype='Regular Season'
    
    for playerid in ids_ls:
        api_param = (('LeagueID', leagueid),('PlayerID',playerid),('Season',season),('SeasonType',seasontype))
        response = requests.get(url, params=api_param,headers={"USER-AGENT":u_a})
        response_json = response.json()
        response_df = pd.DataFrame(response_json['resultSets'][0]['rowSet'],columns=response_json['resultSets'][0]['headers'])
        allplayerGameLogs_df_ls.append(response_df)
        
    allplayerGameLogs_df = pd.concat(allplayerGameLogs_df_ls,axis=0)
    return(allplayerGameLogs_df)

In [9]:
allplayerGameLogs_0313 = collectAllPlayerGameLogs(ids_ls)
allplayerGameLogs_0313.shape

(20875, 27)

In [10]:
with open('../Data/allplayerGameLogs_0313.pickle', 'wb') as handle:
  pickle.dump(allplayerGameLogs_0313, handle)

In [238]:
allplayerGameLogs_0314 = collectAllPlayerGameLogs(ids_ls)

In [239]:
allplayerGameLogs_0314.shape

(20984, 27)

##Data Munging to combine different data sources

In [11]:
allplayerGameLogs = allplayerGameLogs_0313
del allplayerGameLogs_0313

In [31]:
def data_cleaner(allplayerGameLogs):
    #data cleaning 1: add calculated Fantasy Points for each player in each game
    allplayerGameLogs['GAME_DATE'] = pd.to_datetime(allplayerGameLogs['GAME_DATE'])
    del allplayerGameLogs['VIDEO_AVAILABLE']
    dd = ((allplayerGameLogs['PTS']>=10) & (allplayerGameLogs['AST']>=10))| \
         ((allplayerGameLogs['PTS']>=10) & (allplayerGameLogs['REB']>=10))| \
         ((allplayerGameLogs['AST']>=10) & (allplayerGameLogs['REB']>=10))| \
         ((allplayerGameLogs['PTS']>=10) & (allplayerGameLogs['STL']>=10))| \
         ((allplayerGameLogs['REB']>=10) & (allplayerGameLogs['BLK']>=10))| \
         ((allplayerGameLogs['STL']>=10) & (allplayerGameLogs['BLK']>=10))| \
         ((allplayerGameLogs['PTS']>=10) & (allplayerGameLogs['BLK']>=10))
    allplayerGameLogs['DouBL']= dd
    ttt = (allplayerGameLogs['PTS']>=10) & (allplayerGameLogs['AST']>=10) & (allplayerGameLogs['REB']>=10)| \
          (allplayerGameLogs['PTS']>=10) & (allplayerGameLogs['STL']>=10) & (allplayerGameLogs['REB']>=10)| \
          (allplayerGameLogs['PTS']>=10) & (allplayerGameLogs['BLK']>=10) & (allplayerGameLogs['REB']>=10)| \
          (allplayerGameLogs['AST']>=10) & (allplayerGameLogs['BLK']>=10) & (allplayerGameLogs['REB']>=10)| \
          (allplayerGameLogs['PTS']>=10) & (allplayerGameLogs['STL']>=10) & (allplayerGameLogs['AST']>=10)| \
          (allplayerGameLogs['STL']>=10) & (allplayerGameLogs['AST']>=10) & (allplayerGameLogs['REB']>=10)
    allplayerGameLogs['TriBL']= ttt
    allplayerGameLogs['FanPTs'] = 3.5 * allplayerGameLogs['FG3M'] + 1*allplayerGameLogs['FTM'] \
    + 2*(allplayerGameLogs['FGM']-allplayerGameLogs['FG3M']) \
    + 1.25*allplayerGameLogs['REB']+1.5*allplayerGameLogs['AST'] \
    + 2*allplayerGameLogs['STL']+ 2*allplayerGameLogs['BLK'] \
    + 1.5*allplayerGameLogs['DouBL'] + 3*allplayerGameLogs['TriBL'] \
    - 0.5*allplayerGameLogs['TOV']
    allplayerFantasyGameLogs = allplayerGameLogs.set_index('GAME_DATE')
    allplayerFantasyGameLogs = allplayerFantasyGameLogs.sort_index(axis=0)

    #data cleaning 2: make sure the players info we collected correspond to our player_id list.
    allplayerFantasyGameLogs = pd.merge(allplayerFantasyGameLogs.reset_index(), playid_df[['playerId' \
         ,'fullName']], left_on='Player_ID',right_on='playerId', how='left')
    del allplayerFantasyGameLogs['playerId']

    #data cleaning 3: add position information for each player
    with open('../Data/allPlayerBios.pickle', 'rb') as handle:
      playerBios = pickle.load(handle)
    allplayerFantasyGameLogs = pd.merge(allplayerFantasyGameLogs,playerBios[['PERSON_ID','position1']], left_on='Player_ID', \
                                    right_on='PERSON_ID', how='left')
    del allplayerFantasyGameLogs['PERSON_ID']

    #data cleaning 4: add team information for each player
    allplayerFantasyGameLogs['Team'] = allplayerFantasyGameLogs['MATCHUP'].map(lambda x: x.split(' ')[0])
    allplayerFantasyGameLogs['OpponentTeam'] = allplayerFantasyGameLogs['MATCHUP'].map(lambda x: x.split(' ')[2])
    allplayerFantasyGameLogs['HomeGame'] = allplayerFantasyGameLogs['MATCHUP'].map(lambda x: 0 if x.split(' ')[1]=='@' else 1)

    #data cleaning 5: Fix some missing values
    allplayerFantasyGameLogs.loc[(allplayerFantasyGameLogs.fullName.isnull()) & (allplayerFantasyGameLogs.Player_ID==2403),'fullName'] = 'Nene Hilario'
    
    return allplayerFantasyGameLogs

In [32]:
allplayerFantasyGameLogs = data_cleaner(allplayerGameLogs)

In [240]:
allplayerFantasyGameLogs_0314 = data_cleaner(allplayerGameLogs_0314)

In [242]:
allplayerFantasyGameLogs_0314.head()

Unnamed: 0,GAME_DATE,SEASON_ID,Player_ID,Game_ID,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,...,PTS,PLUS_MINUS,DouBL,TriBL,FanPTs,fullName,position1,Team,OpponentTeam,HomeGame
0,2015-10-27,22015,2738,21500003,GSW vs. NOP,W,23,1,3,0.333,...,2,1,False,False,14.25,Andre Iguodala,SF,GSW,NOP,1
1,2015-10-27,22015,2544,21500002,CLE @ CHI,L,36,12,22,0.545,...,25,1,True,False,48.5,LeBron James,SF,CLE,CHI,0
2,2015-10-27,22015,201565,21500002,CHI vs. CLE,W,32,8,22,0.364,...,18,-1,False,False,27.75,Derrick Rose,PG,CHI,CLE,1
3,2015-10-27,22015,201168,21500001,ATL vs. DET,L,16,2,5,0.4,...,4,-3,False,False,11.0,Tiago Splitter,C,ATL,DET,1
4,2015-10-27,22015,203484,21500001,DET @ ATL,W,37,7,14,0.5,...,21,17,False,False,30.5,Kentavious Caldwell-Pope,SG,DET,ATL,0


In [245]:
allplayerFantasyGameLogs_0314.set_index("GAME_DATE",inplace=True)

In [250]:
test = allplayerFantasyGameLogs_0314.ix['2016-03-13']

In [None]:
test[test.fullName]

In [35]:
with open('../Data/allplayerFantasyGameLogs_0313.pickle', 'wb') as handle:
  pickle.dump(allplayerFantasyGameLogs, handle)

##Clean it up to Player Features/Stats Table for modeling

In [42]:
allplayerFantasyGameLogs['GameMonth'] = allplayerFantasyGameLogs['GAME_DATE'].map(lambda dd: dd.month)

In [44]:
allplayerFantasyGameLogs.set_index('GAME_DATE', inplace=True)

def aggr(group):
    test_df = pd.DataFrame()    
    test_df['LastFanPTs'] = group['FanPTs'][-1:]
    test_df['AvgFanPTs'] = group['FanPTs'].mean()
    test_df['AvgPTS'] = group['PTS'].mean()
    test_df['LastPT'] = group['PTS'][-1:]
    test_df['AvgMIN'] = group['MIN'].mean()
    test_df['LastMIN'] = group['MIN'][-1:]
    test_df['AvgFGM'] = group['FGM'].mean()
    test_df['LastFGM'] = group['FGM'][-1:]
    test_df['AvgFGA'] = group['FGA'].mean()
    test_df['LastFGA'] = group['FGA'][-1:]
    test_df['AvgFG3M'] = group['FG3M'].mean()
    test_df['LastFG3M'] = group['FG3M'][-1:]
    test_df['AvgFG3A'] = group['FG3A'].mean()
    test_df['LastFG3A'] = group['FG3A'][-1:]
    test_df['AvgREB'] = group['REB'].mean()
    test_df['LastREB'] = group['REB'][-1:]
    test_df['AvgAST'] = group['AST'].mean()
    test_df['LastAST'] = group['AST'][-1:]
    test_df['AvgSTL'] = group['STL'].mean()
    test_df['AvgTOV'] = group['TOV'].mean() 
    test_df['LastTOV'] = group['TOV'][-1:]
    test_df['AvgPF'] = group['PF'].mean()
    test_df['LastPF'] = group['PF'][-1:]
    test_df['AvgPLUS_MINUS'] = group['PLUS_MINUS'].mean()
    test_df['LastPLUS_MINUS'] = group['PLUS_MINUS'][-1:]
    #group['NumDouBL'] = group['DouBL'].sum()
    #group['NumTriBL'] = group['TriBL'].sum()

    test_df['Last3GameAvgFanPTs'] = group['FanPTs'][-3:].mean()
    test_df['Last3GameAvgMIN'] = group['MIN'][-3:].mean()
    test_df['Last3GameAvgPTS'] = group['PTS'][-3:].mean()
    
    num_team = len(group['Team'].unique())
    if(num_team==1):
        test_df['fullName'] = group['fullName'].unique()
        test_df['Player_ID'] = group['Player_ID'].unique()
        test_df['Team'] = group['Team'].unique()[0]
        test_df['position1'] = group['position1'].unique()[0]
    else:
        test_df['fullName'] = group['fullName'].unique()
        test_df['Player_ID'] = group['Player_ID'].unique()
        test_df['Team'] = group['Team'].unique()[num_team-1]
        test_df['position1'] = group['position1'].unique()       
    
    return(test_df)
    
def aggr_stats(date,allplayerFantasyGameLogs):
    interest_columns = ['fullName','Player_ID','Team','position1','MIN','PTS','FGM','FGA', 'FG3M','FG3A', \
                        'REB','AST','STL','TOV','PF','PLUS_MINUS','DouBL','TriBL','FanPTs']
    tmp = allplayerFantasyGameLogs.ix['2015-10-27':date]
    
    playerID_tmp = tmp.reset_index().copy()
    tmp.grouped = playerID_tmp[interest_columns].groupby('Player_ID')
    Newdf = pd.DataFrame()
    ids = playerID_tmp['Player_ID'].unique()
    
    for id in ids:
        group = tmp.grouped.get_group(id)
        df = aggr(group)
        Newdf = pd.concat([Newdf,df],axis=0)
    
    bins = [-10, 10, 20, 30, 40, 100]
    group_names = ['benchPlayer','belowAvg','average','advanced','top']
    Newdf['Rank']= pd.cut(Newdf['AvgFanPTs'],bins,labels=group_names)
    
    return(Newdf)

In [45]:
def aggr_teamVSteam(group):
        group['TeamStdVSFanPTs'] = group['FanPTs'].std()
        group['TeamAvgVSFanPTs'] = group['FanPTs'].mean()
        group['TeamMaxVSFanPTs'] = group['FanPTs'].max()
        return group

def aggr_team(group):
        group['TeamStdFanPTs'] = group['TeamStdVSFanPTs'].mean()
        group['TeamAvgFanPTs'] = group['TeamAvgVSFanPTs'].mean()
        group['TeamMaxFanPTs'] = group['TeamMaxVSFanPTs'].mean()
        return group    

def generate_team_features(playerGameLogs, playerFeatureTable, date):
    tmp = playerGameLogs['2015-10-27': date]
    tmp = tmp.reset_index()
    bad_players = playerFeatureTable[playerFeatureTable.Rank=='benchPlayer']['Player_ID']
    interest_cols = ['fullName','Player_ID','Team','OpponentTeam','position1','FanPTs','MIN']
    tmp = tmp[interest_cols]
    tmp = tmp[~tmp['Player_ID'].isin(bad_players)]
    
    newdf = tmp.copy()
    newdf_grouped = newdf.groupby(['Team','OpponentTeam'])
        
    Newdf = newdf_grouped.apply(aggr_teamVSteam)
    Newdf.drop(['fullName','Player_ID','MIN','FanPTs','position1'],inplace=True,axis=1)
    Newdf.drop_duplicates(['Team','OpponentTeam'],inplace=True)
    
    Newdf.drop('OpponentTeam',axis=1,inplace=True)
    
    Newdf2 = Newdf.copy()
    Newdf2_grouped = Newdf2.groupby('Team')
    
    Newdf_overall = Newdf2_grouped.apply(aggr_team)
    Newdf_overall.drop(['TeamStdVSFanPTs','TeamAvgVSFanPTs','TeamMaxVSFanPTs'],inplace=True,axis=1)
    Newdf_overall.drop_duplicates('Team',inplace=True)
    
    return(Newdf_overall)

In [46]:
def drop_y(df):
    # list comprehension of the cols that end with '_y'
    to_drop = [x for x in df if x.endswith('_y')]
    df.drop(to_drop, axis=1, inplace=True)

def rename_x(df):
    for col in df:
        if col.endswith('_x'):
            df.rename(columns={col:col.rstrip('_x')}, inplace=True)

In [61]:
def clean_trainingdata(data):
    data['Rank_dup'] = data['Rank']
    data = data[data.Rank!='benchPlayer']
    var_to_encode = ['Team','OpponentTeam','position1','HomeGame','Rank','GameMonth']
    data = pd.get_dummies(data, columns=var_to_encode)
    return(data)

In [57]:
from sklearn import cross_validation, metrics

In [60]:
def get_train_test(train_date): #format like'2/10/2016' 
    train_date_index = pd.date_range(start='11/10/2015', end=train_date, freq='D')
    data = pd.DataFrame()
    
    alldates = allplayerFantasyGameLogs.index
    trydates = pd.date_range(start='10/27/2015', end='3/12/2016', freq='D') #need to adjust the end data for a specific training
    s = set(alldates)
    nodates = [x for x in trydates if x not in s]
    
    for idx in train_date_index:
        tmp_idx = idx+1
        if tmp_idx not in nodates and idx not in nodates:
            #aggregate the statistics from players -> player-level features
            trainLogs = allplayerFantasyGameLogs.ix['2015-10-27':idx]
            train_player_df = aggr_stats(idx,trainLogs)   
            #next we need to collect the player's next game Fantasy Points.
            next_date = idx + 1
            tmpLogs = allplayerFantasyGameLogs[['fullName', 'Player_ID','Team','OpponentTeam','HomeGame','FanPTs','GameMonth']].ix[next_date]
            tmpLogs.rename(columns={'FanPTs':'NewGameFanPTs'},inplace=True)
            #join the tmpLogs and player festure table by Player_ID, which is based on the players on a new game day
            newgame_df = pd.merge(tmpLogs,train_player_df,how='inner',on='Player_ID')
            drop_y(newgame_df)
            rename_x(newgame_df)

            #get the team features table 
            train_team_df = generate_team_features(allplayerFantasyGameLogs, train_player_df, idx)
            newgame_df = pd.merge(newgame_df,train_team_df,how='left',on='Team')
            data = pd.concat([data,newgame_df],axis=0)
    
    data = clean_trainingdata(data)
    target='NewGameFanPTs'
    predictors = [x for x in data.columns if x not in [target]]
    
    X_train = data[predictors]
    y_train = data[target]  
    
    train_df,test_df,y_train,y_test = cross_validation.train_test_split(X_train,y_train,test_size=0.3,random_state=1)
    return(train_df,test_df,y_train,y_test) 
#actually, the test_df is the validation set to control overfitting for our models. 
#The "real" test_set is only the games on a new day.

In [62]:
train_df,test_df,y_train,y_test = get_train_test('3/11/2016')

In [68]:
y_train = pd.DataFrame(y_train,columns=['NewGameFanPTs'])

In [70]:
y_test = pd.DataFrame(y_test,columns=['NewGameFanPTs'])

In [73]:
train_df.to_csv('../Data/train_df_0313.csv',index=False)
y_train.to_csv('../Data/y_train_0313.csv',index=False)
test_df.to_csv('../Data/valid_df_0313.csv',index=False)
y_test.to_csv('../Data/y_valid_0313.csv',index=False)

##Prepare the Experiment Test Set

In [233]:
def get_experiment_set(games_list, gamedate): #games_list = ['HOU@GWS','SAC@CLE'], gamedate='3/16/2016' (the day before newgameday for feature table)
    OpponentTeam = [x.split('@')[0] for x in games_list]
    HomeTeam = [x.split('@')[1] for x in games_list]
    teams = HomeTeam + OpponentTeam 
    matchup_map = {}
    for (x,y) in zip(HomeTeam,OpponentTeam):
        matchup_map[x]=y
        matchup_map[y]=x 
    
    GameMonth = gamedate.split('/')[0]
    
    #get all the players in the games today
    #with open('allplayerFantasyGameLogs_0313.pickle', 'rb') as handle:
    #    allplayerFantasyGameLogs = pickle.load(handle)
    todayPlayers_df = allplayerFantasyGameLogs[allplayerFantasyGameLogs['Team'].isin(pd.Series(teams))][['fullName','Player_ID','position1','Team']] 
    todayPlayers_df = todayPlayers_df.drop_duplicates()
    
    todayPlayers_df['HomeGame'] = todayPlayers_df['Team'].map(lambda x:1 if x in HomeTeam else 0)
    todayPlayers_df['OpponentTeam'] = todayPlayers_df['Team'].map(lambda x: matchup_map[x])
    todayPlayers_df['GameMonth'] = GameMonth
    
    #integrate the player feature table
    tmptrainLogs = allplayerFantasyGameLogs.ix['2015-10-27':gamedate]
    train_player_df = aggr_stats(gamedate,tmptrainLogs)   
    
    #join the information together
    newgame_df = pd.merge(todayPlayers_df,train_player_df,how='inner',on='Player_ID')
    drop_y(newgame_df)
    rename_x(newgame_df)
    
    #include the team-level features
    train_team_df = generate_team_features(allplayerFantasyGameLogs, train_player_df, gamedate)
    newgame_df = pd.merge(newgame_df,train_team_df,how='left',on='Team')
    ntest_df = newgame_df[newgame_df.Rank!='benchPlayer']
    
    #clean up the data frame for modeling. Note that we should fill out the data attributes with 0.
    var_to_encode = ['Team','OpponentTeam','position1','HomeGame','Rank','GameMonth']
    ntest_df = pd.get_dummies(ntest_df, columns=var_to_encode)
    team_set = Set(allplayerFantasyGameLogs.Team.unique())
    oppteam_set = Set(allplayerFantasyGameLogs.OpponentTeam.unique())
    mon_set = Set(allplayerFantasyGameLogs.GameMonth.unique())
    
    notin_team_ls = list(team_set - Set(teams))
    notin_oppteam_ls = list(oppteam_set - Set(teams))
    notin_mon_ls = list(mon_set - Set([int(GameMonth)]))

    notin_team_ls = ['Team_'+ele for ele in notin_team_ls]
    notin_oppteam_ls = ['OpponentTeam_'+ele for ele in notin_oppteam_ls]
    notin_mon_ls = ['GameMonth_'+str(ele) for ele in notin_mon_ls]
    names_ls = notin_team_ls+notin_oppteam_ls+notin_mon_ls
    
    nrow = ntest_df.shape[0]
    ncol = len(names_ls)
    dummy_df = pd.DataFrame(np.zeros((nrow,ncol)),columns=names_ls)
    ntest_df.index = range(nrow)
    
    ntest_df = pd.concat([ntest_df,dummy_df],axis=1)
    
    return ntest_df

In [234]:
ttdf = get_experiment_set(['CLE@LAC','IND@ATL', 'UTA@SAC', 'MIL@BKN', 'NYK@LAL'], '3/12/2016')

In [235]:
ttdf.shape

(110, 110)

In [237]:
ttdf.to_csv('../Data/experiment_test_df_0313.csv',index=False)