In [1]:
import pandas as pd
import numpy as np

In [2]:
contests = pd.read_csv('data/WorkingData.csv')[:]
print("There are " + str(len(contests)) + " Contests")

There are 372211 Contests


In [3]:
features = contests.columns.drop('Unnamed: 0').drop('Entries').drop('DistinctUsers')
features = features.drop('GameSet').drop('ContestName').drop('is_vault')
contests = contests[features].set_index('ContestId')

In [4]:
contests.head(1)

Unnamed: 0_level_0,SportName,VariantName,ContestStartDatetimeEST,ContestEndDatetimeEST,ContestPayoutDatetimeEST,EntryFeeAmount,TotalPrizeAmount,MaxNumberPlayers,MaxEntriesPerUser,Contest_Group,NumGames,DraftablePlayersInSet,PaidUsersInDraftGroup,TopPrize,MaxPayoutPosition
ContestId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
7963004,NFL,Classic,2015-09-13 13:00:00.000,2015-09-14 22:20:00.000,2015-09-15 02:27:24.000,0.25,25000.0,117500,100,Headliner,15,910.0,668396,2000.0,23665


In [5]:
def categorizeFeature(df, name):
    newDF = df[[name]]
    cats = {}
    for cat in contests[name].unique():
        cats[name] = (str(name)+'_'+str(cat))
        newDF = newDF.assign(newCol = lambda x: (newDF[name] == cat).astype(float)).rename(columns = {'newCol':cats[name]})    
    newDF = newDF.drop(columns=[name], axis=1)
    return newDF

In [6]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
def scaleFeature(df, name):
    return df.assign(newCol=scaler.fit_transform(df[[name]])).rename(columns={'newCol': name+'_Scaled'}).drop(columns=[name], axis=1)

In [7]:
import math
def circularizeFeature(df, feature):
    fStr = feature[:-len('scaled')]
    df = df.assign(X= lambda c: np.sin(df[feature]*2*math.pi)).assign(Y= lambda c: np.cos(df[feature]*2*math.pi))
    df = df.rename(columns={'X': fStr+'X', 'Y': fStr+'Y'}).drop(columns=[feature], axis=1)
    return df


In [8]:
def thresholdFeature(df):
    df = df.assign(SuccessThreshold= lambda x:(df['TotalPrizeAmount'].astype(float)/(df['EntryFeeAmount'].astype(float))).astype(int))
    return df.drop(columns=['TotalPrizeAmount', 'EntryFeeAmount'], axis=1)

In [9]:
formatString = '%Y-%m-%d %H:%M:%S'
def mapTime2(df, name, colHeader):
    newDF = df[[name]]
    
    newDF = newDF.assign(timeCol = lambda x: pd.to_datetime(df[name], format=formatString))
    newDF[colHeader+'_Minute'] = newDF.timeCol.apply(lambda x: x.strftime('%M'))
    newDF[colHeader+'_Hour'] = newDF.timeCol.apply(lambda x: x.strftime('%H'))
    newDF[colHeader+'_Day'] = newDF.timeCol.apply(lambda x: x.strftime('%d'))
    newDF[colHeader+'_Month'] = newDF.timeCol.apply(lambda x: x.strftime('%m'))
    newDF[colHeader+'_Year'] = newDF.timeCol.apply(lambda x: x.strftime('%Y'))
    newDF[colHeader+'_DayOfWeek'] = newDF.timeCol.apply(lambda x: x.strftime('%w'))
    newDF[colHeader+'_DayOfYear'] = newDF.timeCol.apply(lambda x: x.strftime('%-j'))
    newDF[colHeader+'_WeekOfYear'] = newDF.timeCol.apply(lambda x: x.strftime('%U'))
    newDF = newDF.drop(columns = [name, 'timeCol'], axis=1)
    return newDF

In [67]:
categoricalFeatures = ['SportName', 'VariantName', 'Contest_Group']
timeFeatures = ['ContestStartDatetimeEST', 'ContestEndDatetimeEST', 'ContestPayoutDatetimeEST']
scaleFeatures = ['EntryFeeAmount', 'TotalPrizeAmount', 'MaxNumberPlayers', 'MaxEntriesPerUser', 
                       'NumGames', 'DraftablePlayersInSet', 'PaidUsersInDraftGroup', 'TopPrize', 'MaxPayoutPosition']
mapper = {'ContestStartDatetimeEST' : 'Start', 'ContestEndDatetimeEST' : 'End', 'ContestPayoutDatetimeEST': 'Payout'}
# scaleTimeFeatures
def engineerFeatures(df, startIndex, endIndex):
    subDF = df[startIndex: endIndex]

    cDF = subDF[categoricalFeatures]

    newCDF = cDF[[]]
    for f in categoricalFeatures: 
        tempDF = categorizeFeature(cDF, f)
        newCDF = newCDF.join(tempDF, on='ContestId', how='left')
    newCDF = newCDF.dropna()    
    
    
    threshDF = subDF[['TotalPrizeAmount', 'EntryFeeAmount']]
    threshDF = thresholdFeature(threshDF).dropna()

    tDF = subDF[timeFeatures]
    newTDF = tDF
    for f in timeFeatures:
        tempDF = mapTime2(tDF, f, mapper[f])
        newTDF = tempDF.join(newTDF, on='ContestId', how='left')
#         print('newTDF', len(newTDF.index))
#     newTDF = newTDF.drop(columns=tDF.columns, axis=1)
    
    scaleFeaturesDF = subDF[scaleFeatures]

    collectDF = newTDF.join(subDF, on='ContestId', how='inner')
    collectDF = scaleFeaturesDF.join(newCDF, on='ContestId', how='inner')
    collectDF = threshDF.join(collectDF, on='ContestId', how='inner')
#     collectDF = pd.merge(collectDF, subDF, on='ContestId', how='left')
#     collectDF = collectDF.drop(columns=timeFeatures, axis=1).drop(columns=categoricalFeatures, axis=1)

    print('collectDF', len(collectDF))
#     collectDF = scaleAndLoopFeatures(collectDF)

    return collectDF

In [69]:
scaleFeatures = ['EntryFeeAmount', 'TotalPrizeAmount', 'MaxNumberPlayers', 'MaxEntriesPerUser', 'NumGames', 'DraftablePlayersInSet', 'PaidUsersInDraftGroup', 'TopPrize', 'MaxPayoutPosition']
scaleTimeFeatures = ['Start_Minute', 'Start_Hour', 'Start_Day', 'Start_Month', 'Start_Year', 'Start_DayOfWeek', 'Start_DayOfYear', 'Start_WeekOfYear', 'End_Minute','End_Hour', 'End_Day', 'End_Month', 'End_Year', 'End_DayOfWeek','End_DayOfYear', 'End_WeekOfYear', 'Payout_Minute', 'Payout_Hour','Payout_Day', 'Payout_Month', 'Payout_Year', 'Payout_DayOfWeek','Payout_DayOfYear', 'Payout_WeekOfYear']

def scaleAndLoopFeatures(df):
    sDF = df[scaleFeatures]
    for f in scaleFeatures:
        print('scaling:', f)
        sDF = scaleFeature(sDF, f)
#     sDF = scaleFeature(sDF, "SuccessThreshold")
    stDF = df[scaleTimeFeatures]
    for f in scaleTimeFeatures:
        print('time scaling:', f)
        stDF = scaleFeature(stDF, f)
        stDF = circularizeFeature(stDF, f+'_Scaled')
    
#     return pd.merge(sDF, stDF, on='ContestId')
    return sDF, stDF
# ty = scaleAndLoopFeatures(combinedDF)

In [70]:
stepSize = 10000
dfs = []
def genCSV():
    numContests = len(contests.index)
#     numContests = 50000
    for index in range(0, numContests, stepSize):
        end = index+stepSize-1
        if end > numContests:
            end = numContests
        print("Gathering", index, "through", end)

        df = engineerFeatures(contests, index, end)
        print("gathered: ", len(df))
        dfs.append(df)
        ###############Saving Interim Files!!################
#         df.to_csv('data/CreateData/'+str(index)+".csv")
        #####################################################

In [71]:
genCSV()

Gathering 0 through 9999


ValueError: columns overlap but no suffix specified: Index(['ContestStartDatetimeEST', 'ContestEndDatetimeEST',
       'ContestPayoutDatetimeEST'],
      dtype='object')

In [56]:
dfSum = 0
for df in dfs:
#     print("-",len(df.index))
    dfSum = dfSum + len(df)
dfSum
# dfs[1].index

376638

In [57]:
combinedDF = pd.DataFrame()
for df in dfs:
    combinedDF = combinedDF.append(df, sort = False)
combinedDF = combinedDF.dropna()

In [58]:
len(combinedDF)

376534

In [59]:
combinedDF.tail()

Unnamed: 0_level_0,SuccessThreshold,EntryFeeAmount,TotalPrizeAmount,MaxNumberPlayers,MaxEntriesPerUser,NumGames,DraftablePlayersInSet,PaidUsersInDraftGroup,TopPrize,MaxPayoutPosition,...,Contest_Group_MultiPlayer,Contest_Group_DoubleUp,Contest_Group_Satellite,Contest_Group_Qualifier,Contest_Group_50/50,Contest_Group_H2H,Contest_Group_LiveEvent,Contest_Group_3Max,Contest_Group_Multiplier,Contest_Group_DFSLeagues
ContestId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
58300381,150,1.0,150.0,178,5,3,298.0,6701,30.0,49,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58300538,50,1.0,50.0,59,1,3,298.0,6701,12.5,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58300644,50,1.0,50.0,59,1,3,298.0,6701,12.5,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58300656,50,1.0,50.0,59,1,3,298.0,6701,12.5,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58300706,50,1.0,50.0,59,1,1,100.0,4820,12.5,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
# dfTuple = scaleAndLoopFeatures(combinedDF)

In [65]:
# scaledDF = dfTuple[0]
# timesDF = dfTuple[1]

In [66]:
# len(contests)