This file is used to generate the final data set for machine learning. It cleans and combines Header Data and Time series data into a features.

In [9]:
import pandas as pd
import numpy as np

In [11]:
#Of the header data, these are the columns that are numeric values.
scaleColumns = ['EntryFeeAmount', 'TotalPrizeAmount','MaxNumberPlayers', 'MaxEntriesPerUser', 'DistinctUsers', 'NumGames', 'DraftablePlayersInSet','PaidUsersInDraftGroup', 'TopPrize', 'MaxPayoutPosition', 'Duration']
#Of the header data, these are the columns that are categorical.
categoricalColumns = ['SportName', 'VariantName', 'Contest_Group']

In [12]:
rawDF = pd.read_csv('data/WorkingData.csv')
rawDF = pd.merge(rawDF, pd.read_csv('data/Durations.csv').set_index('ContestId'), on='ContestId', how='left')

In [13]:
print("These are the columns not used in the final dataset:")
print(list(rawDF.drop(columns=scaleColumns).drop(columns=categoricalColumns).columns))

These are the columns not used in the final dataset:
['ContestId', 'DraftGroupId', 'GameSet', 'ContestName', 'ContestStartDatetimeEST', 'ContestEndDatetimeEST', 'ContestPayoutDatetimeEST', 'Entries']


In [14]:
print("Contests that do not fill to goal:", str(len(rawDF[~(rawDF['MaxNumberPlayers'] == rawDF['Entries'])])) + "/"+str (len(rawDF)))

Contests that do not fill to goal: 54481/630446


In [15]:
# These are synonymous, so I'm joining them
rawDF = rawDF.replace('SOC', 'SOCC')
rawDF = rawDF.replace('PGA', 'GOLF')

In [26]:
processedDF = rawDF.copy(deep=True).set_index('ContestId')

### Header Data Stuff

579223 contests hit 98%

575965 contests hit 100%

3258 contests change their success value by thresholding at 98%

In [23]:
## Header data processing
metaColumns = []
successCol = []

processedDF['Success'] = (processedDF['MaxNumberPlayers'] == processedDF['Entries'])
successCol.append('Success')

successDF = processedDF[['Success']]

## Scale numeric data by max of each column
for scol in scaleColumns:
    processedDF[scol+"_Scaled"] = processedDF[scol]/processedDF[scol].max()
    metaColumns.append(scol+"_Scaled")
    
## 1-hot encoding for categorical contests
for scol in categoricalColumns:
    for val in processedDF[scol].unique():
        catStr = scol+"_"+val
        processedDF[catStr] = (processedDF[scol]==val)
        metaColumns.append(catStr)
        
metaDF = processedDF[metaColumns]

In [9]:
metaDF.to_csv('data/MetaData.csv')

### Time Series Pred Stuff

In [28]:
#The Kalman Filter and WLS pred columns are all labeld "Pred_*". 
#Now they will be labeled "Pred_KF_*" and "Pred_LR_*"
def renamePredColumns(df, prefix):
    renameDict = {}
    columns = []
    for col in df.columns:
        if(col[:4]=='Pred'):
            colName = "Pred_"+prefix+"_"+col[4:]
            renameDict[col] = colName
            columns.append(colName)
    df = df.rename(columns=renameDict)
    return df

In [29]:
#The Kalman Filter and WLS AB columns are all labeled "A_*"/"B_*". 
#Now they will be labeled "A_KF_*"/"B_KF_*" and "A_LR_*"/"B_LR_*"
def renameABColumns(df, prefix):
    renameDict = {}
    columns = []
    for col in df.columns:
        colName = col
        version = 'default'
        if(col[-1:][0]=='A'):
            version = col[:-1]
        elif(col[0] == 'A'):
            version = col[1:]
        
        if(not (version == 'default')):
            newName = "A_"+prefix+"_"+version
            renameDict[colName] = newName
            renameDict[colName.replace('A', 'B')] = newName.replace('A', 'B')
            columns.append(newName)
            columns.append(newName.replace('A', 'B'))
    df = df.rename(columns=renameDict)
    return df


In [25]:
def getABDF(dfs):
    for df in dfs:
        print(len(df))

In [26]:
#Limits columns in a DF to the larget value storable in np.float32. 
#This is to ensure that the machine learning algorithms can process the data
MAX_FLOAT = np.finfo(np.float32).max
def limitVal(df, ignore = [], N = MAX_FLOAT):
    df = df.copy(deep=True)
    for c in df.drop(columns=ignore).columns:
        df[c] = df[c].where(df[c] <= N, N) 
    return df

In [41]:
scaled = True

In [36]:
lrDF = pd.read_csv('data/LR_Preds.csv').set_index('ContestId').drop(columns=['Duration'])
lrDF = renamePredColumns(lrDF, 'LR')
lrDF = renameABColumns(lrDF, 'LR')

kfDF = pd.read_csv('data/KF_Preds.csv').set_index('ContestId').drop(columns=['Duration'])
kfDF = renamePredColumns(kfDF, 'KF')
kfDF = renameABColumns(kfDF, 'KF')

In [38]:
predDF = pd.merge(rawDF, kfDF, on='ContestId', how='left')
predDF = pd.merge(predDF, lrDF, on='ContestId', how='left')
predDF = predDF.set_index('ContestId')

In [45]:
kfColumns = []
lrColumns = []
for col in kfDF.columns:
    if(col[:4]=='Pred'):
        label = col+"_Scaled"
        if(scaled):
            predDF[label] = predDF[col]/100
        else:
            predDF[label] = predDF[col]/predDF["MaxNumberPlayers"]
        kfColumns.append(label)
    else:
        kfColumns.append(col)
kfPredsDF = predDF[kfColumns]
kfPredsDF = limitVal(kfPredsDF.dropna(), N=np.finfo(np.float32).max)
        
for col in lrDF.columns:
    if(col[:4]=='Pred'):
        label = col+"_Scaled"
        if(scaled):
            predDF[label] = predDF[col]/100
        else:
            predDF[label] = predDF[col]/predDF["MaxNumberPlayers"]
        lrColumns.append(label)
    else:
        lrColumns.append(col)
        
lrPredsDF = predDF[lrColumns]
lrPredsDF = limitVal(lrPredsDF.dropna(), N=np.finfo(np.float32).max)

In [49]:
kfPredsDF.to_csv('data/KF_Results.csv')
lrPredsDF.to_csv('data/LR_Results.csv')

In [50]:
workedData = pd.merge(metaDF, kfPredsDF, on='ContestId', how='left')
workedData = pd.merge(workedData, lrPredsDF, on='ContestId', how='left')
workedData = pd.merge(successDF, workedData, on='ContestId', how='left')

In [53]:
workedData.to_csv('data/WorkedData.csv')