## Make sure to run Chunking Series and Durations first. This will not be accurate if those files are not up to date.

In [2]:
import pandas as pd
import numpy as np

In [9]:
df = pd.read_csv('data/MetaData.csv')

In [12]:
len(df.columns.drop("ContestId"))

61

In [2]:
scaleColumns = ['EntryFeeAmount', 'TotalPrizeAmount','MaxNumberPlayers', 'MaxEntriesPerUser', 'DistinctUsers', 'NumGames', 'DraftablePlayersInSet','PaidUsersInDraftGroup', 'TopPrize', 'MaxPayoutPosition', 'Duration']
categoricalColumns = ['SportName', 'VariantName', 'Contest_Group']

In [3]:
rawDF = pd.read_csv('data/WorkingData.csv')
rawDF = pd.merge(rawDF, pd.read_csv('data/Durations.csv').set_index('ContestId'), on='ContestId', how='left')

In [4]:
rawDF.drop(columns=scaleColumns).drop(columns=categoricalColumns).columns

Index(['ContestId', 'DraftGroupId', 'GameSet', 'ContestName',
       'ContestStartDatetimeEST', 'ContestEndDatetimeEST',
       'ContestPayoutDatetimeEST', 'Entries'],
      dtype='object')

In [77]:
len(rawDF)

630446

In [78]:
len(rawDF[rawDF['MaxNumberPlayers'] == rawDF['Entries']])

575965

In [79]:
rawDF = rawDF.replace('SOC', 'SOCC')
rawDF = rawDF.replace('PGA', 'GOLF')

In [80]:
processedDF = rawDF.copy(deep=True).set_index('ContestId')

In [81]:
metaColumns = []
successCol = []


processedDF['Success'] = (processedDF['MaxNumberPlayers'] == processedDF['Entries'])
successCol.append('Success')

successDF = processedDF[['Success']]

for scol in scaleColumns:
    processedDF[scol+"_Scaled"] = processedDF[scol]/processedDF[scol].max()
    metaColumns.append(scol+"_Scaled")
    
for scol in categoricalColumns:
    for val in processedDF[scol].unique():
        catStr = scol+"_"+val
        processedDF[catStr] = (processedDF[scol]==val)
        metaColumns.append(catStr)
        
metaDF = processedDF[metaColumns]

In [82]:
metaDF.to_csv('data/MetaData.csv')

In [83]:
def renamePredColumns(df, prefix):
    renameDict = {}
    columns = []
    for col in df.columns:
        if(col[:4]=='Pred'):
            colName = "Pred_"+prefix+"_"+col[4:]
            renameDict[col] = colName
            columns.append(colName)
    df = df.rename(columns=renameDict)
    return df



In [84]:
def renameABColumns(df, prefix):
    renameDict = {}
    columns = []
    for col in df.columns:
        colName = col
        version = 'default'
        if(col[-1:][0]=='A'):
            version = col[:-1]
        elif(col[0] == 'A'):
            version = col[1:]
        
        if(not (version == 'default')):
            newName = "A_"+prefix+"_"+version
            renameDict[colName] = newName
            renameDict[colName.replace('A', 'B')] = newName.replace('A', 'B')
            columns.append(newName)
            columns.append(newName.replace('A', 'B'))
    df = df.rename(columns=renameDict)
    return df


In [85]:
def getABDF(dfs):
    for df in dfs:
        print(len(df))
# getABDF([lrDF, kfDF])

In [86]:
MAX_FLOAT = np.finfo(np.float32).max
#Limits columns in a DF to  particular value
def limitVal(df, ignore = [], N = MAX_FLOAT):
    df = df.copy(deep=True)
    for c in df.drop(columns=ignore).columns:
#         df[c].loc[df[c] >= N] = N
#         print(df[c].dtype)
#         if(not (df[c].dtype == 'bool')):
        df[c] = df[c].where(df[c] <= N, N) 
    return df

### Raw

In [87]:
#UNSCALED
lrDF = pd.read_csv('data/unscaled/LR_Preds.csv').set_index('ContestId').drop(columns=['Duration', 'Unnamed: 0'])
lrDF = renamePredColumns(lrDF, 'LR')
lrDF = renameABColumns(lrDF, 'LR')

kfDF = pd.read_csv('data/unscaled/KF_Preds.csv').set_index('ContestId').drop(columns=['Duration'])
kfDF = renamePredColumns(kfDF, 'KF')
kfDF = renameABColumns(kfDF, 'KF')

In [88]:
predDF = pd.merge(rawDF, kfDF, on='ContestId', how='left')
predDF = pd.merge(predDF, lrDF, on='ContestId', how='left')
predDF = predDF.set_index('ContestId')
# predDF = pd.merge(predDF, lrDF, on='ContestId', how='left')

In [89]:
kfColumns = []
lrColumns = []

for col in kfDF.columns:
    if(col[:4]=='Pred'):
        label = col+"_Scaled"
        predDF[label] = predDF[col]/predDF["MaxNumberPlayers"]
        kfColumns.append(label)
    else:
        kfColumns.append(col)

kfPredsDF = predDF[kfColumns]
kfPredsDF = limitVal(kfPredsDF, N=np.finfo(np.float32).max)
        
for col in lrDF.columns:
    if(col[:4]=='Pred'):
        label = col+"_Scaled"
        predDF[label] = predDF[col]/predDF["MaxNumberPlayers"]
        lrColumns.append(label)
    else:
        lrColumns.append(col)
        
lrPredsDF = predDF[lrColumns]
lrPredsDF = limitVal(lrPredsDF, N=np.finfo(np.float32).max)

In [90]:
kfPredsDF.to_csv('data/unscaled/KF_Results.csv')
lrPredsDF.to_csv('data/unscaled/LR_Results.csv')

In [91]:
workedData = pd.merge(metaDF, kfPredsDF, on='ContestId', how='left')
workedData = pd.merge(workedData, lrPredsDF, on='ContestId', how='left')
workedData = pd.merge(successDF, workedData, on='ContestId', how='left')

In [92]:
workedData.to_csv('data/unscaled/WorkedData.csv')

### Scaled

In [93]:
#SCALED
lrDF = pd.read_csv('data/scaled/LR_Preds.csv').set_index('ContestId')#.drop(columns=['Duration', 'Unnamed: 0'])
lrDF = renamePredColumns(lrDF, 'LR')
lrDF = renameABColumns(lrDF, 'LR')

kfDF = pd.read_csv('data/scaled/KF_Preds.csv').set_index('ContestId')
kfDF = renamePredColumns(kfDF, 'KF')
kfDF = renameABColumns(kfDF, 'KF')

In [94]:
predDF = pd.merge(rawDF, lrDF, on='ContestId', how='left')
predDF = pd.merge(predDF, kfDF, on='ContestId', how='left')
predDF = predDF.set_index('ContestId')

In [95]:
kfColumns = []
lrColumns = []

for col in kfDF.columns:
    if(col[:4]=='Pred'):
        label = col+"_Scaled"
        predDF[label] = predDF[col]/100
        kfColumns.append(label)
    else:
        kfColumns.append(col)

kfPredsDF = predDF[kfColumns]
kfPredsDF = limitVal(kfPredsDF.dropna())
        
for col in lrDF.columns:
    if(col[:4]=='Pred'):
        label = col+"_Scaled"
        predDF[label] = predDF[col]/100
        lrColumns.append(label)
    else:
        lrColumns.append(col)
        
lrPredsDF = predDF[lrColumns]
lrPredsDF = limitVal(lrPredsDF.dropna())

In [96]:
kfPredsDF.to_csv('data/scaled/KF_Results.csv')
lrPredsDF.to_csv('data/scaled/LR_Results.csv')

In [97]:
workedData = pd.merge(metaDF, kfPredsDF, on='ContestId', how='left')
workedData = pd.merge(workedData, lrPredsDF, on='ContestId', how='left')
workedData = pd.merge(successDF, workedData, on='ContestId', how='left')

In [98]:
workedData.to_csv('data/scaled/WorkedData.csv')