In [1]:
import pandas as pd
import numpy as np

In [2]:
contests = pd.read_csv('data/WorkingData.csv').dropna().set_index('ContestId')

In [4]:
def processCategories(df, columns, verbose=False):
    localDF = df[columns].copy(deep=True)
    for col in columns:
        if(verbose):
            print("Splitting", col)
        for val in localDF[col].unique():
            thisCat = col+'_'+val
            localDF = localDF.assign(newCol=lambda x:1*(localDF[col] == val)).rename(columns = {'newCol':thisCat})
#             print(localDF.head(1))
    localDF= localDF.drop(columns=columns, axis=1)
    return localDF

In [5]:
def timeSubstring(myStr):
    return myStr[len('Contest'):-len('DatetimeEST')]
    
def processTimes(df, columns, verbose=False, formatString = '%Y-%m-%d %H:%M:%S'):
    localDF = df[columns].copy(deep=True)
    for col in columns:
        newColName = timeSubstring(col)
        if(verbose):
            print("Processing", newColName, "("+ col +")")
        localDF = localDF.assign(timeCol = lambda x: pd.to_datetime(localDF[col], format=formatString))
        localDF[newColName+'_Minute'] = localDF.timeCol.apply(lambda x: x.strftime('%M')).astype(float)
        localDF[newColName+'_Minute'] = localDF[newColName+'_Minute'].apply(lambda x: x/60.0)

        localDF[newColName+'_Hour'] = localDF.timeCol.apply(lambda x: x.strftime('%H')).astype(float)
        localDF[newColName+'_Hour'] = localDF[newColName+'_Hour'].apply(lambda x: x/24.0)

        localDF[newColName+'_Day'] = localDF.timeCol.apply(lambda x: x.strftime('%d')).astype(float)
        localDF[newColName+'_Day'] = localDF[newColName+'_Hour'].apply(lambda x: x/31.0)

        localDF[newColName+'_Month'] = localDF.timeCol.apply(lambda x: x.strftime('%m')).astype(float)
        localDF[newColName+'_Month'] = localDF[newColName+'_Month'].apply(lambda x: x/12.0)

        localDF[newColName+'_DayOfWeek'] = localDF.timeCol.apply(lambda x: x.strftime('%w')).astype(float)
        localDF[newColName+'_DayOfWeek'] = localDF[newColName+'_DayOfWeek'].apply(lambda x: x/7.0)

        localDF[newColName+'_DayOfYear'] = localDF.timeCol.apply(lambda x: x.strftime('%-j')).astype(float)
        localDF[newColName+'_DayOfYear'] = localDF[newColName+'_DayOfYear'].apply(lambda x: x/365.0)

        localDF[newColName+'_WeekOfYear'] = localDF.timeCol.apply(lambda x: x.strftime('%U')).astype(float)
        localDF[newColName+'_WeekOfYear'] = localDF[newColName+'_WeekOfYear'].apply(lambda x: x/52.0)

        #Assuming contests are between 2000 and 2100
        localDF[newColName+'_Year'] = localDF.timeCol.apply(lambda x: x.strftime('%Y')).astype(float)
        localDF[newColName+'_Year'] = localDF[newColName+'_Year'].apply(lambda x: (x-2000.0)/100.)

        localDF = localDF.drop(columns=[col, 'timeCol'], axis=1)
    return localDF

In [6]:
def scaleColumns(df, columns, verbose=False, zScore=False):
    localDF = df[columns].copy(deep=True)
    for col in columns:
        newName = col + "_Scaled"
        if(verbose):
            print("Scaling", col)
        if(zScore):
            localDF[newName] = (localDF[col] - localDF[col].mean())/localDF[col].std()
#             localDF[newName] = (localDF[newName] - localDF[newName].min())/(localDF[newName].max() - localDF[newName].min())
        else:
            localDF[newName] = (localDF[col] - localDF[col].min())/(localDF[col].max() - localDF[col].min())
    localDF= localDF.drop(columns=columns, axis=1)
    return localDF

In [7]:
successColumns = ['MaxNumberPlayers', 'EntryFeeAmount', 'Entries', 'TotalPrizeAmount']
def successAndFillGapDollars(df, verbose=False):
    localDF = df[successColumns].copy(deep=True)
    
    if(verbose):
        print("Calculating Success")
    localDF = localDF.assign(Success = localDF['MaxNumberPlayers'] == localDF['Entries'])
    
    if(verbose):
        print("Calculating FillGapDollar")
    localDF = localDF.assign(FillGapDollars = (localDF['MaxNumberPlayers'] - localDF['Entries'])*localDF['EntryFeeAmount'])
    
    if(verbose):
        print("Calculating TrueLoss")
    localDF = localDF.assign(TrueLoss = (localDF['TotalPrizeAmount']) - (localDF['Entries']*localDF['EntryFeeAmount']))
    
    localDF = localDF.drop(columns=successColumns, axis=1)
    return localDF

In [25]:
toCombine = []

successDF = successAndFillGapDollars(contests, verbose=True)
toCombine.append(successDF)

scaleFeatures = ['EntryFeeAmount', 'TotalPrizeAmount', 'MaxNumberPlayers', 'MaxEntriesPerUser', 'NumGames', 'DraftablePlayersInSet', 'PaidUsersInDraftGroup', 'TopPrize', 'MaxPayoutPosition']
scaledDF = scaleColumns(contests, scaleFeatures, verbose=True, zScore=False)
toCombine.append(scaledDF)


timeDF = processTimes(contests, ['ContestStartDatetimeEST', 'ContestEndDatetimeEST'], verbose=True)
toCombine.append(timeDF)

categoryDF = processCategories(contests, ['SportName', 'VariantName', 'Contest_Group'], verbose=True)    
toCombine.append(categoryDF)

Calculating Success
Calculating FillGapDollar
Calculating TrueLoss
Scaling EntryFeeAmount
Scaling TotalPrizeAmount
Scaling MaxNumberPlayers
Scaling MaxEntriesPerUser
Scaling NumGames
Scaling DraftablePlayersInSet
Scaling PaidUsersInDraftGroup
Scaling TopPrize
Scaling MaxPayoutPosition
Processing Start (ContestStartDatetimeEST)
Processing End (ContestEndDatetimeEST)
Splitting SportName
Splitting VariantName
Splitting Contest_Group


# Saving csv

In [27]:
combinedDF = "none"
for df in toCombine:
    if combinedDF is "none":
        combinedDF = df
    else:
        combinedDF = pd.merge(combinedDF, df, on='ContestId', how='left')
combinedDF = combinedDF.dropna()

In [33]:
import datetime
now = datetime.datetime.now()
def newName():
    return "Contests_"+str(now.year) + "-" + str(now.month) + "-" + str(now.day) + "_" + str(now.hour) + "-" + str(now.minute)

In [34]:
combinedDF.to_csv("data/New/"+newName()+".csv")

In [35]:
len(combinedDF.dropna())

373115

### Viz Data