In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
contests = pd.read_csv('data/ContestsWithPaths.csv')

In [3]:
contests.head()

Unnamed: 0.1,Unnamed: 0,ContestId,SportName,VariantName,GameSet,ContestName,ContestStartDatetimeEST,ContestEndDatetimeEST,ContestPayoutDatetimeEST,EntryFeeAmount,...,MaxEntriesPerUser,Entries,DistinctUsers,Contest_Group,NumGames,DraftablePlayersInSet,PaidUsersInDraftGroup,TopPrize,MaxPayoutPosition,fileName
0,0,7963004,NFL,Classic,(Main),NFL $25K Quarter Arcade [Just $0.25!],2015-09-13 13:00:00.000,2015-09-14 22:20:00.000,2015-09-15 02:27:24.000,0.25,...,100,117500.0,38925,Headliner,15,910.0,668396,2000.0,23665,2015-09
1,1,7963437,NFL,Classic,(Main),"NFL GIANT $2 Booster [Top 1,250 Win $20]",2015-09-13 13:00:00.000,2015-09-14 22:20:00.000,2015-09-15 02:27:24.000,2.0,...,100,14375.0,9663,Booster,15,910.0,668396,20.0,1250,2015-09
2,2,8296224,NFL,Classic,(Main),Beginner NFL $5K Spy [Single Entry],2015-09-13 13:00:00.000,2015-09-14 22:20:00.000,2015-09-15 02:27:24.000,100.0,...,1,55.0,55,SingleEntry,15,910.0,668396,1100.0,11,2015-09
3,3,8296543,NFL,Classic,(Main),NFL $50 Double Up [$5K Gtd] (Multi-Entry),2015-09-13 13:00:00.000,2015-09-14 22:20:00.000,2015-09-15 02:27:24.000,50.0,...,11,111.0,79,FeaturedDoubleUp,15,910.0,668396,100.0,50,2015-09
4,4,8296544,NFL,Classic,(Main),NFL $50 Double Up [$5K Gtd] (Multi-Entry),2015-09-13 13:00:00.000,2015-09-14 22:20:00.000,2015-09-15 02:27:24.000,50.0,...,11,111.0,91,FeaturedDoubleUp,15,910.0,668396,100.0,50,2015-09


In [4]:
def contestThreshold(id):
    aContest = getContestData(id)
    return float(aContest.TopPrize/aContest.EntryFeeAmount)
def getContestData(id):
    return contests.loc[contests['ContestId'] == id]

In [5]:
contests = contests.assign(Threshold = lambda x: x.TopPrize/x.EntryFeeAmount)

In [6]:
contests = contests.assign(Success = lambda x: (1+(x.Entries- x.Threshold)/abs(x.Entries - x.Threshold))/2)

In [7]:
CLF_Features=['EntryFeeAmount', 'TotalPrizeAmount', 'MaxNumberPlayers',
       'MaxEntriesPerUser', 'NumGames',
       'DraftablePlayersInSet', 'PaidUsersInDraftGroup', 'TopPrize',
       'MaxPayoutPosition', 'Threshold', 'Success']
CLF_contests = contests.filter(CLF_Features, axis=1).dropna()

In [8]:
CLF_contests.head()

Unnamed: 0,EntryFeeAmount,TotalPrizeAmount,MaxNumberPlayers,MaxEntriesPerUser,NumGames,DraftablePlayersInSet,PaidUsersInDraftGroup,TopPrize,MaxPayoutPosition,Threshold,Success
0,0.25,25000.0,117500,100,15,910.0,668396,2000.0,23665,8000.0,1.0
1,2.0,25000.0,14375,100,15,910.0,668396,20.0,1250,10.0,1.0
2,100.0,5000.0,55,1,15,910.0,668396,1100.0,11,11.0,1.0
3,50.0,5000.0,111,11,15,910.0,668396,100.0,50,2.0,1.0
4,50.0,5000.0,111,11,15,910.0,668396,100.0,50,2.0,1.0


In [9]:
feature_names = CLF_contests.columns

In [10]:
CLF_contests['is_train'] = np.random.uniform(0, 1, len(CLF_contests)) <= .75
train, test = CLF_contests[CLF_contests['is_train']==True], CLF_contests[CLF_contests['is_train']==False]
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 348607
Number of observations in the test data: 116097


In [11]:
true = test['Success']
features = CLF_contests.columns[:].drop('is_train').drop('Success')
y = train['Success']

In [12]:
clf = RandomForestClassifier(n_jobs=2, random_state=0)
clf.fit(train[features], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [13]:
# Create actual english names for the plants for each predicted plant class
preds = clf.predict(test[features])
true = test['Success']

# View the PREDICTED species for the first five observations
print('Predicted price of first five observations', preds[0:5])
# View the ACTUAL species for the first five observations
print('Actual Price of first five observations', test['Success'].head())

Predicted price of first five observations [1. 1. 1. 1. 1.]
Actual Price of first five observations 2     1.0
3     1.0
7     1.0
8     1.0
13    1.0
Name: Success, dtype: float64


In [14]:
print("Feature importance:", list(zip(train[features], clf.feature_importances_)))


Feature importance: [('EntryFeeAmount', 0.043786151048240726), ('TotalPrizeAmount', 0.043791116446200144), ('MaxNumberPlayers', 0.05000678778168279), ('MaxEntriesPerUser', 0.0165945516311569), ('NumGames', 0.0940672847859099), ('DraftablePlayersInSet', 0.24367273964707875), ('PaidUsersInDraftGroup', 0.3532601792056217), ('TopPrize', 0.046846788453726605), ('MaxPayoutPosition', 0.05529105107598782), ('Threshold', 0.05268334992439466)]


In [15]:
accuracy_score(true, preds)

0.9887421724936907

In [16]:
confuse = pd.DataFrame({'Predict':preds, 'Real':true})

In [17]:
successes =  len(test.loc[test['Success']==1.0])
fails = len(test.loc[test['Success']==0.0])

In [18]:
print("fails:", fails, "---", "successes:", successes)

fails: 1280 --- successes: 114817


In [19]:
print(confusion_matrix(true, preds))

[[   513    767]
 [   540 114277]]
