In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
from ipywidgets import FloatProgress
from IPython.display import display
import time

In [3]:
rawContests = pd.read_csv('data/WorkingData.csv').set_index('ContestId')
expensiveContests = rawContests.loc[rawContests['TotalPrizeAmount']> 100000][[]]

In [4]:
contests = pd.read_csv('data/PCA_EVERYTHING.csv').set_index('ContestId')
workedData = pd.read_csv('data/WorkedData.csv').set_index('ContestId')

workedData.head(1)

Unnamed: 0_level_0,Success,SportName_NFL,SportName_PGA,SportName_SOCC,SportName_MLB,SportName_NAS,SportName_CFB,SportName_MMA,SportName_LOL,SportName_NHL,...,EntryFeeAmount_Scaled,TotalPrizeAmount_Scaled,MaxNumberPlayers_Scaled,MaxEntriesPerUser_Scaled,NumGames_Scaled,DraftablePlayersInSet_Scaled,PaidUsersInDraftGroup_Scaled,TopPrize_Scaled,MaxPayoutPosition_Scaled,SuccessThreshold_Scaled
ContestId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7963004,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5e-06,0.0025,0.14981,9.9e-08,0.222222,0.582524,0.891705,0.000999,0.122835,0.149999


In [5]:
contests.head()

Unnamed: 0_level_0,Success,principal component 1,principal component 2,principal component 3
ContestId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
24567084,0.0,20718230.0,-1.235748,0.545033
24567089,1.0,20718220.0,-1.280345,0.976134
24567090,0.0,20718220.0,-1.223384,0.403065
24567091,0.0,20718220.0,-1.234668,0.548913
24567093,0.0,20718220.0,-1.248919,0.562719


In [6]:
contests = pd.merge(contests, workedData.drop(columns=['Success'], axis=1), on="ContestId", how='left').dropna()

In [7]:
# contests = workedData

In [8]:
len(contests)

58462

In [9]:
numFail = len(contests.loc[contests['Success']==0])
percent = numFail/len(contests)
1-percent

0.9467517361705039

In [10]:
CLF_contests = contests.loc[:].dropna()
len(CLF_contests)

58462

In [11]:
# CLF_contests['USE_FOR_BALANCE'] = np.random.uniform(0, 1, len(CLF_contests)) <= percent
# CLF_contests = CLF_contests.loc[(CLF_contests["USE_FOR_BALANCE"] == True) | (CLF_contests["Success"] == 0)]
# CLF_Features = CLF_contests.columns

In [12]:
CLF_Features= CLF_contests.columns.drop('Success')
# CLF_Features = CLF_Features.drop('USE_FOR_BALANCE')

In [13]:
CLF_contests['is_train'] = np.random.uniform(0, 1, len(CLF_contests)) <= .75
train, test = CLF_contests[CLF_contests['is_train']==True], CLF_contests[CLF_contests['is_train']==False]
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 43869
Number of observations in the test data: 14593


In [26]:
def getSample(df, numSamples):
    successes = df.loc[df['Success']==1][[]]
    fails = df.loc[df['Success']==0][[]]
#     print(len(fails)) 
    subSuccess = successes.sample(numSamples)
    subFail = fails.sample(numSamples)
    allSub = subSuccess.append(subFail)
    newDF = pd.merge(allSub, df, on='ContestId', how='left').dropna()
    return newDF

In [27]:
numForests = 200
clfs = []
clfPreds = []
preds = []
numTrains = 800

In [28]:
actual = test['Success']
f = FloatProgress(min=0, max=numForests)
display(f)
for i in range(numForests):
    subTrain = getSample(train, numTrains)
    y = subTrain['Success']
    clfs.append(RandomForestClassifier(n_jobs=2, n_estimators=30, random_state=i))
    clfs[i].fit(subTrain[CLF_Features], y)
    clfPreds.append(clfs[i].predict(test[CLF_Features]))
    f.value += 1


FloatProgress(value=0.0, max=200.0)

In [29]:
for c in range(len(test)):
    val = 0
    for i in range(numForests):
        val = val + clfPreds[i][c]
    preds.append(int(val>=(.5*numForests)))

In [30]:
# Create actual english names for the plants for each predicted plant class
trueIds = test[['Success']]


# View the PREDICTED species for the first five observations
print('Predicted price of first five observations', preds[0:5])
# View the ACTUAL species for the first five observations
print('Actual Price of first five observations', test['Success'].head())

Predicted price of first five observations [0, 0, 0, 0, 1]
Actual Price of first five observations ContestId
24567084    0.0
24567093    0.0
24567085    0.0
24567092    1.0
24878883    1.0
Name: Success, dtype: float64


In [31]:
accuracy_score(actual, preds)

0.7326115260741451

In [32]:
confuse = pd.DataFrame({'Predict':preds, 'Real':actual})

In [33]:
successes =  len(test.loc[test['Success']==True])
fails = len(test.loc[test['Success']==False])

In [34]:
print("fails:", fails, "---", "successes:", successes)

fails: 784 --- successes: 13809


In [35]:
print(confusion_matrix(actual, preds))

[[  684   100]
 [ 3802 10007]]
