In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier

from feature_functions import *

races = pd.read_csv("Data/races.csv")
runs = pd.read_csv("Data/runs_1.csv")

  from numpy.core.umath_tests import inner1d


In [2]:
FEATURES = ["horse_race_count", "avg_distance_time", "win_percent", "declared_weight_scaled", 
            "actual_scaled_weight", "win_odds", "draw", "horse_no", "going_type_record", "jockey_record"]
#FEATURES = ["finish_time"]
TARGET = 'won'

# Split into train and test data
X = runs[FEATURES]
y = runs[TARGET]

testPct = 0.2
trainIndex = int(len(races) * (1-testPct))
max_race_id = races.iloc[trainIndex, :]["race_id"]
X_train = X.loc[runs["race_id"]<=max_race_id]
y_train = y.loc[runs["race_id"]<=max_race_id]
X_test = X.loc[runs["race_id"]>max_race_id]
y_test = y.loc[runs["race_id"]>max_race_id]
race_sizes_for_eval = [len(runs.loc[runs["race_id"]==race_id]["race_id"]) for race_id in np.unique(runs.loc[runs["race_id"]>max_race_id]["race_id"])]


In [3]:
# Random Forest Model
print("Training Random Forest model ...")
RFmodel = RandomForestClassifier()
RFmodel.fit(X_train, y_train)
# XGBoost model
print("Training XGBoost model ...")
XGBmodel = XGBClassifier()
XGBmodel.fit(X_train, y_train)
# LR model 
print("Training Logistic Regression model ...")
LRmodel = LogisticRegression()
LRmodel.fit(X_train, y_train)
# Gradient boosted tree
print("Training Gradient Boosting model ... ")
GBmodel = GradientBoostingClassifier()
GBmodel.fit(X_train, y_train)
# SVM model
#print("Training SVM model ...")
#SVMmodel = svm.SVC(probability=True, max_iter=300)
#SVMmodel.fit(X_train, y_train)
# SGD model
#print("Training SGD model ...")
#SGDmodel = SGDClassifier(max_iter=100, loss='log')
#SGDmodel.fit(X_train, y_train)

Training Random Forest model ...
Training XGBoost model ...
Training Logistic Regression model ...
Training Gradient Boosting model ... 


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [4]:
# Create our evaluate function
def winnerEval(model, x_test, y_test, race_sizes):
    # convert preds into an actual win choice
    winPreds = model.predict_proba(x_test)[:, 1]
    winCount = 0
    temp = 0 
    for i, s in enumerate(race_sizes):
        low_index = temp
        high_index = temp + s
        
        racePreds = winPreds[low_index:high_index]
        raceVals = y_test[low_index:high_index]
        
        predWinner = np.argmax(racePreds, axis=0)
        actWinner = np.argmax(raceVals, axis=0)
        #print(predWinner, actWinner)
        
        if predWinner == actWinner:
            winCount += 1
            
        temp += s
        
    return winCount/float(len(race_sizes))

# DO ANOTHER FUNCTION THAT IS JUST FINDING THE BEST ODDS HORSE
def bestOddsEval(x_test, y_test, race_sizes):
    winCount = 0
    temp = 0 
    for i, s in enumerate(race_sizes):
        low_index = temp
        high_index = temp + s
        
        oddsPreds = x_test[low_index:high_index]
        raceVals = y_test[low_index:high_index]
        
        predWinner = np.argmin(oddsPreds["win_odds"], axis=0)
        actWinner = np.argmax(raceVals, axis=0)
        
        if predWinner == actWinner:
            winCount += 1
            
        temp += s
        
    return winCount/float(len(race_sizes))

def randEval(race_sizes):
    randCount = 0 
    for s in race_sizes:
        rand_a = np.random.randint(s)
        rand_b = np.random.randint(s)
        if rand_a==rand_b:
            randCount+=1
    return randCount/float(len(race_sizes))

randAcc = randEval(race_sizes_for_eval)
oddsAcc = bestOddsEval(X_test, y_test, race_sizes_for_eval)
        
print("Random Forest accuracy: {:.3f}".format(winnerEval(RFmodel, X_test, y_test, race_sizes_for_eval)))
print("Logistic Regression accuracy: {:.3f}".format(winnerEval(LRmodel, X_test, y_test, race_sizes_for_eval)))
print("Gradient Boosting accuracy: {:.3f}".format(winnerEval(GBmodel, X_test, y_test, race_sizes_for_eval)))
print("XGBoost accuracy: {:.3f}".format(winnerEval(XGBmodel, X_test, y_test, race_sizes_for_eval)))
#print("Support Vector Machine accuracy: {:.3f}".format(winnerEval(SVMmodel, X_test, y_test, race_sizes_for_eval)))
#print("Stochastic Gradient Descent accuracy: {:.3f}".format(winnerEval(SGDmodel, X_test, y_test, race_sizes_for_eval)))
print("Random guessing accuracy: {:.3f}".format(randAcc))
print("Betting best odds accuracy: {:.3f}".format(oddsAcc))

Random Forest accuracy: 0.260
Logistic Regression accuracy: 0.292
Gradient Boosting accuracy: 0.301
XGBoost accuracy: 0.274
Random guessing accuracy: 0.103
Betting best odds accuracy: 0.313


In [7]:
GBmodel.score(X_test, y_test)

0.920254279959718

In [8]:
RFmodel.score(X_test, y_test)

0.9159743202416919