In [1]:
# imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif, chi2

runs = pd.read_pickle("Data/main_1.df")
runs = runs.iloc[int(len(runs)*.1):,:]
num_races = len(np.unique(runs["race_id"]))
FEATURES = ["horse_no", "horse_age", "horse_rating", "declared_weight", "actual_weight", 
            "win_odds", "draw", #"race_size", "distance", "race_class", 
            'last_race_result','win_percent', 
            'avg_distance_time', 'normal_avg_distance_time',
            'going_type_record', 'actual_weight_scaled',
            'declared_weight_scaled', 'horse_race_count', "jockey_record",
            'trainer_record', 'horse_record', 'surface_record', 'place_odds',
            'weight_change', 'weight_change_over_time','weight_change_from_average', 'weight_change_increase',
            'venue_change','venue_record', 'days_since_last_race', 'new_horse',
            'best_odds', 'best_win_percent', 'best_distance_time', 'best_going_record', 
            'best_horse_record', 'best_jockey_record','best_trainer_record', 'highest_actual_weight', 
            'lowest_actual_weight', 'start_speed', 'rode_before']


#TARGET = 'won'
TARGET = 'placed'

X = runs[FEATURES]
y = runs[TARGET]

testPct = 0.2
trainIndex = int(num_races * (1-testPct))
max_race_id = np.unique(runs["race_id"])[trainIndex]
X_train = X.loc[runs["race_id"]<=max_race_id]
y_train = y.loc[runs["race_id"]<=max_race_id]
X_test = X.loc[runs["race_id"]>max_race_id]
y_test = y.loc[runs["race_id"]>max_race_id]
race_sizes_for_eval = [len(runs.loc[runs["race_id"]==race_id]["race_id"]) for race_id in np.unique(runs.loc[runs["race_id"]>max_race_id]["race_id"])]

In [2]:
k = 17
rf_pipe = Pipeline([('skb', SelectKBest(chi2, k = k)),
                    ('model', RandomForestClassifier())])
lr_pipe = Pipeline([('skb', SelectKBest(chi2, k = k)),
                    ('model', LogisticRegression(max_iter=10000))])
xgb_pipe = Pipeline([('skb', SelectKBest(chi2, k = k)),
                    ('model', XGBClassifier())])
gb_pipe = Pipeline([('skb', SelectKBest(chi2, k = k)),
                    ('model', GradientBoostingClassifier())])

estimators = estimators = [('rf', rf_pipe),
                           ('lr', lr_pipe),
                           ('xgb', xgb_pipe),
                           ('gb', gb_pipe)]
lr_stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
rf_stack = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier())
gb_stack = StackingClassifier(estimators=estimators, final_estimator=GradientBoostingClassifier())
xgb_stack = StackingClassifier(estimators=estimators, final_estimator=XGBClassifier())

soft_voter = VotingClassifier(estimators=estimators, voting='soft')

In [3]:
# Fit the models
print("Training RF ... ")
rf_pipe.fit(X_train, y_train)
print("Training LR ... ")
lr_pipe.fit(X_train, y_train)
print("Training XGB ... ")
xgb_pipe.fit(X_train, y_train)
print("Training GB ... ")
gb_pipe.fit(X_train, y_train)
print("Training lr_stack ... ")
lr_stack.fit(X_train, y_train)
print("Training rf_stack ... ")
rf_stack.fit(X_train, y_train)
print("Training gb_stack ... ")
gb_stack.fit(X_train, y_train)
print("Training xgb_stack ... ")
xgb_stack.fit(X_train, y_train)
print("Training soft voter ... ")
soft_voter.fit(X_train, y_train)

Training RF ... 
Training LR ... 
Training XGB ... 
Training GB ... 
Training lr_stack ... 
Training rf_stack ... 
Training gb_stack ... 
Training xgb_stack ... 
Training soft voter ... 


VotingClassifier(estimators=[('rf',
                              Pipeline(memory=None,
                                       steps=[('skb',
                                               SelectKBest(k=17,
                                                           score_func=<function chi2 at 0x11a0a66a8>)),
                                              ('model',
                                               RandomForestClassifier(bootstrap=True,
                                                                      ccp_alpha=0.0,
                                                                      class_weight=None,
                                                                      criterion='gini',
                                                                      max_depth=None,
                                                                      max_features='auto',
                                                                      max_leaf_nodes=None,
                 

In [4]:
# Create our evaluate function
def winnerEval(model, x_test, y_test, race_sizes):
    # convert preds into an actual win choice
    winPreds = model.predict_proba(x_test)[:, 1]
    winCount = 0
    temp = 0 
    for i, s in enumerate(race_sizes):
        low_index = temp
        high_index = temp + s
        
        racePreds = winPreds[low_index:high_index]
        raceVals = y_test[low_index:high_index]
        
        if TARGET=='won':
            predWinner = np.argmax(racePreds, axis=0)
            actWinner = np.argmax(raceVals, axis=0)
        
            if predWinner == actWinner:
                winCount += 1
                
        elif TARGET=='placed':
            predPlacers = racePreds.argsort()[-3:]
            actPlacers = raceVals.argsort()[-3:]
            
            for val in actPlacers:
                if val in predPlacers:
                    winCount+=1
            
        temp += s
        

    if TARGET=='won':
        return winCount/float(len(race_sizes))
    else:
        return winCount / float(len(race_sizes)*3)

# DO ANOTHER FUNCTION THAT IS JUST FINDING THE BEST ODDS HORSE
def bestOddsEval(x_test, y_test, race_sizes):
    winCount = 0
    temp = 0 
    for i, s in enumerate(race_sizes):
        low_index = temp
        high_index = temp + s
        
        oddsPreds = x_test[low_index:high_index]
        raceVals = y_test[low_index:high_index]
        
        if TARGET=='won':
            predWinner = np.argmin(oddsPreds["win_odds"], axis=0)
            actWinner = np.argmax(raceVals, axis=0)

            if predWinner == actWinner:
                winCount += 1
            
        elif TARGET=='placed':
            predPlacers = oddsPreds["place_odds"].argsort()[:3].to_list()
            actPlacers = raceVals.argsort()[-3:].to_list()
            
            for val in actPlacers:
                if val in predPlacers:
                    winCount+=1
                    
        temp += s
        
    if TARGET=='won':
        return winCount/float(len(race_sizes))
    else:
        return winCount / float(len(race_sizes)*3)

def randEval(race_sizes):
    randCount = 0 
    for s in race_sizes:
        rand_a = np.random.randint(s)
        rand_b = np.random.randint(s)
        if rand_a==rand_b:
            randCount+=1
    return randCount/float(len(race_sizes))

randAcc = randEval(race_sizes_for_eval)
oddsAcc = bestOddsEval(X_test, y_test, race_sizes_for_eval)
        
print("Random Forest accuracy: {:.3f}".format(winnerEval(rf_pipe, X_test, y_test, race_sizes_for_eval)))
print("Logistic Regression accuracy: {:.3f}".format(winnerEval(lr_pipe, X_test, y_test, race_sizes_for_eval)))
print("Gradient Boosting accuracy: {:.3f}".format(winnerEval(gb_pipe, X_test, y_test, race_sizes_for_eval)))
print("XGBoost accuracy: {:.3f}".format(winnerEval(xgb_pipe, X_test, y_test, race_sizes_for_eval)))

print("LR Stacking accuracy: {:.3f}".format(winnerEval(lr_stack, X_test, y_test, race_sizes_for_eval)))
print("RFStacking accuracy: {:.3f}".format(winnerEval(rf_stack, X_test, y_test, race_sizes_for_eval)))
print("GB Stacking accuracy: {:.3f}".format(winnerEval(gb_stack, X_test, y_test, race_sizes_for_eval)))
print("XGB Stacking accuracy: {:.3f}".format(winnerEval(xgb_stack, X_test, y_test, race_sizes_for_eval)))

print("Soft voter accuracy: {:.3f}".format(winnerEval(soft_voter, X_test, y_test, race_sizes_for_eval)))

print("Random guessing accuracy: {:.3f}".format(randAcc))
print("Betting best odds accuracy: {:.3f}".format(oddsAcc))

Random Forest accuracy: 0.491
Logistic Regression accuracy: 0.490
Gradient Boosting accuracy: 0.500
XGBoost accuracy: 0.488
LR Stacking accuracy: 0.502
RFStacking accuracy: 0.462
GB Stacking accuracy: 0.496
XGB Stacking accuracy: 0.488
Soft voter accuracy: 0.499
Random guessing accuracy: 0.070
Betting best odds accuracy: 0.495
