In [6]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
import matplotlib.pyplot as plt

import sys
sys.path.append('../')
from helper import *

runs = pd.read_pickle("../Data/main_1.df")
FEATURES = ['actual_weight','declared_weight','draw','win_odds',"jockey_record",
'horse_record','distance', 'trainer_record']
TARGET = "won"
num_races = len(np.unique(runs["race_id"]))
X = runs[FEATURES]
y = runs[TARGET]

testPct = 0.2
valPct = 0.2
trainIndex = int(num_races * (1-(testPct+valPct)))
valIndex =  int(num_races * (1-valPct))
train_race_id = np.unique(runs["race_id"])[trainIndex]
val_race_id = np.unique(runs["race_id"])[trainIndex]
X_train = X.loc[runs["race_id"]<=train_race_id]
y_train = y.loc[runs["race_id"]<=train_race_id]
X_test = X.loc[runs["race_id"]>train_race_id]
y_test = y.loc[runs["race_id"]>train_race_id]
race_sizes_for_eval = [len(runs.loc[runs["race_id"]==race_id]["race_id"]) for race_id in np.unique(runs.loc[runs["race_id"]>train_race_id]["race_id"])]

In [2]:
feat_1 = ["horse_no", "horse_age", "horse_rating", "declared_weight", "actual_weight", 
            "win_odds", "draw", "race_size", "distance", "race_class", 
            'last_race_result','win_percent', 
            'avg_distance_time', 'normal_avg_distance_time',
            'going_type_record', 'actual_weight_scaled',
            'declared_weight_scaled', 'horse_race_count', "jockey_record",
            'trainer_record', 'horse_record', 'surface_record', 'place_odds',
            'weight_change', 'weight_change_over_time','weight_change_from_average', 'weight_change_increase',
            'venue_change','venue_record', 'days_since_last_race', 'new_horse',
            'best_odds', 'best_win_percent', 'best_distance_time', 'best_going_record', 
            'best_horse_record', 'best_jockey_record','best_trainer_record', 'highest_actual_weight', 
            'lowest_actual_weight', 'start_speed', 'rode_before',
            'horse_type_Brown',
            'horse_type_Colt', 'horse_type_Filly', 'horse_type_Gelding',
            'horse_type_Grey', 'horse_type_Horse', 'horse_type_Mare',
            'horse_type_Rig', 'horse_type_Roan', 'horse_country_ARG',
            'horse_country_AUS', 'horse_country_BRZ', 'horse_country_CAN',
            'horse_country_FR', 'horse_country_GB', 'horse_country_GER',
            'horse_country_GR', 'horse_country_IRE', 'horse_country_ITY',
            'horse_country_JPN', 'horse_country_NZ', 'horse_country_SAF',
            'horse_country_SPA', 'horse_country_USA', 'horse_country_ZIM',
            'going_type_fast', 'going_type_slow', 'going_type_wet'
            ]


feat_2 = ["horse_no", "horse_age", "horse_rating", "declared_weight", "actual_weight", 
            "draw", "race_size", "distance", "race_class", 
            'last_race_result','win_percent', 
            'avg_distance_time', 'normal_avg_distance_time',
            'going_type_record', 'actual_weight_scaled',
            'declared_weight_scaled', 'horse_race_count', "jockey_record",
            'trainer_record', 'horse_record', 'surface_record',
            'weight_change', 'weight_change_over_time','weight_change_from_average', 'weight_change_increase',
            'venue_change','venue_record', 'days_since_last_race', 'new_horse',
            'best_win_percent', 'best_distance_time', 'best_going_record', 
            'best_horse_record', 'best_jockey_record','best_trainer_record', 'highest_actual_weight', 
            'lowest_actual_weight', 'start_speed', 'rode_before',
            'horse_type_Brown',
            'horse_type_Colt', 'horse_type_Filly', 'horse_type_Gelding',
            'horse_type_Grey', 'horse_type_Horse', 'horse_type_Mare',
            'horse_type_Rig', 'horse_type_Roan', 'horse_country_ARG',
            'horse_country_AUS', 'horse_country_BRZ', 'horse_country_CAN',
            'horse_country_FR', 'horse_country_GB', 'horse_country_GER',
            'horse_country_GR', 'horse_country_IRE', 'horse_country_ITY',
            'horse_country_JPN', 'horse_country_NZ', 'horse_country_SAF',
            'horse_country_SPA', 'horse_country_USA', 'horse_country_ZIM',
            'going_type_fast', 'going_type_slow', 'going_type_wet'
            ]

# Define the base models
nn_1 = MLPClassifier(
        solver="adam",
        activation="identity",
        hidden_layer_sizes=(int(len(feat_1)/2)),
        learning_rate="constant",
    )
gb_1 = GradientBoostingClassifier(
        min_samples_split=5,
        loss="deviance",
        min_samples_leaf=3,
        max_depth=7,
    )
rf_1 = RandomForestClassifier(
        min_samples_split=4,
        min_samples_leaf=3,
        max_depth=5
    )
lr_1 = LogisticRegression(
        solver="liblinear"
    )
xgb_1 = XGBClassifier(
    booster='gblinear',
)

In [3]:
soft_1 = VotingClassifier(estimators=[
                            ("lr", lr_1), ("rf", rf_1), ("gb", gb_1), ("xgb", xgb_1), ("nn", nn_1)],
                            voting="soft"
                            )
meanScore, stdScore = crossVal(runs, feat_1, TARGET, soft_1)
meanScore, stdScore = crossVal(runs, feat_2, TARGET, soft_1)

CV 1/4
CV 2/4
CV 3/4
CV 4/4
Mean score: 0.297 +/- 0.006
CV 1/4
CV 2/4


KeyboardInterrupt: 

In [None]:
# Pipelines that select features for you 
k = 15
nn_2 = Pipeline(steps= [
    ('feat', SelectKBest(chi2, k=k)),
    ('model', nn_1)
])

rf_2 = Pipeline(steps= [
    ('feat', SelectKBest(chi2, k=k)),
    ('model', rf_1)
])


gb_2 = Pipeline(steps= [
    ('feat', SelectKBest(chi2, k=k)),
    ('model', gb_1)
])


xgb_2 = Pipeline(steps= [
    ('feat', SelectKBest(chi2, k=k)),
    ('model', xgb_1)
])


lr_2 = Pipeline(steps= [
    ('feat', SelectKBest(chi2, k=k)),
    ('model', lr_1)
])

In [None]:
soft_2 = VotingClassifier(estimators=[
                            ("lr", lr_2), ("rf", rf_2), ("gb", gb_2), ("xgb", xgb_2), ("nn", nn_2)],
                            voting="soft"
                            )
meanScore, stdScore = crossVal(runs, feat_1, TARGET, soft_2)
meanScore, stdScore = crossVal(runs, feat_2, TARGET, soft_2)

In [10]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [11]:
kfold = KFold(n_splits = 5)
score = cross_val_score(soft_1, X_train, y_train, cv=kfold)