In [2]:
# FOR TUNING PARAMS
# https://www.youtube.com/watch?v=dMulLZKm_pg

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import xgboost as xgb
from xgboost import DMatrix

import itertools

runs = pd.read_pickle("Data/main_1.df")
runs = runs.iloc[int(len(runs)*.1):,:]
num_races = len(np.unique(runs["race_id"]))
TARGET = "result"
FEATURES = ["horse_no", "horse_age", "horse_rating", "declared_weight", "actual_weight", 
            "win_odds", "draw", #"race_size", "distance", "race_class", 
            'last_race_result','win_percent', 
            'avg_distance_time', 'normal_avg_distance_time',
            'going_type_record', 'actual_weight_scaled',
            'declared_weight_scaled', 'horse_race_count', "jockey_record",
            'trainer_record', 'horse_record', 'surface_record', 'place_odds',
            'weight_change', 'weight_change_over_time','weight_change_from_average', 'weight_change_increase',
            'venue_change','venue_record', 'days_since_last_race', 'new_horse',
            'best_odds', 'best_win_percent', 'best_distance_time', 'best_going_record', 
            'best_horse_record', 'best_jockey_record','best_trainer_record', 'highest_actual_weight', 
            'lowest_actual_weight', 'start_speed', 'rode_before']

X = runs[FEATURES]
y = runs[TARGET]

testPct = 0.2
trainIndex = int(num_races * (1-testPct))
max_race_id = np.unique(runs["race_id"])[trainIndex]
X_train = X.loc[runs["race_id"]<=max_race_id]
y_train = y.loc[runs["race_id"]<=max_race_id]
X_test = X.loc[runs["race_id"]>max_race_id]
y_test = y.loc[runs["race_id"]>max_race_id]

groups_train = [len(runs.loc[runs["race_id"]==race_id]["race_id"]) for race_id in np.unique(runs.loc[runs["race_id"]<=max_race_id]["race_id"])]
groups_test = [len(runs.loc[runs["race_id"]==race_id]["race_id"]) for race_id in np.unique(runs.loc[runs["race_id"]>max_race_id]["race_id"])]

#n_samples = len(FEATURES) 
#n_groups = len(groups)

In [33]:
train_dmatrix = DMatrix(X_train, label=y_train)

valid_dmatrix = DMatrix(X_test, label=y_test)

train_dmatrix.set_group(groups_train)
valid_dmatrix.set_group(groups_test)

params = {'objective': 'rank:pairwise', 'eta': .2, 'gamma': 1,
          'min_child_weight': 0.1, 'eval_metric':'ndcg',
          'seed':1212}
xgb_model = xgb.train(params, train_dmatrix, num_boost_round=500,
                      evals=[(valid_dmatrix, 'validation')]
                     )

[0]	validation-ndcg:0.71390
[1]	validation-ndcg:0.71210
[2]	validation-ndcg:0.71664
[3]	validation-ndcg:0.71725
[4]	validation-ndcg:0.71951
[5]	validation-ndcg:0.71987
[6]	validation-ndcg:0.71986
[7]	validation-ndcg:0.71965
[8]	validation-ndcg:0.72084
[9]	validation-ndcg:0.72116
[10]	validation-ndcg:0.72227
[11]	validation-ndcg:0.72016
[12]	validation-ndcg:0.72174
[13]	validation-ndcg:0.72109
[14]	validation-ndcg:0.72093
[15]	validation-ndcg:0.72195
[16]	validation-ndcg:0.71875
[17]	validation-ndcg:0.71868
[18]	validation-ndcg:0.71798
[19]	validation-ndcg:0.71892
[20]	validation-ndcg:0.71919
[21]	validation-ndcg:0.71923
[22]	validation-ndcg:0.71963
[23]	validation-ndcg:0.71927
[24]	validation-ndcg:0.71935
[25]	validation-ndcg:0.71925
[26]	validation-ndcg:0.71830
[27]	validation-ndcg:0.71915
[28]	validation-ndcg:0.71843
[29]	validation-ndcg:0.71883
[30]	validation-ndcg:0.71806
[31]	validation-ndcg:0.71657
[32]	validation-ndcg:0.71708
[33]	validation-ndcg:0.71755
[34]	validation-ndcg:0.7

[277]	validation-ndcg:0.70793
[278]	validation-ndcg:0.70827
[279]	validation-ndcg:0.70819
[280]	validation-ndcg:0.70885
[281]	validation-ndcg:0.70795
[282]	validation-ndcg:0.70833
[283]	validation-ndcg:0.70783
[284]	validation-ndcg:0.70764
[285]	validation-ndcg:0.70900
[286]	validation-ndcg:0.70906
[287]	validation-ndcg:0.70913
[288]	validation-ndcg:0.70897
[289]	validation-ndcg:0.70889
[290]	validation-ndcg:0.70847
[291]	validation-ndcg:0.70849
[292]	validation-ndcg:0.70836
[293]	validation-ndcg:0.70879
[294]	validation-ndcg:0.70890
[295]	validation-ndcg:0.70845
[296]	validation-ndcg:0.70871
[297]	validation-ndcg:0.70804
[298]	validation-ndcg:0.70773
[299]	validation-ndcg:0.70796
[300]	validation-ndcg:0.70787
[301]	validation-ndcg:0.70789
[302]	validation-ndcg:0.70786
[303]	validation-ndcg:0.70803
[304]	validation-ndcg:0.70753
[305]	validation-ndcg:0.70753
[306]	validation-ndcg:0.70769
[307]	validation-ndcg:0.70742
[308]	validation-ndcg:0.70808
[309]	validation-ndcg:0.70799
[310]	vali

In [36]:
def RankEval(xgb_model, x_test, y_test, groups_test):
    pred = xgb_model.predict(DMatrix(x_test))
    s = 0
    wCount = 0
    for i, groupNum in enumerate(groups_test):
        realVals = y_train[s:s+groupNum].to_list()
        predVals = pred[s:s+groupNum]
        
        output = [0] * groupNum
        for j, x in enumerate(sorted(range(len(predVals)), key=lambda y: predVals[y])):
            output[x] = j+1
        s += groupNum

        if realVals[0]==output[0]:
            wCount += 1
            
    return wCount/float(len(groups_test))

In [37]:
print(RankEval(xgb_model, X_test, y_test, groups_test))

0.08238387379491674
