In [1]:
import numpy as np
import pandas as pd
import datetime
from sklearn.preprocessing import MinMaxScaler

import random

random.seed(0)

In [None]:
df = pd.read_pickle("./clean_df.pkl")

## Preparing the dataset for training

### Split the dataset to train/validation/test sets

In [None]:
train_df = df[(df['RaceStartTime'].dt.date<datetime.date(2020,11,1))]  
validation_df = df[(df['RaceStartTime'].dt.date>=datetime.date(2020,11,1)) & (df['RaceStartTime'].dt.date<datetime.date(2021,11,1))]
test_df = df[df['RaceStartTime'].dt.date>=datetime.date(2021,11,1)]

### Creating the target variable

In [None]:
train_df['finish_position_num'] = 20
temp_index = train_df['FinishPosition'].isin(['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18'])
train_df['finish_position_num'][temp_index] = train_df['FinishPosition'][temp_index].astype(int)

validation_df['finish_position_num'] = 20
temp_index = validation_df['FinishPosition'].isin(['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18'])
validation_df['finish_position_num'][temp_index] = validation_df['FinishPosition'][temp_index].astype(int)

test_df['finish_position_num'] = 20
temp_index = test_df['FinishPosition'].isin(['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18'])
test_df['finish_position_num'][temp_index] = test_df['FinishPosition'][temp_index].astype(int)

In [None]:
# scale performance metrics to have the same scale
transform_columns = ['finish_position_num', 'BeatenMargin', 'PriceSP', 'PIRPosition']

target_mmscaler = MinMaxScaler()
target_mmscaler.fit(train_df[transform_columns])
train_df[transform_columns] = target_mmscaler.transform(train_df[transform_columns])
validation_df[transform_columns] = target_mmscaler.transform(validation_df[transform_columns])
test_df[transform_columns] = target_mmscaler.transform(test_df[transform_columns])

In [None]:
# we created two target variable but in the end only used target_1
train_df['target_1'] = 1 / (1 + train_df['BeatenMargin']+train_df['finish_position_num']+train_df['PIRPosition']+train_df['PriceSP'])
train_df['target_2'] = 1 / (train_df['BeatenMargin']+train_df['finish_position_num']+train_df['PIRPosition']+train_df['PriceSP'])

validation_df['target_1'] = 1 / (1 + validation_df['BeatenMargin']+validation_df['finish_position_num']+validation_df['PIRPosition']+validation_df['PriceSP'])
validation_df['target_2'] = 1 / (validation_df['BeatenMargin']+validation_df['finish_position_num']+validation_df['PIRPosition']+validation_df['PriceSP'])

test_df['target_1'] = 1 / (1 + test_df['BeatenMargin']+test_df['finish_position_num']+test_df['PIRPosition']+test_df['PriceSP'])
test_df['target_2'] = 1 / (test_df['BeatenMargin']+test_df['finish_position_num']+test_df['PIRPosition']+test_df['PriceSP'])

### Scaling features using MinMaxScaler

In [15]:
transform_columns = [col for col in train_df.columns if col not in ['HorseID', 'Prizemoney', 'RaceID','target', 'RaceStartTime', "PIRPosition", "PriceSP", "finish_position_num", "BeatenMargin", "target_1", "target_2", "FinishPosition"]]

mmscaler = MinMaxScaler()
mmscaler.fit(train_df[transform_columns])
train_df[transform_columns] = mmscaler.transform(train_df[transform_columns])
validation_df[transform_columns] = mmscaler.transform(validation_df[transform_columns])
test_df[transform_columns] = mmscaler.transform(test_df[transform_columns])

### Creating data instances. each instance is a 2d snapshot of extraced features of all contestants of each race

In [None]:
train = []
validation = []
test = []
train_target = []
validation_target = []
test_target = []
filter_columns = ['HorseID', 'Prizemoney', 'RaceID','target', 'RaceStartTime', "PIRPosition", "PriceSP", "finish_position_num", "BeatenMargin", "target_1", "target_2", "FinishPosition"]
test_dic = {}

counter = 0
for gp in train_df.groupby("RaceID"):
    counter+=1
    if counter % 1000 == 0:
        print(counter)
    if gp[1].shape[0] < 20:
        d = pd.DataFrame(0, index=np.arange(20-gp[1].shape[0]), columns=gp[1].columns)
        t = pd.concat([gp[1],d]).sample(frac=1)
    train.append(t.drop(columns=filter_columns).to_numpy())
    train_target.append(t['target_1'].to_numpy())

counter = 0
for gp in validation_df.groupby("RaceID"):
    counter+=1
    if counter % 1000 == 0:
        print(counter)
    if gp[1].shape[0] < 20:
        d = pd.DataFrame(0, index=np.arange(20-gp[1].shape[0]), columns=gp[1].columns)
        t = pd.concat([gp[1],d]).sample(frac=1)
    validation.append(t.drop(columns=filter_columns).to_numpy())
    validation_target.append(t['target_1'].to_numpy())

counter = 0
for gp in test_df.groupby("RaceID"):
    if counter % 1000 == 0:
        print(counter)
    if gp[1].shape[0] < 20:
        d = pd.DataFrame(0, index=np.arange(20-gp[1].shape[0]), columns=gp[1].columns)
        t = pd.concat([gp[1],d]).sample(frac=1)
    test_dic[counter] = {"RaceID": gp[0], "HorseIDS" : t['HorseID'].values}
    counter+=1
    test.append(t.drop(columns=filter_columns).to_numpy())
    test_target.append(t['target_1'].to_numpy())



train = np.array(train)
validation = np.array(validation)
test = np.array(test)
train_target = np.array(train_target)
validation_target = np.array(validation_target)
test_target = np.array(test_target)

In [None]:
# scale scores to be in [0,1] and add up to 1

train_target = [t/t.max() for t in train_target]
validation_target = [t/t.max() for t in validation_target]
test_target = [t/t.max() for t in test_target]

train_target = [t/t.sum() for t in train_target]
validation_target = [t/t.sum() for t in validation_target]
test_target = [t/t.sum() for t in test_target]

In [21]:
# saving the results
data_dic = {
    "train": train,
    "validation": validation,
    "test": test,
    "train_target": train_target,
    "validation_target": validation_target,
    "test_target": test_target
}

import pickle
with open("data_dic.pkl", "wb") as f:
    pickle.dump(data_dic, f)

## Modeling

In [None]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from datetime import datetime
import random

random.seed(0)

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Conv2D, AveragePooling2D, GlobalAveragePooling2D
from tensorflow.keras.layers import Reshape, multiply
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping

In [1]:
import pickle
with open("data_dic.pkl", "rb") as f:
    data = pickle.load(f)
    
train = data['train']
train_target = data['train_target']
validation = data['validation']
validation_target = data['validation_target']
test = data['test']
test_target = data['test_target']

In [8]:
def squeeze_excite_block(filters,input):                      
    se = GlobalAveragePooling2D()(input)
    se = Reshape((1, filters))(se) 
    se = Dense(filters//16, activation='relu')(se)
    se = Dense(filters, activation='sigmoid')(se)
    se = multiply([input, se])
    return se

n_filters = 32
input_x = Input(shape=(20, 210,1))

x = Conv2D(filters=n_filters,
           kernel_size=(1,10),
           kernel_initializer = 'lecun_normal',
           padding='same')(input_x)
x = BatchNormalization()(x)
x = squeeze_excite_block(n_filters,x)
x = Activation('relu')(x)
x = Dropout(0.5)(x)

x = Conv2D(filters=n_filters*2,
           kernel_size=(1,10),
           kernel_initializer = 'lecun_normal',
           padding='same')(x)

x = BatchNormalization()(x)
x = squeeze_excite_block(n_filters*2,x)
x = Activation('relu')(x)
x = AveragePooling2D(pool_size=4)(x)
x = Dropout(0.5)(x)


x = Conv2D(filters=n_filters*4,
           kernel_size=(1,10),
           kernel_initializer = 'lecun_normal',
           padding='same')(x)

x = BatchNormalization()(x)
x = squeeze_excite_block(n_filters*4,x)
x = Activation('relu')(x)
x = Dropout(0.5)(x)

x = Conv2D(filters=n_filters*4,
           kernel_size=(20,1),
           kernel_initializer = 'lecun_normal',
           padding='same')(input_x)

x = BatchNormalization()(x)
x = squeeze_excite_block(n_filters*4,x)
x = Activation('relu')(x)
x = AveragePooling2D(pool_size=4)(x)
x = Dropout(0.5)(x)


x = Conv2D(filters=n_filters*4,
           kernel_size=(20,1),
           kernel_initializer = 'lecun_normal',
           padding='same')(input_x)

x = BatchNormalization()(x)
x = squeeze_excite_block(n_filters*4,x)
x = Activation('relu')(x)
x = Dropout(0.5)(x)

x = GlobalAveragePooling2D()(x)
# x = Flatten()(x)
x = Dense(100, kernel_initializer = 'lecun_normal')(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dropout(0.5)(x)
x = Dense(20, kernel_initializer = 'lecun_normal')(x)
x = BatchNormalization()(x)
x = Activation('sigmoid')(x)

Y_HAT = Dense(20, activation="softmax")(x)
model = Model(inputs=input_x, outputs=Y_HAT)
model.compile(
    optimizer=Adam(learning_rate=0.001,clipnorm=1.0, clipvalue=5.0),
    loss='categorical_crossentropy',
)
model.summary()


chk = ModelCheckpoint("model_script_4", monitor='val_loss', save_best_only=True, mode='min', verbose=1)
model.fit(train,
    np.array(train_target),
    validation_data=(validation,np.array(validation_target)),
    epochs=500,
    batch_size=64,
    verbose=1,
    callbacks=[EarlyStopping(patience=50), chk])

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 20, 210, 1)  0           []                               
                                ]                                                                 
                                                                                                  
 conv2d_9 (Conv2D)              (None, 20, 210, 512  10752       ['input_2[0][0]']                
                                )                                                                 
                                                                                                  
 batch_normalization_11 (BatchN  (None, 20, 210, 512  2048       ['conv2d_9[0][0]']               
 ormalization)                  )                                                           

In [None]:
chk = ModelCheckpoint("model_jup", monitor='val_loss', save_best_only=True, mode='min', verbose=1)
model.fit(train,
    np.array(train_target),
    validation_data=(validation,np.array(validation_target)),
    epochs=500,
    batch_size=64,
    verbose=1,
    callbacks=[EarlyStopping(patience=50), chk])

In [None]:
from tensorflow.keras.models import load_model
model = load_model("model_script_4")

pred = model.predict(test)

test_set_prediction = {
    "test_target": test_target,
    "test_prediction": pred,
    "test_race_horse_id": test_dic
}

with open("test_set_prediction.pkl", "wb") as f:
    pickle.dump(test_set_prediction, f)