In [1]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout
from keras_tqdm import TQDMNotebookCallback
import keras.callbacks

from sqlalchemy import create_engine
import json
from operator import itemgetter

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
np.set_printoptions(precision=6, suppress=True)
pd.options.mode.chained_assignment = None

Using TensorFlow backend.


In [19]:
engine = create_engine('sqlite:///../../data/race.db')
conn = engine.connect()

dfoo = pd.read_sql_table('race', conn)
print('{} races loaded!'.format(len(dfoo)))

69652 races loaded!


In [20]:
dfoo['results'] = dfoo['results_data'].map(json.loads)
dfoo['runners'] = dfoo['runners_data'].map(json.loads)
print('results and runners decoded')

results and runners decoded


In [21]:
dfos = dfoo.sort_values('race_start_time')
print(len(dfos))
dfos = dfos[int(len(dfos) * 0.15):]
print(len(dfos))
# print(dfos)

69652
59205


In [22]:
version = 'v3'

# R W  111111111 11111111 1111
# R P  111111111 11111111 111
# G W  111111111 11111111 1111
# G P  111111111 11111111 111
# H W  111111111 11111111 1111
# H P  111111111 11111111 111

# race_type = 'R'
# race_type = 'G'
race_type = 'H'

bet_type = 'W'
# bet_type = 'P'

In [23]:
dfo = dfos.loc[dfos['race_type'].isin([race_type])]
print('{} {} races!'.format(len(dfo), race_type))

15467 H races!


In [24]:
# extract runners from races
data_all = []
for ri, race in dfo.iterrows():
    data_all.extend(race['runners'])
data_all = pd.DataFrame(data_all)
# print(data_all.columns)

In [25]:
# drop scratched
data = data_all.dropna(subset=['has_odds', 'sigma_scaled'])
data.tail(3)

Unnamed: 0,P_pred,P_prob,W_pred,W_prob,barrierNumber,claimAmount,cnt,finishingPosition,fixedOdds,fpo,...,tps,trainerName,two,twp,twr,tws,win_odds,win_perc,win_rank,win_scaled
156840,0.287855,0.157825,0.15397,0.148552,0,0,5.0,0,"{'returnWin': 5.5, 'returnWinOpen': 8, 'return...",1.8,...,0.083019,I P BLANCHON,9.8,0.102041,0.8125,0.08594,9.8,,,
156841,0.201489,0.110472,0.111659,0.10773,0,0,6.0,0,"{'returnWin': 9.5, 'returnWinOpen': 10, 'retur...",2.51,...,0.07428,MLLE V CHATELAIN,10.7,0.093458,0.75,0.078712,10.7,,,
156842,0.572163,0.313705,0.359641,0.346986,0,0,4.0,2,"{'returnWin': 1.9, 'returnWinOpen': 1.7, 'retu...",1.16,...,0.141132,A LAURENT,3.0,0.333333,1.0,0.280739,3.0,,,


In [26]:
# get label data
Y = data['finishingPosition']

if bet_type == 'W':
    Y = (Y == 1)
elif bet_type == 'P':
    Y = (Y == 1) | (Y == 2) | ((Y == 3) & (data['num_runners'] >= 8))

#print(data['finishingPosition'].head(10))
Y = Y.astype(int)
# Y.head(10)
Y.describe()

count    149042.000000
mean          0.103904
std           0.305136
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: finishingPosition, dtype: float64

In [27]:
xn = data['num_runners']
xfws = data['fws']
xfps = data['fps']
xtws = data['tws']
xtps = data['tps']
xfwr = data['fwr']
xfpr = data['fpr']
xtwr = data['twr']
xtpr = data['tpr']
xrm = data['mu_scaled']
xrs = data['sigma_scaled']
xrp = data['rating_prob']
X = pd.concat([xn,
    xfws, xfps, xtws, xtps,
    xfwr, xfpr, xtwr, xtpr,
    xrm, xrs, xrp
], axis=1)
X.tail()

Unnamed: 0,num_runners,fws,fps,tws,tps,fwr,fpr,twr,tpr,mu_scaled,sigma_scaled,rating_prob
156838,0.0625,0.008038,0.014646,0.018112,0.053257,0.25,0.25,0.5,0.5,0.934312,0.239489,0.028494
156839,0.0625,0.026189,0.043428,0.029042,0.094088,0.625,0.625,0.5625,0.875,0.980571,0.217118,0.062679
156840,0.0625,0.147608,0.152723,0.08594,0.083019,0.9375,0.9375,0.8125,0.8125,1.16671,0.288271,0.134715
156841,0.0625,0.085457,0.109523,0.078712,0.07428,0.8125,0.8125,0.75,0.6875,1.02473,0.242466,0.097634
156842,0.0625,0.427287,0.236984,0.280739,0.141132,1.0,1.0,1.0,1.0,1.095073,0.339114,0.127882


In [28]:
# For a single-input model with 2 classes (binary classification):
print('version {}'.format(version))

n = len(X.columns)
print('input dimension = {}'.format(n))

epochs = 500
print('epochs = {}'.format(epochs))

layer_1 = 64
layer_2 = 64
# layer_3 = 200
dropout = 0.2

tag = '{}{}x{}{}'.format(race_type, layer_1, layer_2, bet_type)
print('tag = {}'.format(tag))
file_name = '/Users/jaco/code/tabby/each_way/{}/models/{}.h5'.format(version, tag)

try:
    model = load_model(file_name)
    print('model loaded')
except OSError:
    model = Sequential()
    model.add(Dense(layer_1, activation='relu', input_dim=n))
    model.add(Dropout(dropout))
    model.add(Dense(layer_2, activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    print('model created')

version v3
input dimension = 12
epochs = 500
tag = H64x64W
model loaded


In [29]:
# Train the model, iterating on the data in batches of 32 samples
tqdm = TQDMNotebookCallback()

tbCallBack = keras.callbacks.TensorBoard(
    log_dir='/Users/jaco/code/tabby/each_way/{}/summary/{}'.format(version, tag), 
    histogram_freq=0,
    write_graph=True,
    write_images=True)

model.fit(
    X.as_matrix(), 
    Y.as_matrix(),
    validation_split=0.2,
    shuffle=True,
    epochs=epochs,
    batch_size=64,
    verbose=0,
    callbacks=[tqdm, tbCallBack])

# creates a HDF5 file
model.save(file_name)
print('model saved')


model saved
