In [4]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout
from keras_tqdm import TQDMNotebookCallback
import keras.callbacks

from sqlalchemy import create_engine
import json

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

np.set_printoptions(precision=6, suppress=True)
pd.options.mode.chained_assignment = None

In [15]:
version = 'v2'

# R  W  
# R  P  
# G  W  
# G  P  
# H  W  
# H  P  

race_type = 'R'
# race_type = 'G'
# race_type = 'H'

bet_type = 'W'
# bet_type = 'P'

In [6]:
engine = create_engine('sqlite:///../../data/race.db')
conn = engine.connect()

dfoo = pd.read_sql_table('race', conn)
print('{} races loaded!'.format(len(dfoo)))

41674 races loaded!


In [8]:
dfo = dfoo.loc[dfoo['race_type'].isin([race_type])]
print('{} {} races!'.format(len(dfo), race_type))

13525 R races!


In [9]:
dfo['results'] = dfo['results_data'].map(json.loads)
dfo['runners'] = dfo['runners_data'].map(json.loads)
dfo.head(3)

Unnamed: 0,id,meeting_name,location,venue_mnemonic,race_type,meeting_date,race_number,race_name,race_start_time,race_status,race_distance,results_data,num_runners,runners_data,quinella,exacta,trifecta,first_four,results,runners
0,1,BALLINA,NSW,C,R,2017-10-01,1,COATES HIRE MAIDEN PLATE,2017-10-01 01:40:00,Paying,1600,"[[4], [9], [7], [6]]",11.0,"[{""runnerName"": ""GLOWING FEELING"", ""runnerNumb...",35.2,27.4,170.9,401.1,"[[4], [9], [7], [6]]","[{'runnerName': 'GLOWING FEELING', 'runnerNumb..."
1,3,BALLINA,NSW,C,R,2017-10-01,3,CNW ELECTRICAL WHOLESALE CG&E BM55,2017-10-01 02:55:00,Paying,1000,"[[3], [4], [5], [1]]",7.0,"[{""runnerName"": ""ALL THE WAY"", ""runnerNumber"":...",14.1,23.1,100.3,710.5,"[[3], [4], [5], [1]]","[{'runnerName': 'ALL THE WAY', 'runnerNumber':..."
2,4,BALLINA,NSW,C,R,2017-10-01,4,LEND LEASE MAIDEN HANDICAP,2017-10-01 03:35:00,Paying,1300,"[[14], [5], [1], [13]]",12.0,"[{""runnerName"": ""CASSINI COMET"", ""runnerNumber...",8.5,22.6,49.6,1158.9,"[[14], [5], [1], [13]]","[{'runnerName': 'CASSINI COMET', 'runnerNumber..."


In [10]:
# extract runners from races
data_all = []
for ri, race in dfo.iterrows():
    data_all.extend(race['runners'])
data_all = pd.DataFrame(data_all)
data_all.tail(3)

Unnamed: 0,P_pred,P_prob,W_pred,W_prob,barrierNumber,claimAmount,cnt,finishingPosition,fixedOdds,fpo,...,sigma_scaled,tpo,tpp,trainerName,two,twp,win_odds,win_perc,win_rank,win_scaled
151624,0.209246,0.166667,0.209246,0.166667,2,-1.0,1,2,"{'returnWin': 3.9, 'returnWinOpen': 5.5, 'retu...",1.95,...,1.0,2.1,0.47619,B ELLISON,3.8,0.263158,3.8,0.263158,0.571429,0.221744
151625,0.209246,0.166667,0.209246,0.166667,3,-1.0,1,0,"{'returnWin': 3.2, 'returnWinOpen': 3.2, 'retu...",1.7,...,1.0,2.7,0.37037,ED WALKER,3.5,0.285714,3.5,0.285714,1.0,0.24075
151626,0.209246,0.166667,0.209246,0.166667,1,-1.0,1,0,"{'returnWin': 31, 'returnWinOpen': 18, 'return...",10.7,...,1.0,4.2,0.238095,M H TOMPKINS,19.5,0.051282,19.5,0.051282,0.428571,0.043212


In [21]:
# drop scratched
data = data_all.dropna(subset=['has_odds', 'sigma_scaled'])
data.tail(3)

Unnamed: 0,P_pred,P_prob,W_pred,W_prob,barrierNumber,claimAmount,cnt,finishingPosition,fixedOdds,fpo,...,sigma_scaled,tpo,tpp,trainerName,two,twp,win_odds,win_perc,win_rank,win_scaled
151624,0.209246,0.166667,0.209246,0.166667,2,-1.0,1,2,"{'returnWin': 3.9, 'returnWinOpen': 5.5, 'retu...",1.95,...,1.0,2.1,0.47619,B ELLISON,3.8,0.263158,3.8,0.263158,0.571429,0.221744
151625,0.209246,0.166667,0.209246,0.166667,3,-1.0,1,0,"{'returnWin': 3.2, 'returnWinOpen': 3.2, 'retu...",1.7,...,1.0,2.7,0.37037,ED WALKER,3.5,0.285714,3.5,0.285714,1.0,0.24075
151626,0.209246,0.166667,0.209246,0.166667,1,-1.0,1,0,"{'returnWin': 31, 'returnWinOpen': 18, 'return...",10.7,...,1.0,4.2,0.238095,M H TOMPKINS,19.5,0.051282,19.5,0.051282,0.428571,0.043212


In [22]:
# get label data
Y = data['finishingPosition']

if bet_type == 'W':
    Y = (Y == 1)
elif bet_type == 'P':
    Y = (Y == 1) | (Y == 2) | ((Y == 3) & (data['num_runners'] >= 8))

#print(data['finishingPosition'].head(10))
Y = Y.astype(int)
# Y.head(10)
Y.describe()

count    133995.000000
mean          0.101011
std           0.301345
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: finishingPosition, dtype: float64

In [23]:
xn = data['num_runners']
xwp = data['win_perc']
xws = data['win_scaled']
xwr = data['win_rank']
xpp = data['place_perc']
xps = data['place_scaled']
xpr = data['place_rank']
ms = data['mu_scaled']
ss = data['sigma_scaled']
X = pd.concat([xn, xwp, xws, xwr, xpp, xps, xpr, ms, ss], axis=1)
X.describe()

Unnamed: 0,num_runners,win_perc,win_scaled,win_rank,place_perc,place_scaled,place_rank,mu_scaled,sigma_scaled
count,133995.0,133995.0,133995.0,133995.0,133995.0,133995.0,133995.0,133995.0,133995.0
mean,0.100931,0.119376,0.100932,0.597391,0.331869,0.100932,0.602346,1.005312,0.672123
std,0.031243,0.117119,0.098762,0.265054,0.215763,0.071156,0.265957,0.121019,0.289203
min,0.041667,0.000651,0.000549,0.041667,0.002563,0.00151,0.041667,0.591656,0.154243
25%,0.076923,0.037736,0.031975,0.375,0.16129,0.047165,0.384615,0.935464,0.402703
50%,0.090909,0.081301,0.068588,0.6,0.285714,0.084943,0.611111,1.0,0.558987
75%,0.111111,0.16129,0.136557,0.833333,0.454545,0.138608,0.833333,1.0,1.0
max,0.333333,0.961538,0.894509,1.0,1.0,0.946292,1.0,1.724258,1.0


In [24]:
# For a single-input model with 2 classes (binary classification):
print('version {}'.format(version))

n = len(X.columns)
print('input dimension = {}'.format(n))

epochs = 500
print('epochs = {}'.format(epochs))

layer_1 = 64
layer_2 = 64

tag = '{}{}x{}{}'.format(race_type, layer_1, layer_2, bet_type)
print('tag = {}'.format(tag))
file_name = '/Users/jaco/code/tabby/each_way/{}/models/{}.h5'.format(version, tag)

try:
    model = load_model(file_name)
    print('model loaded')
except OSError:
    model = Sequential()
    model.add(Dense(layer_1, activation='relu', input_dim=n))
    model.add(Dense(layer_2, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    print('model created')

version v3
input dimension = 9
epochs = 500
tag = R64x64W
model created


In [26]:
# Train the model, iterating on the data in batches of 32 samples
tqdm = TQDMNotebookCallback()

tbCallBack = keras.callbacks.TensorBoard(
    log_dir='/Users/jaco/code/tabby/each_way/{}/summary/{}'.format(version, tag), 
    histogram_freq=0,
    write_graph=True,
    write_images=True)

model.fit(
    X.as_matrix(), 
    Y.as_matrix(),
    validation_split=0.2,
    shuffle=True,
    epochs=epochs,
    batch_size=64,
    verbose=0,
    callbacks=[tqdm, tbCallBack])

# creates a HDF5 file
model.save(file_name)
print('model saved')




          33856/|/[loss: 0.312, acc: 0.897]  32%|| 33856/107196 [00:20<00:02, 28544.41it/s]

model saved
