In [1]:
# TODO delete this.
# Ensures that modules are reloaded on re-running
%load_ext autoreload
%autoreload 2

from datetime import datetime
import pandas
from tqdm import tqdm
import matplotlib.pyplot as plt
from src.logger import log
from src.helpers import generate_new_combination_indices
import numpy as np
import tensorflow as tf
import os
import json

if len(tf.config.list_physical_devices('GPU')) == 0:
    raise Exception('No GPU found')

# TODO
# This is a good tutorial: https://www.tensorflow.org/tutorials/structured_data/time_series

In [2]:
def get_training_data(file_path: str) -> pandas.DataFrame:
    df = pandas.read_csv(file_path, dtype={
        '_id': str,
        'refid': str,
        'bet_amount_horse_nb_1': float,
        'bet_amount_horse_nb_2': float,
        'bet_amount_horse_nb_3': float,
        'bet_amount_horse_nb_4': float,
        'bet_amount_horse_nb_5': float,
        'bet_amount_horse_nb_6': float,
        'bet_amount_horse_nb_7': float,
        'bet_amount_horse_nb_8': float,
        'bet_amount_horse_nb_9': float,
        'bet_amount_horse_nb_10': float,
        'bet_amount_horse_nb_11': float,
        'bet_amount_horse_nb_12': float,
        'bet_amount_horse_nb_13': float,
        'bet_amount_horse_nb_14': float,
    }, index_col=0)
    log(f"Read in training data of {df['refid'].nunique()} races, comprising {len(df.index)} rows")
    return df


def get_training_config(file_path: str) -> dict:
    with open(file_path) as f:
        return json.load(f)


path = '../data/2_prepared/2021-07-13T23-25-28'
data = get_training_data(f'{path}/data.csv')
config = get_training_config(f'{path}/config.json')

17:35:15: Read in training data of 3106 races, comprising 310600 rows


In [3]:
def convert_to_training_data(df: pandas.DataFrame) -> np.ndarray:
    race_ids = df['refid'].unique()
    races = []
    for refid in tqdm(race_ids):
        race = df.loc[df['refid'] == refid]
        cleaned = race.drop(columns=['_id', 'refid'])
        races.append(cleaned.to_numpy())

    as_numpy = np.asarray(races)
    log(f"Converted data into numpy array of shape {as_numpy.shape}")
    return as_numpy


training_data = convert_to_training_data(data)

100%|██████████| 3106/3106 [01:14<00:00, 41.42it/s]

17:36:30: Converted data into numpy array of shape (3106, 100, 14)





In [4]:
# Split into training, validation and testing set
n = len(training_data)
training_set = training_data[0:int(n * 0.7)]
validation_set = training_data[int(n * 0.7):int(n * 0.9)]
test_set = training_data[int(n * 0.9):]

In [5]:
train_mean = training_set.mean()
train_std = training_set.std()

# FIXME I don't think normalisation is useful to this dataset.
norm_training = training_set / training_set.max()  # (training_set - train_mean) / train_std
norm_validation = validation_set / training_set.max()  # (validation_set - train_mean) / train_std
norm_test = test_set / training_set.max()  # (test_set - train_mean) / train_std

In [6]:
# Convert data into an input (x) and label (y) set for evaluation.
# Will also omit intermediate steps based on how many it should predict into the future
def split_into_inputs_and_labels(to_split: np.ndarray, steps: int) -> [np.ndarray, np.ndarray]:
    inputs = to_split[:, :-steps, :]
    labels = np.expand_dims(to_split[:, -1, :], axis=1)
    return [inputs, labels]


# Inflate the dataset to a given factor with randomised indices
# As we have up to 14 horses, this means we can scale the dataset by a factor of up to
# 14! = 87178291200
def inflate_data(to_inflate: np.ndarray, indices: np.ndarray) -> np.ndarray:
    new_data = []
    for d in to_inflate:
        for i in indices:
            new_data.append(d.T[i].T)

    return np.asarray(new_data)


config['data_inflation_factor'] = 1
combination_indices = np.asarray(
    generate_new_combination_indices(config['random_seed'], 14, config['data_inflation_factor']))

result = inflate_data(norm_test, combination_indices)

config['cycles_into_the_future'] = 2

train_inputs, train_labels = split_into_inputs_and_labels(inflate_data(norm_training, combination_indices),
                                                          config['cycles_into_the_future'])
validation_inputs, validation_labels = split_into_inputs_and_labels(inflate_data(norm_validation, combination_indices),
                                                                    config['cycles_into_the_future'])
test_inputs, test_labels = split_into_inputs_and_labels(norm_test, config['cycles_into_the_future'])

17:36:30: Generated 1 position combinations for 14 elements in 1 iterations


In [7]:
training_dataset = tf.data.Dataset.from_tensor_slices((train_inputs, train_labels))
validation_dataset = tf.data.Dataset.from_tensor_slices((validation_inputs, validation_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_inputs, test_labels))

In [8]:
# TODO intellij gets memory leaks from long training.
# I should move this to a regular python file
# TODO save model after training
from src.models.baseline import NaiveNoChange
from src.models.stateless import LinearSingleStep
from src.models.multistep_dense import MultistepDense
from src.models.single_layer import SingleLayerDense
from src.models.univariate_linear import UnivariateLinear
from src.models.univariate_single_mlp import UnivariateSingleMLP

all_models = [NaiveNoChange(), LinearSingleStep(), UnivariateLinear(), SingleLayerDense(), MultistepDense(),
              UnivariateSingleMLP()]
models = [UnivariateSingleMLP(), NaiveNoChange(), UnivariateLinear()]

date_string = datetime.now().strftime('%Y-%m-%dT%H-%M-%S')
output_folder = f'../data/3_trained/{date_string}'

if not os.path.exists(output_folder):
    os.mkdir(output_folder)

training_performances = {}
validation_performances = {}
test_performances = {}


def compile_and_fit(model, patience=5, max_epochs=20):
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                      patience=patience,
                                                      mode='min')

    model.compile(loss=tf.losses.MeanSquaredError(),
                  optimizer=tf.optimizers.Adam(),
                  metrics=[tf.metrics.MeanAbsoluteError()])

    if train_labels[0].shape != model(train_inputs[0]).shape:
        raise Exception('The training inputs and model outputs have different shapes.'
                        f'Training label:{train_labels[0].shape}'
                        f'Model output: {model(train_inputs[0]).shape}')

    model.summary()

    trainable_params = np.sum([np.prod(v.get_shape()) for v in model.trainable_weights])

    # Allows us to skip models that don't actually have anything to train
    if trainable_params > 0:
        history = model.fit(training_dataset, epochs=max_epochs,
                            validation_data=validation_dataset,
                            callbacks=[early_stopping]
                            )
    else:
        history = None

    return history


for m in models:
    config['training_epochs'] = 100
    config['patience'] = 10
    name = m.__class__.__name__

    training_history = compile_and_fit(m, max_epochs=config['training_epochs'], patience=config['patience'])

    training_performances[name] = m.evaluate(training_dataset)
    validation_performances[name] = m.evaluate(validation_dataset)
    test_performances[name] = m.evaluate(test_dataset)
    log(f'{name}: validation performance:{validation_performances[name]}')
    log(f'{name}: test performance:{test_performances[name]}')

    description_path = f'{output_folder}/{name}_description.txt'

    if training_history is not None:
        history_path = f'{output_folder}/{name}_training.json'
        with open(history_path, 'w') as outfile:
            json.dump(training_history.history, outfile, indent=4)

updated_config_path = f'{output_folder}/config.json'
with open(updated_config_path, 'w') as outfile:
    json.dump(config, outfile, indent=4)

performances = {'training_performances': training_performances, 'validation_performances': validation_performances,
                'testing_performances': test_performances}
performances_path = f'{output_folder}/performances.json'
with open(performances_path, 'w') as outfile:
    json.dump(performances, outfile, indent=4)

Model: "univariate_single_mlp_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              multiple                  99        
Total params: 99
Trainable params: 99
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100

KeyboardInterrupt: 

In [None]:
# Makes plots appear as separate windows for easier zoom
% matplotlib qt

def get_race_id(full_race: np.ndarray) -> str:
    norm_data = np.vstack((norm_training, norm_validation, norm_test))
    index = np.where(np.all(norm_data == full_race, axis=2))[0][0]
    print(index)
    return data.iloc[index * config['target_race_length']]['refid']


markers = ['P', 's', 'p', 'X', '*', 'D']
colours = ['black', 'aqua', 'blue', 'brown', 'coral', 'goldenrod', 'green', 'indigo', 'lime', 'magenta', 'red',
           'purple', 'turquoise', 'yellow', 'sienna', 'khaki']


def plot_predictions(test_series, test_input, test_label):
    [input_cycles, input_pools] = test_input.shape

    # Plot the existing time series and future
    for input_column, series_column, colour in (zip(test_input.T, test_series.T, colours)):
        plt.plot(range(0, input_cycles), input_column, marker='.', color=colour)
        future_values = np.insert(series_column[len(input_column):], 0, input_column[-1])
        plt.plot(range(input_cycles - 1, input_cycles - 1 + len(future_values)), future_values, marker='.',
                 color=colour,
                 alpha=0.3, linewidth=2)

    # Plot model predictions
    for model, marker in zip(models, markers):
        predictions = np.squeeze(model(test_input))
        model_name = model.__class__.__name__
        label_set = False
        for p, colour in zip(predictions, colours):
            label = None if label_set else f'{model_name}'
            plt.plot(len(test_series) - 1, p, marker, linestyle='', color=colour,
                     label=label)
            label_set = True

    plt.ylabel('Bet Pool - normalised')
    plt.xlabel(f'Cycles (30 seconds each)')
    refid = get_race_id(test_series)
    plt.title(f"Race {refid} - Model Predictions {config['cycles_into_the_future']} into the future")
    plt.legend()
    plt.show()


plot_predictions(norm_test[15], test_inputs[15], test_labels[15])