In [1]:
# Imports & Config
from datetime import datetime
import pandas
from tqdm import tqdm
from src.logger import log
import hashlib
import json
import os

config = {
    'random_seed': 'bananarama',  # seed string for "random" behaviour when preparing training or training itself.
    'prepared_file_name': 'win_only_data-2021-07-06T21-57-03.csv',  # input file
    'prepared_file_hash': 'replace me with real value',
    'target_race_length': 100,  # how many of the last bet cycles to use.
}

In [2]:
def get_whole_file(file_path: str) -> pandas.DataFrame:
    df = pandas.read_csv(file_path, dtype={
        '_id': str,
        '_seconds': float,
        '_suspicious': str,
        'asattimestr': str,  # bet cycle time
        # These ids probably form a unique composite of some sort
        # Seems to be combo of asatdate+meetid+performanceid+raceid
        'refid': str,
        'type': str,  # FINAL or INTER, but what does it mean?
        # bet type: place, win, quinella and omni.
        # Different types allow different bet combinations and accordingly pool dimensions.
        'ccy': str,  # currency: 83% HKD and 17% GBP
        # horse number  and bet amount
        # is the number of participating horses fixed or variable?
        'bet_amount_horse_nb_1': float,
        'bet_amount_horse_nb_2': float,
        'bet_amount_horse_nb_3': float,
        'bet_amount_horse_nb_4': float,
        'bet_amount_horse_nb_5': float,
        'bet_amount_horse_nb_6': float,
        'bet_amount_horse_nb_7': float,
        'bet_amount_horse_nb_8': float,
        'bet_amount_horse_nb_9': float,
        'bet_amount_horse_nb_10': float,
        'bet_amount_horse_nb_11': float,
        'bet_amount_horse_nb_12': float,
        'bet_amount_horse_nb_13': float,
        'bet_amount_horse_nb_14': float,
    })
    log(f"Read in file of {df['refid'].nunique()} races, comprising {len(df.index)} rows")
    return df

def md5(fname):
    hash_md5 = hashlib.md5()
    with open(fname, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

path = f"../data/1_cleaned/{config['prepared_file_name']}"
data = get_whole_file(path)
config['prepared_file_hash'] = md5(path)

16:49:55: Read in file of 3106 races, comprising 634679 rows


In [3]:
def pick_relevant_rows(df: pandas.DataFrame) -> pandas.DataFrame:
    relevant_columns = df[
        ['_id', 'refid',
         'bet_amount_horse_nb_1', 'bet_amount_horse_nb_2', 'bet_amount_horse_nb_3', 'bet_amount_horse_nb_4',
         'bet_amount_horse_nb_5', 'bet_amount_horse_nb_6', 'bet_amount_horse_nb_7', 'bet_amount_horse_nb_8',
         'bet_amount_horse_nb_9', 'bet_amount_horse_nb_10', 'bet_amount_horse_nb_11', 'bet_amount_horse_nb_12',
         'bet_amount_horse_nb_13', 'bet_amount_horse_nb_14']]

    log('Reduced columns to training-relevant data')

    return relevant_columns

data = pick_relevant_rows(data)

16:49:55: Reduced columns to training-relevant data


In [4]:
def trim_races_to_uniform_duration(df: pandas.DataFrame) -> pandas.DataFrame:
    races = df['refid'].unique()
    trimmed_races = []
    for refid in tqdm(races):
        race = df.loc[df['refid'] == refid]
        if len(race.index) >= config['target_race_length']:
            trimmed_races.append(race.tail(config['target_race_length']))


    log(f"Trimmed race cycles to size {config['target_race_length']}, keeping {len(trimmed_races)} races")

    return pandas.concat(trimmed_races)

data = trim_races_to_uniform_duration(data)

100%|██████████| 3106/3106 [05:00<00:00, 10.34it/s]


16:54:56: Trimmed race cycles to size 25, keeping 3106 races


In [5]:
date_string = datetime.now().strftime('%Y-%m-%dT%H-%M-%S')
output_folder = f'../data/2_prepared/{date_string}'
output_path = f'{output_folder}/data.csv'
output_config_path = f'{output_folder}/config.json'

if not os.path.exists(output_folder):
    os.mkdir(output_folder)

log(f'Writing training data of {len(data.index)} rows to {output_path}')
data.to_csv(output_path)

log(f'Writing config of generated data to {output_config_path}')
with open(output_config_path, 'w') as outfile:
    json.dump(config, outfile, indent=4)

log(f'Finished')

16:54:57: Writing training data of 77650 rows to ../data/2_prepared/2021-08-13T16-54-57/data.csv
16:54:58: Writing config of generated data to ../data/2_prepared/2021-08-13T16-54-57/config.json
16:54:58: Finished
