In [6]:
import pandas as pd
import numpy as np
import requests
import os
import re
import tarfile
import zipfile
import bz2
import glob
import logging
import yaml

from datetime import date, timedelta
from unittest.mock import patch
from typing import List, Set, Dict, Tuple, Optional
from itertools import zip_longest
import betfairlightweight
from betfairlightweight import StreamListener
from betfairlightweight.resources.bettingresources import (
    PriceSize,
    MarketBook
)

# Utility Functions
# _________________________________

def as_str(v) -> str:
    return '%.2f' % v if type(v) is float else v if type(v) is str else ''

def split_anz_horse_market_name(market_name: str) -> (str, str, str):
    parts = market_name.split(' ')
    race_no = parts[0] # return example R6
    race_len = parts[1] # return example 1400m
    race_type = parts[2].lower() # return example grp1, trot, pace
    return (race_no, race_len, race_type)


def load_markets(file_paths):
    for file_path in file_paths:
        print(file_path)
        if os.path.isdir(file_path):
            for path in glob.iglob(file_path + '**/**/*.bz2', recursive=True):
                f = bz2.BZ2File(path, 'rb')
                yield f
                f.close()
        elif os.path.isfile(file_path):
            ext = os.path.splitext(file_path)[1]
            # iterate through a tar archive
            if ext == '.tar':
                with tarfile.TarFile(file_path) as archive:
                    for file in archive:
                        yield bz2.open(archive.extractfile(file))
            # or a zip archive
            elif ext == '.zip':
                with zipfile.ZipFile(file_path) as archive:
                    for file in archive.namelist():
                        yield bz2.open(archive.open(file))

    return None

def slicePrice(l, n):
    try:
        x = l[n].price
    except:
        x = ""
    return(x)

def sliceSize(l, n):
    try:
        x = l[n].size
    except:
        x = ""
    return(x)

In [7]:
with open("../../secrets.yaml", 'r') as stream:
    creds = yaml.safe_load(stream)

trading = betfairlightweight.APIClient(creds['uid'], creds['pwd'],  app_key=creds["api_key"])

listener = StreamListener(max_latency=None)

# Extraction Notes


## Scope

- We want to extract predominantly preplay price data
- We also want to extract multiple price points per runner
- We also want final results and BSPs
- We probably want the time granularities to be split into chunks (start at 30 mins before off, every minute until 10 mins, then every second until off)
- The market components we want to extract I can think of rn are:
    + Probably take the whole available to back ladder, available to lay ladder, and traded volume ladder up to a certain amount

## Conclusion
So we might want to filter for a few tracks because this is going to be a lot of data. Maybe filter on big Victorian tracks or something.
Also might want to split the extraction into 2 components: preplay, and runner summary so I can keep the below code pattern that I coded up for the angles piece initially.

In a seperate script I figured out the top 5 tracks by volume per market are: Flemington, Caulfield, Moonee Valley, Bendigo, Sandown. Need to be careful about the venue names in the stream files which are going to be invariably different.

In [8]:
def filter_market(market: MarketBook) -> bool: 
    
    d = market.market_definition
    track_filter = ['Bendigo', 'Sandown', 'Flemington', 'Caulfield', 'Moonee Valley']

    return (d.country_code == 'AU' 
        and d.venue in track_filter
        and d.market_type == 'WIN' 
        and (c := split_anz_horse_market_name(d.name)[2]) != 'trot' and c != 'pace')




In [10]:
log1_Start = 60 * 30 # Seconds before scheduled off to start recording data for data segment one
log1_Step = 60       # Seconds between log steps for first data segment
log2_Start = 60* 10  # Seconds before scheduled off to start recording data for data segment two
log2_Step = 1        # Seconds between log steps for second data segment

def loop_stream_markets(s):

    with patch("builtins.open", lambda f, _: f):

        gen = s.get_generator()

        marketID = None
        time = None

        for market_books in gen():

            for market_book in market_books:
                
                # Only Evaluate Thoroughbred Races
                # ________________________________

                if ((evaluate_market := filter_market(market_book)) == False):
                    break

                # Time Step Management
                # _____________________

                if marketID is None:
                    # print(1)
                    marketID = market_book.market_id
                    time =  market_book.publish_time
                else:
                    
                    seconds_to_start = (market_book.market_definition.market_time - market_book.publish_time).total_seconds()

                    if seconds_to_start > log1_Start:

                        continue

                    else:
                    
                        # Update data at different time steps depending on seconds to off
                        wait = np.where(seconds_to_start < log2_Start, log2_Step, log1_Step)

                        # New Market
                        if market_book.market_id != marketID:
                            marketID = market_book.market_id
                            time =  market_book.publish_time
                        # (wait) seconds elapsed since last write
                        elif (market_book.publish_time - time).total_seconds() > wait:
                            time = market_book.publish_time
                        # fewer than (wait) seconds elapsed continue to next loop
                        else:
                            continue

                                
                # for runner in market_book.runners:

                #     o.write(
                #         "{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}\n".format(
                #             market_book.market_id,
                #             runner.selection_id,
                #             market_book.publish_time,
                #             market_book.status,
                #             market_book.inplay,
                #             sum([rung.size for rung in runner.ex.traded_volume]),
                #             runner.last_price_traded or "",
                #             slicePrice(runner.ex.available_to_back, 0),
                #             slicePrice(runner.ex.available_to_lay, 0),
                #             sliceSize(runner.ex.available_to_back, 0),
                #             sliceSize(runner.ex.available_to_lay, 0)
                #         )
                #     )

                    # print(
                    #     "{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}\n".format(
                    #         market_book.market_id,
                    #         runner.selection_id,
                    #         market_book.publish_time,
                    #         market_book.status,
                    #         market_book.inplay,
                    #         sum([rung.size for rung in runner.ex.traded_volume]),
                    #         runner.last_price_traded or "",
                    #         slicePrice(runner.ex.available_to_back, 0),
                    #         slicePrice(runner.ex.available_to_lay, 0),
                    #         sliceSize(runner.ex.available_to_back, 0),
                    #         sliceSize(runner.ex.available_to_lay, 0)
                    #     )
                    # )   

def parse_stream(dir):
    
    # with open("outputs/sample.csv", "w+") as output:

    #     output.write("market_id,selection_id,time,market_status,inplay_status,traded_volume,ltp,best_back,best_lay,best_back_volume,best_lay_volume\n")

    for file_obj in load_markets(dir):

        stream = trading.streaming.create_historical_generator_stream(
            file_path=file_obj,
            listener=listener,
        )

        loop_stream_markets(stream)


parse_stream(["/media/hdd/data/betfair-stream/thoroughbred/2021_06_JunRacingAUPro.tar"])

/media/hdd/data/betfair-stream/thoroughbred/2021_06_JunRacingAUPro.tar
Hamilton
Hamilton
Sapphire Coast
Scone
Rockhampton
Hamilton
Sapphire Coast
Scone
Rockhampton
Ballarat
None
None
None
None
Ballarat
Hamilton
None
None
None
None
None
None
None
Sapphire Coast
Scone
Rockhampton
Hamilton
Sapphire Coast
Scone
Rockhampton
Hamilton
Sapphire Coast
Scone
Rockhampton
Hamilton
Sapphire Coast
Scone
Rockhampton
Hamilton
Sapphire Coast
Hamilton
Scone
Rockhampton
Townsville
Ballarat
Townsville
Gosford
Ballarat
Quirindi
Townsville
Gosford
Ballarat
Quirindi
Townsville
Gosford
Ballarat
Quirindi
Townsville
Ballarat
Gosford
Quirindi
Ballarat
Townsville
Quirindi
Gosford
Sandown
Ballarat
Doomben
Gawler
Gosford
Warwick Farm
Ballarat
Sandown
Quirindi
Doomben
Gosford
Quirindi
Gawler
Warwick Farm
Sandown
Doomben
Gawler
Warwick Farm
Sandown
Doomben
Pakenham
Gawler
Taree
Warwick Farm
Belmont
Sandown
Pakenham
Doomben
Taree
Gawler
Warwick Farm
Belmont
Pakenham
Sandown
Doomben
Taree
Gawler
Warwick Farm
Pakenham
B

KeyboardInterrupt: 

In [4]:
# stream_file_dir = glob.glob("/media/hdd/data/betfair-stream/thoroughbred/*.tar")
# stream_file_dir

In [5]:
# file_name =  '/media/hdd/data/betfair-stream/thoroughbred/2021_03_MarRacingAUPro.tar'
# file_name

# re.search('.+(?=.tar)', file_name)

<re.Match object; span=(0, 66), match='/media/hdd/data/betfair-stream/thoroughbred/2021_>