In [2]:
import pandas as pd
import numpy as np
import requests
import os
import re
import tarfile
import zipfile
import bz2
import glob
import logging
import yaml

from datetime import date, timedelta
from unittest.mock import patch
from typing import List, Set, Dict, Tuple, Optional
from itertools import zip_longest
import betfairlightweight
from betfairlightweight import StreamListener
from betfairlightweight.resources.bettingresources import (
    PriceSize,
    MarketBook
)

# Utility Functions
# _________________________________

def as_str(v) -> str:
    return '%.2f' % v if type(v) is float else v if type(v) is str else ''

def split_anz_horse_market_name(market_name: str) -> (str, str, str):
    parts = market_name.split(' ')
    race_no = parts[0] # return example R6
    race_len = parts[1] # return example 1400m
    race_type = parts[2].lower() # return example grp1, trot, pace
    return (race_no, race_len, race_type)


def load_markets(file_paths):
    for file_path in file_paths:
        print(file_path)
        if os.path.isdir(file_path):
            for path in glob.iglob(file_path + '**/**/*.bz2', recursive=True):
                f = bz2.BZ2File(path, 'rb')
                yield f
                f.close()
        elif os.path.isfile(file_path):
            ext = os.path.splitext(file_path)[1]
            # iterate through a tar archive
            if ext == '.tar':
                with tarfile.TarFile(file_path) as archive:
                    for file in archive:
                        yield bz2.open(archive.extractfile(file))
            # or a zip archive
            elif ext == '.zip':
                with zipfile.ZipFile(file_path) as archive:
                    for file in archive.namelist():
                        yield bz2.open(archive.open(file))

    return None

def slicePrice(l, n):
    try:
        x = l[n].price
    except:
        x = ""
    return(x)

def sliceSize(l, n):
    try:
        x = l[n].size
    except:
        x = ""
    return(x)

def pull_ladder(availableLadder, n = 5):
        out = {}
        price = []
        volume = []
        if len(availableLadder) == 0:
            return(out)        
        else:
            for rung in availableLadder[0:n]:
                price.append(rung.price)
                volume.append(rung.size)

            out["p"] = price
            out["v"] = volume
            return(out)

In [3]:
with open("../../secrets.yaml", 'r') as stream:
    creds = yaml.safe_load(stream)

trading = betfairlightweight.APIClient(creds['uid'], creds['pwd'],  app_key=creds["api_key"])

listener = StreamListener(max_latency=None)

# Extraction Notes


## Scope

- We want to extract predominantly preplay price data
- We also want to extract multiple price points per runner
- We also want final results and BSPs
- We probably want the time granularities to be split into chunks (start at 30 mins before off, every minute until 10 mins, then every second until off)
- The market components we want to extract I can think of rn are:
    + Probably take the whole available to back ladder, available to lay ladder, and traded volume ladder up to a certain amount

## Conclusion
So we might want to filter for a few tracks because this is going to be a lot of data. Maybe filter on big Victorian tracks or something.
Also might want to split the extraction into 2 components: preplay, and runner summary so I can keep the below code pattern that I coded up for the angles piece initially.

In a seperate script I figured out the top 5 tracks by volume per market are: Flemington, Caulfield, Moonee Valley, Bendigo, Sandown. Need to be careful about the venue names in the stream files which are going to be invariably different.

In [4]:
def filter_market(market: MarketBook) -> bool: 
    
    d = market.market_definition
    track_filter = ['Bendigo', 'Sandown', 'Flemington', 'Caulfield', 'Moonee Valley']

    return (d.country_code == 'AU' 
        and d.venue in track_filter
        and d.market_type == 'WIN' 
        and (c := split_anz_horse_market_name(d.name)[2]) != 'trot' and c != 'pace')

# Preplay Prices

In [5]:
log1_Start = 60 * 30 # Seconds before scheduled off to start recording data for data segment one
log1_Step = 60       # Seconds between log steps for first data segment
log2_Start = 60 * 10  # Seconds before scheduled off to start recording data for data segment two
log2_Step = 1        # Seconds between log steps for second data segment

def loop_preplay_prices(s, o):

    with patch("builtins.open", lambda f, _: f):

        gen = s.get_generator()

        marketID = None
        tradeVols = None
        time = None

        for market_books in gen():

            # Check if this market book meets our market filter
            # ________________________________

            if ((evaluate_market := filter_market(market_books[0])) == False):
                    break

            for market_book in market_books:

                # Time Step Management
                # _____________________

                if marketID is None:

                    # No market initialised
                    marketID = market_book.market_id
                    time =  market_book.publish_time

                elif market_book.inplay:

                    # Stop once market goes inplay
                    break

                else:
                    
                    seconds_to_start = (market_book.market_definition.market_time - market_book.publish_time).total_seconds()

                    if seconds_to_start <= log1_Start:
                        
                        # Too early before off to start logging prices
                        continue

                    else:
                    
                        # Update data at different time steps depending on seconds to off
                        wait = np.where(seconds_to_start <= log2_Start, log2_Step, log1_Step)

                        # New Market
                        if market_book.market_id != marketID:
                            marketID = market_book.market_id
                            time =  market_book.publish_time
                        # (wait) seconds elapsed since last write
                        elif (market_book.publish_time - time).total_seconds() > wait:
                            time = market_book.publish_time
                        # fewer than (wait) seconds elapsed continue to next loop
                        else:
                            continue

                # Execute Data Logging
                # _____________________
                                                
                for runner in market_book.runners:

                    try:
                        atb_ladder = pull_ladder(runner.ex.available_to_back, n = 10)
                        atl_ladder = pull_ladder(runner.ex.available_to_lay, n = 10)
                    except:
                        atb_ladder = {}
                        atl_ladder = {}

                    # Calculate Current Traded Volume + Tradedd WAP
                    limitTradedVol = sum([rung.size for rung in runner.ex.traded_volume])
                    if limitTradedVol == 0:
                        limitWAP = ""
                    else:
                        limitWAP = sum([rung.size * rung.price for rung in runner.ex.traded_volume]) / limitTradedVol
                        limitWAP = round(limitWAP, 2)

                    o.write(
                        "{}, {}, {}, {}, {}, {}, {}, {}\n".format(
                            market_book.market_id,
                            runner.selection_id,
                            market_book.publish_time,
                            limitTradedVol,
                            limitWAP,
                            runner.last_price_traded or "",
                            '"' + str(atb_ladder).replace(' ','') + '"', 
                            '"' + str(atl_ladder).replace(' ','') + '"'
                        )
                    )   

def parse_preplay_prices(dir, out_file):
    
    with open(output_file, "w+") as output:

        output.write("market_id,selection_id,time,traded_volume,wap,ltp,atb_ladder,atl_ladder\n")

        for file_obj in load_markets(dir):

            stream = trading.streaming.create_historical_generator_stream(
                file_path=file_obj,
                listener=listener,
            )

            loop_preplay_prices(stream, output)


output_file = "/media/hdd/tmp/thoroughbred-parsed/vic-tho-preplay-price-movements.csv"

#parse_preplay_prices(["/media/hdd/data/betfair-stream/thoroughbred/2021_06_JunRacingAUPro.tar"], output_file)

# Market Meta

In [13]:
def final_market_book(s):

    with patch("builtins.open", lambda f, _: f):

        gen = s.get_generator()

        for market_books in gen():
            
            # Check if this market book meets our market filter
            # ________________________________

            if ((evaluate_market := filter_market(market_books[0])) == False):
                    return(None)
            
            for market_book in market_books:

                last_market_book = market_book
        
        return(last_market_book)


def parse_final_selection_meta(dir):
    
    # with open(output_file, "w+") as output:

        #output.write("market_id,selection_id,time,traded_volume,wap,ltp,atb_ladder,atl_ladder\n")

    for file_obj in load_markets(dir):

        stream = trading.streaming.create_historical_generator_stream(
            file_path=file_obj,
            listener=listener,
        )

        last_market_book = final_market_book(stream)

        if last_market_book is None:
            continue 

        # Extract Info
        # _____________________________________

        runnerMeta = [
            {
                'selection_id': r.selection_id,
                'selection_name': next((rd.name for rd in last_market_book.market_definition.runners if rd.selection_id == r.selection_id), None),
                'selection_status': r.status,
                'win': np.where(r.status == "WINNER", 1, 0),
                'sp': r.sp.actual_sp
            }
            for r in last_market_book.runners 
        ]

        # Return Info
        # _____________________________________

        for runnerMeta in runnerMeta:

            if runnerMeta['selection_status'] != 'REMOVED':

                print(
                    "{},{},{},{},{},{},{}\n".format(
                        str(last_market_book.market_id),
                        last_market_book.market_definition.venue,
                        last_market_book.market_definition.market_time,
                        runnerMeta['selection_id'],
                        runnerMeta['selection_name'],
                        runnerMeta['win'],
                        runnerMeta['sp']
                    )
                )

parse_final_selection_meta(["/media/hdd/data/betfair-stream/thoroughbred/2021_06_JunRacingAUPro.tar"])





/media/hdd/data/betfair-stream/thoroughbred/2021_06_JunRacingAUPro.tar
1.184024305,Sandown,2021-06-02 02:25:00,39648870,1. Diamond Dagger,0,12.27

1.184024305,Sandown,2021-06-02 02:25:00,39648871,3. Finch N Chips,0,13.5

1.184024305,Sandown,2021-06-02 02:25:00,39648873,5. Nuriya,0,86.05

1.184024305,Sandown,2021-06-02 02:25:00,39308845,6. Oceans Jen,1,7.8

1.184024305,Sandown,2021-06-02 02:25:00,39509292,7. Red Sista,0,8.3

1.184024305,Sandown,2021-06-02 02:25:00,39648874,8. Surangani,0,69.79

1.184024305,Sandown,2021-06-02 02:25:00,31554076,9. Write The Score,0,2.19

1.184024305,Sandown,2021-06-02 02:25:00,39089152,10. Wyld Savanna,0,8.38

1.184024307,Sandown,2021-06-02 03:00:00,39648875,1. Aktolgali,0,188.85

1.184024307,Sandown,2021-06-02 03:00:00,39648876,2. Jungle Magnate,1,10.95

1.184024307,Sandown,2021-06-02 03:00:00,2241826,3. Royal Fox,0,3.45

1.184024307,Sandown,2021-06-02 03:00:00,39648877,4. Arohaboy,0,41.29

1.184024307,Sandown,2021-06-02 03:00:00,8949957,5. Back In The D

KeyboardInterrupt: 