In [2]:
import pandas as pd
import json


filename = "race_data_168096594.json"

In [5]:
records = []


def try_parse_json(string):
    try:
        return json.loads(string), None
    except json.JSONDecodeError as e:
        return None, e


with open("race_data_168096594.json", "r") as file:
    buffer = ""
    for line in file:
        buffer += line.strip()
        parsed_json, error = try_parse_json(buffer)
        if parsed_json:
            buffer = ""  # Clear buffer if successful parse
            if parsed_json["op"] == "mcm":
                pt = parsed_json["pt"]
                for mc_item in parsed_json.get("mc", []):
                    market_id = mc_item["id"]
                    for rc in mc_item.get("rc", []):
                        if "batb" in rc and len(rc["batb"]) > 1:
                            best_available_to_back = rc["batb"][1][1]
                        else:
                            best_available_to_back = None
                        ltp = rc.get("ltp")
                        tv = rc.get("tv")
                        horse_id = rc["id"]
                        records.append(
                            {
                                "timestamp_unix": pt,
                                "market_id": market_id,
                                "horse_id": horse_id,
                                "best_available_to_back": best_available_to_back,
                                "last_traded_price": ltp,
                                "traded_volume": tv,
                            }
                        )
        elif error and "Extra data" in str(
            error
        ):  # Possible end of one object and start of another
            first_part, _, buffer = buffer.partition("}{")
            buffer = "{" + buffer  # Assume the split is between separate JSON objects
            first_part += "}"
            try_parse_json(first_part)  # Retry parsing the first part
        # Handle other types of JSONDecodeError if necessary

In [6]:
# Create a DataFrame from the records
df = pd.DataFrame(records)


In [9]:
df.tail(1000)

Unnamed: 0,timestamp_unix,market_id,horse_id,best_available_to_back,last_traded_price,traded_volume
34336,1580573868503,1.168096594,22972635,9.80,10.50,20585.65
34337,1580573868503,1.168096594,16927442,,22.00,12517.58
34338,1580573868503,1.168096594,22101589,3.95,4.00,229729.74
34339,1580573868503,1.168096594,23405261,29.00,30.00,7551.54
34340,1580573868503,1.168096594,21848159,9.80,13.50,76375.33
...,...,...,...,...,...,...
35331,1580573985099,1.168096594,16927442,75.00,1000.00,13256.69
35332,1580573985099,1.168096594,22101589,,1.01,320188.44
35333,1580573985099,1.168096594,23405261,8.00,1000.00,9617.63
35334,1580573985099,1.168096594,21848159,10.00,1000.00,78481.91


In [10]:
# describe the dataframe

df.describe()


Unnamed: 0,timestamp_unix,horse_id,best_available_to_back,last_traded_price,traded_volume
count,35336.0,35336.0,12138.0,35336.0,35336.0
mean,1580556000000.0,20155430.0,11.731981,16.002382,14500.115202
std,23999940.0,2969125.0,18.251229,59.155546,30019.736814
min,1580469000000.0,14765350.0,1.01,0.0,0.0
25%,1580549000000.0,17596200.0,5.0,5.7,1262.59
50%,1580568000000.0,21848160.0,8.4,8.8,4210.81
75%,1580573000000.0,22101590.0,13.5,15.5,14681.7375
max,1580574000000.0,23405260.0,650.0,1000.0,320188.44


In [11]:
df.shape

(35336, 6)