In [1]:
import pandas as pd
import json


filename = "race_data_168096594.json"

In [2]:
records = []


def try_parse_json(string):
    try:
        return json.loads(string), None
    except json.JSONDecodeError as e:
        return None, e


# Read and process the JSON file
with open(filename, "r") as file:
    buffer = ""
    for line in file:
        buffer += line.strip()
        parsed_json, error = try_parse_json(buffer)
        if parsed_json:
            buffer = ""  # Clear buffer if successful parse
            if parsed_json["op"] == "mcm":
                pt = parsed_json["pt"]
                for mc_item in parsed_json.get("mc", []):
                    market_id = mc_item["id"]
                    in_play = mc_item.get("marketDefinition", {}).get("inPlay", False)
                    for rc in mc_item.get("rc", []):
                        ltp = rc.get("ltp", 0)
                        tv = rc.get("tv", 0)
                        horse_id = rc["id"]
                        trades = rc.get("trd", [])  # Extract trade data as a list
                        # Append the record
                        records.append(
                            {
                                "timestamp_unix": pt,
                                "market_id": market_id,
                                "horse_id": horse_id,
                                "last_traded_price": ltp,
                                "traded_volume": tv,
                                "in_play": in_play,
                                "trades": trades,
                            }
                        )
        elif error and "Extra data" in str(error):  # Handle split JSON objects
            first_part, _, buffer = buffer.partition("}{")
            buffer = "{" + buffer  # Assume the split is between separate JSON objects
            first_part += "}"
            parsed_json, _ = try_parse_json(first_part)
            if parsed_json:  # Check if first part is a complete JSON object
                pt = parsed_json["pt"]
                for mc_item in parsed_json.get("mc", []):
                    market_id = mc_item["id"]
                    in_play = mc_item.get("marketDefinition", {}).get("inPlay", False)
                    for rc in mc_item.get("rc", []):
                        ltp = rc.get("ltp", 0)
                        tv = rc.get("tv", 0)
                        horse_id = rc["id"]
                        trades = rc.get("trd", [])  # Extract trade data as a list
                        # Append the record
                        records.append(
                            {
                                "timestamp_unix": pt,
                                "market_id": market_id,
                                "horse_id": horse_id,
                                "last_traded_price": ltp,
                                "traded_volume": tv,
                                "in_play": in_play,
                                "trades": trades,
                            }
                        )

In [3]:
# Convert records to a DataFrame
df = pd.DataFrame(records)


In [7]:
# Display the DataFrame
print(df.shape)
df.tail(50)

(35336, 7)


Unnamed: 0,timestamp_unix,market_id,horse_id,last_traded_price,traded_volume,in_play,trades
35286,1580573976488,1.168096594,22101589,1.02,304502.59,False,"[[1.02, 5881.06], [1.03, 9507.59], [1.04, 1367..."
35287,1580573976488,1.168096594,21848159,1000.0,78481.61,False,[]
35288,1580573976488,1.168096594,17596196,27.0,90785.35,False,"[[21, 8.98], [22, 102.78], [27, 5.82], [30, 0...."
35289,1580573977518,1.168096594,22023487,1000.0,28654.87,False,[]
35290,1580573977518,1.168096594,22972635,1000.0,23618.82,False,[]
35291,1580573977518,1.168096594,16927442,1000.0,13255.95,False,[]
35292,1580573977518,1.168096594,22101589,1.01,309231.75,False,"[[1.01, 100.32], [1.02, 9939.81], [1.03, 9841...."
35293,1580573977518,1.168096594,23405261,1000.0,9617.32,False,[]
35294,1580573977518,1.168096594,17596196,50.0,90791.33,False,"[[27, 9.22], [34, 0.39], [50, 2.27]]"
35295,1580573978505,1.168096594,22023487,1000.0,28654.96,False,"[[1000, 9.68]]"


In [10]:
# describe the dataframe

df.describe()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35336 entries, 0 to 35335
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   timestamp_unix     35336 non-null  int64  
 1   market_id          35336 non-null  object 
 2   horse_id           35336 non-null  int64  
 3   last_traded_price  35336 non-null  float64
 4   traded_volume      35336 non-null  float64
 5   in_play            35336 non-null  bool   
 6   trades             35336 non-null  object 
dtypes: bool(1), float64(2), int64(2), object(2)
memory usage: 1.7+ MB


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35336 entries, 0 to 35335
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   timestamp_unix     35336 non-null  int64  
 1   market_id          35336 non-null  object 
 2   horse_id           35336 non-null  int64  
 3   last_traded_price  35336 non-null  float64
 4   traded_volume      35336 non-null  float64
 5   in_play            35336 non-null  bool   
 6   trades             35336 non-null  object 
dtypes: bool(1), float64(2), int64(2), object(2)
memory usage: 1.7+ MB


In [9]:
# Save the DataFrame to a CSV file

df.to_csv("csv_files/race_data.csv", index=False)