In [8]:
import pandas as pd
import json


filename = "118771360_WIN_PRO.json"

In [9]:
records = []


def try_parse_json(string):
    try:
        return json.loads(string), None
    except json.JSONDecodeError as e:
        return None, e


# Read and process the JSON file
with open(filename, "r") as file:
    buffer = ""
    for line in file:
        buffer += line.strip()
        parsed_json, error = try_parse_json(buffer)
        if parsed_json:
            buffer = ""  # Clear buffer if successful parse
            if parsed_json["op"] == "mcm":
                pt = parsed_json["pt"]
                for mc_item in parsed_json.get("mc", []):
                    market_id = mc_item["id"]
                    in_play = mc_item.get("marketDefinition", {}).get("inPlay", False)
                    for rc in mc_item.get("rc", []):
                        ltp = rc.get("ltp", 0)
                        tv = rc.get("tv", 0)
                        horse_id = rc["id"]
                        # Extract trade data as a list
                        trades = rc.get("trd", [])
                        # Append the record
                        records.append(
                            {
                                "timestamp_unix": pt,
                                "market_id": market_id,
                                "horse_id": horse_id,
                                "last_traded_price": ltp,
                                "traded_volume": tv,
                                "in_play": in_play,
                                "trades": trades,
                            }
                        )
        elif error and "Extra data" in str(error):  # Handle split JSON objects
            first_part, _, buffer = buffer.partition("}{")
            # Assume the split is between separate JSON objects
            buffer = "{" + buffer
            first_part += "}"
            parsed_json, _ = try_parse_json(first_part)
            if parsed_json:  # Check if first part is a complete JSON object
                pt = parsed_json["pt"]
                for mc_item in parsed_json.get("mc", []):
                    market_id = mc_item["id"]
                    in_play = mc_item.get("marketDefinition", {}).get("inPlay", False)
                    for rc in mc_item.get("rc", []):
                        ltp = rc.get("ltp", 0)
                        tv = rc.get("tv", 0)
                        horse_id = rc["id"]
                        # Extract trade data as a list
                        trades = rc.get("trd", [])
                        # Append the record
                        records.append(
                            {
                                "timestamp_unix": pt,
                                "market_id": market_id,
                                "horse_id": horse_id,
                                "last_traded_price": ltp,
                                "traded_volume": tv,
                                "in_play": in_play,
                                "trades": trades,
                            }
                        )

In [10]:
# Convert records to a DataFrame
df = pd.DataFrame(records)

In [11]:
# Display the DataFrame
print(df.shape)
df.tail(50)

(99545, 7)


Unnamed: 0,timestamp_unix,market_id,horse_id,last_traded_price,traded_volume,in_play,trades
99495,1431699557120,1.11877136,5465145,0.0,0.0,False,[]
99496,1431699557143,1.11877136,7401388,0.0,0.0,False,[]
99497,1431699557143,1.11877136,3415981,0.0,0.0,False,[]
99498,1431699557143,1.11877136,5105924,0.0,0.0,False,[]
99499,1431699557143,1.11877136,8421889,0.0,0.0,False,[]
99500,1431699557143,1.11877136,5465145,0.0,0.0,False,[]
99501,1431699557221,1.11877136,7401388,0.0,0.0,False,[]
99502,1431699557221,1.11877136,3415981,0.0,0.0,False,[]
99503,1431699557221,1.11877136,5105924,0.0,0.0,False,[]
99504,1431699557221,1.11877136,7560122,0.0,0.0,False,[]


In [12]:
# describe the dataframe

df.describe()

Unnamed: 0,timestamp_unix,horse_id,last_traded_price,traded_volume
count,99545.0,99545.0,99545.0,99545.0
mean,1431691000000.0,6367231.0,5.493305,4299.136051
std,19741630.0,1642249.0,52.71113,13884.807337
min,1431517000000.0,3415981.0,0.0,0.0
25%,1431692000000.0,5105924.0,0.0,0.0
50%,1431699000000.0,5465145.0,0.0,0.0
75%,1431699000000.0,7560122.0,0.0,0.0
max,1431700000000.0,8421889.0,1000.0,449236.93


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99545 entries, 0 to 99544
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   timestamp_unix     99545 non-null  int64  
 1   market_id          99545 non-null  object 
 2   horse_id           99545 non-null  int64  
 3   last_traded_price  99545 non-null  float64
 4   traded_volume      99545 non-null  float64
 5   in_play            99545 non-null  bool   
 6   trades             99545 non-null  object 
dtypes: bool(1), float64(2), int64(2), object(2)
memory usage: 4.7+ MB


In [14]:
# Save the DataFrame to a CSV file

df.to_csv("csv_files/win_data_pro.csv", index=False)