In [1]:
import pandas as pd
import json


filename = "118771361_PLACE_PRO.json"

In [2]:
records = []


def try_parse_json(string):
    try:
        return json.loads(string), None
    except json.JSONDecodeError as e:
        return None, e


# Read and process the JSON file
with open(filename, "r") as file:
    buffer = ""
    for line in file:
        buffer += line.strip()
        parsed_json, error = try_parse_json(buffer)
        if parsed_json:
            buffer = ""  # Clear buffer if successful parse
            if parsed_json["op"] == "mcm":
                pt = parsed_json["pt"]
                for mc_item in parsed_json.get("mc", []):
                    market_id = mc_item["id"]
                    in_play = mc_item.get("marketDefinition", {}).get("inPlay", False)
                    for rc in mc_item.get("rc", []):
                        ltp = rc.get("ltp", 0)
                        tv = rc.get("tv", 0)
                        horse_id = rc["id"]
                        # Extract trade data as a list
                        trades = rc.get("trd", [])
                        # Append the record
                        records.append(
                            {
                                "timestamp_unix": pt,
                                "market_id": market_id,
                                "horse_id": horse_id,
                                "last_traded_price": ltp,
                                "traded_volume": tv,
                                "in_play": in_play,
                                "trades": trades,
                            }
                        )
        elif error and "Extra data" in str(error):  # Handle split JSON objects
            first_part, _, buffer = buffer.partition("}{")
            # Assume the split is between separate JSON objects
            buffer = "{" + buffer
            first_part += "}"
            parsed_json, _ = try_parse_json(first_part)
            if parsed_json:  # Check if first part is a complete JSON object
                pt = parsed_json["pt"]
                for mc_item in parsed_json.get("mc", []):
                    market_id = mc_item["id"]
                    in_play = mc_item.get("marketDefinition", {}).get("inPlay", False)
                    for rc in mc_item.get("rc", []):
                        ltp = rc.get("ltp", 0)
                        tv = rc.get("tv", 0)
                        horse_id = rc["id"]
                        # Extract trade data as a list
                        trades = rc.get("trd", [])
                        # Append the record
                        records.append(
                            {
                                "timestamp_unix": pt,
                                "market_id": market_id,
                                "horse_id": horse_id,
                                "last_traded_price": ltp,
                                "traded_volume": tv,
                                "in_play": in_play,
                                "trades": trades,
                            }
                        )

In [3]:
# Create a DataFrame from the records
df = pd.DataFrame(records)

In [4]:
# Display the DataFrame
print(df.shape)
df.tail(50)

(33123, 7)


Unnamed: 0,timestamp_unix,market_id,horse_id,last_traded_price,traded_volume,in_play,trades
33073,1431699553100,1.118771361,5105924,0.0,0.0,False,[]
33074,1431699553294,1.118771361,5105924,0.0,0.0,False,[]
33075,1431699553408,1.118771361,5105924,0.0,0.0,False,[]
33076,1431699553408,1.118771361,5105924,1.01,390.25,False,"[[1.01, 390.25]]"
33077,1431699553408,1.118771361,7560122,0.0,0.0,False,[]
33078,1431699553408,1.118771361,7560122,1.01,877.99,False,"[[1.01, 877.99]]"
33079,1431699553408,1.118771361,8421889,0.0,0.0,False,[]
33080,1431699553408,1.118771361,8421889,27.0,15.99,False,"[[27, 3.98], [50, 12.01]]"
33081,1431699553408,1.118771361,5465145,0.0,0.0,False,[]
33082,1431699553408,1.118771361,5465145,990.0,0.1,False,"[[990, 0.1]]"


In [5]:
# describe the dataframe

df.describe()

Unnamed: 0,timestamp_unix,horse_id,last_traded_price,traded_volume
count,33123.0,33123.0,33123.0,33123.0
mean,1431690000000.0,6031468.0,0.735624,93.545505
std,24066520.0,1719332.0,19.034862,546.611356
min,1431517000000.0,3415981.0,0.0,0.0
25%,1431690000000.0,5105924.0,0.0,0.0
50%,1431699000000.0,5465145.0,0.0,0.0
75%,1431699000000.0,7560122.0,0.0,0.0
max,1431700000000.0,8421889.0,990.0,31150.43


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33123 entries, 0 to 33122
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   timestamp_unix     33123 non-null  int64  
 1   market_id          33123 non-null  object 
 2   horse_id           33123 non-null  int64  
 3   last_traded_price  33123 non-null  float64
 4   traded_volume      33123 non-null  float64
 5   in_play            33123 non-null  bool   
 6   trades             33123 non-null  object 
dtypes: bool(1), float64(2), int64(2), object(2)
memory usage: 1.5+ MB


In [7]:
# save the dataframe to a csv file

df.to_csv("csv_files/place_data_pro.csv", index=False)