In [1]:
import pandas as pd
import json


filename = "place_data.json"

In [2]:
records = []


def try_parse_json(string):
    try:
        return json.loads(string), None
    except json.JSONDecodeError as e:
        return None, e


# Read and process the JSON file
with open(filename, "r") as file:
    buffer = ""
    for line in file:
        buffer += line.strip()
        parsed_json, error = try_parse_json(buffer)
        if parsed_json:
            buffer = ""  # Clear buffer if successful parse
            if parsed_json["op"] == "mcm":
                pt = parsed_json["pt"]
                for mc_item in parsed_json.get("mc", []):
                    market_id = mc_item["id"]
                    in_play = mc_item.get(
                        "marketDefinition", {}).get("inPlay", False)
                    for rc in mc_item.get("rc", []):
                        ltp = rc.get("ltp", 0)
                        tv = rc.get("tv", 0)
                        horse_id = rc["id"]
                        # Extract trade data as a list
                        trades = rc.get("trd", [])
                        # Append the record
                        records.append(
                            {
                                "timestamp_unix": pt,
                                "market_id": market_id,
                                "horse_id": horse_id,
                                "last_traded_price": ltp,
                                "traded_volume": tv,
                                "in_play": in_play,
                                "trades": trades,
                            }
                        )
        elif error and "Extra data" in str(error):  # Handle split JSON objects
            first_part, _, buffer = buffer.partition("}{")
            # Assume the split is between separate JSON objects
            buffer = "{" + buffer
            first_part += "}"
            parsed_json, _ = try_parse_json(first_part)
            if parsed_json:  # Check if first part is a complete JSON object
                pt = parsed_json["pt"]
                for mc_item in parsed_json.get("mc", []):
                    market_id = mc_item["id"]
                    in_play = mc_item.get(
                        "marketDefinition", {}).get("inPlay", False)
                    for rc in mc_item.get("rc", []):
                        ltp = rc.get("ltp", 0)
                        tv = rc.get("tv", 0)
                        horse_id = rc["id"]
                        # Extract trade data as a list
                        trades = rc.get("trd", [])
                        # Append the record
                        records.append(
                            {
                                "timestamp_unix": pt,
                                "market_id": market_id,
                                "horse_id": horse_id,
                                "last_traded_price": ltp,
                                "traded_volume": tv,
                                "in_play": in_play,
                                "trades": trades,
                            }
                        )

In [3]:
# Create a DataFrame from the records
df = pd.DataFrame(records)

In [4]:
# Display the DataFrame
print(df.shape)
df.tail(50)


(23151, 7)


Unnamed: 0,timestamp_unix,market_id,horse_id,last_traded_price,traded_volume,in_play,trades
23101,1580573964477,1.168096597,21848159,1.11,10379.79,False,"[[1.11, 19.28], [1.21, 4.72], [1.27, 8]]"
23102,1580573964477,1.168096597,17596196,1.04,10326.46,False,"[[1.04, 54.84], [1.06, 34], [1.07, 4], [1.08, ..."
23103,1580573965516,1.168096597,14765349,5.8,6824.5,False,[]
23104,1580573965516,1.168096597,22101589,1.1,11669.83,False,[]
23105,1580573965516,1.168096597,17596196,1.04,10330.46,False,"[[1.04, 58.84]]"
23106,1580573966512,1.168096597,22101589,1.02,11709.83,False,"[[1.02, 38.08], [1.1, 50]]"
23107,1580573967494,1.168096597,22046306,2.16,3300.87,False,[]
23108,1580573967494,1.168096597,16927442,2.0,2541.85,False,[]
23109,1580573967494,1.168096597,22101589,1.02,11709.83,False,[]
23110,1580573967494,1.168096597,21848159,1.11,10379.79,False,[]


In [5]:
# describe the dataframe

df.describe()

Unnamed: 0,timestamp_unix,horse_id,last_traded_price,traded_volume
count,23151.0,23151.0,23151.0,23151.0
mean,1580558000000.0,20187520.0,3.394366,1236.235645
std,18093090.0,3020924.0,2.005571,2129.182236
min,1580495000000.0,14765350.0,0.0,0.0
25%,1580550000000.0,17596200.0,2.12,102.65
50%,1580565000000.0,22023490.0,2.84,420.18
75%,1580573000000.0,22101590.0,4.1,1186.69
max,1580574000000.0,23405260.0,16.0,11795.21


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23151 entries, 0 to 23150
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   timestamp_unix     23151 non-null  int64  
 1   market_id          23151 non-null  object 
 2   horse_id           23151 non-null  int64  
 3   last_traded_price  23151 non-null  float64
 4   traded_volume      23151 non-null  float64
 5   in_play            23151 non-null  bool   
 6   trades             23151 non-null  object 
dtypes: bool(1), float64(2), int64(2), object(2)
memory usage: 1.1+ MB


In [6]:
# save the dataframe to a csv file

df.to_csv("csv_files/place_data.csv", index=False)