# Loading LOB data into a dataframe from individual files and store the ouput as a CSV

The aim of this is to itterate through all of the LOB data provided, processing it into a format that can be used for feature extraction. 

The mid price at each timestamp has been added as this is likely to be used as a label for our early models.

In [1]:
import os
import pandas as pd
import re

# Define function to process each line in the file
def parse_line(line):
    # trim and split
    parts = line.strip()[1:-1].split(', ', 2)
    # extracting values
    timestamp = float(parts[0])
    exchange = parts[1].strip('\'')
    # extracting bid and ask values
    bid_ask_data = parts[2][1:-1]
    bid_data = re.findall(r'\[\'bid\', \[\[(.*?)\]\]\]', bid_ask_data)
    bid = [list(map(int, pair.split(', '))) for pair in bid_data[0].split('], [')] if bid_data else []
    ask_data = re.findall(r'\[\'ask\', \[\[(.*?)\]\]\]', bid_ask_data)
    ask = [list(map(int, pair.split(', '))) for pair in ask_data[0].split('], [')] if ask_data else []
    return timestamp, exchange, bid, ask

# Define a function to extract first value from an array
# To be used for extracting the level 1 bid and ask prices
def extract_first_value(arr):
    if len(arr) > 0:  # Check if the array is not empty
        return arr[0][0]  # Return the first value of the first array
    else:
        return None

directory = 'data/lob/'
output_dir = 'data/output'
output_filename = 'lob_output_data.csv'  # Output CSV file
output_csv = os.path.join(output_dir, output_filename)

if not os.path.exists(output_dir):
    # If it doesn't exist, create it
    os.makedirs(output_dir)

# Create empty output file 
with open(output_csv, 'w') as csvfile:
    pass

# Loop through files in folder
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        data = []  # Initialize data list for each file
        with open(os.path.join(directory, filename), 'r') as file:
            # Loop through lines in text file
            for line in file:
                # Extract data from line
                timestamp, exchange, bid, ask = parse_line(line)
                # Extract date from filename
                date_match = re.search(r'(\d{4}-\d{2}-\d{2})', filename)
                if date_match:
                    date = date_match.group(1)
                else:
                    date = None
                # Append extracted values to the data list
                data.append({'Timestamp': timestamp, 'Exchange': exchange, 'Bid': bid, 'Ask': ask, 'Date': date})

        # Create DataFrame from list
        df = pd.DataFrame(data)

        # Calculate the mid price form the level 1 bid and ask prices and add it to the df as a new column
        df['Mid_Price'] = (df['Bid'].apply(lambda x: extract_first_value(x)) + df['Ask'].apply(lambda x: extract_first_value(x))) / 2
        
        # Append DataFrame to CSV file
        if os.stat(output_csv).st_size == 0:
            df.to_csv(output_csv, header=True, index=False)
        else:
            df.to_csv(output_csv, mode='a', header=False, index=False)

In [2]:
df = pd.read_csv(output_csv)

In [3]:
df

Unnamed: 0,Timestamp,Exchange,Bid,Ask,Date,Mid_Price
0,0.000,Exch0,[],[],2025-01-02,
1,0.279,Exch0,"[[1, 6]]",[],2025-01-02,
2,1.333,Exch0,"[[1, 6]]","[[800, 1]]",2025-01-02,400.5
3,1.581,Exch0,"[[1, 6]]","[[799, 1]]",2025-01-02,400.0
4,1.643,Exch0,"[[1, 6]]","[[798, 1]]",2025-01-02,399.5
...,...,...,...,...,...,...
1037929,30599.418,Exch0,"[[323, 2], [104, 3], [63, 1], [44, 6]]","[[338, 1], [343, 2], [507, 4], [659, 1], [749,...",2025-01-06,330.5
1037930,30599.449,Exch0,"[[323, 2], [99, 3], [63, 1], [44, 6]]","[[338, 1], [343, 2], [507, 4], [659, 1], [749,...",2025-01-06,330.5
1037931,30599.635,Exch0,"[[323, 2], [99, 3], [63, 1], [44, 6]]","[[338, 1], [341, 2], [507, 4], [659, 1], [749,...",2025-01-06,330.5
1037932,30599.697,Exch0,"[[323, 2], [249, 1], [99, 3], [44, 6]]","[[338, 1], [341, 2], [507, 4], [659, 1], [749,...",2025-01-06,330.5
