# Loading LOB Data into Dataframe

In [1]:
import os
import pandas as pd
import re

## Iterate over files in data folder to form a DataFrame

In [4]:
data = []

# Define function to process each line the file
def parse_line(line):
    # trim and split
    parts = line.strip()[1:-1].split(', ', 2)
    # extracting values
    timestamp = float(parts[0])
    exchange = parts[1].strip('\'')
    # extracting bid and ask values
    bid_ask_data = parts[2][1:-1]
    bid_data = re.findall(r'\[\'bid\', \[\[(.*?)\]\]\]', bid_ask_data)
    bid = [list(map(int, pair.split(', '))) for pair in bid_data[0].split('], [')] if bid_data else []
    ask_data = re.findall(r'\[\'ask\', \[\[(.*?)\]\]\]', bid_ask_data)
    ask = [list(map(int, pair.split(', '))) for pair in ask_data[0].split('], [')] if ask_data else []
    return timestamp, exchange, bid, ask

directory = 'data/lob/'

# Loop through files in folder
for filename in os.listdir(directory):
    # Extract data from .txt files
    if filename.endswith(".txt"):
        with open(os.path.join(directory, filename), 'r') as file:
            for line in file:
                timestamp, exchange, bid, ask = parse_line(line)
                # add  data to list
                data.append({'Timestamp': timestamp, 'Exchange': exchange, 'Bid': bid, 'Ask': ask, 'File': filename})

# Create dataframe from list
df = pd.DataFrame(data)

## Seperate Bid and Ask Columns into Rows

In [None]:
# Columns to melt
columns_to_melt = ['Bid', 'Ask']

# Melt the specified columns
melted_df = df.melt(id_vars=['Timestamp', 'Exchange', 'File'], value_vars=columns_to_melt,
                    var_name='Order Type', value_name='Value')

Unnamed: 0,Timestamp,Exchange,File,Order Type,Value
0,0.000,Exch0,UoB_Set01_2025-05-13LOBs.txt,Bid,[]
1,1.612,Exch0,UoB_Set01_2025-05-13LOBs.txt,Bid,"[[1, 1]]"
2,2.170,Exch0,UoB_Set01_2025-05-13LOBs.txt,Bid,"[[1, 1]]"
3,2.449,Exch0,UoB_Set01_2025-05-13LOBs.txt,Bid,"[[2, 1]]"
4,2.945,Exch0,UoB_Set01_2025-05-13LOBs.txt,Bid,"[[3, 1]]"
...,...,...,...,...,...
82477381,30599.697,Exch0,UoB_Set01_2025-05-06LOBs.txt,Ask,"[[211, 3], [412, 3], [580, 5], [589, 5], [642,..."
82477382,30599.728,Exch0,UoB_Set01_2025-05-06LOBs.txt,Ask,"[[211, 3], [412, 3], [580, 5], [636, 5], [642,..."
82477383,30599.790,Exch0,UoB_Set01_2025-05-06LOBs.txt,Ask,"[[211, 3], [333, 3], [412, 3], [580, 5], [636,..."
82477384,30599.821,Exch0,UoB_Set01_2025-05-06LOBs.txt,Ask,"[[211, 3], [333, 3], [412, 3], [580, 5], [636,..."


## Reshaping the dataframe to show one bid and ask per row

In [None]:
df = melted_df.explode('Value')

Unnamed: 0,Timestamp,Exchange,File,Order Type,Value
0,0.000,Exch0,UoB_Set01_2025-05-13LOBs.txt,Bid,
1,1.612,Exch0,UoB_Set01_2025-05-13LOBs.txt,Bid,"[1, 1]"
2,2.170,Exch0,UoB_Set01_2025-05-13LOBs.txt,Bid,"[1, 1]"
3,2.449,Exch0,UoB_Set01_2025-05-13LOBs.txt,Bid,"[2, 1]"
4,2.945,Exch0,UoB_Set01_2025-05-13LOBs.txt,Bid,"[3, 1]"
...,...,...,...,...,...
82477385,30599.945,Exch0,UoB_Set01_2025-05-06LOBs.txt,Ask,"[211, 3]"
82477385,30599.945,Exch0,UoB_Set01_2025-05-06LOBs.txt,Ask,"[333, 3]"
82477385,30599.945,Exch0,UoB_Set01_2025-05-06LOBs.txt,Ask,"[580, 5]"
82477385,30599.945,Exch0,UoB_Set01_2025-05-06LOBs.txt,Ask,"[605, 3]"


## Separate Price and Quantity from Values column

In [None]:
df[['Price', 'Quantity']] = pd.DataFrame(df['Value'].apply(lambda x: x if isinstance(x, list) else []).tolist(), index=df.index)
df.drop(columns=['Value'], inplace=True)

Unnamed: 0,Timestamp,Exchange,File,Order Type,Price,Quantity
0,0.000,Exch0,UoB_Set01_2025-05-13LOBs.txt,Bid,,
1,1.612,Exch0,UoB_Set01_2025-05-13LOBs.txt,Bid,1.0,1.0
2,2.170,Exch0,UoB_Set01_2025-05-13LOBs.txt,Bid,1.0,1.0
3,2.449,Exch0,UoB_Set01_2025-05-13LOBs.txt,Bid,2.0,1.0
4,2.945,Exch0,UoB_Set01_2025-05-13LOBs.txt,Bid,3.0,1.0
...,...,...,...,...,...,...
82477385,30599.945,Exch0,UoB_Set01_2025-05-06LOBs.txt,Ask,211.0,3.0
82477385,30599.945,Exch0,UoB_Set01_2025-05-06LOBs.txt,Ask,333.0,3.0
82477385,30599.945,Exch0,UoB_Set01_2025-05-06LOBs.txt,Ask,580.0,5.0
82477385,30599.945,Exch0,UoB_Set01_2025-05-06LOBs.txt,Ask,605.0,3.0


## Extract Date from file column and store it as a Datetime in a Date coloumn

In [None]:
# Extract date from 'File' column and add it as a new column
df['Date'] = df['File'].str.extract(r'(\d{4}-\d{2}-\d{2})')
# Convert the 'Date' column to datetime type
df['Date'] = pd.to_datetime(df['Date'])
df.drop(columns=['File'], inplace=True)
df

Unnamed: 0,Timestamp,Exchange,Order Type,Price,Quantity,Date
0,0.000,Exch0,Bid,,,2025-05-13
1,1.612,Exch0,Bid,1.0,1.0,2025-05-13
2,2.170,Exch0,Bid,1.0,1.0,2025-05-13
3,2.449,Exch0,Bid,2.0,1.0,2025-05-13
4,2.945,Exch0,Bid,3.0,1.0,2025-05-13
...,...,...,...,...,...,...
82477385,30599.945,Exch0,Ask,211.0,3.0,2025-05-06
82477385,30599.945,Exch0,Ask,333.0,3.0,2025-05-06
82477385,30599.945,Exch0,Ask,580.0,5.0,2025-05-06
82477385,30599.945,Exch0,Ask,605.0,3.0,2025-05-06


In [None]:
df.to_csv('data/lob/full_lob.csv')