In [1]:
import pandas as pd
import numpy as np
import ast

In [2]:
file_path = 'UoB_Set01_2025-05-16LOBs.txt'

In [3]:
# Parsing single-line text
def parse_line_corrected(line):
    line = line.replace('Exch0', "'Exch0'")
    data = ast.literal_eval(line)
    return {
        'timestamp': data[0],
        'exchange': data[1],
        'bids': data[2][0][1],
        'asks': data[2][1][1]
    }

In [4]:
# Parses incoming text line by line
parsed_data_corrected = []
with open(file_path, 'r') as file:
    for line in file:
        parsed_data_corrected.append(parse_line_corrected(line))

In [5]:
df_corrected = pd.DataFrame(parsed_data_corrected)

In [12]:
df_corrected = df_corrected[['exchange', 'timestamp', 'bids', 'asks']]
df_corrected.head()

Unnamed: 0,exchange,timestamp,bids,asks
0,Exch0,0.0,[],[]
1,Exch0,2.294,[],"[[158, 5]]"
2,Exch0,2.604,"[[1, 9]]","[[158, 5]]"
3,Exch0,3.317,"[[2, 9]]","[[158, 5]]"
4,Exch0,3.968,"[[3, 4], [2, 9]]","[[158, 5]]"


In [9]:
# Sort bids and asks and take the top two
def process_row(row):
    bids_sorted = sorted(row['bids'], key=lambda x: x[0], reverse=True)[:2] if row['bids'] else []
    asks_sorted = sorted(row['asks'], key=lambda x: x[0])[:2] if row['asks'] else []
    return bids_sorted, asks_sorted

In [14]:
df_corrected[['bids', 'asks']] = df_corrected.apply(lambda row: process_row(row), axis=1, result_type='expand')
df_filtered = (df_corrected[(df_corrected['bids'].apply(len) >= 2) & (df_corrected['asks'].apply(len) >= 2)]).reset_index(drop=True)
df_filtered.head()

Unnamed: 0,exchange,timestamp,bids,asks
0,Exch0,4.309,"[[4, 4], [2, 9]]","[[158, 5], [532, 1]]"
1,Exch0,4.433,"[[5, 9], [4, 4]]","[[158, 5], [532, 1]]"
2,Exch0,4.557,"[[5, 9], [4, 4]]","[[157, 3], [158, 5]]"
3,Exch0,4.743,"[[142, 1], [5, 9]]","[[157, 3], [158, 5]]"
4,Exch0,4.805,"[[142, 1], [5, 9]]","[[157, 3], [158, 5]]"


In [36]:
all_bid_prices = [bid[0] for bids_list in df_filtered['bids'] for bid in bids_list if bids_list]
mean_price = np.mean(all_bid_prices)
std_price = np.std(all_bid_prices)

def remove_bids_outliers(bids, threshold=2):
    if not bids:
        return False
    # Calculate z-scores and filter out outliers
    z_scores = [(abs(bid[0] - mean_price) / std_price) for bid in bids]
    
    return all(z <= threshold for z in z_scores)

# Apply the function to the 'bids' column and keep rows where no outliers are detected
df_bids_outliers = df_filtered[df_filtered['bids'].apply(lambda bids: remove_bids_outliers(bids, 2))]

# Reset the index after removing rows with outliers
df_bids_outliers = df_bids_outliers.reset_index(drop=True)

# Display the first few rows of the DataFrame with outliers removed
df_bids_outliers.head()


Unnamed: 0,exchange,timestamp,bids,asks
0,Exch0,5.611,"[[128, 4], [127, 1]]","[[134, 3], [135, 1]]"
1,Exch0,6.851,"[[129, 4], [128, 1]]","[[130, 3], [132, 1]]"
2,Exch0,6.882,"[[129, 4], [128, 1]]","[[130, 3], [132, 1]]"
3,Exch0,6.944,"[[129, 1], [128, 1]]","[[132, 1], [134, 3]]"
4,Exch0,7.161,"[[129, 1], [128, 1]]","[[132, 5], [134, 3]]"


In [48]:
all_ask_prices = [ask[0] for asks_list in df_bids_outliers['asks'] for ask in asks_list if asks_list]
mean_price2 = np.mean(all_ask_prices)
std_price2 = np.std(all_ask_prices)

def remove_all_outliers(asks, threshold=2, price_limit=200):
    if not asks:
        return False
    # Calculate z-scores and filter out outliers
    z_scores = [(abs(ask[0] - mean_price2) / std_price2) for ask in asks]
    
    return all(z <= threshold for z in z_scores) and all(ask[0] <= price_limit for ask in asks)

# Apply the function to the 'asks' column and keep rows where no outliers are detected
df_no_outliers = df_bids_outliers[df_bids_outliers['asks'].apply(lambda asks: remove_all_outliers(asks, 2, 200))]

# Reset the index after removing rows with outliers
df_no_outliers = df_no_outliers.reset_index(drop=True)

# Display the first few rows of the DataFrame with outliers removed
df_no_outliers.head()

Unnamed: 0,exchange,timestamp,bids,asks
0,Exch0,5.611,"[[128, 4], [127, 1]]","[[134, 3], [135, 1]]"
1,Exch0,6.851,"[[129, 4], [128, 1]]","[[130, 3], [132, 1]]"
2,Exch0,6.882,"[[129, 4], [128, 1]]","[[130, 3], [132, 1]]"
3,Exch0,6.944,"[[129, 1], [128, 1]]","[[132, 1], [134, 3]]"
4,Exch0,7.161,"[[129, 1], [128, 1]]","[[132, 5], [134, 3]]"


In [51]:
def split_bids_asks(row):
    # Splitting bids into prices and sizes
    bid_price1, bid_size1 = row['bids'][0]
    bid_price2, bid_size2 = row['bids'][1]
    ask_price1, ask_size1 = row['asks'][0]
    ask_price2, ask_size2 = row['asks'][1]  
    return pd.Series([bid_price1, bid_price2, bid_size1, bid_size2, ask_price1, ask_price2, ask_size1, ask_size2])

# Apply the function to each row of the DataFrame and create new columns
df_final_filtered = df_no_outliers
df_final_filtered[['bid_price1', 'bid_price2', 'bid_size1', 'bid_size2', 'ask_price1', 'ask_price2', 'ask_size1', 'ask_size2']] = df_final_filtered.apply(split_bids_asks, axis=1)
df_final_filtered.drop(['bids', 'asks'], axis=1, inplace=True)

# Display the first few rows of the DataFrame to verify the changes
df_final_filtered.head()

Unnamed: 0,exchange,timestamp,bid_price1,bid_price2,bid_size1,bid_size2,ask_price1,ask_price2,ask_size1,ask_size2
0,Exch0,5.611,128,127,4,1,134,135,3,1
1,Exch0,6.851,129,128,4,1,130,132,3,1
2,Exch0,6.882,129,128,4,1,130,132,3,1
3,Exch0,6.944,129,128,1,1,132,134,1,3
4,Exch0,7.161,129,128,1,1,132,134,5,3
