In [14]:
import pandas as pd
import numpy as np
import re
import os
import ast

In [15]:
def parse_line_corrected(line):
    line = line.replace('Exch0', "'Exch0'")
    data = ast.literal_eval(line)
    return {
        'timestamp': data[0],
        'exchange': data[1],
        'bids': data[2][0][1],
        'asks': data[2][1][1]
    }

In [16]:
def process_row(row):
    bids_sorted = sorted(row['bids'], key=lambda x: x[0], reverse=True)[:2] if row['bids'] else []
    asks_sorted = sorted(row['asks'], key=lambda x: x[0])[:2] if row['asks'] else []
    return bids_sorted, asks_sorted

In [17]:
def remove_max_bid_outliers(bids, mean_price, std_price, threshold=3):
    if not bids:
        return True  
    # Calculate z-score for the maximum bid price
    max_bid_price = max(bid[0] for bid in bids)
    z_score = abs(max_bid_price - mean_price) / std_price
    
    # Return True if the z-score is within the threshold, indicating it's not an outlier
    return z_score <= threshold

In [18]:
def remove_min_bid_outliers(bids, mean_min_price, std_min_price, threshold=3):
    if not bids:
        return True
    min_bid_price = min(bid[0] for bid in bids)
    z_score = (min_bid_price - mean_min_price) / std_min_price
    return abs(z_score) <= threshold

In [19]:
def remove_min_ask_outliers(asks, mean_price, std_price, threshold=3):
    if not asks:
        return True  # Keep rows with no asks as they're not outliers
    min_ask_price = min(ask[0] for ask in asks)
    z_score = (min_ask_price - mean_price) / std_price
    return abs(z_score) <= threshold

In [20]:
def remove_max_ask_outlier(asks, mean_price, std_price, threshold=3):
    if not asks:
        return True
    max_ask_price = max(ask[0] for ask in asks)
    z_score = (max_ask_price - mean_price) / std_price
    return abs(z_score) <= threshold

In [21]:
def split_bids_asks(row):
    # Splitting bids into prices and sizes
    bid_price1, bid_size1 = row['bids'][0]
    bid_price2, bid_size2 = row['bids'][1]
    ask_price1, ask_size1 = row['asks'][0]
    ask_price2, ask_size2 = row['asks'][1]  
    return pd.Series([bid_price1, bid_price2, bid_size1, bid_size2, ask_price1, ask_price2, ask_size1, ask_size2])

In [24]:
def lob_clean(file_path):
    parsed_data_corrected = []
    with open(file_path, 'r') as file:
        for line in file:
            parsed_data_corrected.append(parse_line_corrected(line))
    df_corrected = pd.DataFrame(parsed_data_corrected)
    df_corrected = df_corrected[['exchange', 'timestamp', 'bids', 'asks']]
    df_corrected[['bids', 'asks']] = df_corrected.apply(lambda row: process_row(row), axis=1, result_type='expand')
    df_filtered = (df_corrected[(df_corrected['bids'].apply(len) >= 2) & (df_corrected['asks'].apply(len) >= 2)]).reset_index(drop=True)
    # Calculate the mean and standard deviation for the maximum bid prices
    max_bid_prices = [max(bid[0] for bid in bids_list) for bids_list in df_filtered['bids'] if bids_list]
    mean_max_price = np.mean(max_bid_prices)
    std_max_price = np.std(max_bid_prices)

    # Filter the DataFrame rows
    df_filtered_no_max_outliers = df_filtered[df_filtered['bids'].apply(
        lambda bids: remove_max_bid_outliers(bids, mean_max_price, std_max_price, 3)
    )]

    # Reset the index after the filter operation
    df_filtered_no_max_outliers = df_filtered_no_max_outliers.reset_index(drop=True)
    # Calculate mean and standard deviation for the minimum bid prices across the DataFrame
    min_bid_prices = [min(bid[0] for bid in bids_list) for bids_list in df_filtered_no_max_outliers['bids'] if bids_list]
    mean_min_price = np.mean(min_bid_prices)
    std_min_price = np.std(min_bid_prices)

    # Apply the outlier removal function to each row
    df_filtered_no_min_outliers = df_filtered_no_max_outliers[df_filtered_no_max_outliers['bids'].apply(
        lambda bids: remove_min_bid_outliers(bids, mean_min_price, std_min_price, 3)
    )]

    # Reset index
    df_filtered_no_min_outliers.reset_index(drop=True, inplace=True)
    # Calculate mean and standard deviation for minimum ask prices
    min_ask_prices = [min(ask[0] for ask in asks_list) for asks_list in df_filtered['asks'] if asks_list]
    mean_min_ask_price = np.mean(min_ask_prices)
    std_min_ask_price = np.std(min_ask_prices)

    # Apply the function to filter rows based on minimum ask price outliers
    df_filtered_no_min_ask_outliers = df_filtered_no_min_outliers[df_filtered_no_min_outliers['asks'].apply(
        lambda asks: remove_min_ask_outliers(asks, mean_min_ask_price, std_min_ask_price, 3)
    )]

    # Reset the index after filtering
    df_filtered_no_min_ask_outliers.reset_index(drop=True, inplace=True)
    # Calculate mean and standard deviation for maximum ask prices
    max_ask_prices = [max(ask[0] for ask in asks_list) for asks_list in df_filtered_no_min_ask_outliers['asks'] if asks_list]
    mean_max_ask_price = np.mean(max_ask_prices)
    std_max_ask_price = np.std(max_ask_prices)

    # Apply the function to filter rows based on maximum ask price outliers
    df_filtered_no_max_ask_outliers = df_filtered_no_min_ask_outliers[df_filtered_no_min_ask_outliers['asks'].apply(
        lambda asks: remove_max_ask_outlier(asks, mean_max_ask_price, std_max_ask_price, 3)
    )]

    # Reset the index after filtering
    df_filtered_no_max_ask_outliers.reset_index(drop=True, inplace=True)
    # Apply the function to each row of the DataFrame and create new columns
    df_final_filtered = df_filtered_no_max_ask_outliers.copy()
    df_final_filtered[['bid_price1', 'bid_price2', 'bid_size1', 'bid_size2', 'ask_price1', 'ask_price2', 'ask_size1', 'ask_size2']] = df_final_filtered.apply(split_bids_asks, axis=1)
    df_final_filtered.drop(['bids', 'asks'], axis=1, inplace=True)
    
    pattern = r'UoB.*?\.txt'
    file_name = re.findall(pattern, file_path)[0]
    file_name = "Clean_" + file_name
    
    df_final_filtered.to_csv(file_name)

In [27]:
def lob_clean_all(directory):
    files = os.listdir(directory)
    
    for file in files:
        file_path = os.path.join(directory, file)
        lob_clean(file_path)