In [99]:
import pandas as pd
import numpy as np
import ast

In [100]:
file_path = 'UoB_Set01_2025-05-16LOBs.txt'

In [101]:
# Parsing single-line text
def parse_line_corrected(line):
    line = line.replace('Exch0', "'Exch0'")
    data = ast.literal_eval(line)
    return {
        'timestamp': data[0],
        'exchange': data[1],
        'bids': data[2][0][1],
        'asks': data[2][1][1]
    }

In [102]:
# Parses incoming text line by line
parsed_data_corrected = []
with open(file_path, 'r') as file:
    for line in file:
        parsed_data_corrected.append(parse_line_corrected(line))

In [103]:
df_corrected = pd.DataFrame(parsed_data_corrected)

In [104]:
df_corrected = df_corrected[['exchange', 'timestamp', 'bids', 'asks']]
df_corrected.head()

Unnamed: 0,exchange,timestamp,bids,asks
0,Exch0,0.0,[],[]
1,Exch0,2.294,[],"[[158, 5]]"
2,Exch0,2.604,"[[1, 9]]","[[158, 5]]"
3,Exch0,3.317,"[[2, 9]]","[[158, 5]]"
4,Exch0,3.968,"[[3, 4], [2, 9]]","[[158, 5]]"


In [105]:
# Sort bids and asks and take the top two
def process_row(row):
    bids_sorted = sorted(row['bids'], key=lambda x: x[0], reverse=True)[:2] if row['bids'] else []
    asks_sorted = sorted(row['asks'], key=lambda x: x[0])[:2] if row['asks'] else []
    return bids_sorted, asks_sorted

In [106]:
df_corrected[['bids', 'asks']] = df_corrected.apply(lambda row: process_row(row), axis=1, result_type='expand')
df_filtered = (df_corrected[(df_corrected['bids'].apply(len) >= 2) & (df_corrected['asks'].apply(len) >= 2)]).reset_index(drop=True)
df_filtered.head()

Unnamed: 0,exchange,timestamp,bids,asks
0,Exch0,4.309,"[[4, 4], [2, 9]]","[[158, 5], [532, 1]]"
1,Exch0,4.433,"[[5, 9], [4, 4]]","[[158, 5], [532, 1]]"
2,Exch0,4.557,"[[5, 9], [4, 4]]","[[157, 3], [158, 5]]"
3,Exch0,4.743,"[[142, 1], [5, 9]]","[[157, 3], [158, 5]]"
4,Exch0,4.805,"[[142, 1], [5, 9]]","[[157, 3], [158, 5]]"


In [107]:
def remove_max_bid_outliers(bids, mean_price, std_price, threshold=3):
    if not bids:
        return True  
    # Calculate z-score for the maximum bid price
    max_bid_price = max(bid[0] for bid in bids)
    z_score = abs(max_bid_price - mean_price) / std_price
    
    # Return True if the z-score is within the threshold, indicating it's not an outlier
    return z_score <= threshold

# Calculate the mean and standard deviation for the maximum bid prices
max_bid_prices = [max(bid[0] for bid in bids_list) for bids_list in df_filtered['bids'] if bids_list]
mean_max_price = np.mean(max_bid_prices)
std_max_price = np.std(max_bid_prices)

# Filter the DataFrame rows
df_filtered_no_max_outliers = df_filtered[df_filtered['bids'].apply(
    lambda bids: remove_max_bid_outliers(bids, mean_max_price, std_max_price, 3)
)]

# Reset the index after the filter operation
df_filtered_no_max_outliers = df_filtered_no_max_outliers.reset_index(drop=True)

df_filtered_no_max_outliers

Unnamed: 0,exchange,timestamp,bids,asks
0,Exch0,4.743,"[[142, 1], [5, 9]]","[[157, 3], [158, 5]]"
1,Exch0,4.805,"[[142, 1], [5, 9]]","[[157, 3], [158, 5]]"
2,Exch0,4.836,"[[142, 1], [5, 9]]","[[156, 3], [158, 5]]"
3,Exch0,5.456,"[[127, 1], [81, 1]]","[[135, 1], [155, 3]]"
4,Exch0,5.487,"[[127, 1], [81, 1]]","[[134, 3], [135, 1]]"
...,...,...,...,...
317338,Exch0,30599.542,"[[132, 5], [131, 2]]","[[141, 1], [142, 1]]"
317339,Exch0,30599.790,"[[132, 5], [131, 2]]","[[141, 1], [142, 1]]"
317340,Exch0,30599.821,"[[132, 5], [131, 2]]","[[141, 1], [142, 1]]"
317341,Exch0,30599.945,"[[132, 5], [131, 2]]","[[141, 1], [444, 5]]"


In [108]:
def remove_min_bid_outliers(bids, mean_min_price, std_min_price, threshold=3):
    if not bids:
        return True
    min_bid_price = min(bid[0] for bid in bids)
    z_score = (min_bid_price - mean_min_price) / std_min_price
    return abs(z_score) <= threshold

# Calculate mean and standard deviation for the minimum bid prices across the DataFrame
min_bid_prices = [min(bid[0] for bid in bids_list) for bids_list in df_filtered_no_max_outliers['bids'] if bids_list]
mean_min_price = np.mean(min_bid_prices)
std_min_price = np.std(min_bid_prices)

# Apply the outlier removal function to each row
df_filtered_no_min_outliers = df_filtered_no_max_outliers[df_filtered_no_max_outliers['bids'].apply(
    lambda bids: remove_min_bid_outliers(bids, mean_min_price, std_min_price, 3)
)]

# Reset index
df_filtered_no_min_outliers.reset_index(drop=True, inplace=True)

# Display the first few rows
df_filtered_no_min_outliers

Unnamed: 0,exchange,timestamp,bids,asks
0,Exch0,5.611,"[[128, 4], [127, 1]]","[[134, 3], [135, 1]]"
1,Exch0,6.851,"[[129, 4], [128, 1]]","[[130, 3], [132, 1]]"
2,Exch0,6.882,"[[129, 4], [128, 1]]","[[130, 3], [132, 1]]"
3,Exch0,6.944,"[[129, 1], [128, 1]]","[[132, 1], [134, 3]]"
4,Exch0,7.161,"[[129, 1], [128, 1]]","[[132, 5], [134, 3]]"
...,...,...,...,...
308497,Exch0,30599.542,"[[132, 5], [131, 2]]","[[141, 1], [142, 1]]"
308498,Exch0,30599.790,"[[132, 5], [131, 2]]","[[141, 1], [142, 1]]"
308499,Exch0,30599.821,"[[132, 5], [131, 2]]","[[141, 1], [142, 1]]"
308500,Exch0,30599.945,"[[132, 5], [131, 2]]","[[141, 1], [444, 5]]"


In [109]:
def remove_min_ask_outliers(asks, mean_price, std_price, threshold=3):
    if not asks:
        return True  # Keep rows with no asks as they're not outliers
    min_ask_price = min(ask[0] for ask in asks)
    z_score = (min_ask_price - mean_price) / std_price
    return abs(z_score) <= threshold

# Calculate mean and standard deviation for minimum ask prices
min_ask_prices = [min(ask[0] for ask in asks_list) for asks_list in df_filtered['asks'] if asks_list]
mean_min_ask_price = np.mean(min_ask_prices)
std_min_ask_price = np.std(min_ask_prices)

# Apply the function to filter rows based on minimum ask price outliers
df_filtered_no_min_ask_outliers = df_filtered_no_min_outliers[df_filtered_no_min_outliers['asks'].apply(
    lambda asks: remove_min_ask_outliers(asks, mean_min_ask_price, std_min_ask_price, 3)
)]

# Reset the index after filtering
df_filtered_no_min_ask_outliers.reset_index(drop=True, inplace=True)

df_filtered_no_min_ask_outliers

Unnamed: 0,exchange,timestamp,bids,asks
0,Exch0,5.611,"[[128, 4], [127, 1]]","[[134, 3], [135, 1]]"
1,Exch0,6.851,"[[129, 4], [128, 1]]","[[130, 3], [132, 1]]"
2,Exch0,6.882,"[[129, 4], [128, 1]]","[[130, 3], [132, 1]]"
3,Exch0,6.944,"[[129, 1], [128, 1]]","[[132, 1], [134, 3]]"
4,Exch0,7.161,"[[129, 1], [128, 1]]","[[132, 5], [134, 3]]"
...,...,...,...,...
300088,Exch0,30599.542,"[[132, 5], [131, 2]]","[[141, 1], [142, 1]]"
300089,Exch0,30599.790,"[[132, 5], [131, 2]]","[[141, 1], [142, 1]]"
300090,Exch0,30599.821,"[[132, 5], [131, 2]]","[[141, 1], [142, 1]]"
300091,Exch0,30599.945,"[[132, 5], [131, 2]]","[[141, 1], [444, 5]]"


In [110]:
def remove_max_ask_outlier(asks, mean_price, std_price, threshold=3):
    if not asks:
        return True
    max_ask_price = max(ask[0] for ask in asks)
    z_score = (max_ask_price - mean_price) / std_price
    return abs(z_score) <= threshold

# Calculate mean and standard deviation for maximum ask prices
max_ask_prices = [max(ask[0] for ask in asks_list) for asks_list in df_filtered_no_min_ask_outliers['asks'] if asks_list]
mean_max_ask_price = np.mean(max_ask_prices)
std_max_ask_price = np.std(max_ask_prices)

# Apply the function to filter rows based on maximum ask price outliers
df_filtered_no_max_ask_outliers = df_filtered_no_min_ask_outliers[df_filtered_no_min_ask_outliers['asks'].apply(
    lambda asks: remove_max_ask_outlier(asks, mean_max_ask_price, std_max_ask_price, 3)
)]

# Reset the index after filtering
df_filtered_no_max_ask_outliers.reset_index(drop=True, inplace=True)

df_filtered_no_max_ask_outliers

Unnamed: 0,exchange,timestamp,bids,asks
0,Exch0,5.611,"[[128, 4], [127, 1]]","[[134, 3], [135, 1]]"
1,Exch0,6.851,"[[129, 4], [128, 1]]","[[130, 3], [132, 1]]"
2,Exch0,6.882,"[[129, 4], [128, 1]]","[[130, 3], [132, 1]]"
3,Exch0,6.944,"[[129, 1], [128, 1]]","[[132, 1], [134, 3]]"
4,Exch0,7.161,"[[129, 1], [128, 1]]","[[132, 5], [134, 3]]"
...,...,...,...,...
294447,Exch0,30599.542,"[[132, 5], [131, 2]]","[[141, 1], [142, 1]]"
294448,Exch0,30599.790,"[[132, 5], [131, 2]]","[[141, 1], [142, 1]]"
294449,Exch0,30599.821,"[[132, 5], [131, 2]]","[[141, 1], [142, 1]]"
294450,Exch0,30599.945,"[[132, 5], [131, 2]]","[[141, 1], [444, 5]]"


In [111]:
def split_bids_asks(row):
    # Splitting bids into prices and sizes
    bid_price1, bid_size1 = row['bids'][0]
    bid_price2, bid_size2 = row['bids'][1]
    ask_price1, ask_size1 = row['asks'][0]
    ask_price2, ask_size2 = row['asks'][1]  
    return pd.Series([bid_price1, bid_price2, bid_size1, bid_size2, ask_price1, ask_price2, ask_size1, ask_size2])

# Apply the function to each row of the DataFrame and create new columns
df_final_filtered = df_filtered_no_max_ask_outliers.copy()
df_final_filtered[['bid_price1', 'bid_price2', 'bid_size1', 'bid_size2', 'ask_price1', 'ask_price2', 'ask_size1', 'ask_size2']] = df_final_filtered.apply(split_bids_asks, axis=1)
df_final_filtered.drop(['bids', 'asks'], axis=1, inplace=True)

# Display the first few rows of the DataFrame to verify the changes
df_final_filtered

Unnamed: 0,exchange,timestamp,bid_price1,bid_price2,bid_size1,bid_size2,ask_price1,ask_price2,ask_size1,ask_size2
0,Exch0,5.611,128,127,4,1,134,135,3,1
1,Exch0,6.851,129,128,4,1,130,132,3,1
2,Exch0,6.882,129,128,4,1,130,132,3,1
3,Exch0,6.944,129,128,1,1,132,134,1,3
4,Exch0,7.161,129,128,1,1,132,134,5,3
...,...,...,...,...,...,...,...,...,...,...
294447,Exch0,30599.542,132,131,5,2,141,142,1,1
294448,Exch0,30599.790,132,131,5,2,141,142,1,1
294449,Exch0,30599.821,132,131,5,2,141,142,1,1
294450,Exch0,30599.945,132,131,5,2,141,444,1,5


In [112]:
# Feature extraction
# Function to calculate first WAP
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

# Function to calculate second WAP
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

In [113]:
#Function used to calculate log return
def log_return(series):
    # 确保Series中没有空值，并且所有输入都转换为浮点数
    series = series.dropna().astype(float)
    return np.log(series).diff()


# Calculate the realized volatility
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

In [124]:
def Feature_extraction(df):
    # Calculate Wap
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    df['log_return1'] = np.log(df['wap1']) - np.log(df['wap1'].shift(1))
    df['log_return2'] = np.log(df['wap2']) - np.log(df['wap2'].shift(1))
    # Calculate wap balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    #Calculate the spread relative to the average price
    df["bid_ask_spread"] = abs(df['bid_spread'] - df['ask_spread'])
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    #To quantify the current pressure imbalance between buyers and sellers
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
     # Dict for aggregations
    create_feature_dict = {
        'wap1': [np.sum, np.mean, np.std],
        'wap2': [np.sum, np.mean, np.std],
        'log_return1': [np.sum, realized_volatility, np.mean, np.std],
        'log_return2': [np.sum, realized_volatility, np.mean, np.std],
        'wap_balance': [np.sum, np.mean, np.std],
        'price_spread':[np.sum, np.mean, np.std],
        'price_spread2':[np.sum, np.mean, np.std],
        'bid_spread':[np.sum, np.mean, np.std],
        'ask_spread':[np.sum, np.mean, np.std],
        'total_volume':[np.sum, np.mean, np.std],
        'volume_imbalance':[np.sum, np.mean, np.std],
        "bid_ask_spread":[np.sum, np.mean, np.std],
    }
    


#     # Define window size and step size
#     window_size = 100  # Window size in seconds
#     step_size = 10     # Window step size in seconds

#     # Create a new column 'window_id' to represent the window to which each timestamp belongs
#     df['window_id'] = df['timestamp'].apply(lambda x: x // step_size)

#     # Grouping and aggregation calculations.
#     aggregated_df = df.groupby('window_id').agg(create_feature_dict)

#     # The window ID can be used to calculate the start and end timestamps of each window
#     aggregated_df['window_start'] = aggregated_df.index * step_size
#     aggregated_df['window_end'] = aggregated_df['window_start'] + window_size
    
#     # Reset index so window ID is a column
#     aggregated_df.reset_index(drop=True, inplace=True)

    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    def rolling_aggregates(df, offsets, aggregation_dict):
        results = {}
        # For each offset, calculate the rolling aggregate
        for offset in offsets:
            window_size = offset  # If the data is one point per second, window_size is equal to offset seconds
            rolled = df.rolling(window=window_size, on='timestamp').agg(aggregation_dict)
            # Rename columns to reflect offset
            rolled.columns = [f'{col}_{offset}s' for col in rolled.columns]
            results[offset] = rolled
        return results


    step_size = 10  
    offsets = [100, 200, 300, 400, 500, 600] 

    # Group by start of time window（bug）
    df['window_id'] = df['timestamp'].floordiv(step_size) * step_size


    # Compute aggregates for each window
    window_aggregates = df.groupby('window_id').agg(create_feature_dict)

    # Calculate rolling aggregation
    rolling_results = rolling_aggregates(df, offsets,create_feature_dict)

    # Merge the results of rolling aggregation into window aggregation
    for offset, result in rolling_results.items():
        window_aggregates = window_aggregates.join(result, on='window_id', rsuffix=f'_{offset}s')


    window_aggregates = window_aggregates.reset_index()



    



    
    return window_aggregates
#     return aggregated_df

In [125]:
Feature_extraction(df_final_filtered).head()

TypeError: cannot perform __floordiv__ with this index type: DatetimeArray