In [5]:
import pandas as pd
import numpy as np
import ast
import math
import re
import os

In [1]:
# Feature extraction
# Function to calculate first WAP
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

# Function to calculate second WAP
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

In [2]:
#Function used to calculate log return
def log_return(series):
    # 确保Series中没有空值，并且所有输入都转换为浮点数
    series = series.dropna().astype(float)
    return np.log(series).diff()


# Calculate the realized volatility
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

In [3]:
def Feature_extraction(df):
    data_dict = {}
    # Calculate Wap
    data_dict['wap1'] = calc_wap1(df)
    data_dict['wap2'] = calc_wap2(df)
    data_dict['log_return1'] = np.log(data_dict['wap1']) - np.log(data_dict['wap1'].shift(1))
    data_dict['log_return2'] = np.log(data_dict['wap2']) - np.log(data_dict['wap2'].shift(1))
    # Calculate wap balance
    data_dict['wap_balance'] = abs(data_dict['wap1'] - data_dict['wap2'])
    # Calculate spread
    data_dict['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    data_dict['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)
    data_dict['bid_spread'] = df['bid_price1'] - df['bid_price2']
    data_dict['ask_spread'] = df['ask_price1'] - df['ask_price2']
    #Calculate the spread relative to the average price
    data_dict["bid_ask_spread"] = abs(data_dict['bid_spread'] - data_dict['ask_spread'])
    data_dict['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    #To quantify the current pressure imbalance between buyers and sellers
    data_dict['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))

     # Dict for aggregations
    create_feature_dict = {
        'wap1': [np.sum, np.mean, np.std],
        'wap2': [np.sum, np.mean, np.std],
        'log_return1': [np.sum, realized_volatility, np.mean, np.std],
        'log_return2': [np.sum, realized_volatility, np.mean, np.std],
        'wap_balance': [np.sum, np.mean, np.std],
        'price_spread':[np.sum, np.mean, np.std],
        'price_spread2':[np.sum, np.mean, np.std],
        'bid_spread':[np.sum, np.mean, np.std],
        'ask_spread':[np.sum, np.mean, np.std],
        'total_volume':[np.sum, np.mean, np.std],
        'volume_imbalance':[np.sum, np.mean, np.std],
        "bid_ask_spread":[np.sum, np.mean, np.std],
    }
    
    
    name_dict = {
            np.sum:"sum",
            np.mean:"mean",
            np.std:"std",
            realized_volatility:"realized_volatility"
    }
    


#     # Define window size and step size
#     window_size = 100  # Window size in seconds
#     step_size = 10     # Window step size in seconds

#     # Create a new column 'window_id' to represent the window to which each timestamp belongs
#     df['window_id'] = df['timestamp'].apply(lambda x: x // step_size)

#     # Grouping and aggregation calculations.
#     aggregated_df = df.groupby('window_id').agg(create_feature_dict)

#     # The window ID can be used to calculate the start and end timestamps of each window
#     aggregated_df['window_start'] = aggregated_df.index * step_size
#     aggregated_df['window_end'] = aggregated_df['window_start'] + window_size
    
#     # Reset index so window ID is a column
#     aggregated_df.reset_index(drop=True, inplace=True)

#     df['timestamp'] = pd.to_datetime(df['timestamp']) # 我把这行注释掉了


 # print(data_dict)
    df_t_1 = {}
    for key,value in create_feature_dict.items():
        for func in value:
            df_t_1[key+"_"+name_dict[func]] = func(data_dict[key])

    # print(df_new.index)
    return pd.DataFrame([df_t_1])





In [4]:
def sequentially(df_base):
    lines = []
    max_ts = math.floor(df_base['timestamp'].max())
    interval = 600
    interval_step = [100,200,300,400,500]
    for j in range(0,max_ts,10):
        df_t = df_base[(df_base['timestamp'] >= j) & (df_base['timestamp'] < j+interval)]
        # print(df_t)
        df_total = Feature_extraction(df_t.copy())
        # print(df_total)
        df_total['range'] = "{}-{}".format(j,j+600)
            # # print(df_sum)
        lines.append(df_total)
        for step in interval_step:
            df_step = df_t[(df_t['timestamp'] >= j) & (df_t['timestamp'] < j+step)]

            df_step_sum = Feature_extraction(df_step.copy())
            df_step_sum['range'] = "{}-{}".format(j,j+step)
            lines.append(df_step_sum)

    result = pd.concat(lines,axis=0)
    
    return result


In [25]:
def Feature_extraction_one(file_path):
    df = pd.read_csv(file_path)
    sequentially(df).to_csv("temp.csv",index=False)
    df=pd.read_csv("temp.csv")
    # Set loop value
    cycle_values = [600, 100, 200, 300, 400, 500]

    # Generate a cyclic sequence whose length is the same as the number of rows of df
    num_rows = len(df)
    cycle_sequence = cycle_values * (num_rows // len(cycle_values)) + cycle_values[:num_rows % len(cycle_values)]

    # Assign loop sequence to 'range' column
    df['range'] = cycle_sequence

    # Add an index to each range loop
    df['index'] = (df.index // 6) + 1
    #Merge rows with the same index into one row
    pivot_df = df.pivot(index='index', columns='range')

    num_rows = len(pivot_df)

    # Create a looping timeline that increments every 10
    start_time = [(i * 10) % 600 for i in range(num_rows)]
    end_time = [600 + i * 10 for i in range(num_rows)]

    # Add timeline to DataFrame
    pivot_df['start_time'] = start_time
    pivot_df['end_time'] = end_time
    
    pattern = r'Clean_UoB_Set.*?\.txt'
    file_name = re.findall(pattern, file_path)[0]
    file_name = "Featured_" + file_name
    
    pivot_df.to_csv(file_name,index=False)

In [31]:
def lob_extract_features_all(directory):
    files = os.listdir(directory)
    
    for file in files:
        file_path = os.path.join(directory, file)
        Feature_extraction_one(file_path)

In [32]:
lob_extract_features_all('./JPMorgan_Set01/LOBs_test/')