# Mid-Price-Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam
from keras.layers import Input, Dense, LSTM, TimeDistributed, GRU, SimpleRNN
from keras.models import Model
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc

## Data Pre-Processing

read the data

In [None]:
def read_data(file):
    data_original=pd.read_csv(file, header=[0,1,2])
    data_original=data_original[1:]
    data_original.rename(columns={'Unnamed: 0_level_0':'Time','Unnamed: 0_level_1': '','Unnamed: 0_level_2': '','Unnamed: 3_level_1':'','Unnamed: 3_level_2':''},inplace=True),
    data_original.rename(columns={'Unnamed: 0_level_0':'Time','Unnamed: 0_level_1': '','Unnamed: 0_level_2': '','Unnamed: 3_level_1':'','Unnamed: 3_level_2':'','Unnamed: 2_level_2':'','Unnamed: 2_level_1':'','Unnamed: 4_level_2':'','Unnamed: 4_level_1':''},inplace=True),
    return data_original

Only keep the columns

In [None]:
def keep_columns(data_original):
    data=data_original.loc[:,['Time','level1','level2','level3','level4','level5','Quoted_spread','Weighted_Mid_Price','Mid_Price']]
    data.columns = data.columns.map('_'.join)
    data.rename(columns={'Time__':'Time','Mid_Price__':'Mid_Price','Weighted_Mid_Price__':'Weighted_Mid_Price','Quoted_spread__':'Quoted_spread'},inplace=True)
    return data

Only keep the rows

In [None]:
def keep_rows(data):
    min_index=0
    max_index=0
    last_value = data['Time'].iloc[-1]
    for index, row in data.iterrows():
        if pd.to_numeric(row['Time'])>3600:
            min_index=index
            break
    for index, row in data.iterrows():
        if pd.to_numeric(row['Time'])>last_value-3600:
            max_index=index
            break

    data=data[min_index:max_index]
    return data

Remove the level1 null rows

In [None]:
def romove_rows(data):
    level1_df=data.iloc[:,1:5]
    has_na = level1_df.isna().any(axis=1)
    # find the row index for the null values
    row_numbers = has_na.where(has_na).dropna().index
    data=data.drop(row_numbers)
    df_sub=data.drop(columns=['Time'])
    sub_columns=df_sub.columns
    data.drop_duplicates(subset=sub_columns, keep='first', inplace=True)
    return data

generate the fake prices

In [None]:
import re
def fake_price(data):
    null_in = data.isna().any(axis=1)
    null_rows = null_in.where(null_in).dropna().index
    for i in null_rows:
        row=data.loc[i,:]
        Bid_null=[]
        Ask_null=[]
        for j in range(5):
            v_col_Bid='level'+str(j+1)+'_Bid_Volume'
            v_col_Ask='level'+str(j+1)+'_Ask_Volume'
            p_col_Bid='level'+str(j+1)+'_Bid_Price'
            p_col_Ask='level'+str(j+1)+'_Ask_Price'
            if pd.isnull(row[v_col_Bid]):
                row[v_col_Bid]=0
                data.at[i,v_col_Bid]=0
            if pd.isnull(row[v_col_Ask]):
                row[v_col_Ask]=0
                data.at[i,v_col_Ask]=0
            if pd.isnull(row[p_col_Bid]):
                Bid_null.append(p_col_Bid)
            if pd.isnull(row[p_col_Ask]):
                Ask_null.append(p_col_Ask)
        # For Bid
        if len(Bid_null) >0:
            if Bid_null[0]=='level2_Bid_Price':
                diff=row['level1_Bid_Price']/5
                for k in Bid_null:
                    digit = re.findall(r'\d+', k)
                    n=int(digit[0])
                    bidff= row['level1_Bid_Price'] - diff* (n-1)
                    data.at[i,k]=bidff
            else:
                digit = re.findall(r'\d+', Bid_null[0])
                null_first=int(digit[0])
                p_col_Bid1='level'+str(null_first-1)+'_Bid_Price'
                p_col_Bid2='level'+str(null_first-2)+'_Bid_Price'
                diff=abs(row[p_col_Bid1]-row[p_col_Bid2])
                for k in Bid_null:
                    d=re.findall(r'\d+', k)
                    n=int(d[0])
                    ti=2
                    bidff=row[p_col_Bid1]-(diff/ti)*(n-1)
                    while bidff<=0:
                        ti=ti+1
                        bidff=row[p_col_Bid1]-(diff/ti)*(n-1)
                    data.at[i,k]=bidff
        if len(Ask_null) >0:
            # For Ask
            if Ask_null[0]=='level2_Ask_Price':
                diff=row['level1_Ask_Price']/5
                for k in Ask_null:
                    digit = re.findall(r'\d+', k)
                    n=int(digit[0])
                    askff= row['level1_Ask_Price'] + diff* (n-1)
                    data.at[i,k]=askff
            else:
                digit = re.findall(r'\d+', Ask_null[0])
                null_first=int(digit[0])
                p_col_Ask1='level'+str(null_first-1)+'_Ask_Price'
                p_col_Ask2='level'+str(null_first-2)+'_Ask_Price'
                diff=abs(row[p_col_Ask2]-row[p_col_Ask1])
                for k in Ask_null:
                    d=re.findall(r'\d+', k)
                    n=int(d[0])
                    askff=row[p_col_Ask1] + (diff/2)*(n-1)
                    data.at[i,k]=askff
    return data

Time intervals

In [None]:
def time_intervals(data):
    data_Time=data.loc[:,'Time']
    data.loc[:,'Time']=data_Time.diff().dropna()
    data.dropna(inplace=True)
    # Reset the index
    data=data.reset_index(drop=True)
    return data

Run all of the files

In [1]:
folder_path="./Dataset/HSBC_Set01/LOBs_Benchmark_new/"
new_folder_path="./LOBs_Benchmark/"
LOBdata = pd.DataFrame()
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        name = os.path.splitext(filename)[0]
        file_path=os.path.join(folder_path, filename)
        data= read_data(file)
        data=keep_columns(data)
        data=keep_rows(data)
        data=romove_rows(data)
        data=fake_price(data)
        data=time_intervals(data)
        # Use scaled data
        data['Next_mid']=data.Mid_Price.shift(-1)
        data.dropna(axis=0, inplace=True) #drop the last row which is now Nan
        data['Price_move'] = data.Next_mid - data.Mid_Price
        data['up_or_down'] = np.where(data['Price_move']<0, 0, 1) #for now defining Price_move=0 as 1
        LOBdata=pd.concat([LOBdata, data], axis=0)

LOBdata.to_csv(os.path.join(new_folder_path+'processed_data.csv'),index=False)

NameError: name 'pd' is not defined