# Load tapes and lob data

do this for each day otherwise memory (RAM) exceeds most computers

In [1]:
# code
from fast_tools import get_data, get_data_gen

#data = get_data()

# Clean Data
Remove outliers from lob and create an additional columns noting this

FFill tapes data to get the most up to date tapes price

In [2]:
# code
import numpy as np
from numba import njit, prange

def get_tapes_window(tapes):
    dt = 60*60 # in seconds
    #stds = []
    #means = []
    w_bids = []
    w_asks = []

    t_start = 0
    start_time = 0
    z = 3.29 # 99.9%

    outside = []
    while True:
        end_time = start_time + dt
        t_end = t_start
        rolling_tapes = []
        while tapes[t_end, 0] < end_time:
            rolling_tapes += [tapes[t_end, 1]] * int(tapes[t_end,2])
            t_end += 1

        mean = np.mean(rolling_tapes)
        std = np.std(rolling_tapes)
        #means.append(mean)

        w_bid = mean - std * z
        w_ask = mean + std * z
        w_bids.append(w_bid)
        w_asks.append(w_ask)

        # look one minute a head
        local_end = t_end
        future_tapes = []
        while tapes[local_end, 0] < end_time + 60:
            future_tapes += [tapes[local_end, 1]] * int(tapes[local_end,2])
            local_end += 1

        future_tapes = np.array(future_tapes)

        n_above = len(np.where(future_tapes > w_ask)[0])
        n_below = len(np.where(future_tapes < w_bid)[0])
        if end_time % 60 !=0:
            raise ValueError
        outside.append((end_time,n_above, n_below, len(future_tapes)))

        start_time += 60
        while tapes[t_start, 0] < start_time:
            t_start += 1

        end_time += dt
        if end_time >= 8.5*60*60:
            break

    return outside, w_bids, w_asks

@njit(parallel=True)
def get_features(lob_data: np.array, 
                 lob_times: np.array, 
                 tapes: np.array, 
                 time_step_s: int, 
                 window_data: np.array,
                 ab_weight = 1, 
                 median = True, 
                 ):
    """
    Calculate features from LOB and Tapes data.

    Parameters:
    -----------
    lob_data : np.array
        Array containing the limit order book (LOB) data.
    lob_times : np.array
        Array containing timestamps for the LOB data.
    tapes : np.array
        Array containing Tapes data.
    time_step_s : int
        Time step in seconds for calculating features.
    ab_weight : float, optional
        Weight parameter for alpha and beta calculations, by default 1.
    median : bool, optional
        Whether to calculate features using median instead of mean, by default False.
    window_data : np.array
        Array containing window data for calculating CBS and CAS, by default None.

    Returns:
    --------
    tuple
        A tuple containing:
        - feat_arr: np.array
            Array containing feature values.
        - features: list
            List of feature names.
    """
    
    n_rows = int((8.5 * 60 * 60) / time_step_s)                         # define number of rows of output array
    features = ["MP","HIBID","LOASK","AP","WBP","WAP",                  # define features
                "TCBS","TCAS","AWS","VOL","GAP","SPREAD",
                "ALPHA", "BETA", "ZETA", "ENDT", 
                "PSTD", "LOWIN", "HIWIN", "nUoD",
                "close_hibid", "close_loask"]
    n_features = len(features)                                          # define number of features

    feat_arr = np.zeros((n_rows, n_features), dtype=np.float64)         # array to hold feature values
    
    LA_HB_a_b = np.zeros((lob_data.shape[0]+1, 4), dtype = np.float64)  # array holding the LOASK, HIBID,
                                                                        # alpha, beta, values 

    for i in prange(lob_data.shape[0]):                                 # iterates over the LOB to fill
        row = lob_data[i]                                               # LA_HB_a_b values
        
        neg_ind = np.where(row < 0)[0]                                  # locate bid and ask prices (indicies)
        pos_ind = np.where(row > 0)[0]
        
        if len(neg_ind) == 0:                                           # assign HIBID, np.nan if no values
            LA_HB_a_b[i][1] = np.nan
        else:
            LA_HB_a_b[i][1] = max(neg_ind) + 1 

        if len(pos_ind) == 0:                                           # assign HIBID, np.nan if no values
            LA_HB_a_b[i][0] = np.nan
        else:
            LA_HB_a_b[i][0] = min(pos_ind) + 1

        mid_price = (LA_HB_a_b[i][0] + LA_HB_a_b[i][1]) / 2             # calculate mid_price for alpha/beta calculations

        if np.isnan(mid_price):
            alpha = np.nan
            beta = np.nan
        else:                                                           # calculate alpha/beta using ab_weight var
            beta = 0
            for ind in neg_ind:
                beta += (-1 * row[ind]) / ((mid_price - (ind + 1)) + ab_weight)
    
            alpha = 0
            for ind in pos_ind:
                alpha += row[ind] / (((ind + 1) - mid_price) + ab_weight)
                

        LA_HB_a_b[i][2] = alpha
        LA_HB_a_b[i][3] = beta
        
    max_lob = lob_data.shape[0] - 1                                      # define max indicies for lob
    max_tapes = tapes.shape[0] - 1                                       # define max indicies for tapes
    
    start_time = 0                                                       # define start time
    lob_start = 0                                                        # define start index for lob
    tapes_start = 0                                                      # define start index for tapes
    
    cas = np.zeros(800, dtype = np.int16)                                # define an array to hold CAS values
    cbs = np.zeros(800, dtype = np.int16)                                # define an array to hold CBS values
    for row_i in range(n_rows):
        end_time = start_time + time_step_s                              # move to next time step
        lob_end = lob_start
        tapes_end = tapes_start

        # get lob end index
        while lob_times[lob_end] < end_time and lob_end < max_lob:       # move lob indicies to end time
            lob_end += 1
        
        # get tapes end index
        while tapes[tapes_end][0] < end_time and tapes_end < max_tapes:  # move tapes indicies to end time
            tapes_end += 1

        # feature calculations
        if tapes_start == tapes_end:                                     # if there is no tapes data
            AP = np.nan                                                  # set tapes features to np.nan
            VOL = np.nan
            PSTD = np.nan
            nUoD = np.nan
        else:
            tapes_slice = tapes[tapes_start:tapes_end]                   # extract tapes slice, calculate AP, VOL
            tapes_list = []
            
            for row in tapes_slice:
                for _ in range(int(row[2])):
                    tapes_list.append(row[1])

            tapes_list = np.array(tapes_list, dtype=np.int32)
            AP = np.mean(tapes_list)
            VOL = np.sum(tapes_slice[:,2])
            PSTD = np.std(tapes_list)

            tapes_price_diff = tapes_slice[:,1][1:] - tapes_slice[:,1][:-1]
            n_ups = np.sum(tapes_price_diff > 0)
            n_downs = np.sum(tapes_price_diff < 0)
            nUoD = (n_ups + 1) / (n_downs + 1) - 1

        if lob_start == lob_end:                                         # if there is no LOB data
            MP = np.nan                                                  # set lob features to np.nan
            HIBID = np.nan
            LOASK = np.nan
            SPREAD = np.nan
            TCBS = np.nan
            TCAS = np.nan
            WBP = np.nan
            WAP = np.nan
            AWS = np.nan
            ALPHA = np.nan
            BETA = np.nan
            ZETA = np.nan  
            close_hibid = np.nan
            close_loask = np.nan

        else:
            lob_slice = lob_data[lob_start:lob_end]                       # extract slices of data 
            LA_HB_a_b_slice = LA_HB_a_b[lob_start:lob_end]                

            # midprice_calcs, alpha, beta
            if median:                                                    # calculate price features
                HIBID = np.median(LA_HB_a_b_slice[:,1])                   # using median if set to true
                LOASK = np.median(LA_HB_a_b_slice[:,0])
                ALPHA = np.median(LA_HB_a_b_slice[:,2])
                BETA = np.median(LA_HB_a_b_slice[:,3])
            else:
                HIBID = np.nanmean(LA_HB_a_b_slice[:,1])
                LOASK = np.nanmean(LA_HB_a_b_slice[:,0])
                ALPHA = np.nanmean(LA_HB_a_b_slice[:,2])
                BETA = np.nanmean(LA_HB_a_b_slice[:,3])

            close_hibid = LA_HB_a_b_slice[:,1][-1]
            close_loask = LA_HB_a_b_slice[:,0][-1]

            MP = (HIBID + LOASK) / 2
            SPREAD = LOASK - HIBID
            ZETA = BETA - ALPHA

            if HIBID >= LOASK:
                print("WARNING: HIBID >= LOASK")

            # consolidated calcs
            cas[:] = 0                                                      # reset cas, cbs arrays for new data
            cbs[:] = 0 

            window_index = np.where(window_data[:,0] == end_time)[0]
            if len(window_index) == 1:
                w_bid = window_data[window_index[0], 4]
                w_ask = window_data[window_index[0], 5]
                LOWIN = window_data[window_index[0], 1]
                HIWIN = window_data[window_index[0], 2]
            else:
                w_bid = MP - 100
                w_ask = MP + 100
                LOWIN = 0
                HIWIN = 0

            for ci in prange(int(np.floor(w_bid) - 1), int(np.ceil(w_ask) + 2)):
                # can optimise with LOASK AND HIBID here
                                                                        # only calculate cbs between window left of MP
                cbs_vec = lob_slice[:,ci].copy() * -1                   # and less than LOASK + 100 for efficiency
                cbs_vec[cbs_vec <= 0] = 0                               # idk if this breaks things for efficiency ?:
                cbs[ci] = np.sum(np.abs(np.diff(cbs_vec))) + cbs_vec[0]

                                                                        # only calculate cas between window right of MP
                cas_vec = lob_slice[:,ci].copy()                        # and greater than HIBID - 100 for efficiency
                cas_vec[cas_vec <= 0] = 0                               # idk if this breaks things for efficiency ?:
                cas[ci] = np.sum(np.abs(np.diff(cas_vec))) + cas_vec[0]

            TCBS = np.sum(cbs)                                              # Total CBS
            TCAS = np.sum(cas)                                              # Total CAS

            if TCBS == 0:                                                   # Calculate WBP, np.nan if no activity
                WBP = np.nan
            else:
                WBP = 0
                for ci in prange(800):
                    WBP += (ci + 1) * (cbs[ci] / TCBS)

            if TCAS == 0:                                                   # Calculate WAP, np.nan if no activity
                WAP = np.nan
            else:
                WAP = 0
                for ci in prange(800):
                    WAP += (ci + 1) * (cas[ci] / TCAS)

            AWS = WAP - WBP                                                 # Activity weighted spread calc

        # feature setting
        feat_arr[row_i][features.index("AP")] = AP                          # set the values to the feat_arr
        feat_arr[row_i][features.index("VOL")] = VOL
        feat_arr[row_i][features.index("MP")] = MP
        feat_arr[row_i][features.index("HIBID")] = HIBID
        feat_arr[row_i][features.index("LOASK")] = LOASK
        feat_arr[row_i][features.index("SPREAD")] = SPREAD
        feat_arr[row_i][features.index("TCAS")] = TCAS
        feat_arr[row_i][features.index("TCBS")] = TCBS
        feat_arr[row_i][features.index("WAP")] = WAP
        feat_arr[row_i][features.index("WBP")] = WBP
        feat_arr[row_i][features.index("AWS")] = AWS 
        feat_arr[row_i][features.index("ALPHA")] = ALPHA
        feat_arr[row_i][features.index("BETA")] = BETA
        feat_arr[row_i][features.index("ZETA")] = ZETA
        feat_arr[row_i][features.index("GAP")] = MP - AP
        feat_arr[row_i][features.index("ENDT")] = end_time
        feat_arr[row_i][features.index("PSTD")] = PSTD
        feat_arr[row_i][features.index("LOWIN")] = LOWIN
        feat_arr[row_i][features.index("HIWIN")] = HIWIN
        feat_arr[row_i][features.index("nUoD")] = nUoD
        feat_arr[row_i][features.index("close_hibid")] = close_hibid
        feat_arr[row_i][features.index("close_loask")] = close_loask


        # adjust start times
        start_time = end_time                                                # Set the next start times and 
        lob_start = lob_end                                                  # indicies to the last end times / indicies
        tapes_start = tapes_end

    return feat_arr, features

window_length_s = 60
c = 0
all_features = []
for lob, lob_times, tapes in get_data_gen():
    print(c, end = "\r")
    outside, w_bids, w_asks = get_tapes_window(tapes)
    window_data = np.zeros((len(outside), 6), dtype = float)
    for i in range(len(outside)):
        window_data[i][:4] = outside[i]
        window_data[i][4] = w_bids[i]
        window_data[i][5] = w_asks[i]

    features = get_features(lob, lob_times, tapes, window_length_s, window_data)
    all_features.append(features)
    c += 1

124

In [3]:
import pandas as pd

dfs = [pd.DataFrame(fa, columns=f) for fa, f in all_features]
for i, df in enumerate(dfs):
    df["WMP"] = (df["WBP"] + df["WAP"]) / 2
    df["MP_diff(y)"] = df["MP"].diff(1)
    
    df["MP_perc(y)"] = df["MP_diff(y)"] / df["MP"]
    df["DAY"] = i
df.describe()

Unnamed: 0,MP,HIBID,LOASK,AP,WBP,WAP,TCBS,TCAS,AWS,VOL,...,PSTD,LOWIN,HIWIN,nUoD,close_hibid,close_loask,WMP,MP_diff(y),MP_perc(y),DAY
count,509.0,509.0,510.0,510.0,510.0,510.0,510.0,510.0,510.0,510.0,...,510.0,510.0,510.0,510.0,510.0,510.0,510.0,507.0,507.0,510.0
mean,120.80943,108.336935,133.283333,107.773434,97.821416,119.812024,799.272549,389.211765,21.990607,108.692157,...,2.805573,0.0,1.576471,-0.30061,107.109804,176.068627,108.81672,0.00789,-0.003664,124.0
std,7.717613,1.516618,14.955586,1.438187,14.256804,13.093612,556.606094,238.180699,26.602279,12.428077,...,1.104335,0.0,4.374537,0.156839,7.837226,105.686126,3.229437,10.870564,0.087142,0.0
min,101.5,94.0,107.0,101.575472,59.121086,105.265306,145.0,98.0,-0.04287,69.0,...,1.157225,0.0,0.0,-0.6875,29.0,93.0,93.861267,-52.5,-0.454545,124.0
25%,115.75,108.0,123.0,107.07717,103.54146,112.702025,472.0,233.25,7.081053,101.0,...,1.992277,0.0,0.0,-0.40867,107.0,114.0,108.102343,-6.0,-0.05,124.0
50%,119.0,108.0,130.0,108.102273,105.201785,113.984241,556.0,298.0,8.257971,109.0,...,2.354911,0.0,0.0,-0.316986,108.0,132.0,109.413179,0.0,0.0,124.0
75%,124.0,109.0,140.0,108.754597,106.018,115.434346,729.5,443.0,10.143054,117.0,...,3.507367,0.0,0.0,-0.2,110.0,189.75,110.180411,5.125,0.042598,124.0
max,168.0,112.0,227.0,110.213483,107.823881,177.776699,2743.0,1185.0,89.878338,140.0,...,6.667187,0.0,38.0,0.166667,113.0,776.0,136.64835,47.0,0.291022,124.0


# Extract features from LOB and Tapes

get mean, std, trend, delta from 60 min segments

In [4]:
# code

features = ["TCAS", "TCBS", "ALPHA", "BETA", "ZETA",
            "WMP", "AWS", "VOL", "GAP", "nUoD", "PSTD"]

sum_features = ["LOWIN", "HIWIN"]

y_feats = list(df)

for f in features:
    try:
        y_feats.remove(f)
    except:
        print("couldnt remove", f)

for f in sum_features:
    try:
        y_feats.remove(f)
    except:
        print("couldnt remove", f)

print(y_feats)

X_dfs = []



for c, df in enumerate(dfs):
    print(c, end = "\r")

    X_df = pd.DataFrame()

    for i in range(len(df) - 60):
        train_segment = df[i:i+60]
        test_segment = df[i+60:i+61]

        row = {}
        for f in features:
            # std
            row[f+"_std"] = np.std(train_segment[f])
            # mean
            if f != "WMP": # exclude mean midprice
                row[f+"_mean"] = np.mean(train_segment[f])
                row[f+"_close"] = train_segment[f].iloc[-1]
            # delta
            row[f+"_delta"] = train_segment[f].iloc[-1] - train_segment[f].iloc[0]
            # trend
            row[f+"_corr"] = np.corrcoef(np.arange(len(train_segment[f])), train_segment[f].to_numpy())[0, 1]

        for f in sum_features:
            row[f+"_sum"] = np.sum(train_segment[f])

        for yf in y_feats:
            row[yf] = test_segment[yf].iloc[0]
        row["ENDT"] = test_segment["ENDT"]

        X_df = pd.concat([X_df, pd.DataFrame(row)])

    X_dfs.append(X_df)

X_dfs[0]

['MP', 'HIBID', 'LOASK', 'AP', 'WBP', 'WAP', 'SPREAD', 'ENDT', 'close_hibid', 'close_loask', 'MP_diff(y)', 'MP_perc(y)', 'DAY']
124

Unnamed: 0,TCAS_std,TCAS_mean,TCAS_close,TCAS_delta,TCAS_corr,TCBS_std,TCBS_mean,TCBS_close,TCBS_delta,TCBS_corr,...,AP,WBP,WAP,SPREAD,ENDT,close_hibid,close_loask,MP_diff(y),MP_perc(y),DAY
60,173.435675,764.000000,404.0,-792.0,-0.275200,428.585872,1370.983333,471.0,-820.0,-0.281637,...,257.488000,254.389776,263.023622,42.0,3660.0,261.0,300.0,16.25,0.058244,0
61,176.232924,748.300000,254.0,-590.0,-0.281038,438.983360,1359.900000,626.0,-1271.0,-0.328495,...,254.805310,250.456057,259.973251,3.0,3720.0,236.0,254.0,-22.50,-0.087719,0
62,178.931520,742.333333,486.0,-223.0,-0.302718,438.246068,1342.316667,842.0,-926.0,-0.326624,...,257.831683,250.848848,262.843537,5.0,3780.0,261.0,262.0,0.00,0.000000,0
63,179.974673,740.316667,588.0,-398.0,-0.330746,435.905336,1330.933333,1085.0,-258.0,-0.316472,...,262.461538,256.406330,269.501695,5.0,3840.0,264.0,269.0,9.00,0.033898,0
64,178.095208,733.716667,590.0,-466.0,-0.317706,436.393934,1328.033333,1169.0,-273.0,-0.325841,...,262.118280,255.204082,268.119675,6.0,3900.0,246.0,255.0,-2.50,-0.009506,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,179.894290,836.666667,842.0,172.0,-0.069611,330.086495,1404.850000,1950.0,909.0,0.284667,...,289.822917,262.212719,310.930890,6.0,30360.0,288.0,290.0,-4.50,-0.015464,0
506,179.192103,841.416667,955.0,282.0,-0.078436,326.716355,1410.300000,1368.0,474.0,0.251713,...,293.675000,273.735082,312.932917,8.0,30420.0,298.0,306.0,6.00,0.020202,0
507,179.739357,840.883333,641.0,-432.0,-0.137357,320.014817,1420.816667,1525.0,931.0,0.219803,...,289.133333,259.579163,309.648770,4.0,30480.0,288.0,293.0,-7.00,-0.024138,0
508,177.331300,837.900000,894.0,493.0,-0.092298,302.631865,1431.233333,1219.0,473.0,0.133301,...,290.021277,263.860664,312.310627,7.0,30540.0,289.0,296.0,2.50,0.008547,0


In [5]:
merged_dfs = pd.concat(X_dfs[:], axis = 0)
merged_dfs.to_csv(f"final_eval_{window_length_s}.csv")