# Load tapes and lob data

do this for each day otherwise memory (RAM) exceeds most computers

In [1]:
# code
from fast_tools import get_data, get_data_gen

#data = get_data()

# Clean Data
Remove outliers from lob and create an additional columns noting this

FFill tapes data to get the most up to date tapes price

In [2]:
# code
import numpy as np
from numba import njit, prange

def get_tapes_window(tapes):
    dt = 60*60 # in seconds
    #stds = []
    #means = []
    w_bids = []
    w_asks = []

    t_start = 0
    start_time = 0
    z = 3.29 # 99.9%

    outside = []
    while True:
        end_time = start_time + dt
        t_end = t_start
        rolling_tapes = []
        while tapes[t_end, 0] < end_time:
            rolling_tapes += [tapes[t_end, 1]] * int(tapes[t_end,2])
            t_end += 1

        mean = np.mean(rolling_tapes)
        std = np.std(rolling_tapes)
        #means.append(mean)

        w_bid = mean - std * z
        w_ask = mean + std * z
        w_bids.append(w_bid)
        w_asks.append(w_ask)

        # look one minute a head
        local_end = t_end
        future_tapes = []
        while tapes[local_end, 0] < end_time + 60:
            future_tapes += [tapes[local_end, 1]] * int(tapes[local_end,2])
            local_end += 1

        future_tapes = np.array(future_tapes)

        n_above = len(np.where(future_tapes > w_ask)[0])
        n_below = len(np.where(future_tapes < w_bid)[0])
        if end_time % 60 !=0:
            raise ValueError
        outside.append((end_time,n_above, n_below, len(future_tapes)))

        start_time += 60
        while tapes[t_start, 0] < start_time:
            t_start += 1

        end_time += dt
        if end_time >= 8.5*60*60:
            break

    return outside, w_bids, w_asks

@njit(parallel=True)
def get_features(lob_data: np.array, 
                 lob_times: np.array, 
                 tapes: np.array, 
                 time_step_s: int, 
                 window_data: np.array,
                 ab_weight = 1, 
                 median = True, 
                 ):
    """
    Calculate features from LOB and Tapes data.

    Parameters:
    -----------
    lob_data : np.array
        Array containing the limit order book (LOB) data.
    lob_times : np.array
        Array containing timestamps for the LOB data.
    tapes : np.array
        Array containing Tapes data.
    time_step_s : int
        Time step in seconds for calculating features.
    ab_weight : float, optional
        Weight parameter for alpha and beta calculations, by default 1.
    median : bool, optional
        Whether to calculate features using median instead of mean, by default False.
    window_data : np.array
        Array containing window data for calculating CBS and CAS, by default None.

    Returns:
    --------
    tuple
        A tuple containing:
        - feat_arr: np.array
            Array containing feature values.
        - features: list
            List of feature names.
    """
    
    n_rows = int((8.5 * 60 * 60) / time_step_s)                         # define number of rows of output array
    features = ["MP","HIBID","LOASK","AP","WBP","WAP",                  # define features
                "TCBS","TCAS","AWS","VOL","GAP","SPREAD",
                "ALPHA", "BETA", "ZETA", "ENDT", 
                "PSTD", "LOWIN", "HIWIN", "nUoD",
                "close_hibid", "close_loask"]
    n_features = len(features)                                          # define number of features

    feat_arr = np.zeros((n_rows, n_features), dtype=np.float64)         # array to hold feature values
    
    LA_HB_a_b = np.zeros((lob_data.shape[0]+1, 4), dtype = np.float64)  # array holding the LOASK, HIBID,
                                                                        # alpha, beta, values 

    for i in prange(lob_data.shape[0]):                                 # iterates over the LOB to fill
        row = lob_data[i]                                               # LA_HB_a_b values
        
        neg_ind = np.where(row < 0)[0]                                  # locate bid and ask prices (indicies)
        pos_ind = np.where(row > 0)[0]
        
        if len(neg_ind) == 0:                                           # assign HIBID, np.nan if no values
            LA_HB_a_b[i][1] = np.nan
        else:
            LA_HB_a_b[i][1] = max(neg_ind) + 1 

        if len(pos_ind) == 0:                                           # assign HIBID, np.nan if no values
            LA_HB_a_b[i][0] = np.nan
        else:
            LA_HB_a_b[i][0] = min(pos_ind) + 1

        mid_price = (LA_HB_a_b[i][0] + LA_HB_a_b[i][1]) / 2             # calculate mid_price for alpha/beta calculations

        if np.isnan(mid_price):
            alpha = np.nan
            beta = np.nan
        else:                                                           # calculate alpha/beta using ab_weight var
            beta = 0
            for ind in neg_ind:
                beta += (-1 * row[ind]) / ((mid_price - (ind + 1)) + ab_weight)
    
            alpha = 0
            for ind in pos_ind:
                alpha += row[ind] / (((ind + 1) - mid_price) + ab_weight)
                

        LA_HB_a_b[i][2] = alpha
        LA_HB_a_b[i][3] = beta
        
    max_lob = lob_data.shape[0] - 1                                      # define max indicies for lob
    max_tapes = tapes.shape[0] - 1                                       # define max indicies for tapes
    
    start_time = 0                                                       # define start time
    lob_start = 0                                                        # define start index for lob
    tapes_start = 0                                                      # define start index for tapes
    
    cas = np.zeros(800, dtype = np.int16)                                # define an array to hold CAS values
    cbs = np.zeros(800, dtype = np.int16)                                # define an array to hold CBS values
    for row_i in range(n_rows):
        end_time = start_time + time_step_s                              # move to next time step
        lob_end = lob_start
        tapes_end = tapes_start

        # get lob end index
        while lob_times[lob_end] < end_time and lob_end < max_lob:       # move lob indicies to end time
            lob_end += 1
        
        # get tapes end index
        while tapes[tapes_end][0] < end_time and tapes_end < max_tapes:  # move tapes indicies to end time
            tapes_end += 1

        # feature calculations
        if tapes_start == tapes_end:                                     # if there is no tapes data
            AP = np.nan                                                  # set tapes features to np.nan
            VOL = np.nan
            PSTD = np.nan
            nUoD = np.nan
        else:
            tapes_slice = tapes[tapes_start:tapes_end]                   # extract tapes slice, calculate AP, VOL
            tapes_list = []
            
            for row in tapes_slice:
                for _ in range(int(row[2])):
                    tapes_list.append(row[1])

            tapes_list = np.array(tapes_list, dtype=np.int32)
            AP = np.mean(tapes_list)
            VOL = np.sum(tapes_slice[:,2])
            PSTD = np.std(tapes_list)

            tapes_price_diff = tapes_slice[:,1][1:] - tapes_slice[:,1][:-1]
            n_ups = np.sum(tapes_price_diff > 0)
            n_downs = np.sum(tapes_price_diff < 0)
            nUoD = (n_ups + 1) / (n_downs + 1) - 1

        if lob_start == lob_end:                                         # if there is no LOB data
            MP = np.nan                                                  # set lob features to np.nan
            HIBID = np.nan
            LOASK = np.nan
            SPREAD = np.nan
            TCBS = np.nan
            TCAS = np.nan
            WBP = np.nan
            WAP = np.nan
            AWS = np.nan
            ALPHA = np.nan
            BETA = np.nan
            ZETA = np.nan  
            close_hibid = np.nan
            close_loask = np.nan

        else:
            lob_slice = lob_data[lob_start:lob_end]                       # extract slices of data 
            LA_HB_a_b_slice = LA_HB_a_b[lob_start:lob_end]                

            # midprice_calcs, alpha, beta
            if median:                                                    # calculate price features
                HIBID = np.median(LA_HB_a_b_slice[:,1])                   # using median if set to true
                LOASK = np.median(LA_HB_a_b_slice[:,0])
                ALPHA = np.median(LA_HB_a_b_slice[:,2])
                BETA = np.median(LA_HB_a_b_slice[:,3])
            else:
                HIBID = np.nanmean(LA_HB_a_b_slice[:,1])
                LOASK = np.nanmean(LA_HB_a_b_slice[:,0])
                ALPHA = np.nanmean(LA_HB_a_b_slice[:,2])
                BETA = np.nanmean(LA_HB_a_b_slice[:,3])

            close_hibid = LA_HB_a_b_slice[:,1][-1]
            close_loask = LA_HB_a_b_slice[:,0][-1]

            MP = (HIBID + LOASK) / 2
            SPREAD = LOASK - HIBID
            ZETA = BETA - ALPHA

            if HIBID >= LOASK:
                print("WARNING: HIBID >= LOASK")

            # consolidated calcs
            cas[:] = 0                                                      # reset cas, cbs arrays for new data
            cbs[:] = 0 

            window_index = np.where(window_data[:,0] == end_time)[0]
            if len(window_index) == 1:
                w_bid = window_data[window_index[0], 4]
                w_ask = window_data[window_index[0], 5]
                LOWIN = window_data[window_index[0], 1]
                HIWIN = window_data[window_index[0], 2]
            else:
                w_bid = MP - 100
                w_ask = MP + 100
                LOWIN = 0
                HIWIN = 0

            for ci in prange(int(np.floor(w_bid) - 1), int(np.ceil(w_ask) + 2)):
                # can optimise with LOASK AND HIBID here
                                                                        # only calculate cbs between window left of MP
                cbs_vec = lob_slice[:,ci].copy() * -1                   # and less than LOASK + 100 for efficiency
                cbs_vec[cbs_vec <= 0] = 0                               # idk if this breaks things for efficiency ?:
                cbs[ci] = np.sum(np.abs(np.diff(cbs_vec))) + cbs_vec[0]

                                                                        # only calculate cas between window right of MP
                cas_vec = lob_slice[:,ci].copy()                        # and greater than HIBID - 100 for efficiency
                cas_vec[cas_vec <= 0] = 0                               # idk if this breaks things for efficiency ?:
                cas[ci] = np.sum(np.abs(np.diff(cas_vec))) + cas_vec[0]

            TCBS = np.sum(cbs)                                              # Total CBS
            TCAS = np.sum(cas)                                              # Total CAS

            if TCBS == 0:                                                   # Calculate WBP, np.nan if no activity
                WBP = np.nan
            else:
                WBP = 0
                for ci in prange(800):
                    WBP += (ci + 1) * (cbs[ci] / TCBS)

            if TCAS == 0:                                                   # Calculate WAP, np.nan if no activity
                WAP = np.nan
            else:
                WAP = 0
                for ci in prange(800):
                    WAP += (ci + 1) * (cas[ci] / TCAS)

            AWS = WAP - WBP                                                 # Activity weighted spread calc

        # feature setting
        feat_arr[row_i][features.index("AP")] = AP                          # set the values to the feat_arr
        feat_arr[row_i][features.index("VOL")] = VOL
        feat_arr[row_i][features.index("MP")] = MP
        feat_arr[row_i][features.index("HIBID")] = HIBID
        feat_arr[row_i][features.index("LOASK")] = LOASK
        feat_arr[row_i][features.index("SPREAD")] = SPREAD
        feat_arr[row_i][features.index("TCAS")] = TCAS
        feat_arr[row_i][features.index("TCBS")] = TCBS
        feat_arr[row_i][features.index("WAP")] = WAP
        feat_arr[row_i][features.index("WBP")] = WBP
        feat_arr[row_i][features.index("AWS")] = AWS 
        feat_arr[row_i][features.index("ALPHA")] = ALPHA
        feat_arr[row_i][features.index("BETA")] = BETA
        feat_arr[row_i][features.index("ZETA")] = ZETA
        feat_arr[row_i][features.index("GAP")] = MP - AP
        feat_arr[row_i][features.index("ENDT")] = end_time
        feat_arr[row_i][features.index("PSTD")] = PSTD
        feat_arr[row_i][features.index("LOWIN")] = LOWIN
        feat_arr[row_i][features.index("HIWIN")] = HIWIN
        feat_arr[row_i][features.index("nUoD")] = nUoD
        feat_arr[row_i][features.index("close_hibid")] = close_hibid
        feat_arr[row_i][features.index("close_loask")] = close_loask


        # adjust start times
        start_time = end_time                                                # Set the next start times and 
        lob_start = lob_end                                                  # indicies to the last end times / indicies
        tapes_start = tapes_end

    return feat_arr, features

window_length_s = 15
c = 0
all_features = []
for lob, lob_times, tapes in get_data_gen():
    print(c, end = "\r")
    outside, w_bids, w_asks = get_tapes_window(tapes)
    window_data = np.zeros((len(outside), 6), dtype = float)
    for i in range(len(outside)):
        window_data[i][:4] = outside[i]
        window_data[i][4] = w_bids[i]
        window_data[i][5] = w_asks[i]

    features = get_features(lob, lob_times, tapes, window_length_s, window_data)
    all_features.append(features)
    c += 1

124

In [3]:
import pandas as pd

dfs = [pd.DataFrame(fa, columns=f) for fa, f in all_features]
for i, df in enumerate(dfs):
    df["WMP"] = (df["WBP"] + df["WAP"]) / 2
    df["MP_diff(y)"] = df["MP"].diff(1)
    
    df["MP_perc(y)"] = df["MP_diff(y)"] / df["MP"]
    df["DAY"] = i
df.describe()

Unnamed: 0,MP,HIBID,LOASK,AP,WBP,WAP,TCBS,TCAS,AWS,VOL,...,PSTD,LOWIN,HIWIN,nUoD,close_hibid,close_loask,WMP,MP_diff(y),MP_perc(y),DAY
count,2037.0,2038.0,2039.0,2040.0,2030.0,2016.0,2040.0,2040.0,2008.0,2040.0,...,2040.0,2040.0,2040.0,2040.0,2040.0,2040.0,2008.0,2034.0,2034.0,2040.0
mean,124.961095,107.649411,142.230505,107.880379,81.693436,139.367125,376.804412,172.172549,57.593848,27.173039,...,2.092764,0.0,0.394118,-0.217295,106.561275,167.516176,110.23125,0.021632,-0.019697,124.0
std,18.930247,4.441086,36.676765,2.364483,14.507753,23.116651,191.425026,90.092616,27.834036,8.047978,...,1.088788,0.0,2.28983,0.511986,10.025625,94.541804,12.474487,27.498415,0.203425,0.0
min,64.75,31.5,94.0,95.571429,41.733333,98.0,0.0,0.0,-9.225146,2.0,...,0.0,0.0,0.0,-0.875,1.0,93.0,81.412433,-116.5,-1.134409,124.0
25%,113.5,107.0,119.0,107.19185,71.259103,123.77256,233.0,110.0,50.718658,22.0,...,1.419884,0.0,0.0,-0.5,107.0,113.0,102.468867,-14.5,-0.126468,124.0
50%,119.0,108.0,129.0,108.315038,77.631482,136.05125,362.5,164.0,62.977284,27.0,...,1.825674,0.0,0.0,-0.333333,108.0,130.0,108.748407,-1.125,-0.009392,124.0
75%,132.0,110.0,155.0,109.310884,90.878739,150.26965,493.25,226.0,73.911857,33.0,...,2.399519,0.0,0.0,0.0,110.0,174.0,113.850625,13.5,0.103368,124.0
max,261.5,113.0,415.0,113.0,109.90604,310.264368,1197.0,536.0,183.778862,55.0,...,7.299808,0.0,38.0,6.0,114.0,776.0,191.1407,138.5,0.550696,124.0


# Extract features from LOB and Tapes

get mean, std, trend, delta from 60 min segments

In [9]:
# code

features = ["TCAS", "TCBS", "ALPHA", "BETA", "ZETA",
            "WMP", "AWS", "VOL", "GAP", "nUoD", "PSTD"]

sum_features = ["LOWIN", "HIWIN"]

y_feats = list(df)

for f in features:
    try:
        y_feats.remove(f)
    except:
        print("couldnt remove", f)

for f in sum_features:
    try:
        y_feats.remove(f)
    except:
        print("couldnt remove", f)

print(y_feats)

X_dfs = []



for c, df in enumerate(dfs):
    print(c, end = "\r")

    X_df = pd.DataFrame()

    for i in range(len(df) - 60):
        train_segment = df[i:i+60]
        test_segment = df[i+60:i+61]

        row = {}
        for f in features:
            # std
            row[f+"_std"] = np.std(train_segment[f])
            # mean
            if f != "WMP": # exclude mean midprice
                row[f+"_mean"] = np.mean(train_segment[f])
                row[f+"_close"] = train_segment[f].iloc[-1]
            # delta
            row[f+"_delta"] = train_segment[f].iloc[-1] - train_segment[f].iloc[0]
            # trend
            row[f+"_corr"] = np.corrcoef(np.arange(len(train_segment[f])), train_segment[f].to_numpy())[0, 1]

        for f in sum_features:
            row[f+"_sum"] = np.sum(train_segment[f])

        for yf in y_feats:
            row[yf] = test_segment[yf].iloc[0]
        row["ENDT"] = test_segment["ENDT"]

        X_df = pd.concat([X_df, pd.DataFrame(row)])

    X_dfs.append(X_df)

X_dfs[0]

['MP', 'HIBID', 'LOASK', 'AP', 'WBP', 'WAP', 'SPREAD', 'ENDT', 'close_hibid', 'close_loask', 'MP_diff(y)', 'MP_perc(y)', 'DAY']
0

124

Unnamed: 0,TCAS_std,TCAS_mean,TCAS_close,TCAS_delta,TCAS_corr,TCBS_std,TCBS_mean,TCBS_close,TCBS_delta,TCBS_corr,...,AP,WBP,WAP,SPREAD,ENDT,close_hibid,close_loask,MP_diff(y),MP_perc(y),DAY
60,67.729484,238.316667,123.0,-235.0,-0.044226,141.763238,436.866667,580.0,447.0,0.036801,...,253.416667,226.285714,271.532338,9.0,915.0,254.0,255.0,-5.5,-0.022044,0
61,66.067466,235.700000,201.0,-36.0,-0.008201,138.614464,438.616667,238.0,-438.0,-0.067440,...,258.821429,249.355731,314.946746,15.0,930.0,264.0,317.0,17.0,0.063790,0
62,66.616406,234.566667,169.0,-231.0,-0.035987,137.110584,431.566667,253.0,-39.0,-0.055794,...,263.600000,246.286920,288.360976,11.0,945.0,262.0,318.0,-1.0,-0.003766,0
63,63.131738,231.316667,205.0,-59.0,0.025648,138.219973,430.650000,237.0,-34.0,-0.124949,...,266.307692,248.942761,307.420765,43.0,960.0,263.0,268.0,19.0,0.066784,0
64,63.284270,229.966667,183.0,-150.0,0.019070,137.758762,431.083333,297.0,-68.0,-0.186928,...,262.318182,248.523585,306.835052,38.0,975.0,266.0,299.0,-3.5,-0.012456,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2035,77.234382,211.316667,324.0,179.0,0.130294,125.954220,364.633333,243.0,-141.0,0.034024,...,291.200000,269.971193,309.209964,7.0,30540.0,289.0,296.0,1.5,0.005128,0
2036,77.250521,213.583333,281.0,60.0,0.130678,126.882898,362.283333,243.0,-266.0,0.011040,...,287.272727,272.952273,310.855895,4.0,30555.0,288.0,289.0,-5.5,-0.019164,0
2037,77.270109,213.716667,229.0,47.0,0.139126,125.856461,361.133333,440.0,65.0,0.062879,...,291.869565,249.435398,307.721951,5.0,30570.0,288.0,295.0,5.5,0.018803,0
2038,77.168798,214.100000,205.0,4.0,0.124038,128.527468,364.300000,565.0,188.0,0.109771,...,285.071429,251.435685,303.539683,5.0,30585.0,290.0,293.0,-3.0,-0.010363,0


In [10]:
merged_dfs = pd.concat(X_dfs[:], axis = 0)
merged_dfs.to_csv(f"final_eval_{window_length_s}.csv")