In [1]:
import numpy as np
import pandas as pd
import os
import numba

%load_ext line_profiler

In [2]:
quotes_filename = os.path.join("play_data", "XBTUSD_quotes_191214_0434.csv")
trades_filename = os.path.join("play_data", "XBTUSD_trades_191214_0434.csv")

In [3]:
df = pd.read_csv(
    quotes_filename,
    index_col='recorded',
    parse_dates=['recorded', 'timestamp'],
)
df.index.name="received"
df

Unnamed: 0_level_0,timestamp,bidSize,bidPrice,askPrice,askSize
received,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-12-14 04:34:40.731941+00:00,2019-12-14 04:34:40.024000+00:00,3840427,7251.5,7252.0,701477
2019-12-14 04:34:41.211565+00:00,2019-12-14 04:34:40.410000+00:00,3840391,7251.5,7252.0,701477
2019-12-14 04:34:42.210955+00:00,2019-12-14 04:34:41.165000+00:00,3840391,7251.5,7252.0,731477
2019-12-14 04:34:42.210955+00:00,2019-12-14 04:34:41.183000+00:00,3840187,7251.5,7252.0,731277
2019-12-14 04:34:42.210955+00:00,2019-12-14 04:34:41.431000+00:00,3840187,7251.5,7252.0,715277
...,...,...,...,...,...
2019-12-14 11:21:25.279438+00:00,2019-12-14 11:21:24.810000+00:00,1737201,7171.0,7171.5,2502314
2019-12-14 11:21:25.279438+00:00,2019-12-14 11:21:24.971000+00:00,1737201,7171.0,7171.5,2562314
2019-12-14 11:21:25.279438+00:00,2019-12-14 11:21:24.980000+00:00,1681090,7171.0,7171.5,2562314
2019-12-14 11:21:25.279438+00:00,2019-12-14 11:21:24.992000+00:00,1681090,7171.0,7171.5,2570314


In [4]:
def label_window(bid_array, ask_array):
    
    label_array = (
        2 * (bid_array < bid_array[0]) # 0: bid no change, 2: bid dec
        + (ask_array > ask_array[0]) # 0: ask no change, 1: ask inc
    )

    label_array = label_array[label_array != 0] # only rows != 0
    
    if len(label_array) > 0: # if not empty
        return label_array[0] # returns first non-zero label
    else:
        return 0
    
def label_df(df, lookahead):
    """
    labels:
    0 -- No price change
    1 -- Ask price increased
    2 -- Bid price decreased
    3 -- Ask price increased AND Bid price decreased
    """
    
    df = df[["bidPrice", "askPrice"]]
    labels = pd.Series(
        np.zeros(len(df)),
        index=df.index,
    )
    td = pd.Timedelta(lookahead)
    
    for win_start in df.index:
        window = df.loc[win_start : win_start + td].values
        labels[win_start] = label_window(window[:,0], window[:,1])

    return labels

In [5]:
for i in "1S 3S 5S".split():
    df[i+"_label"] = label_df(df, i)
df.head()

Unnamed: 0_level_0,timestamp,bidSize,bidPrice,askPrice,askSize,1S_label,3S_label,5S_label
received,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-12-14 04:34:40.731941+00:00,2019-12-14 04:34:40.024000+00:00,3840427,7251.5,7252.0,701477,0.0,0.0,0.0
2019-12-14 04:34:41.211565+00:00,2019-12-14 04:34:40.410000+00:00,3840391,7251.5,7252.0,701477,0.0,0.0,0.0
2019-12-14 04:34:42.210955+00:00,2019-12-14 04:34:41.165000+00:00,3840391,7251.5,7252.0,731477,0.0,0.0,0.0
2019-12-14 04:34:42.210955+00:00,2019-12-14 04:34:41.183000+00:00,3840187,7251.5,7252.0,731277,0.0,0.0,0.0
2019-12-14 04:34:42.210955+00:00,2019-12-14 04:34:41.431000+00:00,3840187,7251.5,7252.0,715277,0.0,0.0,0.0
