# Study notebook

For formatting the cell: use alt+shift+F


[link](https://stackoverflow.com/questions/65747615/how-to-format-jupyter-notebook-in-vscode)

In [57]:
import datetime
import numpy as np
import pandas as pd
import datetime
import os
import torch
from torch.utils.data import Dataset, random_split, DataLoader
from importlib import reload

import sys

import data_provider.data_prep, data_provider.data_loader, utils.timefeatures


pd.set_option("display.max_columns", None)


reload(data_provider.data_prep)
reload(data_provider.data_loader)
reload(utils.timefeatures)

<module 'utils.timefeatures' from 'C:\\Users\\qilin\\My Drive\\code\\ETS\\utils\\timefeatures.py'>

# Run all

In [None]:
pa2 = utils.data_prep.make_parquet(
    2021, 6, 25, 2021, 8, 10,
    folder="./data",
    save_folder="./data_new",
)
pa2.head()

In [None]:
pa3 = utils.data_prep.make_parquet(
    2021, 9, 1, 2022, 3, 25,
    folder="./data",
    save_folder="./data_new",
)
pa3.head()

In [None]:
train_df, valid_df, test_df = utils.data_prep.split_parquet(
    "data_new/SB_20210625_20210810", "df.parquet"
)

In [None]:
train_df, valid_df, test_df = utils.data_prep.split_parquet(
    "data_new/SB_20210901_20220325", "df.parquet"
)

In [4]:
from sklearn.preprocessing import FunctionTransformer

# def sin_transformer(period):
# 	return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))

# def cos_transformer(period):
# 	return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

mid_price = lambda x: np.mean(
    np.repeat(
        pd.concat([x["L1-AskPrice"], x["L1-BidPrice"]]),
        pd.concat([x["L1-AskSize"], x["L1-BidSize"]]),
    )
)


def ask_price(level):
    return lambda x: np.mean(np.repeat(x[f"L{level}-AskPrice"], x[f"L{level}-AskSize"]))


def ask_size(level):
    return lambda x: np.mean(x[f"L{level}-AskSize"])


def bid_price(level):
    return lambda x: np.mean(np.repeat(x[f"L{level}-BidPrice"], x[f"L{level}-BidSize"]))


def bid_size(level):
    return lambda x: np.mean(x[f"L{level}-BidSize"])


def encode(data, col, max_val, time_col="time"):
    data[col + "_sin"] = np.sin(2 * np.pi * data[col] / max_val)
    data[col + "_cos"] = np.cos(2 * np.pi * data[col] / max_val)
    return data


def encode_withSeries(series, max_val, set_name_to):
    a = np.sin(2 * np.pi * series / max_val)
    b = np.cos(2 * np.pi * series / max_val)
    return pd.concat(
        [series, a, b],
        axis=1,
        keys=[set_name_to, set_name_to + "_sin", set_name_to + "_cos"],
    )


def get_time_features(df, time_col="time", coef=10**3):
    # ! these are for grouped subjuects

    ts_micro = (
        df["time"].values.astype(np.int64) // coef
    )  #  nanosec to microsec, divided by 10**3
    ts_micro = pd.Series(ts_micro)

    microseconds_in_day = 24 * 60 * 60 * 1e6

    s1 = encode_withSeries(df["time"].dt.month, 12, "month")
    s2 = encode_withSeries(
        df["time"].dt.day_of_year, df["time"].dt.is_leap_year.astype(int) + 365, "day"
    )
    s3 = encode_withSeries(ts_micro, microseconds_in_day, "microseconds")
    s4 = encode_withSeries(df["time"].dt.day_of_week, 6, "day_of_week")

    return pd.concat([s1, s2, s3, s4], axis=1)

    # can directly do these 2:
    # df['month_sin'] = np.sin(2*np.pi*df.month/12)
    # df['month_cos'] = np.cos(2*np.pi*df.month/12)

    # df.set_index('time', inplace = True) # Without this FunctionTransformer will not work
    # df["month_sin"] = sin_transformer(12).fit_transform(df)["month"]
    # df["month_cos"] = cos_transformer(12).fit_transform(df)["month"]
    # df["day_sin"] = sin_transformer(31).fit_transform(df)["day"]
    # df["day_cos"] = cos_transformer(31).fit_transform(df)["day"]


# ~ On the question of using 31 vs 30 for a month, pick 31, run these two:
# print(np.sin((31*2.*np.pi/30)),np.sin((30*2.*np.pi/30)),np.sin(1*2.*np.pi/30),np.sin(2*2.*np.pi/30))
# print(np.sin((31*2.*np.pi/31)),np.sin((30*2.*np.pi/31)), np.sin(1*2.*np.pi/31), np.sin(2*2.*np.pi/31))
# 1000000


def day_csv_transform(
    df,
    str_col="Date-Time",
    freq="5S",
    price_cols=[
        "L10-AskPrice",
        "L9-AskPrice",
        "L8-AskPrice",
        "L7-AskPrice",
        "L6-AskPrice",
        "L5-AskPrice",
        "L4-AskPrice",
        "L3-AskPrice",
        "L2-AskPrice",
        "L1-AskPrice",
        "L1-BidPrice",
        "L2-BidPrice",
        "L3-BidPrice",
        "L4-BidPrice",
        "L5-BidPrice",
        "L6-BidPrice",
        "L7-BidPrice",
        "L8-BidPrice",
        "L9-BidPrice",
        "L10-BidPrice",
    ],
    size_cols=[
        "L10-AskSize",
        "L9-AskSize",
        "L8-AskSize",
        "L7-AskSize",
        "L6-AskSize",
        "L5-AskSize",
        "L4-AskSize",
        "L3-AskSize",
        "L2-AskSize",
        "L1-AskSize",
        "L1-BidSize",
        "L2-BidSize",
        "L3-BidSize",
        "L4-BidSize",
        "L5-BidSize",
        "L6-BidSize",
        "L7-BidSize",
        "L8-BidSize",
        "L9-BidSize",
        "L10-BidSize",
    ],
):
    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.'time'.html
    df.rename(columns={str_col: "time"}, inplace=True)

    # datetime64 ns utc to datetime
    # https://stackoverflow.com/questions/62917882/convert-datetime64ns-utc-pandas-column-to-datetime
    df["time"] = pd.to_datetime(df["time"])  # .dt.tz_localize(None)

    df_cols = ["time"]

    for k in range(1, 11):
        df_cols += [
            f"L{k}-AskPrice",
            f"L{k}-AskSize",
            f"L{k}-BidPrice",
            f"L{k}-BidSize",
        ]
    df = df[df_cols]
    df = df.dropna()
    df.reset_index(inplace=True)

    group_df = df.groupby(pd.Grouper(key="time", freq=freq))

    lst_price_ask = []
    lst_size_ask = []
    lst_price_buy = []
    lst_size_buy = []

    for i in range(10, 0, -1):

        _price = group_df.apply(ask_price(i)).transform(lambda x: x.fillna(0))
        lst_price_ask.append(_price)
        _size = group_df.apply(ask_size(i)).transform(lambda x: x.fillna(0))
        lst_size_ask.append(_size)

    for i in range(1, 11):

        _price = group_df.apply(bid_price(i)).transform(lambda x: x.fillna(0))
        lst_price_buy.append(_price)
        _size = group_df.apply(bid_size(i)).transform(lambda x: x.fillna(0))
        lst_size_buy.append(_size)

    lst_price = lst_price_ask + lst_price_buy
    lst_size = lst_size_ask + lst_size_buy

    p_df = pd.concat(lst_price, axis=1, keys=price_cols)
    v_df = pd.concat(lst_size, axis=1, keys=size_cols)

    p_df.reset_index(inplace=True)
    v_df.reset_index(inplace=True)

    time_features = get_time_features(p_df)

    macro_midprice = pd.DataFrame( (p_df["L1-AskPrice"] * v_df["L1-AskSize"] + p_df["L1-BidPrice"] * v_df["L1-BidSize"]) / (v_df["L1-AskSize"] + v_df["L1-BidSize"]), columns=['midprice']).fillna(0)


    total = pd.concat(
        [
            p_df.drop(["time"], axis=1),
            v_df.drop(["time"], axis=1),
            time_features,
            macro_midprice,
        ],
        axis=1,
    )

    return total


In [5]:
df_1 = pd.read_csv(f"../data/SB/raw/SB_2022-02-15.csv")
total = day_csv_transform(df_1)
print(total.columns[0:20], total.columns[40:53])


Index(['L10-AskPrice', 'L9-AskPrice', 'L8-AskPrice', 'L7-AskPrice',
       'L6-AskPrice', 'L5-AskPrice', 'L4-AskPrice', 'L3-AskPrice',
       'L2-AskPrice', 'L1-AskPrice', 'L1-BidPrice', 'L2-BidPrice',
       'L3-BidPrice', 'L4-BidPrice', 'L5-BidPrice', 'L6-BidPrice',
       'L7-BidPrice', 'L8-BidPrice', 'L9-BidPrice', 'L10-BidPrice'],
      dtype='object') Index(['month', 'month_sin', 'month_cos', 'day', 'day_sin', 'day_cos',
       'microseconds', 'microseconds_sin', 'microseconds_cos', 'day_of_week',
       'day_of_week_sin', 'day_of_week_cos', 'midprice'],
      dtype='object')


In [76]:
total

Unnamed: 0,L10-AskPrice,L9-AskPrice,L8-AskPrice,L7-AskPrice,L6-AskPrice,L5-AskPrice,L4-AskPrice,L3-AskPrice,L2-AskPrice,L1-AskPrice,L1-BidPrice,L2-BidPrice,L3-BidPrice,L4-BidPrice,L5-BidPrice,L6-BidPrice,L7-BidPrice,L8-BidPrice,L9-BidPrice,L10-BidPrice,L10-AskSize,L9-AskSize,L8-AskSize,L7-AskSize,L6-AskSize,L5-AskSize,L4-AskSize,L3-AskSize,L2-AskSize,L1-AskSize,L1-BidSize,L2-BidSize,L3-BidSize,L4-BidSize,L5-BidSize,L6-BidSize,L7-BidSize,L8-BidSize,L9-BidSize,L10-BidSize,month,month_sin,month_cos,day,day_sin,day_cos,microseconds,microseconds_sin,microseconds_cos,day_of_week,day_of_week_sin,day_of_week_cos,midprice
0,18.351792,18.336760,18.332585,18.321419,18.307796,18.302264,18.298545,18.298221,18.294719,18.271596,18.235972,18.212182,18.205479,18.189478,18.170661,18.169059,18.169002,18.139942,18.125620,18.135428,39.506667,40.986667,36.680000,36.746667,53.786667,62.026667,30.800000,27.280000,19.920000,6.600000,4.800000,2.933333,1.946667,1.786667,3.026667,4.533333,6.280000,4.586667,4.840000,5.453333,2,0.866025,0.5,46,0.711657,0.702527,1644913800000000,0.793353,-6.087614e-01,1,0.866025,0.5,18.256596
1,18.338640,18.328158,18.314154,18.307520,18.299840,18.291470,18.280758,18.269671,18.259620,18.250132,18.232031,18.222500,18.214412,18.204757,18.193158,18.184493,18.173077,18.161875,18.150465,18.146162,34.217391,31.391304,40.086957,37.695652,114.434783,30.173913,11.478261,6.608696,3.434783,3.304348,2.782609,4.173913,5.913043,4.478261,3.304348,3.000000,2.260870,2.782609,3.739130,4.304348,2,0.866025,0.5,46,0.711657,0.702527,1644913805000000,0.793132,-6.090499e-01,1,0.866025,0.5,18.241857
2,18.333197,18.323000,18.312810,18.300981,18.296569,18.284963,18.273774,18.264293,18.254167,18.245914,18.226242,18.218204,18.210641,18.201180,18.192115,18.179619,18.169855,18.157130,18.147926,18.141389,31.900000,31.000000,33.100000,104.000000,61.200000,18.066667,10.333333,6.133333,3.600000,3.100000,5.500000,5.566667,5.200000,5.366667,3.466667,3.500000,2.300000,3.600000,4.500000,4.800000,2,0.866025,0.5,46,0.711657,0.702527,1644913810000000,0.792910,-6.093382e-01,1,0.866025,0.5,18.233333
3,18.338187,18.327898,18.317952,18.305037,18.299470,18.288955,18.278131,18.268696,18.258727,18.249941,18.230000,18.220000,18.210000,18.200000,18.190000,18.180000,18.170000,18.160000,18.150000,18.140000,34.200000,31.400000,33.200000,54.000000,113.200000,26.800000,10.700000,9.200000,5.500000,34.000000,9.500000,6.000000,6.000000,4.000000,6.000000,4.000000,4.000000,1.000000,9.000000,4.000000,2,0.866025,0.5,46,0.711657,0.702527,1644913815000000,0.792689,-6.096265e-01,1,0.866025,0.5,18.245586
4,18.337384,18.327014,18.317082,18.303881,18.299178,18.289375,18.275556,18.268065,18.257500,18.249623,18.234167,18.225909,18.215714,18.206667,18.194706,18.186667,18.175714,18.168421,18.151290,18.147500,33.857143,31.571429,33.285714,62.571429,104.285714,22.857143,6.428571,8.857143,5.714286,7.571429,6.857143,6.285714,6.000000,5.142857,4.857143,5.142857,4.000000,2.714286,4.428571,6.857143,2,0.866025,0.5,46,0.711657,0.702527,1644913820000000,0.792467,-6.099147e-01,1,0.866025,0.5,18.242277
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6836,18.235789,18.222500,18.216598,18.200561,18.198409,18.182059,18.175070,18.163498,18.151851,18.147776,18.130594,18.123316,18.113466,18.105119,18.092070,18.085333,18.077216,18.061364,18.058636,18.043077,22.800000,25.600000,19.400000,78.500000,52.800000,27.200000,28.400000,26.300000,60.500000,53.500000,20.200000,38.600000,50.200000,29.500000,45.400000,45.000000,19.400000,26.400000,17.600000,5.200000,2,0.866025,0.5,46,0.711657,0.702527,1644947980000000,-0.999999,-1.454441e-03,1,0.866025,0.5,18.143066
6837,18.240000,18.230000,18.220000,18.210000,18.200000,18.190000,18.180000,18.170000,18.160000,18.150000,18.140000,18.130000,18.120000,18.110000,18.100000,18.090000,18.080000,18.070000,18.060000,18.050000,33.000000,16.000000,32.000000,11.000000,111.000000,14.000000,36.000000,23.000000,28.000000,103.666667,5.666667,31.000000,39.833333,37.000000,23.000000,60.000000,35.000000,9.000000,38.000000,4.000000,2,0.866025,0.5,46,0.711657,0.702527,1644947985000000,-0.999999,-1.090831e-03,1,0.866025,0.5,18.149482
6838,18.240000,18.230000,18.220000,18.210000,18.200000,18.190000,18.180000,18.170000,18.160000,18.150000,18.140000,18.130000,18.120000,18.110000,18.100000,18.090000,18.080000,18.070000,18.060000,18.048235,33.000000,16.000000,32.000000,11.000000,111.000000,13.800000,35.700000,22.900000,28.000000,95.100000,7.300000,28.100000,36.500000,35.500000,19.900000,58.500000,33.500000,7.800000,36.800000,3.400000,2,0.866025,0.5,46,0.711657,0.702527,1644947990000000,-1.000000,-7.272205e-04,1,0.866025,0.5,18.149287
6839,18.240000,18.230000,18.220000,18.210000,18.200000,18.190000,18.180000,18.170000,18.160000,18.150000,18.131342,18.124299,18.115216,18.109409,18.090351,18.086471,18.078571,18.061282,18.058889,18.030625,33.000000,16.000000,32.000000,11.000000,111.000000,14.818182,35.000000,22.545455,27.090909,60.454545,27.090909,29.818182,32.590909,16.909091,28.500000,42.500000,17.500000,19.500000,18.000000,16.000000,2,0.866025,0.5,46,0.711657,0.702527,1644947995000000,-1.000000,-3.636103e-04,1,0.866025,0.5,18.144226


In [471]:
total.head()


Unnamed: 0,L10-AskPrice,L9-AskPrice,L8-AskPrice,L7-AskPrice,L6-AskPrice,L5-AskPrice,L4-AskPrice,L3-AskPrice,L2-AskPrice,L1-AskPrice,...,month_cos,day,day_sin,day_cos,microseconds,microseconds_sin,microseconds_cos,day_of_week,day_of_week_sin,day_of_week_cos
0,18.844967,18.829321,18.811384,18.798303,18.793638,18.787349,18.779207,18.763088,18.750056,18.742152,...,6.123234000000001e-17,77,0.970064,0.24285,1647592200000000,0.793353,-0.608761,4,-0.866025,-0.5
1,18.81,18.8,18.79,18.78,18.77,18.76,18.75,18.74,18.73,18.72,...,6.123234000000001e-17,77,0.970064,0.24285,1647592205000000,0.793132,-0.60905,4,-0.866025,-0.5
2,18.81699,18.805,18.798254,18.786667,18.776984,18.766786,18.757943,18.747064,18.736158,18.729375,...,6.123234000000001e-17,77,0.970064,0.24285,1647592210000000,0.79291,-0.609338,4,-0.866025,-0.5
3,18.818738,18.807548,18.79933,18.788571,18.778785,18.768996,18.758395,18.74903,18.739183,18.728966,...,6.123234000000001e-17,77,0.970064,0.24285,1647592215000000,0.792689,-0.609626,4,-0.866025,-0.5
4,18.82,18.81,18.8,18.79,18.78,18.77,18.76,18.75,18.74,18.73,...,6.123234000000001e-17,77,0.970064,0.24285,1647592220000000,0.792467,-0.609915,4,-0.866025,-0.5


## Code: make file

In [30]:
import datetime
import numpy as np
import pandas as pd
import datetime
from collections import deque
import os


def make_parquet(  # 2021-2-10 to 2022-03-25
    begin_year,
    begin_month,
    begin_day,
    end_year,
    end_month,
    end_day,
    folder="./data",
    save_folder=".data_new",
    ticker="SB",
):

    os.makedirs(f"{folder}/{ticker}/npy/", exist_ok=True)

    cols = ["Date-Time"]
    for k in range(1, 11):
        cols += [
            "L%s-AskPrice" % k,
            "L%s-AskSize" % k,
            "L%s-BidPrice" % k,
            "L%s-BidSize" % k,
        ]

    file_list = []
    for file in os.listdir(f"{folder}/{ticker}/raw/"):
        year, month, day = map(
            lambda x: int(x),
            file.split(sep="_", maxsplit=-1)[1].split(sep=".")[0].split("-"),
        )
        currentDateTime = datetime.datetime(year, month, day)
        file_list.append(currentDateTime)

    # running_queue = deque([]) # 5 days of running average

    df_list = []

    for i, dt in enumerate(file_list):
        if (datetime.datetime(begin_year, begin_month, begin_day) <= dt) and (
            dt <= datetime.datetime(end_year, end_month, end_day)
        ):

            d = pd.read_csv(
                f"{folder}/{ticker}/raw/{ticker}_{dt.year}-{dt.month:02d}-{dt.day:02d}.csv"
            )

            print(f"The {i}th file for {dt} has length {len(d)}\n")

            if len(d) < 1000:
                print(
                    f"The {i}th document for {dt} has few observation and will be skipped\n"
                )
                continue

            total = day_csv_transform(d)

            df_list.append(total)

    df = pd.concat(df_list, axis=0)
    os.makedirs(
        f"{save_folder}/{ticker}_{begin_year}{begin_month:02d}{begin_day:02d}_{end_year}{end_month:02d}{end_day:02d}",
        exist_ok=True,
    )
    df.to_parquet(
        f"{save_folder}/{ticker}_{begin_year}{begin_month:02d}{begin_day:02d}_{end_year}{end_month:02d}{end_day:02d}/df.parquet"
    )

    return df
    

In [457]:
pa2 = make_parquet(
    2021,
    6,
    1,
    2021,
    6,
    2,
    folder="../data",
    save_folder="../data_new",
)
pa2.head()


The 76th file for 2021-06-01 00:00:00 has length 36503

The 77th file for 2021-06-02 00:00:00 has length 30654



Unnamed: 0,L10-AskPrice,L9-AskPrice,L8-AskPrice,L7-AskPrice,L6-AskPrice,L5-AskPrice,L4-AskPrice,L3-AskPrice,L2-AskPrice,L1-AskPrice,...,month_cos,day,day_sin,day_cos,microseconds,microseconds_sin,microseconds_cos,day_of_week,day_of_week_sin,day_of_week_cos
0,17.829396,17.815975,17.836494,17.828863,17.785787,17.781184,17.78819,17.75305,17.742789,17.73086,...,-1.0,153,0.486273,-0.873807,1622619000000000,0.92388,-0.382683,2,0.866025,-0.5
1,17.815526,17.805864,17.797943,17.786774,17.771573,17.76875,17.755833,17.745238,17.734706,17.727676,...,-1.0,153,0.486273,-0.873807,1622619005000000,0.92374,-0.383019,2,0.866025,-0.5
2,17.818644,17.808667,17.799728,17.788947,17.776667,17.76977,17.759277,17.749155,17.738644,17.729304,...,-1.0,153,0.486273,-0.873807,1622619010000000,0.923601,-0.383355,2,0.866025,-0.5
3,17.822895,17.809697,17.800714,17.7928,17.778533,17.770135,17.762381,17.750714,17.741406,17.732136,...,-1.0,153,0.486273,-0.873807,1622619015000000,0.923462,-0.383691,2,0.866025,-0.5
4,17.815455,17.806154,17.799153,17.785455,17.773902,17.768824,17.757024,17.7475,17.736857,17.728881,...,-1.0,153,0.486273,-0.873807,1622619020000000,0.923322,-0.384027,2,0.866025,-0.5


In [473]:
pa = pd.read_parquet("../data_new/SB_20210601_20210602/df.parquet")


In [131]:
pa.head()


NameError: name 'pa' is not defined

## Code: Split

In [462]:
def split_parquet(folder_path, file_name, train_ratio=0.7, valid_ratio=0.2, save=True):
    df = pd.read_parquet(f"{folder_path}/{file_name}")
    n = len(df)
    train_df = df[0 : int(n * train_ratio)]
    valid_df = df[int(n * train_ratio) : int(n * (train_ratio + valid_ratio))]
    test_df = df[int(n * (train_ratio + valid_ratio)) :]
    if save:
        train_df.to_parquet(f"{folder_path}/train_df.parquet")
        valid_df.to_parquet(f"{folder_path}/valid_df.parquet")
        test_df.to_parquet(f"{folder_path}/test_df.parquet")
    return train_df, valid_df, test_df


In [474]:
train_df, valid_df, test_df = split_parquet(
    "../data_new/SB_20210601_20210602/", "df.parquet"
)


In [475]:
train_df.head()


Unnamed: 0,L10-AskPrice,L9-AskPrice,L8-AskPrice,L7-AskPrice,L6-AskPrice,L5-AskPrice,L4-AskPrice,L3-AskPrice,L2-AskPrice,L1-AskPrice,...,month_cos,day,day_sin,day_cos,microseconds,microseconds_sin,microseconds_cos,day_of_week,day_of_week_sin,day_of_week_cos
0,17.829396,17.815975,17.836494,17.828863,17.785787,17.781184,17.78819,17.75305,17.742789,17.73086,...,-1.0,153,0.486273,-0.873807,1622619000000000,0.92388,-0.382683,2,0.866025,-0.5
1,17.815526,17.805864,17.797943,17.786774,17.771573,17.76875,17.755833,17.745238,17.734706,17.727676,...,-1.0,153,0.486273,-0.873807,1622619005000000,0.92374,-0.383019,2,0.866025,-0.5
2,17.818644,17.808667,17.799728,17.788947,17.776667,17.76977,17.759277,17.749155,17.738644,17.729304,...,-1.0,153,0.486273,-0.873807,1622619010000000,0.923601,-0.383355,2,0.866025,-0.5
3,17.822895,17.809697,17.800714,17.7928,17.778533,17.770135,17.762381,17.750714,17.741406,17.732136,...,-1.0,153,0.486273,-0.873807,1622619015000000,0.923462,-0.383691,2,0.866025,-0.5
4,17.815455,17.806154,17.799153,17.785455,17.773902,17.768824,17.757024,17.7475,17.736857,17.728881,...,-1.0,153,0.486273,-0.873807,1622619020000000,0.923322,-0.384027,2,0.866025,-0.5


In [468]:
len(pa2), len(train_df), len(valid_df), len(test_df)


(6818, 4772, 1364, 682)

In [467]:
 len(train_df) + len(valid_df) +len(test_df)

6818

## Code: dataset

In [105]:
class Dataset(Dataset):
    """Characterizes a dataset for PyTorch"""

    def __init__(self, df, input_width, shift, label_width, stride=1):
        self.input_width = (
            input_width  # input_width: # of time steps that are fed into the model
            # input_width_p1 + input_width_p2
        )
        self.shift = shift  # shift: # of timesteps separating the input and the (final) predictions
        self.label_width = (
            label_width  # label_width: # of time steps in the predictions
        )

        self.window_size = self.input_width + self.shift
        self.label_start = self.window_size - self.label_width

        self.length = df.shape[0]
        self.input_slice = slice(0, self.input_width)
        self.label_slice = slice(self.label_start, None)

        self.mask_slice = None
        if self.shift != self.label_width:
            self.mask_slice = slice(self.input_width, self.label_start)

        self.stride = stride

        # splits = [total[i:i+self.window_size] for i in range(0,self.length - self.window_size + 1,self.stride)]
        inputs = [
            df[i : i + self.input_width]
            for i in range(0, self.length - self.window_size + 1, self.stride)
        ]
        labels = [
            df[i + self.label_start : i + self.window_size]
            for i in range(0, self.length - self.window_size + 1, self.stride)
        ]

        inputs_tensor = torch.from_numpy(
            np.concatenate(np.expand_dims(inputs, axis=0), axis=0)
        ).to(dtype=torch.float)
        labels_tensor = torch.from_numpy(
            np.concatenate(np.expand_dims(labels, axis=0), axis=0)
        ).to(dtype=torch.float)

        self.X = inputs_tensor[:,:,:-1] # mid_price not included
        self.y = labels_tensor[:,:,:-1]
        self.target = labels_tensor[:,:,-1]

    def __len__(self):
        """Denotes the total number of samples"""
        return len(self.X)

    def __getitem__(self, index):
        """Generates samples of data"""
        return self.X[index], self.y[index], self.target[index]


In [151]:
ds = Dataset(total_demo, input_width=120, shift=24, label_width=24)
len(ds)

5978

In [152]:
x,y,target = ds[0]
len(x),len(y),len(target)

(120, 24, 24)

In [153]:
# missing value taken care of
# total_demo.iloc[22,:]
# total_demo.iloc[44,:]

In [159]:
x[44],y[22], target[22]

(tensor([ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          3.0000e+00,  1.0000e+00,  6.1232e-17,  7.7000e+01,  9.7006e-01,
          2.4285e-01,  1.6476e+15,  7.8351e-01, -6.2138e-01,  4.0000e+00,
         -8.6603e-01, -5.0000e-01]),
 tensor([ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
 

In [154]:
x.shape, y.shape, target

(torch.Size([120, 52]),
 torch.Size([24, 52]),
 tensor([18.7030, 18.6924, 18.6859, 18.6823, 18.6960,  0.0000, 18.6906, 18.6977,
          0.0000, 18.6877, 18.6810,  0.0000,  0.0000, 18.6872, 18.6965, 18.6904,
         18.6890, 18.6867,  0.0000, 18.6927, 18.6804, 18.6820,  0.0000,  0.0000]))

In [None]:
from torch.utils.data import DataLoader

dl = DataLoader(ds, batch_size=32)

## Code: Scaler

In [39]:
from sklearn.preprocessing import MinMaxScaler


In [40]:
minmax = MinMaxScaler()
total


Unnamed: 0,L10-AskPrice,L9-AskPrice,L8-AskPrice,L7-AskPrice,L6-AskPrice,L5-AskPrice,L4-AskPrice,L3-AskPrice,L2-AskPrice,L1-AskPrice,L1-BidPrice,L2-BidPrice,L3-BidPrice,L4-BidPrice,L5-BidPrice,L6-BidPrice,L7-BidPrice,L8-BidPrice,L9-BidPrice,L10-BidPrice,L10-AskSize,L9-AskSize,L8-AskSize,L7-AskSize,L6-AskSize,L5-AskSize,L4-AskSize,L3-AskSize,L2-AskSize,L1-AskSize,L1-BidSize,L2-BidSize,L3-BidSize,L4-BidSize,L5-BidSize,L6-BidSize,L7-BidSize,L8-BidSize,L9-BidSize,L10-BidSize,month,month_sin,month_cos,day,day_sin,day_cos,microseconds,microseconds_sin,microseconds_cos,day_of_week,day_of_week_sin,day_of_week_cos
0,18.351792,18.336760,18.332585,18.321419,18.307796,18.302264,18.298545,18.298221,18.294719,18.271596,18.235972,18.212182,18.205479,18.189478,18.170661,18.169059,18.169002,18.139942,18.125620,18.135428,39.506667,40.986667,36.680000,36.746667,53.786667,62.026667,30.800000,27.280000,19.920000,6.600000,4.800000,2.933333,1.946667,1.786667,3.026667,4.533333,6.280000,4.586667,4.840000,5.453333,2,0.866025,0.5,46,0.711657,0.702527,1644913800000000,0.793353,-6.087614e-01,1,0.866025,0.5
1,18.338640,18.328158,18.314154,18.307520,18.299840,18.291470,18.280758,18.269671,18.259620,18.250132,18.232031,18.222500,18.214412,18.204757,18.193158,18.184493,18.173077,18.161875,18.150465,18.146162,34.217391,31.391304,40.086957,37.695652,114.434783,30.173913,11.478261,6.608696,3.434783,3.304348,2.782609,4.173913,5.913043,4.478261,3.304348,3.000000,2.260870,2.782609,3.739130,4.304348,2,0.866025,0.5,46,0.711657,0.702527,1644913805000000,0.793132,-6.090499e-01,1,0.866025,0.5
2,18.333197,18.323000,18.312810,18.300981,18.296569,18.284963,18.273774,18.264293,18.254167,18.245914,18.226242,18.218204,18.210641,18.201180,18.192115,18.179619,18.169855,18.157130,18.147926,18.141389,31.900000,31.000000,33.100000,104.000000,61.200000,18.066667,10.333333,6.133333,3.600000,3.100000,5.500000,5.566667,5.200000,5.366667,3.466667,3.500000,2.300000,3.600000,4.500000,4.800000,2,0.866025,0.5,46,0.711657,0.702527,1644913810000000,0.792910,-6.093382e-01,1,0.866025,0.5
3,18.338187,18.327898,18.317952,18.305037,18.299470,18.288955,18.278131,18.268696,18.258727,18.249941,18.230000,18.220000,18.210000,18.200000,18.190000,18.180000,18.170000,18.160000,18.150000,18.140000,34.200000,31.400000,33.200000,54.000000,113.200000,26.800000,10.700000,9.200000,5.500000,34.000000,9.500000,6.000000,6.000000,4.000000,6.000000,4.000000,4.000000,1.000000,9.000000,4.000000,2,0.866025,0.5,46,0.711657,0.702527,1644913815000000,0.792689,-6.096265e-01,1,0.866025,0.5
4,18.337384,18.327014,18.317082,18.303881,18.299178,18.289375,18.275556,18.268065,18.257500,18.249623,18.234167,18.225909,18.215714,18.206667,18.194706,18.186667,18.175714,18.168421,18.151290,18.147500,33.857143,31.571429,33.285714,62.571429,104.285714,22.857143,6.428571,8.857143,5.714286,7.571429,6.857143,6.285714,6.000000,5.142857,4.857143,5.142857,4.000000,2.714286,4.428571,6.857143,2,0.866025,0.5,46,0.711657,0.702527,1644913820000000,0.792467,-6.099147e-01,1,0.866025,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6836,18.235789,18.222500,18.216598,18.200561,18.198409,18.182059,18.175070,18.163498,18.151851,18.147776,18.130594,18.123316,18.113466,18.105119,18.092070,18.085333,18.077216,18.061364,18.058636,18.043077,22.800000,25.600000,19.400000,78.500000,52.800000,27.200000,28.400000,26.300000,60.500000,53.500000,20.200000,38.600000,50.200000,29.500000,45.400000,45.000000,19.400000,26.400000,17.600000,5.200000,2,0.866025,0.5,46,0.711657,0.702527,1644947980000000,-0.999999,-1.454441e-03,1,0.866025,0.5
6837,18.240000,18.230000,18.220000,18.210000,18.200000,18.190000,18.180000,18.170000,18.160000,18.150000,18.140000,18.130000,18.120000,18.110000,18.100000,18.090000,18.080000,18.070000,18.060000,18.050000,33.000000,16.000000,32.000000,11.000000,111.000000,14.000000,36.000000,23.000000,28.000000,103.666667,5.666667,31.000000,39.833333,37.000000,23.000000,60.000000,35.000000,9.000000,38.000000,4.000000,2,0.866025,0.5,46,0.711657,0.702527,1644947985000000,-0.999999,-1.090831e-03,1,0.866025,0.5
6838,18.240000,18.230000,18.220000,18.210000,18.200000,18.190000,18.180000,18.170000,18.160000,18.150000,18.140000,18.130000,18.120000,18.110000,18.100000,18.090000,18.080000,18.070000,18.060000,18.048235,33.000000,16.000000,32.000000,11.000000,111.000000,13.800000,35.700000,22.900000,28.000000,95.100000,7.300000,28.100000,36.500000,35.500000,19.900000,58.500000,33.500000,7.800000,36.800000,3.400000,2,0.866025,0.5,46,0.711657,0.702527,1644947990000000,-1.000000,-7.272205e-04,1,0.866025,0.5
6839,18.240000,18.230000,18.220000,18.210000,18.200000,18.190000,18.180000,18.170000,18.160000,18.150000,18.131342,18.124299,18.115216,18.109409,18.090351,18.086471,18.078571,18.061282,18.058889,18.030625,33.000000,16.000000,32.000000,11.000000,111.000000,14.818182,35.000000,22.545455,27.090909,60.454545,27.090909,29.818182,32.590909,16.909091,28.500000,42.500000,17.500000,19.500000,18.000000,16.000000,2,0.866025,0.5,46,0.711657,0.702527,1644947995000000,-1.000000,-3.636103e-04,1,0.866025,0.5


## demo: get data

## demo: get grouped data

In [66]:
# return lst_price, lst_size, lst_price_buy, price_cols, size_cols

p_df = pd.concat(lst_price, axis=1, keys=price_cols)
v_df = pd.concat(lst_size, axis=1, keys=size_cols)

p_df.reset_index(inplace=True)
v_df.reset_index(inplace=True)


In [145]:
time_features = get_time_features(p_df)

total_demo = pd.concat(
    [p_df.drop(["time"], axis=1), v_df.drop(["time"], axis=1), time_features, macro_midprice],
    axis=1,
)

In [146]:
total_demo.head()

Unnamed: 0,L10-AskPrice,L9-AskPrice,L8-AskPrice,L7-AskPrice,L6-AskPrice,L5-AskPrice,L4-AskPrice,L3-AskPrice,L2-AskPrice,L1-AskPrice,L1-BidPrice,L2-BidPrice,L3-BidPrice,L4-BidPrice,L5-BidPrice,L6-BidPrice,L7-BidPrice,L8-BidPrice,L9-BidPrice,L10-BidPrice,L10-AskSize,L9-AskSize,L8-AskSize,L7-AskSize,L6-AskSize,L5-AskSize,L4-AskSize,L3-AskSize,L2-AskSize,L1-AskSize,L1-BidSize,L2-BidSize,L3-BidSize,L4-BidSize,L5-BidSize,L6-BidSize,L7-BidSize,L8-BidSize,L9-BidSize,L10-BidSize,month,month_sin,month_cos,day,day_sin,day_cos,microseconds,microseconds_sin,microseconds_cos,day_of_week,day_of_week_sin,day_of_week_cos,midprice
0,18.844967,18.829321,18.811384,18.798303,18.793638,18.787349,18.779207,18.763088,18.750056,18.742152,18.700078,18.699773,18.689778,18.669096,18.668411,18.652533,18.648517,18.64476,18.620852,18.617599,19.169014,40.042254,34.197183,27.887324,19.28169,15.352113,13.84507,8.802817,7.549296,5.366197,5.450704,6.211268,6.338028,12.309859,8.774648,25.408451,45.676056,28.732394,18.507042,22.056338,3,1.0,6.123234000000001e-17,77,0.970064,0.24285,1647592200000000,0.793353,-0.608761,4,-0.866025,-0.5,18.720951
1,18.81,18.8,18.79,18.78,18.77,18.76,18.75,18.74,18.73,18.72,18.703197,18.695104,18.685216,18.674161,18.66551,18.651223,18.648807,18.636244,18.622686,18.616429,26.0,52.0,21.828571,20.171429,13.057143,18.714286,9.2,10.628571,10.2,6.685714,3.485714,9.571429,8.6,11.742857,12.6,45.314286,53.628571,12.171429,20.0,26.885714,3,1.0,6.123234000000001e-17,77,0.970064,0.24285,1647592205000000,0.793132,-0.60905,4,-0.866025,-0.5,18.714242
2,18.81699,18.805,18.798254,18.786667,18.776984,18.766786,18.757943,18.747064,18.736158,18.729375,18.712419,18.7019,18.691731,18.683864,18.671662,18.662127,18.650804,18.646744,18.633684,18.620842,28.791667,34.666667,42.0,22.0,21.0,18.666667,14.583333,9.083333,8.458333,5.333333,5.166667,8.333333,13.0,11.0,13.541667,16.458333,70.958333,32.25,9.5,23.75,3,1.0,6.123234000000001e-17,77,0.970064,0.24285,1647592210000000,0.79291,-0.609338,4,-0.866025,-0.5,18.721032
3,18.818738,18.807548,18.79933,18.788571,18.778785,18.768996,18.758395,18.74903,18.739183,18.728966,18.706784,18.698356,18.688116,18.677707,18.662847,18.656874,18.650468,18.643781,18.627147,18.614584,29.428571,29.714286,46.892857,22.0,21.464286,17.428571,12.464286,11.785714,7.428571,5.178571,7.107143,8.035714,11.75,20.25,24.464286,29.821429,58.0,30.607143,12.642857,27.5,3,1.0,6.123234000000001e-17,77,0.970064,0.24285,1647592215000000,0.792689,-0.609626,4,-0.866025,-0.5,18.716134
4,18.82,18.81,18.8,18.79,18.78,18.77,18.76,18.75,18.74,18.73,18.72,18.71,18.7,18.69,18.68,18.67,18.66,18.65,18.64,18.63,30.0,28.0,51.0,22.0,22.0,20.0,51.0,10.0,8.0,7.25,3.5,5.25,9.0,11.0,25.0,14.0,28.0,87.0,16.0,8.0,3,1.0,6.123234000000001e-17,77,0.970064,0.24285,1647592220000000,0.792467,-0.609915,4,-0.866025,-0.5,18.726744


In [None]:
def 

## demo: get timestamp

In [375]:
df.reset_index(inplace=True)


In [381]:
# https://stackoverflow.com/questions/40881876/python-pandas-convert-datetime-to-timestamp-effectively-through-dt-accessor

# ~ Two ways to get the epoch time (POSIX, since 1970)


ts_micro = df["time"].values.astype(np.int64) // 10**3
ts_micro = pd.Series(ts_micro)
ts_micro.head()


0    1647592200207854
1    1647592200208980
2    1647592200216796
3    1647592200216796
4    1647592200216796
dtype: int64

In [319]:
datetime.datetime.timestamp(p_df.iloc[0, 0]) * 1e6  # seconds to microseconds


1647592200000000.0

In [131]:
# ~ this solves (micro)seconds in a day
seconds_in_day = 24 * 60 * 60
microseconds_in_day = 24 * 60 * 60 * 1e6
microseconds_in_day


86400000000.0

In [188]:
# check this against the later computation
np.sin(2 * np.pi * temp_ts / microseconds_in_day)


array([ 0.79335334,  0.79313194,  0.79291043, ..., -0.96573735,
       -0.96583165, -0.96592583])

In [387]:
def encode_withSeries(series, max_val, set_name_to):
    a = np.sin(2 * np.pi * series / max_val)
    b = np.cos(2 * np.pi * series / max_val)
    return pd.concat(
        [series, a, b],
        axis=1,
        keys=[set_name_to, set_name_to + "_sin", set_name_to + "_cos"],
    )


s1 = encode_withSeries(df["time"].dt.month, 12, "month")
s1.head()


Unnamed: 0,month,month_sin,month_cos
0,3,1.0,6.123234000000001e-17
1,3,1.0,6.123234000000001e-17
2,3,1.0,6.123234000000001e-17
3,3,1.0,6.123234000000001e-17
4,3,1.0,6.123234000000001e-17


In [378]:
# ~ great: we can broadcast the max value!!! This solves the problem of day of a year
s2 = encode_withSeries(
    df["time"].dt.day_of_year, df["time"].dt.is_leap_year.astype(int) + 365, "day"
)
s2.head()


Unnamed: 0,day,day_sin,day_cos
0,77,0.970064,0.24285
1,77,0.970064,0.24285
2,77,0.970064,0.24285
3,77,0.970064,0.24285
4,77,0.970064,0.24285


In [386]:
s3 = encode_withSeries(ts_micro, microseconds_in_day, "microseconds")
s3.head()


Unnamed: 0,microseconds,microseconds_sin,microseconds_cos
0,1647592200207854,0.793344,-0.608773
1,1647592200208980,0.793344,-0.608773
2,1647592200216796,0.793344,-0.608774
3,1647592200216796,0.793344,-0.608774
4,1647592200216796,0.793344,-0.608774


In [385]:
s4 = encode_withSeries(df["time"].dt.day_of_week, 6, "day_of_week")
s4.head()


Unnamed: 0,day_of_week,day_of_week_sin,day_of_week_cos
0,4,-0.866025,-0.5
1,4,-0.866025,-0.5
2,4,-0.866025,-0.5
3,4,-0.866025,-0.5
4,4,-0.866025,-0.5


In [432]:
# ~! Important: make sure the indexes are the same,
# or it will create error when you combine a df with a newly created series
# even if the underlying data are the same

# solution: reset_index after dropna()
pd.concat([s1, s2, s3, s4], axis=1).tail()  #


Unnamed: 0,month,month_sin,month_cos,day,day_sin,day_cos,microseconds,microseconds_sin,microseconds_cos,day_of_week,day_of_week_sin,day_of_week_cos
25054,3,1.0,6.123234000000001e-17,77,0.970064,0.24285,1647622800149927,-0.965929,-0.258809,4,-0.866025,-0.5
25055,3,1.0,6.123234000000001e-17,77,0.970064,0.24285,1647622800150224,-0.965929,-0.258808,4,-0.866025,-0.5
25056,3,1.0,6.123234000000001e-17,77,0.970064,0.24285,1647622800150224,-0.965929,-0.258808,4,-0.866025,-0.5
25057,3,1.0,6.123234000000001e-17,77,0.970064,0.24285,1647622800150224,-0.965929,-0.258808,4,-0.866025,-0.5
25058,3,1.0,6.123234000000001e-17,77,0.970064,0.24285,1647622800150224,-0.965929,-0.258808,4,-0.866025,-0.5


In [433]:
pd.concat([s1, s2, s3, s4], axis=1).tail()  #


Unnamed: 0,month,month_sin,month_cos,day,day_sin,day_cos,microseconds,microseconds_sin,microseconds_cos,day_of_week,day_of_week_sin,day_of_week_cos
25054,3,1.0,6.123234000000001e-17,77,0.970064,0.24285,1647622800149927,-0.965929,-0.258809,4,-0.866025,-0.5
25055,3,1.0,6.123234000000001e-17,77,0.970064,0.24285,1647622800150224,-0.965929,-0.258808,4,-0.866025,-0.5
25056,3,1.0,6.123234000000001e-17,77,0.970064,0.24285,1647622800150224,-0.965929,-0.258808,4,-0.866025,-0.5
25057,3,1.0,6.123234000000001e-17,77,0.970064,0.24285,1647622800150224,-0.965929,-0.258808,4,-0.866025,-0.5
25058,3,1.0,6.123234000000001e-17,77,0.970064,0.24285,1647622800150224,-0.965929,-0.258808,4,-0.866025,-0.5


In [412]:
# demo
def encode(data, col, max_val, time_col="time", inplace=True):
    if inplace == False:
        data = data.copy()
    data[col + "_sin"] = np.sin(2 * np.pi * data[col] / max_val)
    data[col + "_cos"] = np.cos(2 * np.pi * data[col] / max_val)
    return data


def encode_withSeries(series, max_val, set_name_to):
    a = np.sin(2 * np.pi * series / max_val)
    b = np.cos(2 * np.pi * series / max_val)
    return pd.concat(
        [series, a, b],
        axis=1,
        keys=[set_name_to, set_name_to + "_sin", set_name_to + "_cos"],
    )


def get_time_features(df, time_col="time", coef=10**3, inplace=True):
    # ! these are for grouped subjuects
    # df['year'] = df[time_col].dt.year
    # df['month'] = df[time_col].dt.month
    # df['day'] = df[time_col].dt.day
    # df['weekday'] = df[time_col].dt.weekday
    # df['hour'] = df['time'].dt.hour
    # df['minute'] = df['time'].dt.minute
    # df['second'] = df['time'].dt.second
    # df['microsecond'] = df['time'].dt.microsecond

    ts_micro = (
        df["time"].values.astype(np.int64) // coef
    )  #  nanosec to microsec, divided by 10**3
    ts_micro = pd.Series(ts_micro)

    microseconds_in_day = 24 * 60 * 60 * 1e6

    s0 = df["time"]
    s1 = encode_withSeries(df["time"].dt.month, 12, "month")
    s2 = encode_withSeries(
        df["time"].dt.day_of_year, df["time"].dt.is_leap_year.astype(int) + 365, "day"
    )
    s3 = encode_withSeries(ts_micro, microseconds_in_day, "microseconds")
    s4 = encode_withSeries(df["time"].dt.day_of_week, 6, "day_of_week")

    # df = encode(df, 'ts_micro', max_val=microseconds_in_day,inplace=inplace)

    #     df = encode(df, 'month', 12)
    #     df = encode(df, 'day', 31)
    #     df = encode(df, 'weekday', 6)
    #     df = encode(df, 'hour', 24)
    #     df = encode(df, 'minute', 60)
    #     df = encode(df, 'second', 60)
    #     df = encode(df, 'microsecond', 1000000)

    # return df.drop(['ts_micro',],axis=1)
    return pd.concat([s0, s1, s2, s3, s4], axis=1)


In [431]:
time_features = get_time_features(p_df)

time_features.tail()


Unnamed: 0,time,month,month_sin,month_cos,day,day_sin,day_cos,microseconds,microseconds_sin,microseconds_cos,day_of_week,day_of_week_sin,day_of_week_cos
6116,2022-03-18 16:59:40+00:00,3,1.0,6.123234000000001e-17,77,0.970064,0.24285,1647622780000000,-0.965548,-0.260224,4,-0.866025,-0.5
6117,2022-03-18 16:59:45+00:00,3,1.0,6.123234000000001e-17,77,0.970064,0.24285,1647622785000000,-0.965643,-0.259873,4,-0.866025,-0.5
6118,2022-03-18 16:59:50+00:00,3,1.0,6.123234000000001e-17,77,0.970064,0.24285,1647622790000000,-0.965737,-0.259521,4,-0.866025,-0.5
6119,2022-03-18 16:59:55+00:00,3,1.0,6.123234000000001e-17,77,0.970064,0.24285,1647622795000000,-0.965832,-0.25917,4,-0.866025,-0.5
6120,2022-03-18 17:00:00+00:00,3,1.0,6.123234000000001e-17,77,0.970064,0.24285,1647622800000000,-0.965926,-0.258819,4,-0.866025,-0.5


In [430]:
pd.concat([p_df, v_df, time_features], axis=1).tail()


Unnamed: 0,time,L10-AskPrice,L9-AskPrice,L8-AskPrice,L7-AskPrice,L6-AskPrice,L5-AskPrice,L4-AskPrice,L3-AskPrice,L2-AskPrice,...,month_cos,day,day_sin,day_cos,microseconds,microseconds_sin,microseconds_cos,day_of_week,day_of_week_sin,day_of_week_cos
6116,2022-03-18 16:59:40+00:00,19.050608,19.041384,19.031209,19.020886,19.011115,19.000285,18.993697,18.982266,18.972895,...,6.123234000000001e-17,77,0.970064,0.24285,1647622780000000,-0.965548,-0.260224,4,-0.866025,-0.5
6117,2022-03-18 16:59:45+00:00,19.049353,19.039232,19.028968,19.019187,19.007253,18.999814,18.989652,18.979565,18.96934,...,6.123234000000001e-17,77,0.970064,0.24285,1647622785000000,-0.965643,-0.259873,4,-0.866025,-0.5
6118,2022-03-18 16:59:50+00:00,19.038618,19.028133,19.018496,19.005689,18.999641,18.989528,18.979106,18.968958,18.9575,...,6.123234000000001e-17,77,0.970064,0.24285,1647622790000000,-0.965737,-0.259521,4,-0.866025,-0.5
6119,2022-03-18 16:59:55+00:00,19.033443,19.022683,19.013197,19.001,18.996932,18.986275,18.975021,18.964345,18.951471,...,6.123234000000001e-17,77,0.970064,0.24285,1647622795000000,-0.965832,-0.25917,4,-0.866025,-0.5
6120,2022-03-18 17:00:00+00:00,19.118914,19.1032,19.070443,19.078934,19.009558,19.051069,19.035979,19.018128,19.015283,...,6.123234000000001e-17,77,0.970064,0.24285,1647622800000000,-0.965926,-0.258819,4,-0.866025,-0.5


## Details: Weighted mean

- Weighted mean over time is enough



## Details: Function factory

## Details: Dataset

In [4]:
import torch


In [134]:
label_columns = total_demo.columns
 
if label_columns is not None:
    label_columns_indices = {name: i for i, name in enumerate(label_columns)}
column_indices = {name: i for i, name in enumerate(total.columns)}

In [503]:
input_width = 1440
label_width = 120
shift = 120

total_window_size = input_width + shift

input_slice = slice(0, input_width)
input_indices = np.arange(total_window_size)[input_slice]

label_start = total_window_size - label_width
labels_slice = slice(label_start, None)
label_indices = np.arange(total_window_size)[labels_slice]


In [511]:
total_window_size, label_start


(1560, 1440)

In [510]:
input_slice, labels_slice


(slice(0, 1440, None), slice(1440, None, None))

In [507]:
input_indices


array([   0,    1,    2, ..., 1437, 1438, 1439])

In [509]:
label_indices


array([1440, 1441, 1442, 1443, 1444, 1445, 1446, 1447, 1448, 1449, 1450,
       1451, 1452, 1453, 1454, 1455, 1456, 1457, 1458, 1459, 1460, 1461,
       1462, 1463, 1464, 1465, 1466, 1467, 1468, 1469, 1470, 1471, 1472,
       1473, 1474, 1475, 1476, 1477, 1478, 1479, 1480, 1481, 1482, 1483,
       1484, 1485, 1486, 1487, 1488, 1489, 1490, 1491, 1492, 1493, 1494,
       1495, 1496, 1497, 1498, 1499, 1500, 1501, 1502, 1503, 1504, 1505,
       1506, 1507, 1508, 1509, 1510, 1511, 1512, 1513, 1514, 1515, 1516,
       1517, 1518, 1519, 1520, 1521, 1522, 1523, 1524, 1525, 1526, 1527,
       1528, 1529, 1530, 1531, 1532, 1533, 1534, 1535, 1536, 1537, 1538,
       1539, 1540, 1541, 1542, 1543, 1544, 1545, 1546, 1547, 1548, 1549,
       1550, 1551, 1552, 1553, 1554, 1555, 1556, 1557, 1558, 1559])

In [513]:
inputs = total.iloc[input_indices, :]
inputs


Unnamed: 0,L10-AskPrice,L9-AskPrice,L8-AskPrice,L7-AskPrice,L6-AskPrice,L5-AskPrice,L4-AskPrice,L3-AskPrice,L2-AskPrice,L1-AskPrice,...,month_cos,day,day_sin,day_cos,microseconds,microseconds_sin,microseconds_cos,day_of_week,day_of_week_sin,day_of_week_cos
0,18.844967,18.829321,18.811384,18.798303,18.793638,18.787349,18.779207,18.763088,18.750056,18.742152,...,6.123234e-17,77,0.970064,0.24285,1647592200000000,0.793353,-0.608761,4,-0.866025,-0.5
1,18.810000,18.800000,18.790000,18.780000,18.770000,18.760000,18.750000,18.740000,18.730000,18.720000,...,6.123234e-17,77,0.970064,0.24285,1647592205000000,0.793132,-0.609050,4,-0.866025,-0.5
2,18.816990,18.805000,18.798254,18.786667,18.776984,18.766786,18.757943,18.747064,18.736158,18.729375,...,6.123234e-17,77,0.970064,0.24285,1647592210000000,0.792910,-0.609338,4,-0.866025,-0.5
3,18.818738,18.807548,18.799330,18.788571,18.778785,18.768996,18.758395,18.749030,18.739183,18.728966,...,6.123234e-17,77,0.970064,0.24285,1647592215000000,0.792689,-0.609626,4,-0.866025,-0.5
4,18.820000,18.810000,18.800000,18.790000,18.780000,18.770000,18.760000,18.750000,18.740000,18.730000,...,6.123234e-17,77,0.970064,0.24285,1647592220000000,0.792467,-0.609915,4,-0.866025,-0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1435,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,6.123234e-17,77,0.970064,0.24285,1647599375000000,0.384362,-0.923182,4,-0.866025,-0.5
1436,18.820000,18.810000,18.800000,18.790000,18.780000,18.770000,18.760000,18.750000,18.740000,18.730000,...,6.123234e-17,77,0.970064,0.24285,1647599380000000,0.384027,-0.923322,4,-0.866025,-0.5
1437,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,6.123234e-17,77,0.970064,0.24285,1647599385000000,0.383691,-0.923462,4,-0.866025,-0.5
1438,18.820000,18.810000,18.800000,18.790000,18.780000,18.770000,18.760000,18.750000,18.740000,18.730000,...,6.123234e-17,77,0.970064,0.24285,1647599390000000,0.383355,-0.923601,4,-0.866025,-0.5


In [514]:
labels = total.iloc[label_indices, :]
labels


Unnamed: 0,L10-AskPrice,L9-AskPrice,L8-AskPrice,L7-AskPrice,L6-AskPrice,L5-AskPrice,L4-AskPrice,L3-AskPrice,L2-AskPrice,L1-AskPrice,...,month_cos,day,day_sin,day_cos,microseconds,microseconds_sin,microseconds_cos,day_of_week,day_of_week_sin,day_of_week_cos
1440,18.812353,18.800264,18.792424,18.780968,18.770574,18.761484,18.750300,18.741981,18.730258,18.724948,...,6.123234e-17,77,0.970064,0.24285,1647599400000000,0.382683,-0.923880,4,-0.866025,-0.5
1441,18.820000,18.810000,18.800000,18.790000,18.780000,18.770000,18.760000,18.750000,18.740000,18.730000,...,6.123234e-17,77,0.970064,0.24285,1647599405000000,0.382347,-0.924019,4,-0.866025,-0.5
1442,18.817547,18.802131,18.797619,18.785172,18.773784,18.766389,18.752364,18.747119,18.732073,18.729660,...,6.123234e-17,77,0.970064,0.24285,1647599410000000,0.382011,-0.924158,4,-0.866025,-0.5
1443,18.810000,18.800000,18.790000,18.780000,18.770000,18.760000,18.750000,18.740000,18.730000,18.720000,...,6.123234e-17,77,0.970064,0.24285,1647599415000000,0.381675,-0.924296,4,-0.866025,-0.5
1444,18.816723,18.801529,18.796809,18.784167,18.772887,18.765412,18.751711,18.746222,18.731485,18.728963,...,6.123234e-17,77,0.970064,0.24285,1647599420000000,0.381339,-0.924435,4,-0.866025,-0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1555,18.780000,18.770000,18.760000,18.750000,18.740000,18.730000,18.720000,18.710000,18.700000,18.690000,...,6.123234e-17,77,0.970064,0.24285,1647599975000000,0.343728,-0.939069,4,-0.866025,-0.5
1556,18.780000,18.770000,18.760000,18.750000,18.740000,18.730000,18.720000,18.710000,18.700000,18.690000,...,6.123234e-17,77,0.970064,0.24285,1647599980000000,0.343387,-0.939194,4,-0.866025,-0.5
1557,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,6.123234e-17,77,0.970064,0.24285,1647599985000000,0.343045,-0.939319,4,-0.866025,-0.5
1558,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,6.123234e-17,77,0.970064,0.24285,1647599990000000,0.342703,-0.939444,4,-0.866025,-0.5


In [17]:
length = total.shape[0]

# input_width: # of time steps that are fed into the model
# label_width: # of time steps in the predictions
# shift: # of timesteps in the separating the input and the predictions
input_width = 120  # 12 * 60
label_width = 24  # 12 * 10
shift = 24

window_size = input_width + shift
label_start = window_size - label_width
labels_slice = slice(label_start, None)


label_start = window_size - label_width

assert input_width + label_width == window_size
input_slice = slice(0, input_width)
label_slice = slice(label_start, None)
stride = 1


In [18]:
# pd.to_datetime(total['microseconds'] * 10**3)


In [None]:
class Dataset(Dataset):
    """Characterizes a dataset for PyTorch"""

    def __init__(self, df, input_width, shift, label_width, stride=1):
        self.input_width = (
            input_width  # input_width: # of time steps that are fed into the model
            # input_width_p1 + input_width_p2
        )
        self.shift = shift  # shift: # of timesteps separating the input and the (final) predictions
        self.label_width = (
            label_width  # label_width: # of time steps in the predictions
        )

        self.window_size = self.input_width + self.shift
        self.label_start = self.window_size - self.label_width

        self.length = df.shape[0]
        self.input_slice = slice(0, self.input_width)
        self.label_slice = slice(self.label_start, None)

        self.mask_slice = None
        if self.shift != self.label_width:
            self.mask_slice = slice(self.input_width, self.label_start)

        self.stride = stride

        # splits = [total[i:i+self.window_size] for i in range(0,self.length - self.window_size + 1,self.stride)]
        inputs = [
            df[i : i + self.input_width]
            for i in range(0, self.length - self.window_size + 1, self.stride)
        ]
        labels = [
            df[i + self.label_start : i + self.window_size]
            for i in range(0, self.length - self.window_size + 1, self.stride)
        ]

        inputs_tensor = torch.from_numpy(
            np.concatenate(np.expand_dims(inputs, axis=0), axis=0)
        ).to(dtype=torch.float)
        labels_tensor = torch.from_numpy(
            np.concatenate(np.expand_dims(labels, axis=0), axis=0)
        ).to(dtype=torch.float)

        self.X = inputs_tensor[:,:,:-1] # mid_price not included
        self.y = labels_tensor[:,:,:-1]
        self.target = labels_tensor[:,:,-1]

    def __len__(self):
        """Denotes the total number of samples"""
        return len(self.X)

    def __getitem__(self, index):
        """Generates samples of data"""
        return self.X[index], self.y[index], self.target[index]

    # splits = [total[i:i+window_size] for i in range(0,length - window_size + 1,stride)]


In [19]:
a = [total[i : i + input_width] for i in range(0, length - window_size + 1, stride)]
b = [
    total[i + label_start : i + window_size]
    for i in range(0, length - window_size + 1, stride)
]


In [20]:
a1 = np.concatenate(np.expand_dims(a, axis=0), axis=0)
a1.shape


(6698, 120, 52)

In [22]:
b1 = np.concatenate(np.expand_dims(b, axis=0), axis=0)
b1.shape


(6698, 24, 52)

In [29]:
torch.from_numpy(a1).to(dtype=torch.float).size()


torch.Size([6698, 120, 52])

In [523]:
def plot(self, model=None, plot_col="traffic_volume", max_subplots=3):
    inputs, labels = self.sample_batch

    plt.figure(figsize=(12, 8))
    plot_col_index = self.column_indices[plot_col]
    max_n = min(max_subplots, len(inputs))

    for n in range(max_n):
        plt.subplot(3, 1, n + 1)
        plt.ylabel(f"{plot_col} [scaled]")
        plt.plot(
            self.input_indices,
            inputs[n, :, plot_col_index],
            label="Inputs",
            marker=".",
            zorder=-10,
        )

        if self.label_columns:
            label_col_index = self.label_columns_indices.get(plot_col, None)
        else:
            label_col_index = plot_col_index

        if label_col_index is None:
            continue

        plt.scatter(
            self.label_indices,
            labels[n, :, label_col_index],
            edgecolors="k",
            marker="s",
            label="Labels",
            c="green",
            s=64,
        )
        if model is not None:
            predictions = model(inputs)
            plt.scatter(
                self.label_indices,
                predictions[n, :, label_col_index],
                marker="X",
                edgecolors="k",
                label="Predictions",
                c="red",
                s=64,
            )

        if n == 0:
            plt.legend()

    plt.xlabel("Time (h)")


## Details: time

In [4]:
# 1000 microseconds is 1 millisecond; 1000 millisecond is 1 second
k = datetime.datetime(2022, 10, 30)

k_s = datetime.datetime.timestamp(k)
k_s_2 = datetime.datetime.fromtimestamp(k_s, datetime.timezone.utc)
k_s_3 = (
    k_s_2 - datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc)
).total_seconds()
print(k, k_s, k_s_2, k_s_3)


2022-10-30 00:00:00 1667102400.0 2022-10-30 04:00:00+00:00 1667102400.0


In [5]:
print(k.timestamp())
print((k - datetime.datetime(1970, 1, 1)) / datetime.timedelta(seconds=1))
print((k - datetime.datetime(1970, 1, 1)) / datetime.timedelta(microseconds=1))


1667102400.0
1667088000.0
1667088000000000.0


In [6]:
folder = "../data"
ticker = "SB"
# print(os.listdir(f'{folder}/{ticker}/raw/'))
full_list = []
for file in os.listdir(f"{folder}/{ticker}/raw/"):
    year, month, day = map(
        lambda x: int(x),
        file.split(sep="_", maxsplit=-1)[1].split(sep=".")[0].split("-"),
    )
    # print(year, month, day)
    currentDateTime = datetime.datetime(year, month, day)
    full_list.append(currentDateTime)
    # print(currentDateTime)
    # year = lambda x: x.year
    # month = lambda x: x.month
    # day = lambda x: x.day
    # print('Year - ',currentDateTime.year)
    # print('Month - ',month(currentDateTime))
    # print('Day - ',day(currentDateTime))

# for i, dt in enumerate(full_list):
#     print(f"{folder}/{ticker}/raw/{ticker}_{dt.year}-{dt.month}-{dt.day}.csv")

full_list[0:5]


[datetime.datetime(2021, 2, 10, 0, 0),
 datetime.datetime(2021, 2, 11, 0, 0),
 datetime.datetime(2021, 2, 12, 0, 0),
 datetime.datetime(2021, 2, 16, 0, 0),
 datetime.datetime(2021, 2, 17, 0, 0)]

In [7]:
import datetime

folder = "../data"
ticker = "SB"
# print(os.listdir(f'{folder}/{ticker}/raw/'))
full_list = []
for file in os.listdir(f"{folder}/{ticker}/raw/"):
    year, month, day = map(
        lambda x: int(x),
        file.split(sep="_", maxsplit=-1)[1].split(sep=".")[0].split("-"),
    )
    # print(year, month, day)
    currentDateTime = datetime.datetime(year, month, day)
    full_list.append(currentDateTime)
    # print(currentDateTime)
    # year = lambda x: x.year
    # month = lambda x: x.month
    # day = lambda x: x.day
    # print('Year - ',currentDateTime.year)
    # print('Month - ',month(currentDateTime))
    # print('Day - ',day(currentDateTime))

for i, dt in enumerate(full_list):
    if i < 3:
        print(f"{folder}/{ticker}/raw/{ticker}_{dt.year}-{dt.month}-{dt.day}.csv")


../data/SB/raw/SB_2021-2-10.csv
../data/SB/raw/SB_2021-2-11.csv
../data/SB/raw/SB_2021-2-12.csv


In [8]:
# why use datetime?
# we can control the days
for dt in full_list:
    if datetime.datetime(2021, 4, 25) < dt < datetime.datetime(2021, 5, 1):
        print(dt)


2021-04-26 00:00:00
2021-04-27 00:00:00
2021-04-28 00:00:00
2021-04-29 00:00:00
2021-04-30 00:00:00


In [12]:
print(full_list[0])
print(full_list[0] + datetime.timedelta(days=3))


2021-02-10 00:00:00
2021-02-13 00:00:00


## Old: sequence -> time window

In [9]:
import numpy as np

N, T, D = 10, 5, 3

X = np.arange(30).reshape((10, 3))
Y = np.random.random(10)

X


array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14],
       [15, 16, 17],
       [18, 19, 20],
       [21, 22, 23],
       [24, 25, 26],
       [27, 28, 29]])

In [11]:
[N, D] = X.shape
dataX = np.zeros((N - T + 1, T, D))  # of length length N -> N-T+1
for i in range(0, N - T + 1):
    dataX[i] = X[i : i + T, :]
    print(f"{i}, {i+T}")


# same as
# [N, D] = X.shape
# dataX = np.zeros((N - T + 1, T, D))
# for t in range(T, N + 1): #* range([5, 201])
#     dataX[t - T] = X[t - T:t, :]
#     print(f"{t - T}, {t}")


0, 5
1, 6
2, 7
3, 8
4, 9
5, 10


In [None]:
def seq_break_into_intervals(X, T):  # [N, d] -> [N - T + 1, T, D]
    # assert
    [N, D] = X.shape
    dataX = np.zeros((N - T + 1, T, D))
    for i in range(0, N - T + 1):  # 0-5 to 0, 1-6 to 1 ... etc
        dataX[i] = X[i : i + T, :]
    return dataX


## Old: getting file names

In [10]:
# an older way to do this that I have discarded
month = 3
[
    x
    for x in os.listdir(f"{folder}/{ticker}/raw/")
    for j in [month]
    for k in range(1, 5)
    if x in f"{ticker}_{year}-{j:02d}-{k:02d}.csv"
]


['SB_2022-03-01.csv',
 'SB_2022-03-02.csv',
 'SB_2022-03-03.csv',
 'SB_2022-03-04.csv']

# New Ideas

- how do you encode the relative rank position of L1-L10, how do you do the buy and sell side, and how to do it for both P and V?

I want to do a L10-L1(ask)-L1(buy)-L10, so the price order is the ranking order. I am struggling to find way to encode P and V separately though: one idea is the have each volume to have a small network and end up with respective weights, and I let the price * volume and get the weighted means;

I can also put them in different channels and allow for mixing information.