In [110]:
import pathlib

import numpy as np
import pandas as pd

In [111]:
DATASET_PATH = pathlib.Path(
    "/Users/vdblin10/.jupyter/bitmex_book_snapshot_25_2020-09-01_XBTUSD.csv"
)

In [112]:
df = pd.read_csv(filepath_or_buffer=DATASET_PATH)

In [113]:
df.drop(
    labels=["exchange", "symbol", "timestamp", "local_timestamp"], axis=1, inplace=True
)

In [114]:
asks = df.filter(regex=("asks"))

In [115]:
asks_price = np.array([asks[column].to_numpy()
                      for column in asks if "price" in column])
asks_volume = np.array(
    [asks[column].to_numpy() for column in asks if "amount" in column]
)

In [116]:
bids = df.filter(regex=("bids"))

In [117]:
bids_price = np.array([bids[column].to_numpy()
                      for column in bids if "price" in column])
bids_volume = np.array(
    [bids[column].to_numpy() for column in bids if "amount" in column]
)

In [118]:
n = 10  # levels
length = len(df)

In [125]:
dataset = pd.DataFrame()

for level in np.arange(start=0, stop=n, step=1):
    # v_1
    dataset[f"p_ask_{level + 1}"] = asks_price[level]
    dataset[f"v_ask_{level + 1}"] = asks_volume[level]
    dataset[f"p_bid_{level + 1}"] = bids_price[level]
    dataset[f"v_bid_{level + 1}"] = bids_volume[level]

for level in np.arange(start=0, stop=n, step=1):
    # v_2
    dataset[f"p_ask_{level + 1} - p_bid_{level + 1}"] = (
        asks_price[level] - bids_price[level]
    )
    # dataset[f"(p_ask_{level + 1} + p_bid_{level + 1}) / 2"] = (
    #     asks_price[level] - bids_price[level]
    # ) / 2

for level in np.arange(start=0, stop=n, step=1):
    # v_3
    dataset[f"p_ask_{n} - p_ask_{1}"] = asks_price[n - 1] - asks_price[0]
    dataset[f"p_bid_{1} - p_bid_{n}"] = bids_price[0] - bids_price[n - 1]
    dataset[f"|p_ask_{level + 2} - p_ask_{level + 1}|"] = np.absolute(
        asks_price[level + 1] - asks_price[level]
    )
    dataset[f"|p_bid_{level + 2} - p_bid_{level + 1}|"] = np.absolute(
        bids_price[level + 1] - bids_price[level]
    )

# v_4
dataset[f"1 / n * Σ^{n}_i={1}(p_ask_{1}...{n})"] = np.array(
    object=[
        1 / n * np.sum(a=asks_price[:n, i])
        for i in np.arange(start=0, stop=length, step=1)
    ]
)
dataset[f"1 / n * Σ^{n}_i={1}(p_bid_{1}...{n})"] = np.array(
    object=[
        1 / n * np.sum(a=bids_price[:n, i])
        for i in np.arange(start=0, stop=length, step=1)
    ]
)
dataset[f"1 / n * Σ^{n}_i={1}(v_ask_{1}...{n})"] = np.array(
    object=[
        1 / n * np.sum(a=asks_volume[:n, i])
        for i in np.arange(start=0, stop=length, step=1)
    ]
)
dataset[f"1 / n * Σ^{n}_i={1}(v_bid_{1}...{n})"] = np.array(
    object=[
        1 / n * np.sum(a=bids_volume[:n, i])
        for i in np.arange(start=0, stop=length, step=1)
    ]
)

# v_5
dataset[f"Σ^{n}_i={1}(p_ask_{1}...{n} - p_bid_{1}...{n})"] = np.array(
    object=[
        np.sum(a=asks_price[:n, i] - bids_price[:n, i])
        for i in np.arange(start=0, stop=length, step=1)
    ]
)
dataset[f"Σ^{n}_i={1}(v_ask_{1}...{n} - v_bid_{1}...{n})"] = np.array(
    object=[
        np.sum(a=asks_volume[:n, i] - bids_volume[:n, i])
        for i in np.arange(start=0, stop=length, step=1)
    ]
)

# for level in np.arange(start=0, stop=n, step=1):
    # v_6
    # dataset[f"dp_ask_{level + 1} / dt"] = np.gradient(f=asks_price[level])
    # dataset[f"dp_bid_{level + 1} / dt"] = np.gradient(f=bids_price[level])
    # dataset[f"dv_ask_{level + 1} / dt"] = np.gradient(f=asks_volume[level])
    # dataset[f"dv_bid_{level + 1} / dt"] = np.gradient(f=bids_volume[level])

# Mid-price
dataset[f"(p_ask_{1} + p_bid_{1}) / 2"] = (asks_price[0] + bids_price[0]) / 2

In [126]:
dataset

Unnamed: 0,p_ask_1,v_ask_1,p_bid_1,v_bid_1,p_ask_2,v_ask_2,p_bid_2,v_bid_2,p_ask_3,v_ask_3,...,|p_bid_10 - p_bid_9|,|p_ask_11 - p_ask_10|,|p_bid_11 - p_bid_10|,1 / n * Σ^10_i=1(p_ask_1...10),1 / n * Σ^10_i=1(p_bid_1...10),1 / n * Σ^10_i=1(v_ask_1...10),1 / n * Σ^10_i=1(v_bid_1...10),Σ^10_i=1(p_ask_1...10 - p_bid_1...10),Σ^10_i=1(v_ask_1...10 - v_bid_1...10),(p_ask_1 + p_bid_1) / 2
0,11658.0,1399982,11657.5,2293327,11658.5,82328,11657.0,37555,11659.0,3001,...,0.5,0.5,0.5,11660.25,11655.25,215458.9,274250.3,50.0,-587914,11657.75
1,11658.0,1399982,11657.5,2293327,11658.5,82328,11657.0,37555,11659.0,3001,...,0.5,0.5,0.5,11660.25,11655.25,215458.9,274250.3,50.0,-587914,11657.75
2,11658.0,1399982,11657.5,2293327,11658.5,82328,11657.0,37555,11659.0,3001,...,0.5,0.5,0.5,11660.25,11655.25,215458.9,309772.8,50.0,-943139,11657.75
3,11658.0,1399982,11657.5,2293327,11658.5,82328,11657.0,37555,11659.0,3001,...,0.5,0.5,0.5,11660.25,11655.25,215458.9,309792.8,50.0,-943339,11657.75
4,11658.0,1399982,11657.5,2293327,11658.5,82328,11657.0,37555,11659.0,3001,...,0.5,0.5,0.5,11660.25,11655.25,215458.1,309792.8,50.0,-943347,11657.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4382988,11937.0,1561539,11936.5,375476,11937.5,22876,11936.0,9237,11938.0,14453,...,0.5,0.5,0.5,11939.25,11934.00,205789.3,51968.6,52.5,1538207,11936.75
4382989,11937.0,1561539,11936.5,379588,11937.5,22876,11936.0,9237,11938.0,14453,...,0.5,0.5,0.5,11939.25,11934.00,205789.3,52379.8,52.5,1534095,11936.75
4382990,11937.0,1561539,11936.5,379588,11937.5,22876,11936.0,9237,11938.0,14453,...,0.5,0.5,0.5,11939.25,11934.00,205789.3,52379.8,52.5,1534095,11936.75
4382991,11937.0,1561539,11936.5,379588,11937.5,22876,11936.0,9237,11938.0,14453,...,0.5,0.5,0.5,11939.25,11934.00,205789.3,52379.8,52.5,1534095,11936.75


In [127]:
dataset.to_csv(
    path_or_buf="result.zip",
    compression=dict(method="zip", archive_name="result.csv"),
)