In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt # type: ignore
import numpy as np
import polars as pl

In [5]:
from dspy.hdb import get_dataset

# <font color="grey">Data loading and handling</font>

Data is available in two forms: limit order book (LOB) and trade data. The timestamps are given in nanosecond resolution as Unix timestamps, but the granularity of the data is not that fine. A simple dataloader and some helper function to convert Python datetime objects or strings of the form '240802.1450' into timestamps are provided. The 'book' data contains best bid and ask prices with corresponding volumes. The 'trade' data contains trades, with a signed quantity representing direction. There is also an associated trade id in order to identify individual trades.

In [6]:
dl = get_dataset("terank")

In [8]:
RCS = [f"{coin}USDT" for coin in ["BTC", "ETH", "SOL", "DOGE", "LINK"]]
TYPE = ['book', 'trade']
MONTHS = ['06', '07', '08']
TIMES = ['240720.000100', '240720.215000']

####  <a id='chapter1'> <font color="grey">1. Book</font></a>

In [6]:
df = dl.load_book(RCS[0:2], TIMES, lazy=True)
# Add human readable timestamp and mid prices
df = df.ds.add_datetime('ts').feature.add_mid(products=RCS[0:2])

In [7]:
# This takes some time, not because of the loading but because of the aggregation
df = df.collect()
df.head()

In [8]:
# Add a column with random +1 or -1 entries
df = df.with_columns(
    pl.lit(np.random.choice([1., -1.], size=len(df))).alias('random_signal')
)

In [9]:
pdf = df.target.add_sig_pnl(ts_col="ts", col="mid_ETHUSDT", signal="random_signal", horizon="1m", in_bp=False)
pdf.head()

In [10]:
plt.plot(df['dts'], df['mid_ETHUSDT'])
plt.xlabel('Time')
plt.ylabel('Mid price')
plt.title('Mid price over time')
plt.show()

In [12]:
df = df.feature.add_vwap(cols=["prc__s0", "prc__s1", "vol__s0", "vol__s1"])
df.head()

####  <a id='chapter2'> <font color="grey">2. Trades</font></a>

In [25]:
TIMES = ['250101.100000', '250101.120000']
products = ['BTCUSDT', 'ETHUSDT']

In [26]:
tdf = dl.load_trades(products, TIMES, lazy=True)
# By default, the timestamp column is named 'ts'
tdf = tdf.trade.agg_trades().trade.add_side().ds.add_datetime()

In [27]:
tdf = tdf.collect()
tdf.head()

ts,prc,product,trade_id,qty,side,dts
i64,f64,str,i64,f64,i32,datetime[ns]
1735725600175000000,3333.63,"""ETHUSDT""",4937545734,-0.086,-1,2025-01-01 10:00:00.175
1735725600242000000,3333.64,"""ETHUSDT""",4937545735,0.543,1,2025-01-01 10:00:00.242
1735725600340000000,3333.64,"""ETHUSDT""",4937545738,0.015,1,2025-01-01 10:00:00.340
1735725600353000000,93382.1,"""BTCUSDT""",5793674893,0.036,1,2025-01-01 10:00:00.353
1735725600393000000,3333.63,"""ETHUSDT""",4937545740,-8.023,-1,2025-01-01 10:00:00.393
