In [1]:
%load_ext autoreload
%autoreload 2

In [17]:
import matplotlib.pyplot as plt # type: ignore
import numpy as np
import polars as pl

In [3]:
from dspy.hdb import get_dataset

# <font color="grey">Data loading and handling</font>

Data is available in two forms: limit order book (LOB) and trade data. The timestamps are given in nanosecond resolution as Unix timestamps, but the granularity of the data is not that fine. A simple dataloader and some helper function to convert Python datetime objects or strings of the form '240802.1450' into timestamps are provided. The 'book' data contains best bid and ask prices with corresponding volumes. The 'trade' data contains trades, with a signed quantity representing direction. There is also an associated trade id in order to identify individual trades.

In [4]:
dl = get_dataset("tardis")

In [13]:
RCS = [f"{coin}USDT" for coin in ["BTC", "ETH", "SOL", "DOGE", "LINK"]]
TYPE = ['book', 'trade']
MONTHS = ['06', '07', '08']
TIMES = ['240720.000100', '240720.215000']

####  <a id='chapter1'> <font color="grey">1. Book</font></a>

In [51]:
df = dl.load_book(RCS[0:2], TIMES, lazy=True)
# Add human readable timestamp and mid prices
df = df.ds.add_datetime('ts').feature.add_mid(products=RCS[0:2])

In [52]:
# This takes some time, not because of the loading but because of the aggregation
df = df.collect()
df.head()

ts,prc__s0_BTCUSDT,prc__s1_BTCUSDT,vol__s0_BTCUSDT,vol__s1_BTCUSDT,prc__s0_ETHUSDT,prc__s1_ETHUSDT,vol__s0_ETHUSDT,vol__s1_ETHUSDT,dts,mid_BTCUSDT,mid_ETHUSDT
u64,f64,f64,f64,f64,f64,f64,f64,f64,datetime[ns],f64,f64
1721433660056500000,66689.9,66690.0,7.331,10.59,3503.63,3503.64,12.785,82.869,2024-07-20 00:01:00.056500,66689.95,3503.635
1721433660058500000,66689.9,66690.0,7.331,10.531,3503.63,3503.64,12.785,82.869,2024-07-20 00:01:00.058500,66689.95,3503.635
1721433660063500000,66689.9,66690.0,7.329,10.531,3503.63,3503.64,0.332,82.869,2024-07-20 00:01:00.063500,66689.95,3503.635
1721433660066500000,66689.9,66690.0,6.079,10.531,3503.63,3503.64,0.332,82.883,2024-07-20 00:01:00.066500,66689.95,3503.635
1721433660069500000,66689.9,66690.0,6.079,10.513,3503.63,3503.64,0.332,82.883,2024-07-20 00:01:00.069500,66689.95,3503.635


In [53]:
# Add a column with random +1 or -1 entries
df = df.with_columns(
    pl.lit(np.random.choice([1., -1.], size=len(df))).alias('random_signal')
)

In [None]:
pdf = df.target.add_sig_pnl(ts_col="ts", col="mid_ETHUSDT", signal="random_signal", horizon="1m", in_bp=False)
pdf.head()

ts,prc__s0_BTCUSDT,prc__s1_BTCUSDT,vol__s0_BTCUSDT,vol__s1_BTCUSDT,prc__s0_ETHUSDT,prc__s1_ETHUSDT,vol__s0_ETHUSDT,vol__s1_ETHUSDT,dts,mid_BTCUSDT,mid_ETHUSDT,random_signal,pnl_sig_1m
u64,f64,f64,f64,f64,f64,f64,f64,f64,datetime[ns],f64,f64,f64,f64
1721512139491500000,67068.0,67068.1,1.536,3.698,3515.36,3515.37,31.002,16.106,2024-07-20 21:48:59.491500,67068.05,3515.365,1.0,1.45
1721512139557500000,67068.0,67068.1,1.536,3.698,3515.36,3515.37,31.012,16.096,2024-07-20 21:48:59.557500,67068.05,3515.365,-1.0,-1.45
1721512139705500000,67068.0,67068.1,1.536,3.694,3515.36,3515.37,31.012,16.096,2024-07-20 21:48:59.705500,67068.05,3515.365,-1.0,-1.45
1721512139785500000,67068.0,67068.1,1.536,3.694,3515.36,3515.37,31.012,19.457,2024-07-20 21:48:59.785500,67068.05,3515.365,1.0,1.45
1721512139793500000,67068.0,67068.1,1.536,3.694,3515.36,3515.37,31.002,19.467,2024-07-20 21:48:59.793500,67068.05,3515.365,-1.0,-1.45


In [None]:
plt.plot(df['dts'], df['mid_ETHUSDT'])
plt.xlabel('Time')
plt.ylabel('Mid price')
plt.title('Mid price over time')
plt.show()

In [None]:
df = df._feat.add_vwap(cols=["prc__s0", "prc__s1", "vol__s0", "vol__s1"])
df.head()

In [None]:
from datetime import datetime, timedelta
t = datetime.strptime(TIMES[0], "%y%m%d.%H%M%S")
t

In [None]:
td = timedelta(minutes=1)
td.total_seconds()

In [35]:
from dspy.utils import round_up_to_nearest

In [None]:
df['dts'][0]

In [None]:
round_up_to_nearest(df['dts'][0], td)

In [None]:
rdf = dl.load(RCS[0:2], TIMES)
rdf.head()

####  <a id='chapter2'> <font color="grey">2. Trades</font></a>

In [9]:
tdf = dl.load_trades(RCS[0:2], TIMES, lazy=True)
# By default, the timestamp column is named 'ts'
tdf = tdf._trade.agg_trades()._trade.add_side()._dt.add_datetime()

In [None]:
# Just to check that it works
tdf = tdf.collect()
tdf.head()