In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
import polars as pl

In [3]:
from cooc.hdb import get_dataset
from cooc.plots import plot_trades
from cooc.features import coi

# <font color="grey">Trade decomposition</font>

### <font color="grey">  Table of Contents</font>

1. #### <a href='#chapter1'>Data</a>
2. #### <a href='#chapter2'>Trade decomposition</a>

###  <a id='chapter1'> <font color="grey">1. Data </font></a>

Data is currently available for 5 products, namely perpetual futures contracts for the following pairs: BTC-USDT, ETH-USDT, SOL-USDT, DOGE-USDT and LINK-USDT. The details on these products are not important for the time being, they just represent time series. Data is available for trades and quotes (level 1 limit order book) for a period from June to end of August. For each month, the data begins at 00:01 of the first day and ends at 23:50.

The timestamps are given in nanosecond resolution as Unix timestamps, but the granularity of the data is not that fine. A simple dataloader and some helper function to convert Python datetime objects or strings of the form '240802.1450' into timestamps are provided. The 'book' data contains best bid and ask prices with corresponding volumes. The 'trade' data contains trades, with a signed quantity representing direction. There is also an associated trade id in order to identify individual trades.

The book data is mainly to determine the price movements and for potential backtests. In the original dataset there are more timestamps (depending on when the data was sent or received) but I don't think this matters at the moment. One could try to align what happens in the trades with what happens in the book dataset.

We make heavy use of [polars](https://pola.rs/), for the simple reason of performance. If one is not used to it, the learning curve can be steep but it will make things quicker in the long run. We also work with polars extensions to easily add features to the data.

In [4]:
dl = get_dataset("tardis")

In [8]:
RCS = [f"{coin}USDT" for coin in ["BTC", "ETH", "SOL", "DOGE", "LINK"]]
TYPE = ['book', 'trade']
MONTHS = ['06', '07', '08']
TIMES = ['240720.000100', '240725.235000']

In [6]:
df = dl.load_book(RCS[0:2], TIMES, lazy=True)
# Add human readable timestamp and mid prices
df = df._dt.add_datetime('ts')._feat.add_mid(products=RCS[0:2])

In [None]:
# This takes some time, not because of the loading but because of the aggregation
df = df.collect()
df.head()

In [None]:
plt.plot(df['dts'], df['mid_ETHUSDT'])
plt.xlabel('Time')
plt.ylabel('Mid price')
plt.title('Mid price over time')
plt.show()

In [9]:
tdf = dl.load_trades(RCS[0:2], TIMES, lazy=True)
# By default, the timestamp column is named 'ts'
tdf = tdf._trade.agg_trades()._trade.add_side()._dt.add_datetime()

In [None]:
# Just to check that it works
tdf = tdf.collect()
tdf.head()

###  <a id='chapter2'> <font color="grey">2. Trade decomposition </font></a>

In [None]:
# Create a simple dataframe to test the trade classification
prods = ['BTCUSDT', 'ETHUSDT', 'SOLUSDT']
delta = "30s"
data = {
    "ts": [
        "2025-02-23 09:00:00",
        "2025-02-23 09:00:00",
        "2025-02-23 09:00:10",
        "2025-02-23 09:00:20",
        "2025-02-23 09:00:40",
        "2025-02-23 09:01:40",
        "2025-02-23 09:02:30",
        "2025-02-23 09:02:40",
        "2025-02-23 09:03:30",
        "2025-02-23 09:03:40"
    ],
    "product": [
        "BTCUSDT", "SOLUSDT", "SOLUSDT", "ETHUSDT", "BTCUSDT",
        "ETHUSDT", "ETHUSDT", "BTCUSDT", "SOLUSDT", "SOLUSDT"
    ],
    "trade_id": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
    "prc": [100, 102, 101, 103, 104, 103, 101, 100, 99, 98],
    "qty": [10, 15, 12, 20, -18, 21, -15, 12, -10, 10]
}

df = pl.DataFrame(data)
df = df.with_columns(
    pl.col("ts").str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S")
)

print(df)

In [None]:
tdf = df._trade.classify_trades(prods, "ts", "30s")
tdf

In [None]:
plot_trades(tdf, ['BTCUSDT', 'ETHUSDT', 'SOLUSDT'], 'ts', '250223.090000', 10, '30s')

In [None]:
cdf = coi(tdf, "ts", "60s", "nis-c")
print(cdf)

In [14]:
# Now for some real data
df = dl.load_trades(RCS, ['240707.0800', '240707.2000'])._dt.add_datetime('ts')

In [None]:
# Aggregate trades by timestamp and price and add human readable timestamp
tdf = df._trade.agg_trades()._dt.add_datetime('ts')
tdf.head()

In [None]:
tdf = tdf._trade.classify_trades(RCS, "dts", "5ms")
tdf.head()