In [1]:
%load_ext autoreload
%autoreload 2

In [17]:
import polars as pl
import pandas as pd
from typing import Literal
import numpy as np
import matplotlib.pyplot as plt
import time

In [3]:
from main import load_messages

# <font color="grey">Data loading and handling</font>

In [4]:
messages = load_messages("AAPL", ["120621.100000", "120621.100500"], source="lobster")

In [5]:
messages.head()

time,type,size,price,direction,order_id
f64,i32,f32,f32,i8,i64
1340300000.0,2,17.0,586.150024,-1,46527525
1340300000.0,1,17.0,585.72998,1,46530538
1340300000.0,2,20.0,586.140015,-1,46494513
1340300000.0,1,20.0,585.809998,1,46531040
1340300000.0,1,8.0,585.880005,1,46533017


In [6]:
messages['time'][-1]-messages['time'][0]

299.91076827049255

In [7]:
mic_messages = load_messages("RTS2009", ["200608.100000", "200609.090500"], source="mic")

In [8]:
mic_messages.head()

time,type,size,price,direction,order_id
f64,i32,f32,f32,i8,i64
1591600000.0,1,2.0,127140.0,-1,26006039007294
1591600000.0,1,2.0,127130.0,-1,26006039007314
1591600000.0,1,2.0,127140.0,-1,26006039007579
1591600000.0,1,2.0,127130.0,-1,26006039007600
1591600000.0,1,4.0,127070.0,1,26006039007654


In [9]:
mic_messages.shape

(702207, 6)

In [47]:
from main import DerivedFeatures, compute_derived_features, compute_derived_features_old

In [31]:
messages_pandas = messages.to_pandas()

In [48]:
start = time.time()
derived_features_pandas = compute_derived_features_old(messages_pandas)
end = time.time()
print(f"Time taken: {end - start}")

Time taken: 0.0008029937744140625


In [57]:
start = time.time()
derived_features = compute_derived_features(messages)
end = time.time()
print(f"Time taken: {end - start}")

Time taken: 0.0028908252716064453


In [58]:
derived_features

DerivedFeatures(dp_ticks=array([  0, -42,  41, ..., -16,  19, -19], shape=(13503,), dtype=int32), log_size=array([2.890372 , 2.890372 , 3.0445228, ..., 4.615121 , 4.615121 ,
       4.615121 ], shape=(13503,), dtype=float32), dt_log=array([0.        , 0.10845059, 1.6524696 , ..., 0.07806202, 0.04748783,
       0.07508361], shape=(13503,), dtype=float32), dt_prev=array([0.0000000e+00, 6.7949295e-05, 2.5031567e-03, ..., 4.8160553e-05,
       2.8848648e-05, 4.6253204e-05], shape=(13503,), dtype=float32), dt_next=array([6.7949295e-05, 2.5031567e-03, 7.2717667e-05, ..., 2.8848648e-05,
       4.6253204e-05, 1.3402732e+09], shape=(13503,), dtype=float32), has_next_event=array([ True,  True,  True, ...,  True,  True, False], shape=(13503,)), type_code=array([2, 1, 2, ..., 1, 2, 1], shape=(13503,), dtype=int32), side_code=array([-1,  1, -1, ...,  1,  1,  1], shape=(13503,), dtype=int8), level_proxy=array([0, 1, 1, ..., 1, 1, 1], shape=(13503,), dtype=int32), tick_size=0.01, time_scale=0.00059318

In [59]:
derived_features_pandas

DerivedFeatures(dp_ticks=array([  0, -42,  41, ..., -16,  19, -19], shape=(13503,), dtype=int32), log_size=array([2.8903718, 2.8903718, 3.0445225, ..., 4.6151204, 4.6151204,
       4.6151204], shape=(13503,), dtype=float32), dt_log=array([0.        , 0.10845059, 1.6524696 , ..., 0.07806202, 0.04748783,
       0.07508361], shape=(13503,), dtype=float32), dt_prev=array([0.0000000e+00, 6.7949295e-05, 2.5031567e-03, ..., 4.8160553e-05,
       2.8848648e-05, 4.6253204e-05], shape=(13503,), dtype=float32), dt_next=array([6.7949295e-05, 2.5031567e-03, 7.2717667e-05, ..., 2.8848648e-05,
       4.6253204e-05, 5.9318542e-04], shape=(13503,), dtype=float32), has_next_event=array([ True,  True,  True, ...,  True,  True, False], shape=(13503,)), type_code=array([2, 1, 2, ..., 1, 2, 1], shape=(13503,), dtype=int32), side_code=array([-1,  1, -1, ...,  1,  1,  1], shape=(13503,), dtype=int8), level_proxy=array([0, 1, 1, ..., 1, 1, 1], shape=(13503,), dtype=int32), tick_size=0.01, time_scale=0.00059318

In [52]:
from compare_features import compare_derived_features

In [53]:
compare_derived_features(derived_features_pandas, derived_features)

✅ tick_size: 0.01 == 0.01
✅ time_scale: 0.0005931854248046875 == 0.0005931854248046875
✅ dp_ticks: all 13503 values match
✅ log_size: all 13503 values match (within tolerance 1e-06)
✅ dt_log: all 13503 values match (within tolerance 1e-06)
✅ dt_prev: all 13503 values match (within tolerance 1e-06)
❌ dt_next: 12657/13503 values differ (max diff: 1340273152.0)
   First differences at indices: [0 1 2 3 4]
   [0]: 6.794929504394531e-05 != -6.794929504394531e-05 (diff: 0.00013589859008789062)
   [1]: 0.0025031566619873047 != -0.0025031566619873047 (diff: 0.005006313323974609)
   [2]: 7.271766662597656e-05 != -7.271766662597656e-05 (diff: 0.00014543533325195312)
   [3]: 0.023744821548461914 != -0.023744821548461914 (diff: 0.04748964309692383)
   [4]: 0.006034135818481445 != -0.006034135818481445 (diff: 0.01206827163696289)
✅ has_next_event: all 13503 values match
✅ type_code: all 13503 values match
✅ side_code: all 13503 values match
✅ level_proxy: all 13503 values match
✅ time_absolute: all

False

In [60]:
derived_features_pandas.dt_next

array([6.7949295e-05, 2.5031567e-03, 7.2717667e-05, ..., 2.8848648e-05,
       4.6253204e-05, 5.9318542e-04], shape=(13503,), dtype=float32)

In [61]:
derived_features.dt_next

array([6.7949295e-05, 2.5031567e-03, 7.2717667e-05, ..., 2.8848648e-05,
       4.6253204e-05, 1.3402732e+09], shape=(13503,), dtype=float32)

In [63]:
derived_features.time_scale

0.0005931854248046875

In [64]:
derived_features_pandas.time_scale

0.0005931854248046875

In [68]:
messages_pandas["time"].to_numpy(dtype=np.float64)[-1]+derived_features_pandas.time_scale

np.float64(1340273099.9487848)

In [69]:
messages["time"][-1]+derived_features.time_scale

1340273099.9487848

In [41]:
derived_features.__annotations__

{'dp_ticks': 'np.ndarray',
 'log_size': 'np.ndarray',
 'dt_log': 'np.ndarray',
 'dt_prev': 'np.ndarray',
 'dt_next': 'np.ndarray',
 'has_next_event': 'np.ndarray',
 'type_code': 'np.ndarray',
 'side_code': 'np.ndarray',
 'level_proxy': 'np.ndarray',
 'tick_size': 'float',
 'time_scale': 'float',
 'time_absolute': 'np.ndarray'}

In [46]:
messages["time"][-1]

1340273099.9481916

In [None]:
messages.select(pl.col("time").diff().shift(-1).fill_null())

time
f64
0.000068
0.002503
0.000073
0.023745
0.006034
…
0.009152
0.000048
0.000029
0.000046
