In [1]:
import numpy as np # linear algebra
import polars as pl # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd
import os
from pathlib import Path
import sys
import matplotlib as plt
from matplotlib import pyplot
import gc

## Adding Lags

In [2]:
base_path = Path("/kaggle/input/jane-street-real-time-market-data-forecasting/")
train_path = base_path / Path("train.parquet/")

for i in range(10):
    path = train_path / Path(f"partition_id={i}/part-0.parquet")
    df = pl.read_parquet(path)
    responders = [i for i in df.columns if i.startswith('responder')]
    
    last_reading = df.group_by(('date_id', 'symbol_id'), maintain_order=True).last()
    join_to = last_reading.with_columns(last_reading['date_id'] + 1)[['date_id', 'symbol_id'] + responders]
    
    df_grouped = df.join(join_to, ['date_id', 'symbol_id'], how='left', suffix='_lag')
    
    print(f"shapes for {i}: ", df_grouped.shape, df.shape)
    del df
    gc.collect()
    
    partition_dir = f"train/partition_id={i}"
    os.makedirs(partition_dir, exist_ok=True)
    df_grouped.write_parquet(os.path.join(partition_dir, "part-0.parquet"))

shapes for 0:  (1944210, 101) (1944210, 92)
shapes for 1:  (2804247, 101) (2804247, 92)
shapes for 2:  (3036873, 101) (3036873, 92)
shapes for 3:  (4016784, 101) (4016784, 92)
shapes for 4:  (5022952, 101) (5022952, 92)
shapes for 5:  (5348200, 101) (5348200, 92)
shapes for 6:  (6203912, 101) (6203912, 92)
shapes for 7:  (6335560, 101) (6335560, 92)
shapes for 8:  (6140024, 101) (6140024, 92)
shapes for 9:  (6274576, 101) (6274576, 92)


## Calculate Means

In [3]:
train_read_paths = [train_path / Path(f"partition_id={i}/part-0.parquet") for i in [0, 1, 2, 3, 4, 5, 6, 7]]
df = pl.concat([pl.read_parquet(path) for path in train_read_paths])
features = [i for i in df.columns if i.startswith('feature_')]

In [4]:
median_before = [df[i].quantile(0.5) for i in features]
stds_before = [df[i].std() for i in features]

outlier_low = [median - 3 * std for median, std in zip(median_before, stds_before)]
outlier_high = [median + 3 * std for median, std in zip(median_before, stds_before)]

In [5]:
for name, lo, hi in zip(features, outlier_low, outlier_high):
    if df.schema[name] == pl.Int8:
        lo = max(lo, -127)
        hi = min(hi, 127)
    df = df.with_columns(df[name].clip(lo, hi))

In [6]:
means = [df[i].mean() for i in features]
stds = [df[i].std() for i in features]

In [7]:
pl.DataFrame({
    'name': features,
    'mean': means,
    'std': stds,
    'lo': outlier_low,
    'hi': outlier_high
}).write_csv('feature_data.csv')