In [1]:
import numpy as np # linear algebra
import polars as pl # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd
import os
from pathlib import Path
import sys
import matplotlib as plt
from matplotlib import pyplot
import gc
import re

import time

In [2]:
base_path = Path("/kaggle/input/jane-street-real-time-market-data-forecasting/")
train_path = base_path / Path("train.parquet/")

taken = [0, 1, 2, 3, 4, 5, 6, 7]

In [3]:
base_feature_filter = "(feature_\d\d)|(responder_\d_lag)"
full_feature_filter = "((feature_\d\d)|(responder_\d_lag))((_ao\d*)|(_diff)|(_std_\d*)|(_\d*-\d*)|$)"

In [4]:
def add_averages(group, features):
    for i in [5, 50]:
        for col in features:
            group = group.with_columns(group[col].rolling_mean(i, min_periods=1).alias(f"{col}_ao{i}"))
    return group

def add_first_diff(group, features):
    for col in features:
        group = group.with_columns(group[col].diff().alias(f"{col}_diff"))
    return group

def add_stds(group, features):
    for i in [10]:
        for col in features:
            group = group.with_columns(group[col].rolling_std(window_size=i, min_periods=2).alias(f"{col}_std_{i}"))
    return group

def add_lags(df):
    responders = [i for i in df.columns if i.startswith('responder')]
    last_reading = df.group_by(('date_id', 'symbol_id'), maintain_order=True).last()
    join_to = last_reading.with_columns(last_reading['date_id'] + 1)[['date_id', 'symbol_id'] + responders]

    df = df.join(join_to, ['date_id', 'symbol_id'], how='left', suffix='_lag')
    del last_reading, join_to
    return df

def add_avg_diff(df, base_features):
    for col in base_features:
        df = df.with_columns(
            (df[col] - df[f"{col}_ao5"]).alias(f"{col}_1-5"),
            (df[f"{col}_ao5"] - df[f"{col}_ao50"]).alias(f"{col}_5-20")
        )
    return df

In [5]:
def add_features(df):
    df = add_lags(df)
    base_features = [i for i in df.columns if re.fullmatch(base_feature_filter, i)]
    df_grouped = df.group_by('symbol_id', maintain_order=True)
    
    start_time = time.time()
    df_with_avg = df_grouped.map_groups(lambda group: 
                                    add_stds(
                                        add_first_diff(
                                            add_averages(group, base_features), 
                                            base_features), 
                                        base_features
                                    ))
    end_time = time.time()
    print(f"Time for average calculations: {end_time-start_time}")

    start_time = time.time()
    df_with_diff = add_avg_diff(df_with_avg, base_features)
    end_time = time.time()
    print(f"Time for average differences: {end_time - start_time}")
    
    full_features = [i for i in df_with_diff.columns if re.fullmatch(full_feature_filter, i)]
    # print(f"Length of all the new features: {len(full_features)}")

    del df, df_grouped, df_with_avg
    gc.collect()
    return df_with_diff

In [6]:
outliers_low, outliers_high, means, stds = [], [], [], []

def add_stats(df, features):
    # print(f"Bytes per row: {df.estimated_size()} bytes / {df.shape[0]} rows = {df.estimated_size() / df.shape[0]} bytes/row")
    
    median_before = np.asarray([df[i].median() for i in features]).astype('float32')
    stds_before = np.asarray([df[i].std() for i in features]).astype('float32')

    outlier_low = median_before - 3 * stds_before
    outlier_high = median_before + 3 * stds_before
    for i, (name, lo, hi) in enumerate(zip(features, outlier_low, outlier_high)):
        if df.schema[name] == pl.Int8:
            outlier_low[i] = max(lo, -127)
            outlier_high[i] = min(hi, 127)
    
    outlier_low = np.asarray(outlier_low)
    outlier_high = np.asarray(outlier_high)
    
    outliers_low.append(outlier_low)
    outliers_high.append(outlier_high)

    means.append(np.asarray([df[name].clip(lo, hi).mean() for name, lo, hi in zip(features, outlier_low, outlier_high)]).astype('float32'))
    stds.append(np.asarray([df[name].clip(lo, hi).std() for name, lo, hi in zip(features, outlier_low, outlier_high)]).astype('float32'))

In [7]:
features = None

for i in taken:
    print(f"Processing {i}")
    path = train_path / Path(f"partition_id={i}/part-0.parquet")
    df = pl.read_parquet(path)
    print(f"Processing {df.shape[0]} rows")

    start_time = time.time()
    df = add_features(df)
    end_time = time.time()
    print(f"Time to add features: {end_time  - start_time}")

    if features is None:
        features = [i for i in df.columns if re.fullmatch(full_feature_filter, i)]

    start_time = time.time()
    add_stats(df, features)
    end_time = time.time()
    print(f"Time to calculate statistics: {end_time  - start_time}")

    del df
    gc.collect()

Processing 0
Processing 1944210 rows
Time for average calculations: 17.19049596786499
Time for average differences: 1.5885727405548096
Time to add features: 19.183347463607788
Time to calculate statistics: 47.85246753692627
Processing 1
Processing 2804247 rows
Time for average calculations: 46.25683832168579
Time for average differences: 4.5553672313690186
Time to add features: 51.26441693305969
Time to calculate statistics: 93.6950318813324
Processing 2
Processing 3036873 rows
Time for average calculations: 37.2944598197937
Time for average differences: 3.4169065952301025
Time to add features: 41.21019434928894
Time to calculate statistics: 99.75748133659363
Processing 3
Processing 4016784 rows
Time for average calculations: 47.07568049430847
Time for average differences: 3.844787120819092
Time to add features: 51.619420289993286
Time to calculate statistics: 132.3491415977478
Processing 4
Processing 5022952 rows
Time for average calculations: 57.31792879104614
Time for average differ

In [8]:
aggregate_stats = pl.DataFrame({
    'name': features,
    'mean': np.nanmean(np.asarray(means), axis=0),
    'std': np.nanmean(np.asarray(stds), axis=0),
    'lo': np.nanmean(np.asarray(outliers_low), axis=0),
    'hi': np.nanmean(np.asarray(outliers_high), axis=0)
})
aggregate_stats.write_csv('feature_data.csv')

In [9]:
aggregate_stats

name,mean,std,lo,hi
str,f32,f32,f32,f32
"""feature_00""",0.183073,0.906198,-2.534983,2.910047
"""feature_01""",-0.003469,1.017344,-3.061906,3.058188
"""feature_02""",0.18486,0.905231,-2.532217,2.907642
"""feature_03""",0.184406,0.905112,-2.531981,2.906703
"""feature_04""",-0.00382,1.00064,-3.010556,3.007247
…,…,…,…,…
"""responder_6_lag_5-20""",-0.000003,0.029853,-0.205327,0.205328
"""responder_7_lag_1-5""",-3.8383e-8,0.001672,-0.029296,0.029296
"""responder_7_lag_5-20""",-0.000002,0.014149,-0.098366,0.098366
"""responder_8_lag_1-5""",-7.7443e-7,0.006696,-0.116156,0.116156
