In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import polars as pl # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd
import os
from pathlib import Path
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gc
import re
import random

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
np.random.seed(15234)
random.seed(15234)
os.environ['PYTHONHASHSEED'] = '42'
tf.random.set_seed(15234)

## Read Input

In [3]:
base_path = Path("/kaggle/input/jane-street-real-time-market-data-forecasting/")
preprocess_path = Path('/kaggle/input/js-preprocessing/')
train_path = preprocess_path / Path("train/")
train_read_paths = [train_path / Path(f"partition_id={i}/part-0.parquet") for i in [0, 1, 2, 3, 4, 5, 6, 7]]
val_read_paths = [train_path / Path(f"partition_id={i}/part-0.parquet") for i in [8]]
test_read_paths = [train_path / Path(f"partition_id={i}/part-0.parquet") for i in [9]]

data_base_path = Path("/kaggle/input/js-preprocessing/")
feature_data_path = data_base_path / Path("feature_data.csv")

target = "responder_6"

In [4]:
class Config:
    input_format = {
        "date_id": tf.TensorSpec(shape=(None,), dtype=tf.float32),
        "time_id": tf.TensorSpec(shape=(None,), dtype=tf.float32),
        "symbol_id": tf.TensorSpec(shape=(None,), dtype=tf.float32),
        "weight": tf.TensorSpec(shape=(None,), dtype=tf.float32),
        "features": tf.TensorSpec(shape=(None, 79), dtype=tf.float32),
        "responders": tf.TensorSpec(shape=(None, 9), dtype=tf.float32),
        "lags": tf.TensorSpec(shape=(None, 9), dtype=tf.float32),
        "target": tf.TensorSpec(shape=(None,), dtype=tf.float32)
    }
    train_batch_size = 32768
    val_batch_size = 512

config = Config()

In [5]:
feature_format = 'feature_\d\d'
responder_format = 'responder_\d'
lag_format = 'responder_\d_lag'

def chunk_features(chunk):
    return chunk[[i for i in chunk.columns if re.fullmatch(feature_format, i)]]
def chunk_lags(chunk):
    return chunk[[i for i in chunk.columns if re.fullmatch(lag_format, i)]]

def from_files(paths):
    def to_ret():
        for filepath in paths:
            chunk = pl.read_parquet(filepath)
            yield {
                "date_id": chunk["date_id"],
                "time_id": chunk["time_id"],
                "symbol_id": chunk["symbol_id"],
                "weight": chunk["weight"],
                "features": chunk_features(chunk),
                "responders": chunk[[i for i in chunk.columns if re.fullmatch(responder_format, i)]],
                "lags": chunk_lags(chunk),
                "target": chunk[target]
            }
    return to_ret

train_raw = tf.data.Dataset.from_generator(
    from_files(train_read_paths),
    output_signature=config.input_format
).prefetch(tf.data.AUTOTUNE).cache()

val_raw = tf.data.Dataset.from_generator(
    from_files(val_read_paths),
    output_signature=config.input_format
).cache()

test_raw = tf.data.Dataset.from_generator(
    from_files(test_read_paths),
    output_signature=config.input_format
)

In [6]:
train_raw.element_spec

{'date_id': TensorSpec(shape=(None,), dtype=tf.float32, name=None),
 'time_id': TensorSpec(shape=(None,), dtype=tf.float32, name=None),
 'symbol_id': TensorSpec(shape=(None,), dtype=tf.float32, name=None),
 'weight': TensorSpec(shape=(None,), dtype=tf.float32, name=None),
 'features': TensorSpec(shape=(None, 79), dtype=tf.float32, name=None),
 'responders': TensorSpec(shape=(None, 9), dtype=tf.float32, name=None),
 'lags': TensorSpec(shape=(None, 9), dtype=tf.float32, name=None),
 'target': TensorSpec(shape=(None,), dtype=tf.float32, name=None)}

## Filter, Clean, Normalize Data

In [7]:
feature_data_df = pl.read_csv(feature_data_path)
means = np.asarray(feature_data_df['mean']).astype('float32')
stds = np.asarray(feature_data_df['std']).astype('float32')

In [8]:
def clean_features(features):
    return tf.where(
        tf.logical_or(tf.math.is_nan(features), tf.math.is_inf(features)), 
        means, 
        features)

def normalize_features(features):
    return (features - means) / tf.math.maximum(1.0, stds)

def clean_lags(lags):
    return tf.where(
        tf.logical_or(tf.math.is_nan(lags), tf.math.is_inf(lags)), 
        0.0, 
        lags)

def format_data(features, lags):
    return {
        'feature': normalize_features(clean_features(features)),
        'lags': clean_lags(lags)
    }

train_ds = train_raw.map(lambda i: (
                        format_data(i['features'], i['lags']),
                        i['target']
                        )).unbatch().shuffle(10000).batch(config.train_batch_size)
val_ds = val_raw.map(lambda i: (
    format_data(i['features'], i['lags']),
    i['target']
)).unbatch().batch(config.val_batch_size)

## Train a Model

In [9]:
feat_layer = keras.Input(shape=(79,), name='feature')
lag_layer = keras.Input(shape=(9,), name='lags')
inp = keras.layers.Concatenate()([feat_layer, lag_layer])
x = layers.Dropout(rate=0.2)(inp)
x = layers.Dense(units=128, activation="silu", kernel_regularizer="l1l2")(x)
x = layers.Dropout(rate=0.1)(x)
x = layers.Dense(units=64, activation="silu", kernel_regularizer="l1l2")(x)
x = layers.Dropout(rate=0.2)(x)
x = layers.Dense(units=32, activation="silu", kernel_regularizer="l1l2")(x)
x = layers.Dense(units=16, activation="silu")(x)
x = layers.Dense(units=4, activation="silu")(x)
x = layers.Dense(units=1)(x)
model = keras.Model(inputs=[feat_layer, lag_layer], outputs=x)

In [10]:
def r2_loss(y_true, y_pred):
    return tf.math.reduce_sum((y_true - y_pred) ** 2) / tf.math.reduce_sum(y_true ** 2)

model.compile(optimizer="rmsprop", loss="mse", metrics=[keras.metrics.MeanAbsoluteError(), keras.metrics.R2Score()])

In [11]:
callbacks = [
    keras.callbacks.ModelCheckpoint('/kaggle/working/fitted.keras', save_best_only=True),
    keras.callbacks.ModelCheckpoint('/kaggle/working/intermediate.keras', save_best_only=False),
    keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=3, min_delta=5e-5),
    keras.callbacks.EarlyStopping(patience=8)
]

history = model.fit(x=train_ds, validation_data=val_ds, epochs=1000, callbacks = callbacks)

Epoch 1/1000


I0000 00:00:1735172918.824839      66 service.cc:145] XLA service 0x79eae0008760 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1735172918.824917      66 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0


      1/Unknown [1m14s[0m 14s/step - loss: 1.2133 - mean_absolute_error: 0.6799 - r2_score: -0.8946

I0000 00:00:1735172925.630767      66 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


   1060/Unknown [1m172s[0m 149ms/step - loss: 0.8451 - mean_absolute_error: 0.5997 - r2_score: -0.0031

  self.gen.throw(typ, value, traceback)


[1m1060/1060[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 179ms/step - loss: 0.8451 - mean_absolute_error: 0.5997 - r2_score: -0.0031 - val_loss: 0.7436 - val_mean_absolute_error: 0.5554 - val_r2_score: 0.0117 - learning_rate: 0.0010
Epoch 2/1000
[1m1060/1060[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 134ms/step - loss: 0.8347 - mean_absolute_error: 0.5961 - r2_score: 0.0101 - val_loss: 0.7426 - val_mean_absolute_error: 0.5552 - val_r2_score: 0.0129 - learning_rate: 0.0010
Epoch 3/1000
[1m1060/1060[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 143ms/step - loss: 0.8333 - mean_absolute_error: 0.5957 - r2_score: 0.0118 - val_loss: 0.7423 - val_mean_absolute_error: 0.5551 - val_r2_score: 0.0133 - learning_rate: 0.0010
Epoch 4/1000
[1m1060/1060[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 142ms/step - loss: 0.8329 - mean_absolute_error: 0.5956 - r2_score: 0.0123 - val_loss: 0.7422 - val_mean_absolute_error: 0.5550 - val_r2_score: 0.0135 - le

## Evaluate Model

In [12]:
test_ds = test_raw.map(lambda i: (
                        format_data(i['features'], i['lags']),
                        i['target']
                        )).unbatch().batch(512)

In [13]:
model = keras.models.load_model('/kaggle/working/fitted.keras', custom_objects={
    'r2_loss': r2_loss
})

In [14]:
model.evaluate(x=test_ds)

[1m12256/12256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 2ms/step - loss: 0.6665 - mean_absolute_error: 0.5288 - r2_score: 0.0080


[0.6577849984169006, 0.5325245261192322, 0.007608771324157715]