In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import polars as pl # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd
import os
from pathlib import Path
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gc
import re

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Read Input

In [2]:
base_path = Path("/kaggle/input/jane-street-real-time-market-data-forecasting/")
preprocess_path = Path('/kaggle/input/js-preprocessing/')
train_path = preprocess_path / Path("train/")
train_read_paths = [train_path / Path(f"partition_id={i}/part-0.parquet") for i in [0, 1, 2, 3, 4, 5, 6, 7]]
val_read_paths = [train_path / Path(f"partition_id={i}/part-0.parquet") for i in [8]]
test_read_paths = [train_path / Path(f"partition_id={i}/part-0.parquet") for i in [9]]

data_base_path = Path("/kaggle/input/js-preprocessing/")
feature_data_path = data_base_path / Path("feature_data.csv")

target = "responder_6"

In [3]:
class Config:
    input_format = {
        "date_id": tf.TensorSpec(shape=(None,), dtype=tf.float32),
        "time_id": tf.TensorSpec(shape=(None,), dtype=tf.float32),
        "symbol_id": tf.TensorSpec(shape=(None,), dtype=tf.float32),
        "weight": tf.TensorSpec(shape=(None,), dtype=tf.float32),
        "features": tf.TensorSpec(shape=(None, 79), dtype=tf.float32),
        "responders": tf.TensorSpec(shape=(None, 9), dtype=tf.float32),
        "lags": tf.TensorSpec(shape=(None, 9), dtype=tf.float32),
        "target": tf.TensorSpec(shape=(None,), dtype=tf.float32)
    }
    train_batch_size = 32768

config = Config()

In [4]:
feature_format = 'feature_\d\d'
responder_format = 'responder_\d'
lag_format = 'responder_\d_lag'

def chunk_features(chunk):
    return chunk[[i for i in chunk.columns if re.fullmatch(feature_format, i)]]
def chunk_lags(chunk):
    return chunk[[i for i in chunk.columns if re.fullmatch(lag_format, i)]]

def from_files(paths):
    def to_ret():
        for filepath in paths:
            chunk = pl.read_parquet(filepath)
            yield {
                "date_id": chunk["date_id"],
                "time_id": chunk["time_id"],
                "symbol_id": chunk["symbol_id"],
                "weight": chunk["weight"],
                "features": chunk_features(chunk),
                "responders": chunk[[i for i in chunk.columns if re.fullmatch(responder_format, i)]],
                "lags": chunk_lags(chunk),
                "target": chunk[target]
            }
    return to_ret

train_raw = tf.data.Dataset.from_generator(
    from_files(train_read_paths),
    output_signature=config.input_format
).prefetch(tf.data.AUTOTUNE).cache()

val_raw = tf.data.Dataset.from_generator(
    from_files(val_read_paths),
    output_signature=config.input_format
).cache()

test_raw = tf.data.Dataset.from_generator(
    from_files(test_read_paths),
    output_signature=config.input_format
)

In [5]:
train_raw.element_spec

{'date_id': TensorSpec(shape=(None,), dtype=tf.float32, name=None),
 'time_id': TensorSpec(shape=(None,), dtype=tf.float32, name=None),
 'symbol_id': TensorSpec(shape=(None,), dtype=tf.float32, name=None),
 'weight': TensorSpec(shape=(None,), dtype=tf.float32, name=None),
 'features': TensorSpec(shape=(None, 79), dtype=tf.float32, name=None),
 'responders': TensorSpec(shape=(None, 9), dtype=tf.float32, name=None),
 'lags': TensorSpec(shape=(None, 9), dtype=tf.float32, name=None),
 'target': TensorSpec(shape=(None,), dtype=tf.float32, name=None)}

## Filter, Clean, Normalize Data

In [6]:
feature_data_df = pl.read_csv(feature_data_path)
means = np.asarray(feature_data_df['mean']).astype('float32')
stds = np.asarray(feature_data_df['std']).astype('float32')

In [7]:
def clean_features(features):
    return tf.where(
        tf.logical_or(tf.math.is_nan(features), tf.math.is_inf(features)), 
        means, 
        features)

def normalize_features(features):
    return (features - means) / tf.math.maximum(1.0, stds)

def clean_lags(lags):
    return tf.where(
        tf.logical_or(tf.math.is_nan(lags), tf.math.is_inf(lags)), 
        0.0, 
        lags)

def format_data(features, lags):
    return {
        'feature': normalize_features(clean_features(features)),
        'lags': clean_lags(lags)
    }

train_ds = train_raw.map(lambda i: (
                        format_data(i['features'], i['lags']),
                        i['target']
                        )).unbatch().shuffle(10000).batch(config.train_batch_size)
val_ds = val_raw.map(lambda i: (
    format_data(i['features'], i['lags']),
    i['target']
))

## Train a Model

In [8]:
feat_layer = keras.Input(shape=(79,), name='feature')
lag_layer = keras.Input(shape=(9,), name='lags')
inp = keras.layers.Concatenate()([feat_layer, lag_layer])
x = layers.Dropout(rate=0.2)(inp)
x = layers.Dense(units=40, activation="relu", kernel_regularizer="l1l2")(x)
x = layers.Dense(units=30, activation="relu", kernel_regularizer="l1l2")(x)
x = layers.Dropout(rate=0.2)(x)
x = layers.Dense(units=20, activation="relu", kernel_regularizer="l1l2")(x)
x = layers.Dense(units=10, activation="relu")(x)
x = layers.Dense(units=5, activation="relu")(x)
x = layers.Dense(units=1)(x)
model = keras.Model(inputs=[feat_layer, lag_layer], outputs=x)

In [9]:
def r2_loss(y_true, y_pred):
    return tf.math.reduce_sum((y_true - y_pred) ** 2) / tf.math.reduce_sum(y_true ** 2)

model.compile(optimizer="rmsprop", loss="mse", metrics=[keras.metrics.MeanAbsoluteError(), keras.metrics.R2Score()])

In [10]:
callbacks = [
    keras.callbacks.ModelCheckpoint('/kaggle/working/fitted.keras', save_best_only=True),
    keras.callbacks.ModelCheckpoint('/kaggle/working/intermediate.keras', save_best_only=False),
    keras.callbacks.LearningRateScheduler(lambda epoch, lr: lr * 0.97),
    keras.callbacks.EarlyStopping(patience=2)
]

history = model.fit(x=train_ds, validation_data=val_ds, epochs=15, callbacks = callbacks)

Epoch 1/15
   1059/Unknown [1m242s[0m 220ms/step - loss: 0.8585 - mean_absolute_error: 0.6091 - r2_score: -0.0202

  self.gen.throw(typ, value, traceback)


[1m1060/1060[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m272s[0m 249ms/step - loss: 0.8585 - mean_absolute_error: 0.6091 - r2_score: -0.0202 - val_loss: 0.7475 - val_mean_absolute_error: 0.5565 - val_r2_score: 0.0065 - learning_rate: 9.7000e-04
Epoch 2/15
[1m1060/1060[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 183ms/step - loss: 0.8360 - mean_absolute_error: 0.5964 - r2_score: 0.0086 - val_loss: 0.7462 - val_mean_absolute_error: 0.5561 - val_r2_score: 0.0082 - learning_rate: 9.4090e-04
Epoch 3/15
[1m1060/1060[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 182ms/step - loss: 0.8346 - mean_absolute_error: 0.5960 - r2_score: 0.0103 - val_loss: 0.7457 - val_mean_absolute_error: 0.5560 - val_r2_score: 0.0088 - learning_rate: 9.1267e-04
Epoch 4/15
[1m1060/1060[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m236s[0m 219ms/step - loss: 0.8339 - mean_absolute_error: 0.5958 - r2_score: 0.0111 - val_loss: 0.7452 - val_mean_absolute_error: 0.5558 - val_r2_score: 0.009

## Evaluate Model

In [11]:
test_ds = test_raw.map(lambda i: (
                        format_data(i['features'], i['lags']),
                        i['target']
                        ))

In [12]:
model = keras.models.load_model('/kaggle/working/fitted.keras', custom_objects={
    'r2_loss': r2_loss
})

In [13]:
model.evaluate(x=test_ds)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 20s/step - loss: 0.6585 - mean_absolute_error: 0.5329 - r2_score: 0.0066


[0.6584630012512207, 0.5328649878501892, 0.006585359573364258]