In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import polars as pl # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd
import os
from pathlib import Path
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Read Input

In [2]:
base_path = Path("/kaggle/input/jane-street-real-time-market-data-forecasting/")
train_path = base_path / Path("train.parquet")
train_read_paths = [train_path / Path(f"partition_id={i}/part-0.parquet") for i in [0, 1, 2, 3, 4, 5, 6, 7]]
val_read_paths = [train_path / Path(f"partition_id={i}/part-0.parquet") for i in [8]]
test_read_paths = [train_path / Path(f"partition_id={i}/part-0.parquet") for i in [9]]

load_means_from_file = False
means_path = "/kaggle/working/means.csv"
load_stds_from_file = False
stds_path = "/kaggle/working/stds.csv"
target = "responder_6"

In [3]:
class Config:
    input_format = {
        "date_id": tf.TensorSpec(shape=(None,), dtype=tf.float32),
        "time_id": tf.TensorSpec(shape=(None,), dtype=tf.float32),
        "symbol_id": tf.TensorSpec(shape=(None,), dtype=tf.float32),
        "weight": tf.TensorSpec(shape=(None,), dtype=tf.float32),
        "features": tf.TensorSpec(shape=(None, 79), dtype=tf.float32),
        "responders": tf.TensorSpec(shape=(None, 9), dtype=tf.float32),
        "target": tf.TensorSpec(shape=(None,), dtype=tf.float32)
    }
    train_batch_size = 32768

config = Config()

In [4]:
def chunk_features(chunk):
    return chunk[[i for i in chunk.columns if i.startswith("feature_")]]

def from_files(paths):
    def to_ret():
        for filepath in paths:
            chunk = pl.read_parquet(filepath)
            yield {
                "date_id": chunk["date_id"],
                "time_id": chunk["time_id"],
                "symbol_id": chunk["symbol_id"],
                "weight": chunk["weight"],
                "features": chunk_features(chunk),
                "responders": chunk[[i for i in chunk.columns if i.startswith("responder_")]],
                "target": chunk[target]
            }
    return to_ret

train_raw = tf.data.Dataset.from_generator(
    from_files(train_read_paths),
    output_signature=config.input_format
).prefetch(tf.data.AUTOTUNE).cache()

val_raw = tf.data.Dataset.from_generator(
    from_files(val_read_paths),
    output_signature=config.input_format
).cache()

test_raw = tf.data.Dataset.from_generator(
    from_files(test_read_paths),
    output_signature=config.input_format
)

In [5]:
train_raw.element_spec

{'date_id': TensorSpec(shape=(None,), dtype=tf.float32, name=None),
 'time_id': TensorSpec(shape=(None,), dtype=tf.float32, name=None),
 'symbol_id': TensorSpec(shape=(None,), dtype=tf.float32, name=None),
 'weight': TensorSpec(shape=(None,), dtype=tf.float32, name=None),
 'features': TensorSpec(shape=(None, 79), dtype=tf.float32, name=None),
 'responders': TensorSpec(shape=(None, 9), dtype=tf.float32, name=None),
 'target': TensorSpec(shape=(None,), dtype=tf.float32, name=None)}

## Filter Data

In [6]:
def get_mean(dataset):
    batched_ds = dataset.batch(10000)
    def step(acc, value):
        num_non_nan = tf.reduce_sum(tf.cast(tf.math.is_nan(value['features']) == False, tf.float32), axis=0)
        sum_non_nan = tf.reduce_sum(tf.where(tf.math.is_nan(value['features']), tf.zeros_like(value['features']), value['features']), axis=0)
        return (acc[0] + sum_non_nan, acc[1] + num_non_nan)
    
    sum_, rows = batched_ds.reduce((tf.zeros(shape=(79,)), tf.constant(0.0)), step)
    return sum_ / tf.maximum(rows, tf.ones_like(rows))

if load_means_from_file:
    means = np.asarray(pl.read_csv(means_path)).astype('float32').reshape((-1,))
else:
    means = get_mean(train_raw.unbatch())

In [7]:
def clean_data(features):
    return tf.where(
        tf.logical_or(tf.math.is_nan(features), tf.math.is_inf(features)), 
        means, 
        features)

train_vals = train_raw.map(lambda i: (
                        clean_data(i['features']),
                        i['target']
                        ))

In [8]:
def get_std(dataset):
    batched_ds = dataset.batch(10000)
    def step(acc, val):
        return (acc[0] + tf.math.reduce_std(val[0], axis=0), acc[1] + 1)
    sum_, samples = batched_ds.reduce((tf.zeros(shape=(79,)), tf.constant(0.0)), step)
    return sum_ / samples

if load_stds_from_file:
    stds = np.asarray(pl.read_csv(stds_path)).astype('float32').reshape((-1,))
else:
    stds = get_std(train_vals.unbatch())

In [9]:
def normalize_data(features):
    return (features - means) / tf.math.maximum(1.0, stds)

train_ds = train_vals.map(lambda feat, tar: (normalize_data(feat), tar)).unbatch().shuffle(10000).batch(config.train_batch_size)

## Train a Model

In [10]:
inp = keras.Input(shape=(79,))
x = layers.Dense(units=40, activation="relu")(inp)
x = layers.Dense(units=30, activation="relu")(x)
x = layers.Dense(units=20, activation="relu")(x)
x = layers.Dense(units=10, activation="relu")(x)
x = layers.Dense(units=1)(x)
model = keras.Model(inputs=inp, outputs=x)

In [11]:
def r2_loss(y_true, y_pred):
    return tf.math.reduce_sum((y_true - y_pred) ** 2) / tf.math.reduce_sum(y_true ** 2)

model.compile(optimizer="rmsprop", loss=r2_loss, metrics=[keras.metrics.MeanAbsoluteError(), keras.metrics.R2Score()])

In [12]:
history = model.fit(x=train_ds, epochs=3)

Epoch 1/3
[1m1060/1060[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 143ms/step - loss: 1.0046 - mean_absolute_error: 0.6009 - r2_score: -0.0035
Epoch 2/3


  self.gen.throw(typ, value, traceback)


[1m1060/1060[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 140ms/step - loss: 0.9866 - mean_absolute_error: 0.5951 - r2_score: 0.0132
Epoch 3/3
[1m1060/1060[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 142ms/step - loss: 0.9849 - mean_absolute_error: 0.5946 - r2_score: 0.0149


## Save model & Data

In [13]:
pl.DataFrame({'means': np.asarray(means)}).write_csv('means.csv')

In [14]:
model.save('fitted.keras')

In [15]:
pl.DataFrame({'stds': np.asarray(stds)}).write_csv('stds.csv')

## Evaluate Model

In [16]:
test_ds = test_raw.map(lambda i: (
    normalize_data(clean_data(i['features'])),
    i['target']
))