In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import polars as pl # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd
import os
from pathlib import Path
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gc

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Read Input

In [2]:
base_path = Path("/kaggle/input/jane-street-real-time-market-data-forecasting/")
train_path = base_path / Path("train.parquet")
train_read_paths = [train_path / Path(f"partition_id={i}/part-0.parquet") for i in [0, 1, 2, 3, 4, 5, 6, 7]]
val_read_paths = [train_path / Path(f"partition_id={i}/part-0.parquet") for i in [8]]
test_read_paths = [train_path / Path(f"partition_id={i}/part-0.parquet") for i in [9]]

data_base_path = Path("/kaggle/input/js-feature-exploration/")
feature_data_path = data_base_path / Path("feature_data.csv")

target = "responder_6"

In [3]:
class Config:
    input_format = {
        "date_id": tf.TensorSpec(shape=(None,), dtype=tf.float32),
        "time_id": tf.TensorSpec(shape=(None,), dtype=tf.float32),
        "symbol_id": tf.TensorSpec(shape=(None,), dtype=tf.float32),
        "weight": tf.TensorSpec(shape=(None,), dtype=tf.float32),
        "features": tf.TensorSpec(shape=(None, 79), dtype=tf.float32),
        "responders": tf.TensorSpec(shape=(None, 9), dtype=tf.float32),
        "target": tf.TensorSpec(shape=(None,), dtype=tf.float32)
    }
    train_batch_size = 16384

config = Config()

In [4]:
def chunk_features(chunk):
    return chunk[[i for i in chunk.columns if i.startswith("feature_")]]

def from_files(paths):
    def to_ret():
        for filepath in paths:
            chunk = pl.read_parquet(filepath)
            yield {
                "date_id": chunk["date_id"],
                "time_id": chunk["time_id"],
                "symbol_id": chunk["symbol_id"],
                "weight": chunk["weight"],
                "features": chunk_features(chunk),
                "responders": chunk[[i for i in chunk.columns if i.startswith("responder_")]],
                "target": chunk[target]
            }
    return to_ret

train_raw = tf.data.Dataset.from_generator(
    from_files(train_read_paths),
    output_signature=config.input_format
).prefetch(tf.data.AUTOTUNE).cache()

val_raw = tf.data.Dataset.from_generator(
    from_files(val_read_paths),
    output_signature=config.input_format
).cache()

test_raw = tf.data.Dataset.from_generator(
    from_files(test_read_paths),
    output_signature=config.input_format
)

In [5]:
train_raw.element_spec

{'date_id': TensorSpec(shape=(None,), dtype=tf.float32, name=None),
 'time_id': TensorSpec(shape=(None,), dtype=tf.float32, name=None),
 'symbol_id': TensorSpec(shape=(None,), dtype=tf.float32, name=None),
 'weight': TensorSpec(shape=(None,), dtype=tf.float32, name=None),
 'features': TensorSpec(shape=(None, 79), dtype=tf.float32, name=None),
 'responders': TensorSpec(shape=(None, 9), dtype=tf.float32, name=None),
 'target': TensorSpec(shape=(None,), dtype=tf.float32, name=None)}

## Filter, Clean, Normalize Data

In [6]:
feature_data_df = pl.read_csv(feature_data_path)
means = np.asarray(feature_data_df['mean']).astype('float32')
stds = np.asarray(feature_data_df['std']).astype('float32')

In [7]:
def clean_data(features):
    return tf.where(
        tf.logical_or(tf.math.is_nan(features), tf.math.is_inf(features)), 
        means, 
        features)

def normalize_data(features):
    return (features - means) / tf.math.maximum(1.0, stds)

def format_data(features):
    return normalize_data(clean_data(features))

train_ds = train_raw.map(lambda i: (
                        format_data(i['features']),
                        i['target']
                        )).unbatch().shuffle(10000).batch(config.train_batch_size)
val_ds = val_raw.map(lambda i: (
    format_data(i['features']),
    i['target']
))

## Train a Model

In [8]:
inp = keras.Input(shape=(79,))
x = layers.Dense(units=40, activation="relu")(inp)
x = layers.Dense(units=30, activation="relu")(x)
x = layers.Dense(units=20, activation="relu")(x)
x = layers.Dense(units=10, activation="relu")(x)
x = layers.Dense(units=5, activation="relu")(x)
x = layers.Dense(units=1)(x)
model = keras.Model(inputs=inp, outputs=x)

In [9]:
def r2_loss(y_true, y_pred):
    return tf.math.reduce_sum((y_true - y_pred) ** 2) / tf.math.reduce_sum(y_true ** 2)

model.compile(optimizer="rmsprop", loss=r2_loss, metrics=[keras.metrics.MeanAbsoluteError(), keras.metrics.R2Score()])

In [10]:
callbacks = [
    keras.callbacks.ModelCheckpoint('/kaggle/working/fitted.keras', save_best_only=True),
    keras.callbacks.ModelCheckpoint('/kaggle/working/intermediate.keras', save_best_only=False),
    keras.callbacks.LearningRateScheduler(lambda epoch, lr: lr * 0.9),
    keras.callbacks.EarlyStopping(patience=2)
]

history = model.fit(x=train_ds, validation_data=val_ds, epochs=10, callbacks = callbacks)

Epoch 1/10
   2119/Unknown [1m180s[0m 81ms/step - loss: 0.9904 - mean_absolute_error: 0.5963 - r2_score: 0.0095

  self.gen.throw(typ, value, traceback)


[1m2119/2119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 90ms/step - loss: 0.9904 - mean_absolute_error: 0.5963 - r2_score: 0.0095 - val_loss: 0.9923 - val_mean_absolute_error: 0.5555 - val_r2_score: 0.0077 - learning_rate: 9.0000e-04
Epoch 2/10
[1m2119/2119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 77ms/step - loss: 0.9843 - mean_absolute_error: 0.5945 - r2_score: 0.0152 - val_loss: 0.9913 - val_mean_absolute_error: 0.5550 - val_r2_score: 0.0087 - learning_rate: 8.1000e-04
Epoch 3/10
[1m2119/2119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 76ms/step - loss: 0.9832 - mean_absolute_error: 0.5942 - r2_score: 0.0163 - val_loss: 0.9905 - val_mean_absolute_error: 0.5549 - val_r2_score: 0.0095 - learning_rate: 7.2900e-04
Epoch 4/10
[1m2119/2119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 76ms/step - loss: 0.9826 - mean_absolute_error: 0.5941 - r2_score: 0.0169 - val_loss: 0.9903 - val_mean_absolute_error: 0.5550 - val_r2_score: 0.0097 - l

## Evaluate Model

In [11]:
test_ds = test_raw.map(lambda i: (
    normalize_data(clean_data(i['features'])),
    i['target']
))

In [12]:
model.evaluate(x=test_ds)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 17s/step - loss: 0.9957 - mean_absolute_error: 0.5343 - r2_score: 0.0043


[0.9956994652748108, 0.5342878699302673, 0.004279077053070068]