In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import polars as pl # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd
import os
from pathlib import Path
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Read Input

In [2]:
base_path = Path("/kaggle/input/jane-street-real-time-market-data-forecasting/")
train_path = base_path / Path("train.parquet")
train_read_paths = [train_path / Path(f"partition_id={i}/part-0.parquet") for i in [0, 1, 2, 3, 4, 5, 6, 7]]
val_read_paths = [train_path / Path(f"partition_id={i}/part-0.parquet") for i in [8]]
test_read_paths = [train_path / Path(f"partition_id={i}/part-0.parquet") for i in [9]]
target = "responder_6"

In [3]:
class Config:
    input_format = {
        "date_id": tf.TensorSpec(shape=(None,), dtype=tf.float32),
        "time_id": tf.TensorSpec(shape=(None,), dtype=tf.float32),
        "symbol_id": tf.TensorSpec(shape=(None,), dtype=tf.float32),
        "weight": tf.TensorSpec(shape=(None,), dtype=tf.float32),
        "features": tf.TensorSpec(shape=(None, 79), dtype=tf.float32),
        "responders": tf.TensorSpec(shape=(None, 9), dtype=tf.float32),
        "target": tf.TensorSpec(shape=(None,), dtype=tf.float32)
    }
    train_batch_size = 32

config = Config()

In [4]:
def chunk_features(chunk):
    return chunk[[i for i in chunk.columns if i.startswith("feature_")]]

def from_files(paths):
    def to_ret():
        for filepath in paths:
            chunk = pl.read_parquet(filepath)
            yield {
                "date_id": chunk["date_id"],
                "time_id": chunk["time_id"],
                "symbol_id": chunk["symbol_id"],
                "weight": chunk["weight"],
                "features": chunk_features(chunk),
                "responders": chunk[[i for i in chunk.columns if i.startswith("responder_")]],
                "target": chunk[target]
            }
    return to_ret

train_raw = tf.data.Dataset.from_generator(
    from_files(train_read_paths),
    output_signature=config.input_format
).prefetch(tf.data.AUTOTUNE).cache()

val_raw = tf.data.Dataset.from_generator(
    from_files(val_read_paths),
    output_signature=config.input_format
).cache()

test_raw = tf.data.Dataset.from_generator(
    from_files(test_read_paths),
    output_signature=config.input_format
)

In [5]:
train_raw.element_spec

{'date_id': TensorSpec(shape=(None,), dtype=tf.float32, name=None),
 'time_id': TensorSpec(shape=(None,), dtype=tf.float32, name=None),
 'symbol_id': TensorSpec(shape=(None,), dtype=tf.float32, name=None),
 'weight': TensorSpec(shape=(None,), dtype=tf.float32, name=None),
 'features': TensorSpec(shape=(None, 79), dtype=tf.float32, name=None),
 'responders': TensorSpec(shape=(None, 9), dtype=tf.float32, name=None),
 'target': TensorSpec(shape=(None,), dtype=tf.float32, name=None)}

## Filter Data

In [6]:
def get_mean(dataset):
    batched_ds = dataset.batch(10000)
    def step(acc, value):
        num_non_nan = tf.reduce_sum(tf.cast(tf.math.is_nan(value['features']) == False, tf.float32), axis=0)
        sum_non_nan = tf.reduce_sum(tf.where(tf.math.is_nan(value['features']), tf.zeros_like(value['features']), value['features']), axis=0)
        return (acc[0] + sum_non_nan, acc[1] + num_non_nan)
    
    sum_, rows = batched_ds.reduce((tf.zeros(shape=(79,)), tf.constant(0.0)), step)
    print(rows)
    return sum_ / tf.maximum(rows, tf.ones_like(rows))

means = get_mean(train_raw.unbatch().shuffle(10000).take(1000000))

tf.Tensor(
[      0.       0.       0.       0.       0. 1000000. 1000000. 1000000.
  999155. 1000000. 1000000. 1000000. 1000000. 1000000. 1000000.  971538.
  999944.  995195.  999946.  999946. 1000000.       0. 1000000. 1000000.
 1000000. 1000000.       0.       0. 1000000. 1000000. 1000000.       0.
  988717.  988717. 1000000. 1000000. 1000000. 1000000. 1000000.  826738.
  973869.  943836.  826738.  973869.  943836.  888286.  888286.  999913.
 1000000. 1000000.  848707.  998817.  966886.  848707.  998817.  966886.
  999946.  999946.  988722. 1000000. 1000000. 1000000.  894922.  906767.
  904973.  888286.  888286. 1000000. 1000000. 1000000. 1000000. 1000000.
 1000000.  988722.  988722.  999984.  999984. 1000000. 1000000.], shape=(79,), dtype=float32)


In [7]:
def clean_data(features):
    return tf.where(
        tf.logical_or(tf.math.is_nan(features), tf.math.is_inf(features)), 
        means, 
        features)

train_ds = train_raw.map(lambda i: (
                        clean_data(i['features']),
                        i['target']
                        ))


train_ds = train_ds.unbatch().shuffle(10000).batch(config.train_batch_size)

In [8]:
train_ds

<_BatchDataset element_spec=(TensorSpec(shape=(None, 79), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.float32, name=None))>

## Train a Model

In [9]:
input = keras.Input(shape=(79,))
x = layers.Dense(units=1)(input)
model = keras.Model(inputs=input, outputs=x)

In [10]:
model.compile(optimizer="rmsprop", loss=keras.losses.MeanSquaredError(), metrics=[keras.metrics.MeanAbsoluteError()])

In [11]:
history = model.fit(x=train_ds, epochs=1)

I0000 00:00:1734580913.625708      66 service.cc:145] XLA service 0x5a33500c32a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1734580913.625800      66 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0


    116/Unknown [1m4s[0m 1ms/step - loss: 461.7396 - mean_absolute_error: 13.0067

I0000 00:00:1734580913.995834      66 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1084774/1084774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1275s[0m 1ms/step - loss: 1.2805 - mean_absolute_error: 0.6354


  self.gen.throw(typ, value, traceback)


## First Submission!!

In [12]:
import kaggle_evaluation.jane_street_inference_server

In [13]:
# Replace this function with your inference code.
# You can return either a Pandas or Polars dataframe, though Polars is recommended.
# Each batch of predictions (except the very first) must be returned within 1 minute of the batch features being provided.
def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    """Make a prediction."""
    # All the responders from the previous day are passed in at time_id == 0. We save them in a global variable for access at every time_id.
    # Use them as extra features, if you like.

    # Replace this section with your own predictions
    input_features = clean_data(chunk_features(test))
    output_y = np.asarray(model.predict(input_features))
    predictions = pl.DataFrame({
        "row_id": test['row_id'],
        "responder_6": output_y
    })

    
    

    if isinstance(predictions, pl.DataFrame):
        assert predictions.columns == ['row_id', 'responder_6']
    elif isinstance(predictions, pd.DataFrame):
        assert (predictions.columns == ['row_id', 'responder_6']).all()
    else:
        raise TypeError('The predict function must return a DataFrame')
    # Confirm has as many rows as the test data.
    assert len(predictions) == len(test)

    return predictions

In [14]:
predict(pl.read_parquet(base_path / Path("test.parquet/date_id=0/part-0.parquet")), None).head()

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step


row_id,responder_6
i64,"array[f32, 1]"
0,[-0.071093]
1,[-0.071093]
2,[-0.071093]
3,[-0.071093]
4,[-0.071093]


In [15]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
