In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import polars as pl # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd
import os
from pathlib import Path
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import re

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import kaggle_evaluation.jane_street_inference_server

The evaluation API requires that you set up a server which will respond to inference requests. We have already defined the server; you just need write the predict function. When we evaluate your submission on the hidden test set the client defined in `jane_street_gateway` will run in a different container with direct access to the hidden test set and hand off the data timestep by timestep.



Your code will always have access to the published copies of the files.

In [2]:
data_base_path = Path("/kaggle/input/js-feature-exploration/")
feature_data_path = data_base_path / Path("feature_data.csv")

feature_data_df = pl.read_csv(feature_data_path)
means = np.asarray(feature_data_df['mean']).astype('float32')
stds = np.asarray(feature_data_df['std']).astype('float32')

def r2_loss(y_true, y_pred):
    return tf.math.reduce_sum((y_true - y_pred) ** 2) / tf.math.reduce_sum((y_true - tf.reduce_mean(y_true)) ** 2)

feature_format = 'feature_\d\d'
responder_format = 'responder_\d'
lag_format = 'responder_\d_lag'
def chunk_features(chunk):
    return chunk[[i for i in chunk.columns if re.fullmatch(feature_format, i)]]
def chunk_lags(chunk):
    return chunk[[i for i in chunk.columns if re.fullmatch(lag_format, i)]]

def clean_features(features):
    return tf.where(
        tf.logical_or(tf.math.is_nan(features), tf.math.is_inf(features)), 
        means, 
        features)
def normalize_features(features):
    return (features - means) / tf.math.maximum(1.0, stds)
def clean_lags(lags):
    return tf.where(
        tf.logical_or(tf.math.is_nan(lags), tf.math.is_inf(lags)), 
        0.0, 
        lags)

def format_data(features, lags):
    return {
        'feature': normalize_features(clean_features(features)),
        'lags': clean_lags(lags)
    }

In [3]:
old_model = keras.models.load_model('/kaggle/input/js-prediction-training-1/fitted.keras', custom_objects={
    'r2_loss': r2_loss
})
new_model = keras.models.load_model('/kaggle/input/js-prediction-training-1/fitted.keras', custom_objects={
    'r2_loss': r2_loss
})

In [4]:
original_format = "(date_id)|(time_id)|(symbol_id)|(weight)|(feature_\d\d)|(responder_\d)"

In [5]:
feature_names = ['feature_00',
 'feature_01',
 'feature_02',
 'feature_03',
 'feature_04',
 'feature_05',
 'feature_06',
 'feature_07',
 'feature_08',
 'feature_09',
 'feature_10',
 'feature_11',
 'feature_12',
 'feature_13',
 'feature_14',
 'feature_15',
 'feature_16',
 'feature_17',
 'feature_18',
 'feature_19',
 'feature_20',
 'feature_21',
 'feature_22',
 'feature_23',
 'feature_24',
 'feature_25',
 'feature_26',
 'feature_27',
 'feature_28',
 'feature_29',
 'feature_30',
 'feature_31',
 'feature_32',
 'feature_33',
 'feature_34',
 'feature_35',
 'feature_36',
 'feature_37',
 'feature_38',
 'feature_39',
 'feature_40',
 'feature_41',
 'feature_42',
 'feature_43',
 'feature_44',
 'feature_45',
 'feature_46',
 'feature_47',
 'feature_48',
 'feature_49',
 'feature_50',
 'feature_51',
 'feature_52',
 'feature_53',
 'feature_54',
 'feature_55',
 'feature_56',
 'feature_57',
 'feature_58',
 'feature_59',
 'feature_60',
 'feature_61',
 'feature_62',
 'feature_63',
 'feature_64',
 'feature_65',
 'feature_66',
 'feature_67',
 'feature_68',
 'feature_69',
 'feature_70',
 'feature_71',
 'feature_72',
 'feature_73',
 'feature_74',
 'feature_75',
 'feature_76',
 'feature_77',
 'feature_78',
 'responder_0_lag',
 'responder_1_lag',
 'responder_2_lag',
 'responder_3_lag',
 'responder_4_lag',
 'responder_5_lag',
 'responder_6_lag',
 'responder_7_lag',
 'responder_8_lag']
responder_names = ['responder_0',
 'responder_1',
 'responder_2',
 'responder_3',
 'responder_4',
 'responder_5',
 'responder_6',
 'responder_7',
 'responder_8']


In [6]:
def train_online_model(past_df, num_iterations = 15, batch_size = 50000):
    print("training new model")
    model = keras.models.load_model('/kaggle/input/js-prediction-training-1/fitted.keras', custom_objects={
        'r2_loss': r2_loss
    })
    model.optimizer.learning_rate.assign(1e-4)

    if past_df is None or past_df.shape[0] == 0:
        return model

    past_df = past_df.sample(min(past_df.shape[0], batch_size))

    x_train = format_data(chunk_features(past_df), chunk_lags(past_df))
    y_train = past_df[target]
    model.fit(x=x_train, y=y_train, batch_size=past_df.shape[0], epochs=num_iterations)

    return model

In [7]:
past_data_cols = feature_names + ['date_id', 'time_id', 'symbol_id']
past_responder_cols = responder_names + ['date_id', 'time_id', 'symbol_id']
past_df_cols = feature_names + responder_names + ['date_id', 'time_id', 'symbol_id']

class PastStorage:
    def __init__(self, max_size = 100000):
        self.past_df = None
        self.last_day_data = None
        self.max_size = max_size
    def reformat_lags(lags: pl.DataFrame | None):
        if lags is None:
            return None

        last_day_ans = lags
        for i in range(9):
            for j in [col for col in last_day_ans.columns if re.search(f'responder_{i}', col)]:
                last_day_ans = last_day_ans.rename({j: f"responder_{i}"})
        return last_day_ans
    def append_to(df: pl.DataFrame | None, chunk: pl.DataFrame | None):
        if chunk is None:
            return df
        if df is None:
            return chunk
        return pl.concat([df, chunk])
        
    def data_inc(self, test_full: pl.DataFrame, lags: pl.DataFrame | None):
        global past_data_cols, past_responder_cols
        
        # use the data from the previous day for today's information
        if lags is not None:
            if self.last_day_data is not None:
                last_day_ans = PastStorage.reformat_lags(lags)
                to_append = self.last_day_data.join(last_day_ans, ['date_id', 'time_id', 'symbol_id'], how='left')
                for i in range(9):
                    to_append = to_append.filter(to_append[f'responder_{i}'].is_not_nan() & to_append[f'responder_{i}'].is_not_null())
    
                self.past_df = PastStorage.append_to(self.past_df, to_append[past_df_cols])
                print("Check for past dataframe cleanness:", self.past_df.shape, self.past_df['responder_0'].is_null().sum())
    
                self.last_day_data = None

        # append the data to the previous day... should always be the same as last_day_id
        if test_full is not None:
            self.last_day_data = PastStorage.append_to(self.last_day_data, test_full[past_data_cols])

        if self.past_df is not None:
            self.past_df = self.past_df.tail(self.max_size)
        if self.last_day_data is not None:
            self.last_day_data = self.last_day_data.tail(self.max_size)

In [8]:
lags_ : pl.DataFrame | None = None
storage = PastStorage()
days_per_update = 1
cur_day = 0

def reset_vars():
    global lags_, storage, cur_day
    lags_ = None
    storage = PastStorage()
    cur_day = 0
    new_model = keras.models.load_model('/kaggle/input/js-prediction-training-1/fitted.keras', custom_objects={
        'r2_loss': r2_loss
    })

# Replace this function with your inference code.
# You can return either a Pandas or Polars dataframe, though Polars is recommended.
# Each batch of predictions (except the very first) must be returned within 1 minute of the batch features being provided.
def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    global lags_, storage, old_model, new_model, cur_day, days_per_update
    
    if lags is not None:
        lags_ = lags
        
    if lags_ is not None:
        last_reading = lags_.group_by(('date_id', 'symbol_id'), maintain_order=True).last()
        for i in range(9):
            for j in [col for col in last_reading.columns if re.search(f'responder_{i}', col)]:
                last_reading = last_reading.rename({j: f"responder_{i}_lag"})

        selected_cols = ['date_id', 'symbol_id'] + [i for i in last_reading.columns if re.fullmatch(lag_format, i)]
        join_to = last_reading.with_columns(last_reading['date_id'] + 1)[selected_cols]
        test_grouped = test.join(join_to, ['date_id', 'symbol_id'], how='left')
    else:
        test_grouped = test
        for i in range(9):
            test_grouped = test_grouped.with_columns(pl.lit(None).cast(pl.Float32).alias(f"responder_{i}_lag"))
    
    input_features = format_data(chunk_features(test_grouped), chunk_lags(test_grouped))
    old_y = np.asarray(old_model.predict(input_features, verbose=0)).reshape((-1,))
    new_y = np.asarray(new_model.predict(input_features, verbose=0)).reshape((-1,))
    output_y = (old_y + new_y) / 2
    predictions = pl.DataFrame({
        "row_id": test['row_id'],
        "responder_6": output_y
    })

    storage.data_inc(test_grouped, lags)
    if lags is not None:
        cur_day += 1
        if cur_day == days_per_update:
            new_model = train_online_model(storage.past_df)
            storage.past_df = None
        cur_day %= days_per_update

    if isinstance(predictions, pl.DataFrame):
        assert predictions.columns == ['row_id', 'responder_6']
    elif isinstance(predictions, pd.DataFrame):
        assert (predictions.columns == ['row_id', 'responder_6']).all()
    else:
        raise TypeError('The predict function must return a DataFrame')
    # Confirm has as many rows as the test data.
    assert len(predictions) == len(test)

    return predictions

When your notebook is run on the hidden test set, inference_server.serve must be called within 15 minutes of the notebook starting or the gateway will throw an error. If you need more than 15 minutes to load your model you can do so during the very first `predict` call, which does not have the usual 1 minute response deadline.

In [9]:
base_path = Path("/kaggle/input/jane-street-real-time-market-data-forecasting/")

In [10]:
reset_vars()
predict(pl.read_parquet(base_path / Path("test.parquet")), None).head()

row_id,responder_6
i64,f32
0,-0.005609
1,-0.005609
2,-0.005609
3,-0.005609
4,-0.005609


In [11]:
reset_vars()

sample_lags = pl.read_parquet(base_path / Path('lags.parquet'))
sample_lags = sample_lags.with_columns(sample_lags['date_id'] - 1)
predict(pl.read_parquet(base_path / Path("test.parquet")), sample_lags).head()

training new model


row_id,responder_6
i64,f32
0,-0.015311
1,-0.019428
2,-0.023928
3,-0.006346
4,-0.009307


In [12]:
reset_vars()
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )

training new model
