In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [13]:
# Imports
import pandas as pd
import polars as pl
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
import joblib  # For loading preprocessing objects
import tensorflow as tf
import os

# ------------------------------
# 1. Custom Loss
# ------------------------------
def weighted_r2_loss(y_true, y_pred):
    """
    Weighted R^2 loss for the competition
    """
    weights = tf.abs(y_true)
    numerator = tf.reduce_sum(weights * tf.square(y_true - y_pred))
    denominator = tf.reduce_sum(weights * tf.square(y_true))
    return numerator / denominator

# ------------------------------
# 2. Load the Model
# ------------------------------
# Update this path to match your Kaggle dataset name
model_path = "/kaggle/input/mlp-model-h5/mlp_model_no_metrics.h5"
mlp_model = load_model(model_path, custom_objects={"weighted_r2_loss": weighted_r2_loss})

# ------------------------------
# 3. Load Preprocessing Objects
# ------------------------------
# Update this path to match your uploaded preprocessing dataset name in Kaggle
variance_threshold_path = "/kaggle/input/processing/variance_threshold.pkl"
scaler_path = "/kaggle/input/processing/scaler.pkl"
pca_path = "/kaggle/input/processing/pca.pkl"
expected_features_path = "/kaggle/input/expected/expected_features.pkl"

# Load preprocessing objects
variance_threshold = joblib.load(variance_threshold_path)
scaler = joblib.load(scaler_path)
pca = joblib.load(pca_path)
expected_features = joblib.load(expected_features_path)

print("Preprocessing objects and expected features loaded.")

# ------------------------------
# 4. Preprocessing Function
# ------------------------------
def preprocess_data(df):
    """
    Preprocess the data to match the training pipeline.
    """
    # Ensure the test data matches the expected features
    df = df[expected_features]  # Select only the expected features
    
    # Handle missing values
    df.fillna(0, inplace=True)
    
    # Apply preprocessing steps
    features_high_variance = variance_threshold.transform(df.values)
    features_scaled = scaler.transform(features_high_variance)
    features_pca = pca.transform(features_scaled)
    
    return features_pca


# ------------------------------
# 5. Prediction Function
# ------------------------------
# Global variable for lagged data
lags_ = None

def predict(test: pl.DataFrame, lags: pl.DataFrame | None = None) -> pl.DataFrame | pd.DataFrame:
    """
    Predict function for the competition inference server.
    """
    global lags_
    if lags is not None:
        lags_ = lags  # Update global lags with new values
    
    # Convert to pandas and preprocess
    test_df = test.to_pandas()
    test_features = preprocess_data(test_df)
    
    # Make predictions
    predictions = mlp_model.predict(test_features)
    
    # Return predictions in the required format
    return test.select(
        pl.col("row_id"),
        pl.lit(predictions.flatten()).alias("responder_6")
    )

# ------------------------------
# 6. Inference Server Setup
# ------------------------------
import kaggle_evaluation.jane_street_inference_server as inference_server

# Initialize the inference server
inference_server = inference_server.JSInferenceServer(predict)

if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    # Serve the model for the competition rerun
    inference_server.serve()
else:
    # Run locally for testing
    # Run locally for testing
    inference_server.run_local_gateway((
        "/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet",
        "/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet"
    ))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Preprocessing objects and expected features loaded.
[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 129ms/step

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.fillna(0, inplace=True)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step
