In [3]:
import os
import numpy as np
import pandas as pd
import polars as pl
import kaggle_evaluation.jane_street_inference_server

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

lags_ = None
rf_model = None
feature_columns = [f"feature_{i:02d}" for i in range(79)]
median_values = None
symbol_encoder = None

def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    global lags_, rf_model, median_values, symbol_encoder

    if lags is not None:
        lags_ = lags

    if rf_model is None:
        train_path = "/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet"
        lags_path = "/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet"

        train_df = pd.read_parquet(train_path)
        lags_df = pd.read_parquet(lags_path)

        train_df = train_df.merge(lags_df, on=["symbol_id", "date_id"], how="left")

        target_col = "responder_6"
        lag_columns = [col for col in lags_df.columns if col not in ['symbol_id', 'date_id']]
        all_feature_columns = feature_columns + lag_columns

        train_df = train_df.dropna(subset=[target_col])

        symbol_encoder = LabelEncoder()
        train_df['symbol_id_encoded'] = symbol_encoder.fit_transform(train_df['symbol_id'])
        all_feature_columns += ['symbol_id_encoded']

        train_df['date_id_sin'] = np.sin(2 * np.pi * train_df['date_id'] / 365)
        train_df['date_id_cos'] = np.cos(2 * np.pi * train_df['date_id'] / 365)
        all_feature_columns += ['date_id_sin', 'date_id_cos']

        imputer = SimpleImputer(strategy='median')
        train_df[all_feature_columns] = imputer.fit_transform(train_df[all_feature_columns])

        X = train_df[all_feature_columns]
        y = train_df[target_col]

        tscv = TimeSeriesSplit(n_splits=5)

        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [10, 20],
            'min_samples_leaf': [1, 2],
            'max_features': ['auto', 'sqrt']
        }

        rf = RandomForestRegressor(random_state=42, n_jobs=-1)

        grid_search = GridSearchCV(
            estimator=rf,
            param_grid=param_grid,
            cv=tscv,
            scoring='neg_mean_squared_error',
            verbose=1
        )

        grid_search.fit(X, y)

        rf_model = grid_search.best_estimator_

    test_df = test.to_pandas()

    if lags_ is not None:
        test_lags_df = lags_.to_pandas()
        test_df = test_df.merge(test_lags_df, on=["symbol_id", "date_id"], how="left")
    else:
        for lag_col in lag_columns:
            test_df[lag_col] = 0

    test_df['symbol_id_encoded'] = symbol_encoder.transform(
        test_df['symbol_id'].fillna('unknown')
    )

    test_df['date_id_sin'] = np.sin(2 * np.pi * test_df['date_id'] / 365)
    test_df['date_id_cos'] = np.cos(2 * np.pi * test_df['date_id'] / 365)

    test_features = test_df[all_feature_columns]

    test_features = test_features.fillna(train_df[all_feature_columns].median())

    test_df["responder_6"] = rf_model.predict(test_features)

    predictions = pl.DataFrame({
        "row_id": test_df["row_id"],
        "responder_6": test_df["responder_6"]
    })

    return predictions
    
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )

ModuleNotFoundError: No module named 'kaggle_evaluation'