In [22]:
import sys
import os
import pandas as pd
import polars as pl
import pickle
import numpy as np
sys.path.append("/Users/kyleee/code/project/kaggle_competition")
from data.kaggle_evaluation import jane_street_inference_server

In [23]:
# 从文件当中读取模型
model_path = "/Users/kyleee/code/project/kaggle_competition/xgb_model.pkl"
with open(model_path, "rb") as f:
    result = pickle.load(f)
model = result['model']
display(model)

In [24]:
feature_nums = 79
feature_names = [f"feature_{i:02d}" for i in range(feature_nums)]
class CONFIG:
    seed = 42
    target_col = "responder_6"
    feature_cols = ["symbol_id", "time_id"] + [f"feature_{idx:02d}" for idx in range(79)]+ [f"responder_{idx}_lag_1" for idx in range(9)] 

In [25]:
lags_ : pl.DataFrame | None = None

# Replace this function with your inference code.
# You can return either a Pandas or Polars dataframe, though Polars is recommended.
# Each batch of predictions (except the very first) must be returned within 10 minutes of the batch features being provided.
def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    """Make a prediction."""
    # All the responders from the previous day are passed in at time_id == 0. We save them in a global variable for access at every time_id.
    # Use them as extra features, if you like.
    global lags_
    if lags is not None:
        lags_ = lags
    predictions = test.select(
        'row_id',
        pl.lit(0.0).alias('responder_6'),
    )
    symbol_ids = test.select('symbol_id').to_numpy()[:, 0]
    if not lags is None:
        lags = lags.group_by(["date_id", "symbol_id"], maintain_order=True).last() # pick up last record of previous date
        test = test.join(lags, on=["date_id", "symbol_id"],  how="left")
    else:
        test = test.with_columns(
            (pl.lit(0.0).alias(f'responder_{idx}_lag_1') for idx in range(9))
        )
        
    print(test.select(CONFIG.feature_cols))
    pred = model.predict(test.select(CONFIG.feature_cols).to_pandas())
    
    predictions = test.select('row_id').with_columns(pl.Series(
            name = 'responder_6', 
            values = np.clip(pred, a_min = -5, a_max = 5),
            dtype  = pl.Float64,
        ))

    # The predict function must return a DataFrame
    assert isinstance(predictions, pl.DataFrame | pd.DataFrame)
    # with columns 'row_id', 'responer_6'
    assert list(predictions.columns) == ['row_id', 'responder_6']
    # and as many rows as the test data.
    assert len(predictions) == len(test)

    return predictions
    
inference_server = jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/Users/kyleee/code/project/kaggle_competition/data/test.parquet',
            '/Users/kyleee/code/project/kaggle_competition/data/lags.parquet',
        )
    )


shape: (39, 90)
┌───────────┬─────────┬────────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ symbol_id ┆ time_id ┆ feature_00 ┆ feature_0 ┆ … ┆ responder ┆ responder ┆ responder ┆ responder │
│ ---       ┆ ---     ┆ ---        ┆ 1         ┆   ┆ _5_lag_1  ┆ _6_lag_1  ┆ _7_lag_1  ┆ _8_lag_1  │
│ i8        ┆ i16     ┆ f32        ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│           ┆         ┆            ┆ f32       ┆   ┆ f32       ┆ f32       ┆ f32       ┆ f32       │
╞═══════════╪═════════╪════════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 0         ┆ 0       ┆ 0.0        ┆ 0.0       ┆ … ┆ -0.036595 ┆ -1.305746 ┆ -0.795677 ┆ -0.143724 │
│ 1         ┆ 0       ┆ 0.0        ┆ -0.0      ┆ … ┆ -0.615652 ┆ -1.162801 ┆ -1.205924 ┆ -1.245934 │
│ 2         ┆ 0       ┆ 0.0        ┆ -0.0      ┆ … ┆ -0.378265 ┆ -1.57429  ┆ -1.863071 ┆ -0.027343 │
│ 3         ┆ 0       ┆ 0.0        ┆ 0.0       ┆ … ┆ -0.054984 ┆ 0.329152  