In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# ============================================================
# ðŸ§  Hull Tactical Market Prediction â€” Stacked Ensemble v5
# Compatible with Kaggle Inference Server
# ============================================================

import os
from pathlib import Path
import numpy as np
import pandas as pd
import polars as pl
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import kaggle_evaluation.default_inference_server

# ============================================================
# CONFIG
# ============================================================
DATA_PATH = Path('/kaggle/input/hull-tactical-market-prediction/')
MIN_SIGNAL, MAX_SIGNAL = 0.0, 2.0
SIGNAL_MULTIPLIER = 400.0

LAG_PERIODS = (1, 5)
LAG_CANDIDATES = ("S2", "E2", "I2", "P9", "U1", "U2")
VARS_TO_KEEP_BASE = [
    "S2", "E2", "E3", "P9", "S1", "S5", "I2", "P8",
    "P10", "P12", "P13"
]

# ============================================================
# HELPERS
# ============================================================

def add_lag_features(df: pl.DataFrame) -> pl.DataFrame:
    exprs = []
    available = [col for col in LAG_CANDIDATES if col in df.columns]
    for col in available:
        for lag in LAG_PERIODS:
            exprs.append(pl.col(col).shift(lag).over('date_id').alias(f"{col}_lag{lag}"))
    if exprs:
        df = df.with_columns(exprs)
    return df

def load_train() -> pl.DataFrame:
    return (
        pl.read_csv(DATA_PATH / "train.csv")
        .rename({'market_forward_excess_returns':'target'})
        .with_columns(pl.exclude('date_id').cast(pl.Float64, strict=False))
        .head(-10)
    )

def load_test() -> pl.DataFrame:
    return (
        pl.read_csv(DATA_PATH / "test.csv")
        .rename({'lagged_forward_returns':'target'})
        .with_columns(pl.exclude('date_id').cast(pl.Float64, strict=False))
    )

def create_features(df: pl.DataFrame) -> pl.DataFrame:
    df = df.with_columns(
        (pl.col("I2") - pl.col("I1")).alias("U1"),
        (pl.col("M11") / ((pl.col("I2") + pl.col("I9") + pl.col("I7")) / 3)).alias("U2")
    )
    df = add_lag_features(df)

    feature_candidates = VARS_TO_KEEP_BASE + ["U1", "U2"]
    lag_cols = [c for c in df.columns if any(c.endswith(f"_lag{p}") for p in LAG_PERIODS)]
    all_features = [c for c in feature_candidates + lag_cols if c in df.columns]

    selection_cols = ["date_id"]
    if 'target' in df.columns:
        selection_cols.append("target")
    selection_cols.extend(all_features)

    df = df.select(selection_cols)

    # Impute nulls with EWMA then fill 0
    df = df.with_columns([
        pl.col(c).fill_null(pl.col(c).ewm_mean(com=0.5))
        for c in all_features if c in df.columns
    ])
    df = df.with_columns([
        pl.col(c).fill_null(0.0)
        for c in all_features if c in df.columns
    ])
    return df.select(selection_cols)

def join_train_test(train, test):
    common_cols = [c for c in train.columns if c in test.columns]
    return pl.concat([train.select(common_cols), test.select(common_cols)], how="vertical")

def convert_ret_to_signal(arr: np.ndarray) -> np.ndarray:
    return np.clip(arr * SIGNAL_MULTIPLIER + 1, MIN_SIGNAL, MAX_SIGNAL)

# ============================================================
# LOAD + PROCESS
# ============================================================
train = load_train()
test = load_test()
df = join_train_test(train, test)
df = create_features(df)

train = df.filter(pl.col('date_id').is_in(train.get_column('date_id')))
test  = df.filter(pl.col('date_id').is_in(test.get_column('date_id')))

FEATURES = [c for c in test.columns if c not in ['date_id', 'target']]

# ============================================================
# SCALE + CONVERT
# ============================================================
scaler = StandardScaler()
X_train = scaler.fit_transform(train.select(FEATURES).to_numpy())
y_train = train['target'].to_numpy()
X_test  = scaler.transform(test.select(FEATURES).to_numpy())

# ============================================================
# STACKED ENSEMBLE
# ============================================================
base_models = [
    ("ridge", Ridge(alpha=0.5, random_state=42)),
    ("lgbm", LGBMRegressor(
        n_estimators=600, learning_rate=0.03, num_leaves=64,
        subsample=0.8, colsample_bytree=0.8, random_state=42
    )),
    ("xgb", XGBRegressor(
        n_estimators=700, learning_rate=0.03, max_depth=7,
        subsample=0.8, colsample_bytree=0.8, random_state=42
    )),
    ("cat", CatBoostRegressor(
        iterations=700, learning_rate=0.03, depth=7,
        verbose=0, random_seed=42
    ))
]

meta_model = Ridge(alpha=1.0, random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
meta_train = np.zeros((X_train.shape[0], len(base_models)))
meta_test  = np.zeros((X_test.shape[0], len(base_models)))

print("\n--- Starting 5-Fold Stacked Training ---")

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train, y_train), 1):
    print(f"\nFold {fold}")
    X_tr, X_val = X_train[tr_idx], X_train[val_idx]
    y_tr, y_val = y_train[tr_idx], y_train[val_idx]

    fold_test_preds = np.zeros((X_test.shape[0], len(base_models)))

    for j, (name, model) in enumerate(base_models):
        model.fit(X_tr, y_tr)
        val_pred = model.predict(X_val)
        meta_train[val_idx, j] = val_pred
        fold_test_preds[:, j] = model.predict(X_test)
        rmse = mean_squared_error(y_val, val_pred, squared=False)
        print(f"{name:<6} RMSE: {rmse:.5f}")

    meta_test += fold_test_preds / kf.n_splits

print("\nTraining meta Ridge model...")
meta_model.fit(meta_train, y_train)
final_preds = meta_model.predict(meta_test)

# ============================================================
# FINAL PREDICTION FUNCTION
# ============================================================
def predict(test_chunk: pl.DataFrame) -> float:
    df_processed = create_features(test_chunk)
    X_t = df_processed.select(FEATURES).to_numpy()
    X_t = scaler.transform(X_t)

    base_preds = np.column_stack([
        model.predict(X_t) for _, model in base_models
    ])
    meta_pred = meta_model.predict(base_preds)
    return convert_ret_to_signal(meta_pred).item()

# ============================================================
# LOCAL TEST
# ============================================================
inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    local_test_chunk = load_test().head(1)
    signal = predict(local_test_chunk)
    print(f"\nLocal Test Prediction Signal: {signal:.4f}")
