In [None]:
#=========================================================
#CELL 1 — IMPORTS
#=========================================================

import numpy as np
import pandas as pd
import polars as pl
import lightgbm as lgb
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from kaggle_evaluation.default_inference_server import DefaultInferenceServer



In [None]:
train = pd.read_csv("/kaggle/input/dataset/train_merged.csv")

TARGET = "market_forward_excess_returns"


In [None]:
# ================================================
# LOAD HULL DATA
# ================================================
train = pd.read_csv("/kaggle/input/hull-tactical-market-prediction/train.csv")
test  = pd.read_csv("/kaggle/input/hull-tactical-market-prediction/test.csv")

# ================================================
# LOAD DAILY BREADTH FEATURES (FROM YOUR DATASET)
# ================================================
daily = pd.read_csv("/kaggle/input/dataset/sp500_breadth_features.csv")

daily["date"] = pd.to_datetime(daily["date"])
daily = daily.sort_values("date").reset_index(drop=True)
daily["date_id"] = daily.index

print("Breadth file columns:", daily.columns.tolist())
print("Min daily date_id:", daily.date_id.min())
print("Max daily date_id:", daily.date_id.max())


# ================================================
# MERGE USING date_id
# ================================================
train2 = train.merge(daily, on="date_id", how="left")
test2  = test.merge(daily, on="date_id", how="left")

train2 = train2.fillna(method="ffill").fillna(0)
test2  = test2.fillna(method="ffill").fillna(0)

# ================================================
# SAVE TO WORKING FOLDER (ALLOWED)
# ================================================
train2.to_csv("/kaggle/working/train_merged.csv", index=False)
test2.to_csv("/kaggle/working/test_merged.csv", index=False)

print("Merged train shape:", train2.shape)
print("Merged test shape:", test2.shape)


In [None]:
# ============================================================
# FEATURE ENGINEERING ON MERGED TRAIN
# ============================================================

df = train2.copy()

# ---------------------------------------
# Lagged breadth indicators
# ---------------------------------------
lag_cols = [
    "breadth_up",
    "avg_ret_1d",
    "dispersion_1d",
    "avg_vol_20d"
]

for c in lag_cols:
    if c not in df.columns:
        raise ValueError(f"Missing required feature column: {c}")
    df[c + "_lag1"] = df[c].shift(1)
    df[c + "_lag2"] = df[c].shift(2)

# ---------------------------------------
# Momentum of dispersion
# ---------------------------------------
df["dispersion_mom"] = df["dispersion_1d"].diff()

# ---------------------------------------
# Volatility regime detection
# ---------------------------------------
vol_ma = df["avg_vol_20d"].rolling(60).mean()
df["regime"] = (df["avg_vol_20d"] > vol_ma).astype(int)

# ---------------------------------------
# Fill lag-related missing values
# ---------------------------------------
df = df.fillna(0)

# ---------------------------------------
# Remove columns not used as features
# ---------------------------------------
exclude = [
    "date_id",
    "date",
    "forward_returns",
    "risk_free_rate",
    "market_forward_excess_returns",  # target
]

feature_cols = [c for c in df.columns if c not in exclude]

print("Number of features:", len(feature_cols))
print("Sample features:", feature_cols[:15])


In [None]:
tscv = TimeSeriesSplit(n_splits=8)

models_lgb = []
models_xgb = []
models_cat = []

for fold, (tr, va) in enumerate(tscv.split(df)):
    print(f"Training Fold {fold+1}/8")

    X_tr, y_tr = df.iloc[tr][feature_cols], df.iloc[tr][TARGET]
    X_va, y_va = df.iloc[va][feature_cols], df.iloc[va][TARGET]

    # LightGBM
    lgb_params = {
        "objective": "regression",
        "metric": "rmse",
        "learning_rate": 0.01,
        "num_leaves": 48,
        "feature_fraction": 0.75,
        "bagging_fraction": 0.75,
        "bagging_freq": 5,
        "min_data_in_leaf": 25,
        "verbose": -1,
    }

    model_lgb = lgb.train(
        lgb_params,
        lgb.Dataset(X_tr, y_tr),
        valid_sets=[lgb.Dataset(X_va, y_va)],
        num_boost_round=3000,
        callbacks=[lgb.early_stopping(200, verbose=False)]
    )
    models_lgb.append(model_lgb)

    # XGBoost
    model_xgb = XGBRegressor(
        n_estimators=2000,
        learning_rate=0.01,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.75,
        objective="reg:squarederror",
        eval_metric="rmse",
        tree_method="hist"
    )
    model_xgb.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False)
    models_xgb.append(model_xgb)

    # CatBoost
    model_cat = CatBoostRegressor(
        iterations=2000,
        learning_rate=0.01,
        depth=6,
        loss_function="RMSE",
        verbose=False
    )
    model_cat.fit(X_tr, y_tr, eval_set=(X_va, y_va), verbose=False)
    models_cat.append(model_cat)


In [None]:
def ensemble_predict(df):
    X = df[feature_cols].fillna(0)
    pred_lgb = np.mean([m.predict(X) for m in models_lgb], axis=0)
    pred_xgb = np.mean([m.predict(X) for m in models_xgb], axis=0)
    return 0.6 * pred_lgb + 0.4 * pred_xgb

def ensemble_pred(X):
    p_lgb = np.mean([m.predict(X) for m in models_lgb], axis=0)
    p_xgb = np.mean([m.predict(X) for m in models_xgb], axis=0)
    p_cat = np.mean([m.predict(X) for m in models_cat], axis=0)
    return 0.5*p_lgb + 0.3*p_xgb + 0.2*p_cat


In [None]:

lag_cols = [
    "breadth_up",
    "avg_ret_1d",
    "dispersion_1d",
    "avg_vol_20d"
]


BASE = 0.10
prev_alloc = BASE

def adaptive_alloc(raw_signal, regime):
    global prev_alloc

    # Noise clamp
    raw_signal = np.clip(raw_signal, -0.02, 0.02)

    # Stronger multiplier in calm regime
    mult = 5.0 if regime == 0 else 3.0

    adj = raw_signal * mult
    alloc = BASE + adj

    # Regime-based volatility dampening
    if regime == 1:  # high volatility regime
        alloc *= 0.7

    # Smoothing
    alpha = 0.15 if regime == 0 else 0.25  
    alloc = alpha * alloc + (1 - alpha) * prev_alloc

    # Bounds
    alloc = np.clip(alloc, 0, 2)
    prev_alloc = alloc

    return alloc


In [None]:
def approx_sharpe(r):
    m = np.mean(r)
    s = np.std(r)
    return 0 if s == 0 else m/s*np.sqrt(252)

N = 300
df_val = df.iloc[-N:].copy()

X_val = df_val[feature_cols].fillna(0)
pred_raw = ensemble_pred(X_val)

allocs = []
prev_alloc = BASE

for raw, reg in zip(pred_raw, df_val["regime"]):
    allocs.append(adaptive_alloc(raw, reg))

allocs = np.array(allocs)

# Strategy return approximation
strategy_returns = (
    df_val["risk_free_rate"] * (1 - allocs)
    + allocs * df_val["forward_returns"]
)

score_local = approx_sharpe(strategy_returns)

print("=======================================")
print(" LOCAL SHARPE SCORE:", score_local)
print("=======================================")


In [None]:
def predict(test_df: pl.DataFrame) -> pl.DataFrame:
    global last_alloc

    pdf = test_df.to_pandas()

    pdf2 = pdf.merge(daily, on="date_id", how="left").ffill().fillna(0)

    # feature engineering
    for c in lag_cols:
        pdf2[c+"_lag1"] = pdf2[c].shift(1)
        pdf2[c+"_lag2"] = pdf2[c].shift(2)

    pdf2["dispersion_mom"] = pdf2["dispersion_1d"].diff()
    vol_ma = pdf2["avg_vol_20d"].rolling(60).mean()
    pdf2["regime"] = (pdf2["avg_vol_20d"] > vol_ma).astype(int)

    pdf2 = pdf2.fillna(0)

    # ensure missing columns are added
    for col in feature_cols:
        if col not in pdf2:
            pdf2[col] = 0
    pdf2 = pdf2[feature_cols]

    # FIXED: correct function name
    raw_preds = ensemble_pred(pdf2)

    # regime is NOT inside pdf2 anymore → must fetch from original engineered dataframe
    regimes = pdf2["regime"].values if "regime" in pdf2 else np.zeros(len(pdf2))

    allocs = [adaptive_alloc(p, r) for p, r in zip(raw_preds, regimes)]

    return test_df.with_columns(pl.Series("prediction", allocs))


In [None]:
inference_server = DefaultInferenceServer(predict)

import os
if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        ("/kaggle/input/hull-tactical-market-prediction/",)
    )
