In [None]:
# High-score oriented tiny-data pipeline with learned exposure map and vol cap
# - Impute -> PowerTransformer(Yeo–Johnson) -> Standardize (fit on train, reuse at test)
# - Tiny-data ensemble for mean/variance via StackingRegressor with linear meta
# - Score s = mu / sqrt(var), learn monotone piecewise exposure g(s) on a time-ordered validation slice
# - Global volatility cap to <= 1.2x market vol (as per metric), positions clipped to [0, 2]
# - Kaggle default inference server integration

import os, warnings
warnings.filterwarnings("ignore")
import numpy as np, pandas as pd, polars as pl
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.ensemble import StackingRegressor

import kaggle_evaluation.default_inference_server 

# ---------------- Config ----------------
DATA_PATH = Path("/kaggle/input/hull-tactical-market-prediction/")
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

MIN_INVESTMENT, MAX_INVESTMENT = 0.0, 2.0
PREFIXES = ("D","E","I","M","P","S","V","MOM")
LAG_SRC_COLS = ["E1","V1","E3"]

VAL_FRAC = 0.15
N_BINS = 7
EPS_VAR = 1e-6
TARGET_VOL_MULT = 1.2  # strategy vol <= 1.2 * market vol on validation

# ------------- Metric (official) -------------
class ParticipantVisibleError(Exception): pass

def score_metric(solution: pd.DataFrame, submission: pd.DataFrame) -> float:
    sol = solution.copy()
    sol["position"] = submission["prediction"].astype(float)
    if sol["position"].max() > MAX_INVESTMENT:
        raise ParticipantVisibleError(f'Position {sol["position"].max()} exceeds {MAX_INVESTMENT}')
    if sol["position"].min() < MIN_INVESTMENT:
        raise ParticipantVisibleError(f'Position {sol["position"].min()} below {MIN_INVESTMENT}')
    sol["strategy_returns"] = sol["risk_free_rate"]*(1 - sol["position"]) + sol["forward_returns"]*sol["position"]
    strategy_excess = sol["strategy_returns"] - sol["risk_free_rate"]
    strategy_excess_cum = (1.0 + strategy_excess).prod()
    strategy_mean_excess = strategy_excess_cum**(1/len(sol)) - 1.0
    strategy_std = sol["strategy_returns"].std()
    if strategy_std == 0:
        raise ZeroDivisionError
    trading_days_per_yr = 252
    sharpe = strategy_mean_excess / strategy_std * np.sqrt(trading_days_per_yr)

    market_excess = sol["forward_returns"] - sol["risk_free_rate"]
    market_excess_cum = (1.0 + market_excess).prod()
    market_mean_excess = market_excess_cum**(1/len(sol)) - 1.0
    market_std = sol["forward_returns"].std()

    strategy_vol = float(strategy_std * np.sqrt(trading_days_per_yr) * 100)
    market_vol = float(market_std * np.sqrt(trading_days_per_yr) * 100)
    excess_vol = max(0, strategy_vol/market_vol - 1.2) if market_vol > 0 else 0
    vol_penalty = 1 + excess_vol

    return_gap = max(0, (market_mean_excess - strategy_mean_excess) * 100 * trading_days_per_yr)
    return_penalty = 1 + (return_gap**2) / 100

    adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
    return min(float(adjusted_sharpe), 1_000_000.0)

# ------------- Globals for serving -------------
feature_cols_global = None
imputer_global = None
pt_global = None
scaler_global = None
lag_state = {c: 0.0 for c in LAG_SRC_COLS}
mu_model = None
var_model = None
bin_edges_global = None
bin_exposures_global = None
vol_scale_global = 1.0

# ------------- Helpers -------------
def is_feature(c): return c.startswith(PREFIXES)

def list_feature_cols(df):
    cols = [c for c in df.columns if is_feature(c)]
    cols.sort()
    return cols

def clip_outliers(df, cols, ql=0.01, qh=0.99):
    for c in cols:
        low, high = df[c].quantile(ql), df[c].quantile(qh)
        df[c] = df[c].clip(low, high)
    return df

def add_lags(df, cols):
    for c in cols:
        if c not in df.columns: df[c] = 0.0
        df[f"{c}_lag1"] = df[c].shift(1).fillna(0.0)
        df[f"{c}_lag2"] = df[c].shift(2).fillna(0.0)
    return df

# ------------- Preprocessing (fit) -------------
def fit_preprocess(train):
    global feature_cols_global, imputer_global, pt_global, scaler_global
    train = train.sort_values("date_id").reset_index(drop=True)
    feature_cols_global = list_feature_cols(train)
    train["risk_free_rate"] = train["risk_free_rate"].diff().fillna(0.0)
    train["excess_returns"] = train["forward_returns"] - train["risk_free_rate"]
    train = clip_outliers(train, feature_cols_global)
    train = add_lags(train, [c for c in LAG_SRC_COLS if c in train.columns])
    lag_cols = [f"{c}_lag1" for c in LAG_SRC_COLS if f"{c}_lag1" in train.columns] + \
               [f"{c}_lag2" for c in LAG_SRC_COLS if f"{c}_lag2" in train.columns]
    X_cols = feature_cols_global + lag_cols

    # Impute -> Yeo–Johnson -> Scale
    imputer_global = SimpleImputer(strategy="median")
    X_imp = imputer_global.fit_transform(train[X_cols])
    pt_global = PowerTransformer(method="yeo-johnson", standardize=False)
    X_pt = pt_global.fit_transform(X_imp)
    scaler_global = StandardScaler()
    X_all = scaler_global.fit_transform(X_pt)

    y_mu = train["excess_returns"].values.astype(float)
    y_var = (train["excess_returns"].values.astype(float))**2
    return X_all, y_mu, y_var, X_cols, train

# ------------- Preprocessing (transform) -------------
def transform_preprocess(test_df):
    global lag_state
    df = test_df.sort_values("date_id").reset_index(drop=True)
    for c in feature_cols_global:
        if c not in df.columns: df[c] = 0.0
    df = clip_outliers(df, feature_cols_global)
    for c in LAG_SRC_COLS:
        if c not in df.columns: df[c] = 0.0
        df[f"{c}_lag1"] = df[c].shift(1)
        df.loc[df.index.min(), f"{c}_lag1"] = lag_state.get(c, 0.0)
        df[f"{c}_lag1"] = df[f"{c}_lag1"].fillna(0.0)
        df[f"{c}_lag2"] = df[c].shift(2).fillna(0.0)
    if len(df) > 0:
        for c in LAG_SRC_COLS:
            lag_state[c] = float(df[c].iloc[-1])
    lag_cols = [f"{c}_lag1" for c in LAG_SRC_COLS] + [f"{c}_lag2" for c in LAG_SRC_COLS]
    X_cols = feature_cols_global + lag_cols
    for c in X_cols:
        if c not in df.columns: df[c] = 0.0
    X_imp = imputer_global.transform(df[X_cols])
    X_pt = pt_global.transform(X_imp)
    Xt = scaler_global.transform(X_pt)
    if not np.isfinite(Xt).all():
        Xt = np.nan_to_num(Xt, nan=0.0, posinf=0.0, neginf=0.0)
    return Xt

# ------------- Models -------------
def build_models():
    ridge = RidgeCV(alphas=[1e-3,1e-2,1e-1,1,10])
    lasso = LassoCV(alphas=[1e-3,1e-2,1e-1,1,10], max_iter=5000, random_state=RANDOM_STATE)
    enet  = ElasticNetCV(alphas=[1e-3,1e-2,1e-1,1,10], l1_ratio=[0.2,0.5,0.8], max_iter=5000, random_state=RANDOM_STATE)
    base = [("ridge", ridge), ("lasso", lasso), ("enet", enet)]
    meta = RidgeCV(alphas=[1e-3,1e-2,1e-1,1,10])
    mu = StackingRegressor(estimators=base, final_estimator=meta, cv=5, n_jobs=-1)
    var = StackingRegressor(estimators=base, final_estimator=meta, cv=5, n_jobs=-1)
    return mu, var

# ------------- Learn exposure map g(s) and vol cap -------------
def learn_exposure_and_vol(train_proc, X_all, mu, var):
    n = len(train_proc)
    split = int(n*(1.0 - VAL_FRAC))
    val_idx = np.arange(split, n)
    sol_val = train_proc.iloc[val_idx][["date_id","forward_returns","risk_free_rate"]].reset_index(drop=True)

    mu_val = mu.predict(X_all[val_idx])
    var_val = var.predict(X_all[val_idx])
    scores = mu_val / np.sqrt(np.maximum(var_val, EPS_VAR))

    qs = np.linspace(0, 1, N_BINS+1)
    edges = np.quantile(scores, qs)
    for i in range(1, len(edges)):
        if edges[i] <= edges[i-1]:
            edges[i] = edges[i-1] + 1e-9

    exps = np.linspace(0.0, 0.8, N_BINS)
    best_score = -1e9

    def assign(scores_, exps_, edges_):
        idx = np.searchsorted(edges_[1:-1], scores_, side="right")
        return exps_[idx]

    # Coordinate ascent with monotone projection
    for _ in range(60):
        improved = False
        for k in range(N_BINS):
            grid = np.linspace(max(0.0, exps[k]-0.25), min(2.0, exps[k]+0.25), 9)
            local_best = exps[k]
            local_best_score = -1e9
            for v in grid:
                trial = exps.copy()
                trial[k] = v
                for i in range(1, N_BINS):
                    if trial[i] < trial[i-1]: trial[i] = trial[i-1]
                pos = np.clip(assign(scores, trial, edges), MIN_INVESTMENT, MAX_INVESTMENT)
                s = score_metric(sol_val.copy(), pd.DataFrame({"prediction": pos}))
                if s > local_best_score:
                    local_best_score, local_best = s, trial[k]
            if local_best_score > best_score:
                best_score, exps[k] = local_best_score, local_best
                improved = True
        if not improved:
            break

    # Volatility cap to <= 1.2x market vol
    pos_val = np.clip(assign(scores, exps, edges), MIN_INVESTMENT, MAX_INVESTMENT)
    tmp = sol_val.copy()
    tmp["position"] = pos_val
    strat_std = (tmp["risk_free_rate"]*(1 - tmp["position"]) + tmp["forward_returns"]*tmp["position"]).std() * np.sqrt(252)
    mkt_std = tmp["forward_returns"].std() * np.sqrt(252)
    vol_scale = 1.0
    if mkt_std > 0 and strat_std > TARGET_VOL_MULT * mkt_std:
        vol_scale = (TARGET_VOL_MULT * mkt_std) / strat_std

    return edges, exps, float(vol_scale), float(best_score)

# ------------- Train all -------------
def train_all():
    global mu_model, var_model, bin_edges_global, bin_exposures_global, vol_scale_global
    train = pd.read_csv(DATA_PATH / "train.csv").dropna(subset=["forward_returns","risk_free_rate"])
    train = train.sort_values("date_id").reset_index(drop=True)
    X_all, y_mu, y_var, X_cols, train_proc = fit_preprocess(train)
    mu_model, var_model = build_models()
    mu_model.fit(X_all, y_mu)
    var_model.fit(X_all, y_var)
    edges, exps, vol_scale, val_score = learn_exposure_and_vol(train_proc, X_all, mu_model, var_model)
    bin_edges_global, bin_exposures_global, vol_scale_global = edges, exps, vol_scale
    return float(val_score)

# ------------- Predict callback -------------
def make_predict():
    def predict(test_pl: pl.DataFrame) -> pl.DataFrame:
        Xt = transform_preprocess(test_pl.to_pandas())
        mu = mu_model.predict(Xt)
        var = var_model.predict(Xt)
        s = mu / np.sqrt(np.maximum(var, EPS_VAR))
        idx = np.searchsorted(bin_edges_global[1:-1], s, side="right")
        pos = bin_exposures_global[idx] * vol_scale_global
        pos = np.clip(pos, MIN_INVESTMENT, MAX_INVESTMENT)
        if np.all(pos == pos[0]):
            pos = np.clip(pos + 1e-6, MIN_INVESTMENT, MAX_INVESTMENT)
        return pl.DataFrame({"prediction": pos})
    return predict

# ------------- Main -------------
def main():
    val_score = train_all()
    predict_fn = make_predict()
    server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict_fn)
    if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
        server.serve()
    else:
        server.run_local_gateway((str(DATA_PATH),))

if __name__ == "__main__":
    main()
