In [None]:
# |-------|-------------|
# | **Vol-Constrained Sharpe Loss** | `-mean_ret / std_ret + λ × (std_ret - 0.4)²` |
# | **EMA (Exponential Moving Average)** | Starts at epoch 5 → **smooths weights**|
# | **Sharpe-Aware AdamW** | LR boosted by recent Sharpe improvement  |
# | **Online Vol Scaling (inference)** | 20-day rolling std |
# | **10-fold Walk-Forward CV** | 180-day validation windows |
# | **Model Averaging** | Mean of 10 EMA models |


In [None]:

import os, math, random, numpy as np, pandas as pd, polars as pl
from pathlib import Path
import torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import kaggle_evaluation.default_inference_server

# ==============================================================
#  1. DETERMINISTIC
# ==============================================================
def set_deterministic(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(True, warn_only=True)

set_deterministic(42)

# --------------------------------------------------------------
#  CONFIG
# --------------------------------------------------------------
DATA_PATH = Path('/kaggle/input/hull-tactical-market-prediction/')
LB_RUN = True
N_FOLDS = 10
TEST_SIZE = 180
TRAIN_EPOCHS = 60
BATCH_SIZE = 1024
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_WORKERS = 0
EMA_DECAY = 0.995
EMA_START = 5
TARGET_VOL = 0.4
WARMUP = 5
VOL_PENALTY_LAMBDA = 0.01  # From NeuriPS 2024
LABEL_SMOOTHING = 0.1

# --------------------------------------------------------------
#  METRIC
# --------------------------------------------------------------
def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    sol = solution.copy()
    sol['position'] = submission['prediction'].clip(0, 2)
    sol['strategy_returns'] = sol['risk_free_rate'] * (1 - sol['position']) + sol['position'] * sol['forward_returns']
    excess = sol['strategy_returns'] - sol['risk_free_rate']
    cum = (1 + excess).prod()
    ann_ret = cum ** (252 / len(sol)) - 1
    ann_vol = sol['strategy_returns'].std() * np.sqrt(252) * 100
    if ann_vol == 0: return 0
    sharpe = ann_ret / (sol['strategy_returns'].std() * np.sqrt(252))
    m_excess = sol['forward_returns'] - sol['risk_free_rate']
    m_cum = (1 + m_excess).prod()
    m_ann_ret = m_cum ** (252 / len(sol)) - 1
    m_ann_vol = sol['forward_returns'].std() * np.sqrt(252) * 100
    if m_ann_vol == 0: return 0
    vol_penalty = 1 + max(0, ann_vol / m_ann_vol - 1.2)
    ret_gap = max(0, (m_ann_ret - ann_ret) * 252 * 100)
    ret_penalty = 1 + (ret_gap**2) / 100
    return min(sharpe / (vol_penalty * ret_penalty), 1_000_000)

# --------------------------------------------------------------
#  DATA
# --------------------------------------------------------------
def load_full_train() -> pl.DataFrame:
    df = pl.read_csv(DATA_PATH / "train.csv")
    return (df
            .rename({'market_forward_excess_returns': 'target'})
            .with_columns(pl.exclude('date_id').cast(pl.Float32))
            .sort('date_id'))

def create_example_dataset(df: pl.DataFrame) -> pl.DataFrame:
    df = df.with_columns(pl.exclude(['date_id','target']).cast(pl.Float32))
    df = df.with_columns([pl.col(c).fill_null(0.0) for c in df.columns if c not in ['date_id','target']])
    return df

def get_features(train_df, test_df) -> list[str]:
    train_set = set(train_df.columns) - {'date_id','target','forward_returns','risk_free_rate'}
    test_set  = set(test_df.columns) - {'date_id','target','is_scored','lagged_risk_free_rate','lagged_market_forward_excess_returns'}
    return list(train_set & test_set)

def prepare_training_data(df: pl.DataFrame, features: list[str]):
    df = df.with_columns(pl.col(features).cast(pl.Float32))
    X = df.select(features).to_numpy().astype(np.float32)
    y = df['target'].to_numpy().astype(np.float32)
    fr = df.get_column('forward_returns').to_numpy().astype(np.float32) if 'forward_returns' in df.columns else np.zeros(len(y), dtype=np.float32)
    return X, y, fr

# --------------------------------------------------------------
#  WALK-FORWARD
# --------------------------------------------------------------
def walk_forward_split(n_samples: int):
    need = N_FOLDS * TEST_SIZE
    if n_samples < need: raise ValueError('Not enough data')
    start = n_samples - need
    splits = []
    for i in range(N_FOLDS):
        val_start = start + i*TEST_SIZE
        val_end   = val_start + TEST_SIZE
        train_idx = np.arange(0, val_start)
        val_idx   = np.arange(val_start, val_end)
        splits.append((train_idx, val_idx))
    return splits

# --------------------------------------------------------------
#  LOSS – VOL CONSTRAINT
# --------------------------------------------------------------
def sharpe_loss(y_pred, y_true, eps=1e-6):
    y_pred = y_pred.clamp(-3, 3)
    position = torch.tanh(y_pred) * 2.0
    position = position.clamp(0, 2)
    strategy_ret = position * y_true
    mean_ret = strategy_ret.mean()
    std_ret = strategy_ret.std() + eps
    sharpe = -mean_ret / std_ret
    # Vol constraint
    vol_penalty = VOL_PENALTY_LAMBDA * (std_ret - TARGET_VOL) ** 2
    return sharpe + vol_penalty

# --------------------------------------------------------------
#  EMA
# --------------------------------------------------------------
class EMA:
    def __init__(self, model, decay=0.995):
        self.decay = decay
        self.shadow = {n: p.clone().detach() for n, p in model.named_parameters() if p.requires_grad}
    def update(self, model):
        for n, p in model.named_parameters():
            if p.requires_grad:
                self.shadow[n] = self.decay * self.shadow[n] + (1.0 - self.decay) * p.detach()
    def apply(self, model):
        for n, p in model.named_parameters():
            if p.requires_grad:
                p.data.copy_(self.shadow[n])

# --------------------------------------------------------------
#  OPTIMIZER
# --------------------------------------------------------------
class SharpeAwareAdamW(optim.Optimizer):
    def __init__(self, params, lr=3e-3, warmup=5, T_max=60, weight_decay=1e-5):
        defaults = dict(lr=lr, warmup=warmup, T_max=T_max, weight_decay=weight_decay, t=0, sharpe_hist=[])
        super().__init__(params, defaults)

    def set_epoch_sharpe(self, sharpe):
        for group in self.param_groups:
            group['sharpe_hist'].append(sharpe)
            if len(group['sharpe_hist']) > 5: group['sharpe_hist'].pop(0)

    def get_lr(self, epoch):
        warmup = self.param_groups[0]['warmup']
        T_max = self.param_groups[0]['T_max']
        base_lr = self.param_groups[0]['lr']
        if epoch < warmup:
            return base_lr * (epoch + 1) / warmup
        else:
            progress = (epoch - warmup) / (T_max - warmup)
            return base_lr * 0.5 * (1 + math.cos(math.pi * progress))

    def _sharpe_boost(self):
        hist = self.param_groups[0]['sharpe_hist']
        if len(hist) < 2: return 1.0
        improvement = (hist[-1] - hist[-2]) / (abs(hist[-2]) + 1e-8)
        return max(0.5, min(3.0, 1.0 + 3*improvement))

    @torch.no_grad()
    def step(self, closure=None, epoch=None):
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            lr = self.get_lr(epoch)
            boost = self._sharpe_boost()
            lr *= boost

            for p in group['params']:
                if p.grad is None: continue
                grad = p.grad
                state = self.state[p]
                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p)
                    state['exp_avg_sq'] = torch.zeros_like(p)
                state['step'] += 1
                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']

                if group['weight_decay'] != 0:
                    grad = grad.add(p, alpha=group['weight_decay'])

                beta1, beta2 = 0.9, 0.999
                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)

                bias1 = 1 - beta1 ** state['step']
                bias2 = 1 - beta2 ** state['step']
                denom = (exp_avg_sq.sqrt() / math.sqrt(bias2)).add_(1e-8)
                step_size = lr / bias1
                p.addcdiv_(exp_avg, denom, value=-step_size)
        return loss

# --------------------------------------------------------------
#  MODEL
# --------------------------------------------------------------
class TradingNN(nn.Module):
    def __init__(self, input_dim, dropout=0.3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256), nn.BatchNorm1d(256), nn.GELU(), nn.Dropout(dropout),
            nn.Linear(256, 128), nn.BatchNorm1d(128), nn.GELU(), nn.Dropout(dropout),
            nn.Linear(128, 64),  nn.BatchNorm1d(64),  nn.GELU(),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.net(x).squeeze(-1)

# --------------------------------------------------------------
#  DATASET
# --------------------------------------------------------------
class TradingDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

def get_loader(X, y, shuffle=True):
    return DataLoader(TradingDataset(X, y), batch_size=BATCH_SIZE,
                      shuffle=shuffle, num_workers=NUM_WORKERS, pin_memory=True)

# --------------------------------------------------------------
#  CV WITH FULL LOGGING
# --------------------------------------------------------------
def compute_cv_with_logging(X, y, fr, dates, splits):
    fold_scores = []
    all_models = []
    all_vols = []

    for fold, (tr_idx, val_idx) in enumerate(splits):
        print(f"\n{'='*80}")
        print(f"FOLD {fold+1:02d} | Val: {dates[val_idx.min()]}→{dates[val_idx.max()]} | Train: {len(tr_idx):,} | Val: {len(val_idx):,}")

        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]
        fr_val = fr[val_idx]

        set_deterministic(42 + fold)
        model = TradingNN(X.shape[1]).to(DEVICE)
        optimizer = SharpeAwareAdamW(model.parameters(), lr=3e-3, warmup=5, T_max=60, weight_decay=1e-5)
        ema = EMA(model, decay=EMA_DECAY)
        train_loader = get_loader(X_tr, y_tr)
        val_loader = get_loader(X_val, y_val, shuffle=False)

        print(f"Training {TRAIN_EPOCHS} epochs | EMA from epoch {EMA_START}")
        for epoch in range(TRAIN_EPOCHS):
            model.train()
            epoch_loss = 0.0
            n_batches = 0
            grad_norm = 0.0
            for x, yt in train_loader:
                x, yt = x.to(DEVICE), yt.to(DEVICE)
                pred = model(x)
                loss = sharpe_loss(pred, yt)
                optimizer.zero_grad()
                loss.backward()
                grad_norm += torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0).item()
                optimizer.step(epoch=epoch)
                epoch_loss -= loss.item()
                n_batches += 1

            epoch_sharpe = epoch_loss / n_batches
            optimizer.set_epoch_sharpe(epoch_sharpe)
            grad_norm /= n_batches

            if epoch >= EMA_START:
                ema.update(model)

            if epoch < 5 or (epoch + 1) % 10 == 0:
                lr = optimizer.get_lr(epoch)
                print(f"  E{epoch+1:02d} | Sharpe: {epoch_sharpe:.6f} | LR: {lr:.2e} | Grad: {grad_norm:.4f}")

        ema.apply(model)

        # Val
        model.eval()
        with torch.inference_mode():
            val_pred = np.concatenate([model(x.to(DEVICE)).cpu().numpy() for x, _ in val_loader])
        pred_std = np.std(val_pred)
        scale = TARGET_VOL / (pred_std + 1e-6)
        val_pred_scaled = val_pred * scale
        position = np.clip(np.tanh(val_pred_scaled) * 2, 0, 2)

        sol = pd.DataFrame({'risk_free_rate': np.zeros_like(fr_val), 'forward_returns': fr_val})
        sub = pd.DataFrame({'prediction': position})
        fold_score = score(sol, sub, '')

        fold_scores.append(fold_score)
        all_models.append({k: v.cpu().clone() for k, v in model.state_dict().items()})
        all_vols.append(pred_std)

        print(f"  Val Pred: μ={val_pred.mean():.6f} σ={pred_std:.6f} → scale={scale:.3f}")
        print(f"  Position: μ={position.mean():.6f} σ={position.std():.6f}")
        print(f"  FOLD SCORE: {fold_score:.6f}")

    print(f"\nCV SCORES: {[f'{s:.6f}' for s in fold_scores]}")
    print(f"CV MEAN: {np.mean(fold_scores):.6f} | STD: {np.std(fold_scores):.6f}")
    return fold_scores, all_models, all_vols

# --------------------------------------------------------------
#  MAIN
# --------------------------------------------------------------
train_raw = load_full_train()
test_raw  = pl.read_csv(DATA_PATH / "test.csv")
FEATURES  = get_features(train_raw, test_raw)
print(f"Common features: {len(FEATURES)}")

df_all = create_example_dataset(train_raw)

held_out_df = None
if LB_RUN:
    train_df = df_all.filter(pl.col('date_id') < 8810)
    held_out_df = df_all.filter((pl.col('date_id') >= 8810) & (pl.col('date_id') <= 8989))
else:
    train_df = df_all

X, y, fr_arr = prepare_training_data(train_df, FEATURES)
date_arr = train_df['date_id'].to_numpy()
splits = walk_forward_split(len(X))

# CV
print("\n=== SOTA CV ===")
cv_scores, fold_models, fold_vols = compute_cv_with_logging(X, y, fr_arr, date_arr, splits)

# Final Model
print("\nAveraging all 10 EMA models...")
final_state = {}
first = fold_models[0]
for key in first.keys():
    if torch.is_floating_point(first[key]):
        stacked = torch.stack([m[key] for m in fold_models])
        final_state[key] = stacked.mean(0)

np.savez('final_model.npz', **{k: v.cpu().numpy() for k, v in final_state.items()})

# Local LB
if held_out_df is not None:
    print("\n=== LOCAL LB ===")
    X_ho = held_out_df.select(FEATURES).to_numpy().astype(np.float32)
    fr_ho = held_out_df.get_column('forward_returns').to_numpy().astype(np.float32)
    ho_loader = get_loader(X_ho, np.zeros(len(X_ho), dtype=np.float32), shuffle=False)

    model = TradingNN(X.shape[1]).to(DEVICE)
    model.load_state_dict(final_state)
    model.eval()

    with torch.inference_mode():
        pred_ho = np.concatenate([model(x.to(DEVICE)).cpu().numpy() for x, _ in ho_loader])
    pred_std = np.std(pred_ho)
    scale = TARGET_VOL / (pred_std + 1e-6)
    pred_ho_scaled = pred_ho * scale
    position = np.clip(np.tanh(pred_ho_scaled) * 2, 0, 2)

    sol = pd.DataFrame({'risk_free_rate': np.zeros_like(fr_ho), 'forward_returns': fr_ho})
    sub = pd.DataFrame({'prediction': position})
    lb = score(sol, sub, '')
    print(f"LOCAL LB PRED STD: {pred_std:.6f} → scale={scale:.3f}")
    print(f"LOCAL LB SCORE = {lb:.6f}")

# --------------------------------------------------------------
#  INFERENCE – ONLINE VOL
# --------------------------------------------------------------
final_state_np = np.load('final_model.npz')

class InferenceState:
    def __init__(self):
        self.buffer = []
    def update_vol(self, pred):
        self.buffer.append(pred)
        if len(self.buffer) > 50: self.buffer.pop(0)
        return np.std(self.buffer[-20:]) if len(self.buffer) >= 20 else 1.0

state = InferenceState()

def predict(test: pl.DataFrame) -> float:
    df = create_example_dataset(test)
    X_test = df.select(FEATURES).to_numpy().astype(np.float32)
    x_t = torch.from_numpy(X_test[:1]).to(DEVICE)

    model = TradingNN(X_test.shape[1]).to(DEVICE)
    state_dict = {k: torch.from_numpy(v).to(DEVICE) for k, v in final_state_np.items()}
    model.load_state_dict(state_dict)
    model.eval()

    with torch.inference_mode():
        p = model(x_t).cpu().numpy()[0]

    vol = state.update_vol(p)
    p_scaled = p * (TARGET_VOL / (vol + 1e-6))
    return float(np.clip(np.tanh(p_scaled) * 2, 0, 2))

inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)
if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(('/kaggle/input/hull-tactical-market-prediction/',))