# =========================================================================================
### TITLE: Hull Tactical - Advanced Online Ensemble (XGB+LGBM+CAT)
### AUTHOR: AI Machine Learning Engineer
### DESCRIPTION: 
### This notebook implements a State-of-the-Art (SOTA) approach for financial time-series 
### forecasting. It utilizes an Online Learning strategy where the model retrains/updates 
### incrementally as new market data arrives via the API. This adapts to 'Concept Drift' 
### in financial markets.
###
### STRATEGY:
### 1. Data Processing: Polars for high-speed I/O, Pandas for model compatibility.
### 2. Feature Engineering: Lag features and rolling window statistics.
### 3. Model Architecture: Weighted Ensemble of XGBoost, LightGBM, and CatBoost.
### 4. Inference Strategy: "Walk-Forward" validation and retraining loop via Kaggle API.
### 5. Edit) add Catboost
### =========================================================================================

In [None]:
# =========================================================================================
# TITLE: Hull Tactical - Gen3 Hybrid SOTA (Linear + Boost + Volatility Scaling)
# AUTHOR: AI Machine Learning Engineer
# STRATEGY:
# 1. Hybrid Model: ElasticNet (Online Learning) + LightGBM (Non-Linear patterns).
# 2. Advanced Features: Rolling Volatility & Momentum (RSI-like).
# 3. Volatility Targeting: Reduces bet size when market risk is high (The Gold Medal Key).
# =========================================================================================
import os
import warnings
import numpy as np
import pandas as pd
import polars as pl

from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import kaggle_evaluation.default_inference_server

warnings.filterwarnings("ignore")

In [None]:
# -----------------------------------------------------------------------------------------
# 1. CONFIGURATION
# -----------------------------------------------------------------------------------------
class Config:
    SEED = 42

    # Ensemble weights for 3 models
    W_LINEAR = 0.3   # SGDRegressor
    W_LGBM   = 0.4   # LightGBM
    W_CAT    = 0.3   # CatBoost

    # Volatility Targeting
    TARGET_VOL = 0.005          # 목표 일일 변동성 (0.5%)
    MAX_LEVERAGE = 1.2          # ★ 전략 포지션 상한을 1.2로 줄여서 sigma_strat <= 1.2 * sigma_mkt 근사 보장

    # Online Learning rate for SGDRegressor
    SGD_LR = 0.001

    # 포지션 민감도 (예측/변동성 → 포지션으로 바꾸는 스케일)
    SIGNAL_SCALE = 5.0          # 너무 크면 변동성 폭발, 너무 작으면 거의 1 근처. 필요하면 튜닝.


In [None]:
# -----------------------------------------------------------------------------------------
# 2. FEATURE ENGINEERING
# -----------------------------------------------------------------------------------------
def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # lag를 만들 기준 컬럼들 (train에는 존재 / test에는 forward_returns가 없음)
    targets = ['forward_returns', 'risk_free_rate']

    # 1. Lag Features
    for col in targets:
        if col in df.columns:
            for lag in [1, 2, 3, 5, 10]:
                df[f'lag_{col}_{lag}'] = df[col].shift(lag)

    # 2. Volatility Features (최근 risk)
    base_col = 'lag_forward_returns_1'
    if base_col not in df.columns and 'forward_returns' in df.columns:
        # forward_returns 기준으로 1일 lag 직접 생성
        df[base_col] = df['forward_returns'].shift(1)

    # 안전장치: 컬럼이 없다면 0으로 생성
    if base_col not in df.columns:
        df[base_col] = 0.0

    df['vol_5d']  = df[base_col].rolling(5).std()
    df['vol_22d'] = df[base_col].rolling(22).std()  # 약 한달

    # 3. Momentum Features
    df['mom_5d']  = df[base_col].rolling(5).mean()
    df['mom_22d'] = df[base_col].rolling(22).mean()

    # 4. Z-score (비정상적인 구간인지)
    df['zscore_22'] = (df[base_col] - df['mom_22d']) / (df['vol_22d'] + 1e-8)

    # 5. 결측값 처리
    df = df.fillna(0.0)
    return df


In [None]:
# -----------------------------------------------------------------------------------------
# 3. DATA LOADING
# -----------------------------------------------------------------------------------------
def load_data(path: str) -> pd.DataFrame:
    print(f"Loading {path}...")
    df_pl = pl.read_csv(path)
    cols = [c for c in df_pl.columns if c != 'date_id']
    # 숫자형으로 캐스팅 + null -> 0
    df_pl = df_pl.with_columns(
        [pl.col(c).cast(pl.Float64, strict=False).fill_null(0.0) for c in cols]
    )
    return df_pl.to_pandas()


TRAIN_PATH = "/kaggle/input/hull-tactical-market-prediction/train.csv"
train_df = load_data(TRAIN_PATH)
train_df = train_df[:-180]

# Feature Engineering
train_df = feature_engineering(train_df)

# 앞쪽 lag 계산이 충분히 안 된 구간 잘라냄
train_df = train_df.iloc[25:].reset_index(drop=True)

TARGET = "forward_returns"
DROP_COLS = [
    'date_id', 'is_scored',
    'forward_returns', 'risk_free_rate',
    'market_forward_excess_returns'
]

FEATURES = [c for c in train_df.columns if c not in DROP_COLS]
print(f"Features Created: {len(FEATURES)}")

X = train_df[FEATURES]
y = train_df[TARGET]


In [None]:
# -----------------------------------------------------------------------------------------
# 4. HYBRID MODEL TRAINING
# -----------------------------------------------------------------------------------------

print("Training 3 base models (SGD, LGBM, CatBoost)...")

# 4-1. Linear Model (SGDRegressor, online 가능)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

linear_model = SGDRegressor(
    loss='squared_error',
    penalty='l2',
    alpha=0.01,
    learning_rate='constant',
    eta0=Config.SGD_LR,
    random_state=Config.SEED
)
linear_model.fit(X_scaled, y)

# 4-2. LightGBM
lgbm_model = LGBMRegressor(
    n_estimators=600,
    learning_rate=0.02,
    max_depth=5,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=Config.SEED,
    n_jobs=-1,
    verbose=-1
)
lgbm_model.fit(X, y)

# 4-3. CatBoost
cat_model = CatBoostRegressor(
    depth=6,
    learning_rate=0.03,
    iterations=800,
    loss_function='RMSE',
    random_seed=Config.SEED,
    verbose=False
)
cat_model.fit(X, y)

print("All base models trained.")

In [None]:
# -----------------------------------------------------------------------------------------
# 5. INFERENCE LOOP WITH VOLATILITY-AWARE SCALING
# -----------------------------------------------------------------------------------------
GLOBAL_HISTORY = train_df.iloc[-50:].copy()
STEP = 0


def predict(test_pl: pl.DataFrame) -> float:
    global GLOBAL_HISTORY, STEP, linear_model, scaler

    # 1. 입력 처리
    cols = [c for c in test_pl.columns if c != 'date_id']
    test_pl = test_pl.with_columns(
        [pl.col(c).cast(pl.Float64, strict=False).fill_null(0.0) for c in cols]
    )
    test_df_raw = test_pl.to_pandas()

    # 2. 히스토리 업데이트 & 피처 생성
    GLOBAL_HISTORY = pd.concat([GLOBAL_HISTORY, test_df_raw], axis=0, ignore_index=True)
    full_features = feature_engineering(GLOBAL_HISTORY)
    current_features = full_features.iloc[[-1]][FEATURES]

    # 3. 세 모델 예측
    curr_X_scaled = scaler.transform(current_features)
    pred_linear = linear_model.predict(curr_X_scaled)[0]
    pred_lgbm  = lgbm_model.predict(current_features)[0]
    pred_cat   = cat_model.predict(current_features)[0]

    raw_return_pred = (
        Config.W_LINEAR * pred_linear +
        Config.W_LGBM   * pred_lgbm   +
        Config.W_CAT    * pred_cat
    )

    # ---------------------------------------------------------------------
    # 4. 변동성 기반 포지션 결정 (더 보수적인 버전)
    # ---------------------------------------------------------------------
    # 현재 시장 변동성 (22일 rolling)
    current_vol = current_features['vol_22d'].values[0] if 'vol_22d' in current_features.columns else 0.005
    if current_vol < 1e-6:
        current_vol = 0.005

    # signal: 예측 수익률을 변동성으로 나눈 값 (Sharpe 비슷한 개념)
    signal = raw_return_pred / (current_vol + 1e-8)

    # 너무 큰 signal이 들어오면 과도한 레버리지 방지
    signal = np.clip(signal, -3.0, 3.0)

    # 기본 포지션: 1.0(중립)에서 signal에 비례해 움직임
    allocation = 1.0 + Config.SIGNAL_SCALE * signal

    # Crash 보호: 장기 모멘텀이 크게 음수이면 롱 포지션 제한
    mom_22 = current_features['mom_22d'].values[0] if 'mom_22d' in current_features.columns else 0.0
    if mom_22 < -0.01 and allocation > 1.0:
        allocation = 1.0

    # [0, MAX_LEVERAGE]로 클리핑 (MAX_LEVERAGE=1.2 → 전략 변동성 ≤ 1.2 * 시장 변동성 근사 보장)
    allocation = float(np.clip(allocation, 0.0, Config.MAX_LEVERAGE))

    # ---------------------------------------------------------------------
    # 5. Online Learning (SGD만 partial_fit)
    # ---------------------------------------------------------------------
    try:
        prev_target = test_df_raw['lagged_forward_returns'].values[0]
        linear_model.partial_fit(curr_X_scaled, [prev_target])
    except Exception:
        pass

    # 히스토리 관리
    if len(GLOBAL_HISTORY) > 200:
        GLOBAL_HISTORY = GLOBAL_HISTORY.iloc[-100:]

    STEP += 1
    return allocation

In [None]:
# -----------------------------------------------------------------------------------------
# 6. SERVER START
# -----------------------------------------------------------------------------------------
inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(('/kaggle/input/hull-tactical-market-prediction/',))