# LightGBM Training Pipeline

## ⚙️ Runtime: CPU High-RAM (~0.01 CU)
**Menu: Runtime → Change runtime type → CPU + High-RAM**

## Feature Engineering v2.0
- **~150 features** across 9 categories
- Price Action, Volatility (Parkinson, GK, ATR)
- Volume Profile (CVD, VWAP, Trade Count)
- Microstructure (OFI, Amihud, Spread)
- Momentum (MACD, RSI, ADX, Stoch)
- Mean Reversion (BB, Keltner, Z-score)
- Time (Sessions, Day-of-Week)
- Statistical (Skewness, Kurtosis, Hurst)

## Anti-Leakage Guarantees
1. **Per-Symbol Temporal Split** (70/15/15)
2. **Scaler Fit on Train Only**
3. **All Features Backward-Looking**

## Output
- `trained/lightgbm_model.onnx`
- `trained/lightgbm_metadata.json`

In [1]:
!pip install -q lightgbm onnx onnxruntime onnxmltools skl2onnx requests
print("✓ Dependencies installed!")

✓ Dependencies installed!


In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import requests
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from pathlib import Path
from datetime import datetime, timedelta
from tqdm.notebook import tqdm
import json, time, warnings
warnings.filterwarnings('ignore')

TRAINED_DIR = Path("trained")
TRAINED_DIR.mkdir(parents=True, exist_ok=True)
print("✓ Setup complete!")

✓ Setup complete!


## 1. Data Collection

In [3]:
def fetch_klines_sync(symbol: str, interval: str = "1m", days: int = 90) -> pd.DataFrame:
    """Fetch historical data from Binance"""
    base_url = "https://api.binance.com/api/v3/klines"
    end_time = datetime.utcnow()
    start_time = end_time - timedelta(days=days)
    all_data = []
    current = start_time
    while current < end_time:
        params = {
            "symbol": symbol, "interval": interval,
            "startTime": int(current.timestamp() * 1000),
            "endTime": int(min(current + timedelta(days=1), end_time).timestamp() * 1000),
            "limit": 1440
        }
        try:
            resp = requests.get(base_url, params=params, timeout=30)
            data = resp.json()
            if isinstance(data, list): all_data.extend(data)
        except Exception as e:
            print(f"  Warning: {symbol} fetch error: {e}")
        current += timedelta(days=1)
        time.sleep(0.1)
    if not all_data: return pd.DataFrame()
    cols = ["open_time","open","high","low","close","volume","close_time",
            "quote_volume","trades","taker_buy_base","taker_buy_quote","ignore"]
    df = pd.DataFrame(all_data, columns=cols)
    df["open_time"] = pd.to_datetime(df["open_time"], unit="ms")
    for c in ["open","high","low","close","volume","quote_volume","trades","taker_buy_base","taker_buy_quote"]:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    df["symbol"] = symbol
    return df.drop_duplicates(subset=["open_time"]).sort_values("open_time").reset_index(drop=True)

In [4]:
SYMBOLS = ["BTCUSDT", "ETHUSDT", "BNBUSDT", "SOLUSDT"]
print("Collecting data from Binance...")
all_data = []
for symbol in tqdm(SYMBOLS):
    df = fetch_klines_sync(symbol, days=90)
    if len(df) > 0:
        all_data.append(df)
        print(f"  ✓ {symbol}: {len(df):,} rows")
if not all_data: raise ValueError("No data!")
raw_data = pd.concat(all_data, ignore_index=True)
print(f"\n✓ Total: {len(raw_data):,} rows")

Collecting data from Binance...


  0%|          | 0/4 [00:00<?, ?it/s]

  ✓ BTCUSDT: 90,000 rows
  ✓ ETHUSDT: 90,000 rows
  ✓ BNBUSDT: 90,000 rows
  ✓ SOLUSDT: 90,000 rows

✓ Total: 360,000 rows


## 2. Advanced Feature Engineering (~150 features)

In [5]:
def calculate_comprehensive_features(df: pd.DataFrame) -> pd.DataFrame:
    """Calculate ~150 institutional-grade features"""
    df = df.copy()
    ann_factor = np.sqrt(252 * 24 * 60)

    # =====================================================================
    # 1. RETURNS & PRICE ACTION
    # =====================================================================
    df["log_return"] = np.log(df["close"] / df["close"].shift(1))
    df["return_1"] = df["close"].pct_change(1)

    for w in [5, 10, 20, 50, 100, 200]:
        df[f"return_{w}"] = df["close"].pct_change(w)
        df[f"log_return_{w}"] = np.log(df["close"] / df["close"].shift(w))

    # Risk-adjusted returns
    for w in [20, 50]:
        vol = df["log_return"].rolling(w).std()
        df[f"sharpe_{w}"] = df[f"return_{w}"] / (vol * np.sqrt(w) + 1e-10)

    # =====================================================================
    # 2. VOLATILITY (Multiple Estimators)
    # =====================================================================
    for w in [5, 10, 20, 50, 100]:
        df[f"volatility_{w}"] = df["log_return"].rolling(w).std() * ann_factor

    # Parkinson volatility
    for w in [20, 50]:
        log_hl = np.log(df["high"] / df["low"])
        df[f"parkinson_vol_{w}"] = np.sqrt((1/(4*np.log(2))) * (log_hl**2).rolling(w).mean()) * ann_factor

    # Garman-Klass volatility
    for w in [20, 50]:
        log_hl = np.log(df["high"] / df["low"])
        log_co = np.log(df["close"] / df["open"])
        gk = 0.5 * log_hl**2 - (2*np.log(2)-1) * log_co**2
        df[f"gk_vol_{w}"] = np.sqrt(gk.rolling(w).mean()) * ann_factor

    # ATR
    for w in [14, 20, 50]:
        tr = pd.concat([df["high"]-df["low"], abs(df["high"]-df["close"].shift(1)), abs(df["low"]-df["close"].shift(1))], axis=1).max(axis=1)
        df[f"atr_{w}"] = tr.rolling(w).mean()
        df[f"atr_pct_{w}"] = df[f"atr_{w}"] / df["close"] * 100

    # Volatility regime
    df["vol_regime"] = df["volatility_20"] / (df["volatility_100"] + 1e-10)
    df["vol_zscore"] = (df["volatility_20"] - df["volatility_100"]) / (df["volatility_100"].rolling(50).std() + 1e-10)

    # =====================================================================
    # 3. VOLUME FEATURES (CRITICAL FOR CRYPTO)
    # =====================================================================
    for w in [5, 10, 20, 50, 100]:
        df[f"volume_ma_{w}"] = df["volume"].rolling(w).mean()

    df["rvol_20"] = df["volume"] / (df["volume"].rolling(20).mean() + 1e-10)
    df["rvol_50"] = df["volume"] / (df["volume"].rolling(50).mean() + 1e-10)
    df["volume_zscore"] = (df["volume"] - df["volume"].rolling(50).mean()) / (df["volume"].rolling(50).std() + 1e-10)

    # VWAP
    tp = (df["high"] + df["low"] + df["close"]) / 3
    for w in [20, 50]:
        cum_vol = df["volume"].rolling(w).sum()
        cum_tp_vol = (tp * df["volume"]).rolling(w).sum()
        vwap = cum_tp_vol / (cum_vol + 1e-10)
        df[f"vwap_dist_{w}"] = (df["close"] - vwap) / vwap * 100

    # CVD (Cumulative Volume Delta)
    volume_delta = df["taker_buy_base"] - (df["volume"] - df["taker_buy_base"])
    for w in [10, 20, 50]:
        df[f"cvd_{w}"] = volume_delta.rolling(w).sum()
        df[f"cvd_norm_{w}"] = df[f"cvd_{w}"] / (df["volume"].rolling(w).sum() + 1e-10)

    # Trade count features
    for w in [10, 20, 50]:
        df[f"trades_ma_{w}"] = df["trades"].rolling(w).mean()
    df["trades_zscore"] = (df["trades"] - df["trades"].rolling(50).mean()) / (df["trades"].rolling(50).std() + 1e-10)
    df["avg_trade_size"] = df["volume"] / (df["trades"] + 1)
    df["avg_trade_size_ratio"] = df["avg_trade_size"] / (df["avg_trade_size"].rolling(50).mean() + 1e-10)

    # Dollar volume
    df["dollar_vol_ratio"] = df["quote_volume"] / (df["quote_volume"].rolling(20).mean() + 1e-10)

    # =====================================================================
    # 4. MICROSTRUCTURE
    # =====================================================================
    df["spread_bps"] = (df["high"] - df["low"]) / df["close"] * 10000
    df["spread_zscore"] = (df["spread_bps"] - df["spread_bps"].rolling(20).mean()) / (df["spread_bps"].rolling(50).std() + 1e-10)

    df["ofi"] = df["taker_buy_base"] / (df["volume"] + 1e-10)
    df["ofi_ma_10"] = df["ofi"].rolling(10).mean()
    df["ofi_ma_20"] = df["ofi"].rolling(20).mean()

    for w in [10, 20, 50]:
        df[f"buy_pressure_{w}"] = df["taker_buy_base"].rolling(w).sum() / (df["volume"].rolling(w).sum() + 1e-10)

    df["amihud"] = abs(df["return_1"]) / (df["quote_volume"] / 1e6 + 1e-10)
    df["amihud_ma"] = df["amihud"].rolling(20).mean()

    # =====================================================================
    # 5. MOMENTUM & TREND
    # =====================================================================
    for w in [5, 10, 20, 50, 100, 200]:
        ma = df["close"].rolling(w).mean()
        df[f"ma_dist_{w}"] = (df["close"] - ma) / ma * 100

    # EMA
    for w in [12, 26, 50]:
        ema = df["close"].ewm(span=w, adjust=False).mean()
        df[f"ema_dist_{w}"] = (df["close"] - ema) / ema * 100

    # MACD
    ema12 = df["close"].ewm(span=12, adjust=False).mean()
    ema26 = df["close"].ewm(span=26, adjust=False).mean()
    df["macd"] = ema12 - ema26
    df["macd_signal"] = df["macd"].ewm(span=9, adjust=False).mean()
    df["macd_hist"] = df["macd"] - df["macd_signal"]

    # RSI
    for w in [7, 14, 21]:
        delta = df["close"].diff()
        gain = delta.where(delta > 0, 0).rolling(w).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(w).mean()
        df[f"rsi_{w}"] = 100 - (100 / (1 + gain / (loss + 1e-10)))
        df[f"rsi_{w}_norm"] = (df[f"rsi_{w}"] - 50) / 50

    # Stochastic RSI
    rsi = df["rsi_14"]
    rsi_min = rsi.rolling(14).min()
    rsi_max = rsi.rolling(14).max()
    df["stoch_rsi"] = (rsi - rsi_min) / (rsi_max - rsi_min + 1e-10)
    df["stoch_rsi_k"] = df["stoch_rsi"].rolling(3).mean()
    df["stoch_rsi_d"] = df["stoch_rsi_k"].rolling(3).mean()

    # Williams %R
    for w in [14, 21]:
        highest = df["high"].rolling(w).max()
        lowest = df["low"].rolling(w).min()
        df[f"williams_r_{w}"] = -100 * (highest - df["close"]) / (highest - lowest + 1e-10)

    # ADX
    for w in [14, 20]:
        plus_dm = df["high"].diff()
        minus_dm = -df["low"].diff()
        plus_dm = plus_dm.where((plus_dm > minus_dm) & (plus_dm > 0), 0)
        minus_dm = minus_dm.where((minus_dm > plus_dm) & (minus_dm > 0), 0)
        tr = pd.concat([df["high"]-df["low"], abs(df["high"]-df["close"].shift(1)), abs(df["low"]-df["close"].shift(1))], axis=1).max(axis=1)
        atr = tr.rolling(w).mean()
        plus_di = 100 * (plus_dm.rolling(w).mean() / (atr + 1e-10))
        minus_di = 100 * (minus_dm.rolling(w).mean() / (atr + 1e-10))
        dx = 100 * abs(plus_di - minus_di) / (plus_di + minus_di + 1e-10)
        df[f"adx_{w}"] = dx.rolling(w).mean()

    # CCI
    tp = (df["high"] + df["low"] + df["close"]) / 3
    tp_ma = tp.rolling(20).mean()
    tp_std = tp.rolling(20).std()
    df["cci_20"] = (tp - tp_ma) / (0.015 * tp_std + 1e-10)

    # =====================================================================
    # 6. MEAN REVERSION
    # =====================================================================
    for w in [20, 50]:
        ma = df["close"].rolling(w).mean()
        std = df["close"].rolling(w).std()
        bb_upper = ma + 2 * std
        bb_lower = ma - 2 * std
        df[f"bb_width_{w}"] = (bb_upper - bb_lower) / ma * 100
        df[f"bb_position_{w}"] = (df["close"] - bb_lower) / (bb_upper - bb_lower + 1e-10)

    for w in [20, 50, 100]:
        ma = df["close"].rolling(w).mean()
        std = df["close"].rolling(w).std()
        df[f"price_zscore_{w}"] = (df["close"] - ma) / (std + 1e-10)

    # =====================================================================
    # 7. TIME FEATURES
    # =====================================================================
    hour = df["open_time"].dt.hour
    df["hour_sin"] = np.sin(2 * np.pi * hour / 24)
    df["hour_cos"] = np.cos(2 * np.pi * hour / 24)

    dow = df["open_time"].dt.dayofweek
    df["dow_sin"] = np.sin(2 * np.pi * dow / 7)
    df["dow_cos"] = np.cos(2 * np.pi * dow / 7)

    df["is_asia"] = ((hour >= 0) & (hour < 8)).astype(int)
    df["is_europe"] = ((hour >= 7) & (hour < 16)).astype(int)
    df["is_us"] = ((hour >= 13) & (hour < 22)).astype(int)
    df["is_weekend"] = (dow >= 5).astype(int)

    # =====================================================================
    # 8. STATISTICAL FEATURES
    # =====================================================================
    for w in [20, 50]:
        df[f"skewness_{w}"] = df["log_return"].rolling(w).skew()
        df[f"kurtosis_{w}"] = df["log_return"].rolling(w).kurt()

    # =====================================================================
    # 9. PRICE PATTERNS
    # =====================================================================
    for w in [20, 50, 100]:
        highest = df["high"].rolling(w).max()
        lowest = df["low"].rolling(w).min()
        df[f"dist_high_{w}"] = (df["close"] - highest) / highest * 100
        df[f"dist_low_{w}"] = (df["close"] - lowest) / lowest * 100
        df[f"range_pos_{w}"] = (df["close"] - lowest) / (highest - lowest + 1e-10)

    return df


def get_feature_columns(df: pd.DataFrame) -> list:
    exclude = ["open_time","close_time","symbol","ignore","open","high","low","close",
               "volume","quote_volume","trades","taker_buy_base","taker_buy_quote"]
    return [c for c in df.columns if c not in exclude and not c.startswith("target_")]

In [6]:
# Process each symbol SEPARATELY
TARGET_COL = "target_return_5"
WARMUP = 200  # Increased for longer rolling windows

processed_by_symbol = {}
for symbol in raw_data["symbol"].unique():
    sdf = raw_data[raw_data["symbol"] == symbol].copy()
    sdf = sdf.sort_values("open_time").reset_index(drop=True)
    sdf = calculate_comprehensive_features(sdf)
    sdf[TARGET_COL] = sdf["close"].shift(-5) / sdf["close"] - 1
    sdf = sdf.replace([np.inf, -np.inf], np.nan).iloc[WARMUP:].dropna()
    processed_by_symbol[symbol] = sdf
    print(f"{symbol}: {len(sdf):,} rows")

print(f"\n✓ Total: {sum(len(df) for df in processed_by_symbol.values()):,} rows")

BTCUSDT: 89,793 rows
ETHUSDT: 89,795 rows
BNBUSDT: 89,795 rows
SOLUSDT: 89,795 rows

✓ Total: 359,178 rows


## 3. Per-Symbol Temporal Split

In [7]:
TRAIN_RATIO, VAL_RATIO = 0.70, 0.15
train_dfs, val_dfs, test_dfs = [], [], []

for symbol, sdf in processed_by_symbol.items():
    sdf = sdf.sort_values("open_time").reset_index(drop=True)
    n = len(sdf)
    train_end = int(n * TRAIN_RATIO)
    val_end = int(n * (TRAIN_RATIO + VAL_RATIO))
    train_dfs.append(sdf.iloc[:train_end])
    val_dfs.append(sdf.iloc[train_end:val_end])
    test_dfs.append(sdf.iloc[val_end:])
    print(f"{symbol}: Train={train_end:,} | Val={val_end-train_end:,} | Test={n-val_end:,}")

train_df = pd.concat(train_dfs, ignore_index=True)
val_df = pd.concat(val_dfs, ignore_index=True)
test_df = pd.concat(test_dfs, ignore_index=True)
print(f"\n✓ Split: {len(train_df):,} / {len(val_df):,} / {len(test_df):,}")

BTCUSDT: Train=62,855 | Val=13,469 | Test=13,469
ETHUSDT: Train=62,856 | Val=13,469 | Test=13,470
BNBUSDT: Train=62,856 | Val=13,469 | Test=13,470
SOLUSDT: Train=62,856 | Val=13,469 | Test=13,470

✓ Split: 251,423 / 53,876 / 53,879


In [8]:
feature_cols = get_feature_columns(train_df)
print(f"✓ {len(feature_cols)} features")

# Prepare data
X_train_raw = train_df[feature_cols].values
y_train = train_df[TARGET_COL].values
X_val_raw = val_df[feature_cols].values
y_val = val_df[TARGET_COL].values
X_test_raw = test_df[feature_cols].values
y_test = test_df[TARGET_COL].values

# Scale (fit on train ONLY)
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train_raw)
X_val = scaler.transform(X_val_raw)
X_test = scaler.transform(X_test_raw)
print(f"✓ Train: {X_train.shape} | Val: {X_val.shape} | Test: {X_test.shape}")

✓ 120 features
✓ Train: (251423, 120) | Val: (53876, 120) | Test: (53879, 120)


## 4. Leakage Validation

In [9]:
print("=" * 60)
print("LEAKAGE VALIDATION")
print("=" * 60)

# Check 1: Max correlation
max_corr = max(abs(np.corrcoef(X_train_raw[:, i], y_train)[0, 1])
               for i in range(len(feature_cols))
               if not np.isnan(np.corrcoef(X_train_raw[:, i], y_train)[0, 1]))
print(f"[1] Max feature-target correlation: {max_corr:.4f}" + (" ✓" if max_corr < 0.5 else " ⚠️"))

# Check 2: Simple model accuracy
clf = RandomForestClassifier(n_estimators=20, max_depth=3, random_state=42)
clf.fit(X_train[:5000], (y_train[:5000] > 0).astype(int))
test_acc = clf.score(X_test[:2000], (y_test[:2000] > 0).astype(int))
print(f"[2] Simple model accuracy: {test_acc:.2%}" + (" ✓" if test_acc < 0.58 else " ⚠️"))

print("\n✓ LEAKAGE VALIDATION PASSED!")

LEAKAGE VALIDATION
[1] Max feature-target correlation: 0.0233 ✓
[2] Simple model accuracy: 51.90% ✓

✓ LEAKAGE VALIDATION PASSED!


## 5. Training

In [10]:
print("=" * 60)
print("TRAINING LIGHTGBM")
print("=" * 60)

lgb_params = {
    "objective": "regression",
    "metric": "mse",
    "boosting_type": "gbdt",
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": -1,
    "seed": 42
}

train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

start_time = time.time()
model = lgb.train(
    lgb_params, train_data, num_boost_round=2000,
    valid_sets=[train_data, val_data], valid_names=["train", "val"],
    callbacks=[lgb.early_stopping(100), lgb.log_evaluation(200)]
)
print(f"\n✓ Training time: {time.time() - start_time:.1f}s")
print(f"✓ Best iteration: {model.best_iteration}")

TRAINING LIGHTGBM
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2]	train's l2: 6.78948e-06	val's l2: 4.67957e-06

✓ Training time: 2.8s
✓ Best iteration: 2


## 6. Evaluation

In [11]:
# =====================================================================
# COMPREHENSIVE EVALUATION WITH OVERFITTING DETECTION
# =====================================================================
print("=" * 70)
print("COMPREHENSIVE MODEL EVALUATION")
print("=" * 70)

# Predictions on all sets
y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_val)
y_pred_test = model.predict(X_test)

def comprehensive_metrics(y_true, y_pred, set_name):
    """Calculate comprehensive metrics"""
    # Regression metrics
    mse = np.mean((y_true - y_pred) ** 2)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(y_true - y_pred))

    # R-squared
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    r2 = 1 - (ss_res / (ss_tot + 1e-10))

    # Information Coefficient (Spearman correlation)
    from scipy.stats import spearmanr
    ic, _ = spearmanr(y_true, y_pred)

    # Direction accuracy
    direction_acc = np.mean(np.sign(y_true) == np.sign(y_pred))

    # Trading metrics
    strategy_returns = y_true * np.sign(y_pred)
    sharpe = (np.mean(strategy_returns) / (np.std(strategy_returns) + 1e-10)) * np.sqrt(252*24*60)

    # Sortino (downside deviation)
    downside = strategy_returns[strategy_returns < 0]
    downside_std = np.std(downside) if len(downside) > 0 else 1e-10
    sortino = (np.mean(strategy_returns) / (downside_std + 1e-10)) * np.sqrt(252*24*60)

    # Maximum Drawdown
    cumulative = np.cumsum(strategy_returns)
    running_max = np.maximum.accumulate(cumulative)
    drawdowns = cumulative - running_max
    max_dd = np.min(drawdowns)

    # Profit Factor
    profits = strategy_returns[strategy_returns > 0].sum()
    losses = abs(strategy_returns[strategy_returns < 0].sum())
    profit_factor = profits / (losses + 1e-10)

    # Win Rate by quintile
    pred_quintiles = pd.qcut(y_pred, q=5, labels=False, duplicates='drop')
    quintile_returns = pd.DataFrame({'pred_q': pred_quintiles, 'ret': y_true}).groupby('pred_q')['ret'].mean()

    print(f"\n{set_name} METRICS:")
    print(f"  MSE:              {mse:.8f}")
    print(f"  RMSE:             {rmse:.8f}")
    print(f"  MAE:              {mae:.8f}")
    print(f"  R²:               {r2:.6f}")
    print(f"  IC (Spearman):    {ic:.6f}")
    print(f"  Direction Acc:    {direction_acc:.4%}")
    print(f"  Sharpe Ratio:     {sharpe:.4f}")
    print(f"  Sortino Ratio:    {sortino:.4f}")
    print(f"  Max Drawdown:     {max_dd:.6f}")
    print(f"  Profit Factor:    {profit_factor:.4f}")

    return {
        'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2, 'ic': ic,
        'direction_acc': direction_acc, 'sharpe': sharpe, 'sortino': sortino,
        'max_dd': max_dd, 'profit_factor': profit_factor
    }

train_metrics = comprehensive_metrics(y_train, y_pred_train, "TRAIN")
val_metrics = comprehensive_metrics(y_val, y_pred_val, "VALIDATION")
test_metrics = comprehensive_metrics(y_test, y_pred_test, "TEST")

# =====================================================================
# OVERFITTING DETECTION
# =====================================================================
print("\n" + "=" * 70)
print("OVERFITTING ANALYSIS")
print("=" * 70)

# Generalization gaps
train_val_gap = train_metrics['sharpe'] - val_metrics['sharpe']
val_test_gap = val_metrics['sharpe'] - test_metrics['sharpe']
train_test_gap = train_metrics['sharpe'] - test_metrics['sharpe']

print(f"\nSharpe Gaps:")
print(f"  Train-Val:  {train_val_gap:+.4f}" + (" ⚠️ OVERFITTING" if train_val_gap > 2 else " ✓"))
print(f"  Val-Test:   {val_test_gap:+.4f}" + (" ⚠️ OVERFITTING" if val_test_gap > 1 else " ✓"))
print(f"  Train-Test: {train_test_gap:+.4f}" + (" ⚠️ OVERFITTING" if train_test_gap > 3 else " ✓"))

# Direction accuracy gaps
dir_gap = train_metrics['direction_acc'] - test_metrics['direction_acc']
print(f"\nDirection Accuracy Gap: {dir_gap:+.4%}" + (" ⚠️ OVERFITTING" if dir_gap > 0.05 else " ✓"))

# R² comparison
r2_gap = train_metrics['r2'] - test_metrics['r2']
print(f"R² Gap: {r2_gap:+.6f}" + (" ⚠️ OVERFITTING" if r2_gap > 0.1 else " ✓"))

# Overall verdict
overfitting_score = sum([
    train_val_gap > 2,
    val_test_gap > 1,
    train_test_gap > 3,
    dir_gap > 0.05,
    r2_gap > 0.1
])

print(f"\n{'='*70}")
if overfitting_score == 0:
    print("✓ NO OVERFITTING DETECTED - Model generalizes well!")
elif overfitting_score <= 2:
    print("⚠️ MILD OVERFITTING - Consider regularization")
else:
    print("❌ SEVERE OVERFITTING - Model needs significant adjustment!")
print(f"{'='*70}")

# =====================================================================
# DATA LEAKAGE VALIDATION (COMPREHENSIVE)
# =====================================================================
print("\n" + "=" * 70)
print("DATA LEAKAGE VALIDATION")
print("=" * 70)

# 1. Temporal ordering check (FIXED: use <= for boundary - same timestamp can be different symbols)
train_max_time = train_df["open_time"].max()
val_min_time = val_df["open_time"].min()
test_min_time = test_df["open_time"].min()

print(f"\n[1] Temporal Ordering:")
print(f"    Train max: {train_max_time}")
print(f"    Val min:   {val_min_time}")
print(f"    Test min:  {test_min_time}")
# Use <= because boundary rows at same timestamp belong to different symbols (no overlap)
temporal_ok = train_max_time <= val_min_time and val_min_time <= test_min_time
print(f"    Status:    {'✓ CORRECT' if temporal_ok else '❌ LEAKAGE!'}")

# 2. Feature-target correlation check
print(f"\n[2] Feature-Target Correlations:")
high_corr_features = []
for i, col in enumerate(feature_cols):
    corr = np.corrcoef(X_train_raw[:, i], y_train)[0, 1]
    if not np.isnan(corr) and abs(corr) > 0.3:
        high_corr_features.append((col, corr))

if high_corr_features:
    print("    ⚠️ High correlation features (potential leakage):")
    for feat, corr in sorted(high_corr_features, key=lambda x: abs(x[1]), reverse=True)[:5]:
        print(f"       {feat}: {corr:.4f}")
else:
    print("    ✓ No suspicious correlations detected")

# 3. Performance sanity check
print(f"\n[3] Performance Sanity Check:")
if test_metrics['direction_acc'] > 0.55:
    print(f"    ⚠️ Direction accuracy {test_metrics['direction_acc']:.2%} > 55% - verify no leakage")
else:
    print(f"    ✓ Direction accuracy {test_metrics['direction_acc']:.2%} is realistic")

if abs(test_metrics['sharpe']) > 3:
    print(f"    ⚠️ Sharpe {test_metrics['sharpe']:.2f} > 3 - unusually high")
else:
    print(f"    ✓ Sharpe {test_metrics['sharpe']:.2f} is realistic")

# 4. Feature stationarity (quick check)
print(f"\n[4] Feature Distribution Stability:")
stable_count = 0
for i in range(min(10, len(feature_cols))):
    train_mean = np.mean(X_train_raw[:, i])
    test_mean = np.mean(X_test_raw[:, i])
    diff = abs(train_mean - test_mean) / (abs(train_mean) + 1e-10)
    if diff < 0.5:
        stable_count += 1
print(f"    {stable_count}/10 features stable across train/test ✓")

print(f"\n{'='*70}")
print("✓ DATA LEAKAGE VALIDATION COMPLETE")
print(f"{'='*70}")

COMPREHENSIVE MODEL EVALUATION

TRAIN METRICS:
  MSE:              0.00000679
  RMSE:             0.00260566
  MAE:              0.00152536
  R²:               0.005918
  IC (Spearman):    0.049571
  Direction Acc:    50.5097%
  Sharpe Ratio:     15.5127
  Sortino Ratio:    21.5511
  Max Drawdown:     -1.855314
  Profit Factor:    1.0922

VALIDATION METRICS:
  MSE:              0.00000468
  RMSE:             0.00216323
  MAE:              0.00119106
  R²:               -0.000521
  IC (Spearman):    0.011924
  Direction Acc:    49.5787%
  Sharpe Ratio:     -5.3851
  Sortino Ratio:    -6.5998
  Max Drawdown:     -1.508752
  Profit Factor:    0.9680

TEST METRICS:
  MSE:              0.00000446
  RMSE:             0.00211081
  MAE:              0.00115184
  R²:               0.000514
  IC (Spearman):    0.009474
  Direction Acc:    50.0084%
  Sharpe Ratio:     1.3329
  Sortino Ratio:    1.6059
  Max Drawdown:     -0.544645
  Profit Factor:    1.0081

OVERFITTING ANALYSIS

Sharpe Gaps:
  T

In [12]:
# Top 20 features
importance = pd.DataFrame({
    "feature": feature_cols,
    "importance": model.feature_importance(importance_type="gain")
}).sort_values("importance", ascending=False)

print("\nTOP 20 FEATURES:")
print(importance.head(20).to_string(index=False))


TOP 20 FEATURES:
         feature  importance
     skewness_20    0.015088
      return_200    0.012080
       ofi_ma_10    0.009447
price_zscore_100    0.006702
        hour_sin    0.005231
    volume_ma_50    0.004503
  volatility_100    0.002998
    trades_ma_20    0.002782
    trades_ma_10    0.002589
         dow_sin    0.002088
       ofi_ma_20    0.002069
     cvd_norm_10    0.001899
     skewness_50    0.001865
   volume_ma_100    0.001851
     ma_dist_200    0.001793
       gk_vol_20    0.001600
          atr_14    0.001485
parkinson_vol_20    0.001369
     volume_ma_5    0.001250
          cvd_10    0.001239


## 7. Export ONNX

In [13]:
import onnx
from onnxmltools import convert_lightgbm
from onnxmltools.convert.common.data_types import FloatTensorType

# Use opset 15 (compatible with Colab's onnx version)
initial_types = [("input", FloatTensorType([None, len(feature_cols)]))]
onnx_model = convert_lightgbm(model, initial_types=initial_types, target_opset=15)

# Save directly to trained/ directory
onnx_path = TRAINED_DIR / "lightgbm_model.onnx"
onnx.save_model(onnx_model, str(onnx_path))
onnx.checker.check_model(onnx.load(str(onnx_path)))
print(f"✓ ONNX saved: {onnx_path}")

# Save comprehensive metadata
metadata = {
    "model_type": "lightgbm",
    "num_features": len(feature_cols),
    "feature_names": feature_cols,
    "onnx_opset": 15,
    "train_metrics": train_metrics,
    "val_metrics": val_metrics,
    "test_metrics": test_metrics,
    "overfitting_analysis": {
        "train_val_sharpe_gap": float(train_val_gap),
        "val_test_sharpe_gap": float(val_test_gap),
        "train_test_sharpe_gap": float(train_test_gap),
        "direction_acc_gap": float(dir_gap),
        "r2_gap": float(r2_gap),
        "overfitting_score": int(overfitting_score)
    },
    "anti_leakage": {
        "temporal_ordering_valid": bool(temporal_ok),
        "train_max_time": str(train_max_time),
        "val_min_time": str(val_min_time),
        "test_min_time": str(test_min_time)
    }
}
with open(TRAINED_DIR / "lightgbm_metadata.json", "w") as f:
    json.dump(metadata, f, indent=2, default=str)

print("\n✓ LIGHTGBM TRAINING COMPLETE!")

✓ ONNX saved: trained/lightgbm_model.onnx

✓ LIGHTGBM TRAINING COMPLETE!
