# XGBoost Training Pipeline (GPU)

## ⚙️ Runtime: T4 GPU (~0.28 CU)
**Menu: Runtime → Change runtime type → T4 GPU**

## Anti-Leakage Guarantees
1. **Per-Symbol Temporal Split** - Each symbol split independently (70/15/15)
2. **Scaler Fit on Train Only**
3. **Backward-Looking Features**

## Output
- `trained/xgboost_model.onnx`
- `trained/xgboost_metadata.json`

In [1]:
!nvidia-smi
!pip install -q xgboost onnx onnxruntime-gpu onnxmltools requests
print("✓ Dependencies installed!")

Sun Dec 21 23:48:25 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   35C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import requests
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from pathlib import Path
from datetime import datetime, timedelta
from tqdm.notebook import tqdm
import json, time, warnings
warnings.filterwarnings('ignore')

TRAINED_DIR = Path("trained")
TRAINED_DIR.mkdir(parents=True, exist_ok=True)
print("✓ Setup complete!")

In [3]:
def fetch_klines_sync(symbol, days=90):
    base_url = "https://api.binance.com/api/v3/klines"
    end_time = datetime.utcnow()
    start_time = end_time - timedelta(days=days)
    all_data = []
    current = start_time
    while current < end_time:
        params = {"symbol": symbol, "interval": "1m",
                  "startTime": int(current.timestamp()*1000),
                  "endTime": int(min(current+timedelta(days=1), end_time).timestamp()*1000), "limit": 1440}
        try:
            resp = requests.get(base_url, params=params, timeout=30)
            data = resp.json()
            if isinstance(data, list): all_data.extend(data)
        except: pass
        current += timedelta(days=1)
        time.sleep(0.1)
    if not all_data: return pd.DataFrame()
    cols = ["open_time","open","high","low","close","volume","close_time","quote_volume","trades","taker_buy_base","taker_buy_quote","ignore"]
    df = pd.DataFrame(all_data, columns=cols)
    df["open_time"] = pd.to_datetime(df["open_time"], unit="ms")
    for c in ["open","high","low","close","volume","quote_volume","taker_buy_base","taker_buy_quote","trades"]: df[c] = pd.to_numeric(df[c], errors="coerce")
    df["symbol"] = symbol
    return df.drop_duplicates(subset=["open_time"]).sort_values("open_time").reset_index(drop=True)

def calculate_comprehensive_features(df):
    """Calculate ~150 institutional-grade crypto features"""
    df = df.copy()
    ann_factor = np.sqrt(252 * 24 * 60)

    # 1. RETURNS & PRICE ACTION
    df["log_return"] = np.log(df["close"] / df["close"].shift(1))
    df["return_1"] = df["close"].pct_change(1)
    for w in [5, 10, 20, 50, 100, 200]:
        df[f"return_{w}"] = df["close"].pct_change(w)
    for w in [20, 50]:
        vol = df["log_return"].rolling(w).std()
        df[f"sharpe_{w}"] = df[f"return_{w}"] / (vol * np.sqrt(w) + 1e-10)

    # 2. VOLATILITY (multiple estimators)
    for w in [5, 10, 20, 50, 100]:
        df[f"volatility_{w}"] = df["log_return"].rolling(w).std() * ann_factor
    for w in [20, 50]:
        log_hl = np.log(df["high"] / df["low"])
        df[f"parkinson_vol_{w}"] = np.sqrt((1/(4*np.log(2))) * (log_hl**2).rolling(w).mean()) * ann_factor
        log_co = np.log(df["close"] / df["open"])
        gk = 0.5 * log_hl**2 - (2*np.log(2) - 1) * log_co**2
        df[f"gk_vol_{w}"] = np.sqrt(gk.rolling(w).mean().abs()) * ann_factor
    for w in [14, 20, 50]:
        tr = pd.concat([df["high"] - df["low"], abs(df["high"] - df["close"].shift(1)), abs(df["low"] - df["close"].shift(1))], axis=1).max(axis=1)
        df[f"atr_{w}"] = tr.rolling(w).mean()
        df[f"atr_pct_{w}"] = df[f"atr_{w}"] / df["close"] * 100
    df["vol_regime"] = df["volatility_20"] / (df["volatility_100"] + 1e-10)

    # 3. VOLUME (CVD, VWAP, trades)
    for w in [5, 10, 20, 50]:
        df[f"volume_ma_{w}"] = df["volume"].rolling(w).mean()
    df["rvol_20"] = df["volume"] / (df["volume"].rolling(20).mean() + 1e-10)
    df["volume_zscore"] = (df["volume"] - df["volume"].rolling(50).mean()) / (df["volume"].rolling(50).std() + 1e-10)
    typical_price = (df["high"] + df["low"] + df["close"]) / 3
    for w in [20, 50]:
        cum_vol = df["volume"].rolling(w).sum()
        cum_tp_vol = (typical_price * df["volume"]).rolling(w).sum()
        df[f"vwap_dist_{w}"] = (df["close"] - cum_tp_vol/(cum_vol+1e-10)) / (cum_tp_vol/(cum_vol+1e-10)+1e-10) * 100
    volume_delta = df["taker_buy_base"] - (df["volume"] - df["taker_buy_base"])
    for w in [10, 20, 50]:
        df[f"cvd_{w}"] = volume_delta.rolling(w).sum()
        df[f"cvd_norm_{w}"] = df[f"cvd_{w}"] / (df["volume"].rolling(w).sum() + 1e-10)
    df["dollar_vol_ratio"] = df["quote_volume"] / (df["quote_volume"].rolling(20).mean() + 1e-10)
    if "trades" in df.columns:
        df["trades"] = pd.to_numeric(df["trades"], errors="coerce")
        for w in [10, 20]:
            df[f"trades_ma_{w}"] = df["trades"].rolling(w).mean()
        df["trades_zscore"] = (df["trades"] - df["trades"].rolling(50).mean()) / (df["trades"].rolling(50).std() + 1e-10)
        df["avg_trade_size"] = df["volume"] / (df["trades"] + 1)

    # 4. MICROSTRUCTURE
    df["spread_bps"] = (df["high"] - df["low"]) / df["close"] * 10000
    df["ofi"] = df["taker_buy_base"] / (df["volume"] + 1e-10)
    for w in [10, 20, 50]:
        df[f"buy_pressure_{w}"] = df["taker_buy_base"].rolling(w).sum() / (df["volume"].rolling(w).sum() + 1e-10)
    df["amihud"] = abs(df["return_1"]) / (df["quote_volume"] / 1e6 + 1e-10)

    # 5. MOMENTUM (MACD, RSI, ADX, etc.)
    for w in [5, 10, 20, 50, 100]:
        df[f"ma_dist_{w}"] = (df["close"] - df["close"].rolling(w).mean()) / df["close"].rolling(w).mean() * 100
    ema12 = df["close"].ewm(span=12, adjust=False).mean()
    ema26 = df["close"].ewm(span=26, adjust=False).mean()
    df["macd"] = ema12 - ema26
    df["macd_signal"] = df["macd"].ewm(span=9, adjust=False).mean()
    df["macd_hist"] = df["macd"] - df["macd_signal"]
    for w in [7, 14, 21]:
        delta = df["close"].diff()
        gain = delta.where(delta > 0, 0).rolling(w).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(w).mean()
        df[f"rsi_{w}"] = 100 - (100 / (1 + gain/(loss+1e-10)))
        df[f"rsi_{w}_norm"] = (df[f"rsi_{w}"] - 50) / 50
    rsi14 = df["rsi_14"]
    rsi_min, rsi_max = rsi14.rolling(14).min(), rsi14.rolling(14).max()
    df["stoch_rsi"] = (rsi14 - rsi_min) / (rsi_max - rsi_min + 1e-10)
    for w in [14, 21]:
        highest, lowest = df["high"].rolling(w).max(), df["low"].rolling(w).min()
        df[f"williams_r_{w}"] = -100 * (highest - df["close"]) / (highest - lowest + 1e-10)
    for w in [14, 20]:
        plus_dm = df["high"].diff().where(lambda x: x > 0, 0)
        minus_dm = (-df["low"].diff()).where(lambda x: x > 0, 0)
        tr = pd.concat([df["high"]-df["low"], abs(df["high"]-df["close"].shift(1)), abs(df["low"]-df["close"].shift(1))], axis=1).max(axis=1)
        atr = tr.rolling(w).mean()
        plus_di = 100 * (plus_dm.rolling(w).mean() / (atr + 1e-10))
        minus_di = 100 * (minus_dm.rolling(w).mean() / (atr + 1e-10))
        df[f"adx_{w}"] = (100 * abs(plus_di - minus_di) / (plus_di + minus_di + 1e-10)).rolling(w).mean()
    tp = (df["high"] + df["low"] + df["close"]) / 3
    df["cci_20"] = (tp - tp.rolling(20).mean()) / (0.015 * tp.rolling(20).std() + 1e-10)

    # 6. MEAN REVERSION (Bollinger, z-scores)
    for w in [20, 50]:
        ma, std = df["close"].rolling(w).mean(), df["close"].rolling(w).std()
        df[f"bb_width_{w}"] = (4 * std) / ma * 100
        df[f"bb_position_{w}"] = (df["close"] - (ma - 2*std)) / (4*std + 1e-10)
        df[f"price_zscore_{w}"] = (df["close"] - ma) / (std + 1e-10)

    # 7. TIME FEATURES
    hour = df["open_time"].dt.hour
    dow = df["open_time"].dt.dayofweek
    df["hour_sin"] = np.sin(2 * np.pi * hour / 24)
    df["hour_cos"] = np.cos(2 * np.pi * hour / 24)
    df["dow_sin"] = np.sin(2 * np.pi * dow / 7)
    df["dow_cos"] = np.cos(2 * np.pi * dow / 7)
    df["is_asia"] = ((hour >= 0) & (hour < 8)).astype(int)
    df["is_europe"] = ((hour >= 7) & (hour < 16)).astype(int)
    df["is_us"] = ((hour >= 13) & (hour < 22)).astype(int)
    df["is_weekend"] = (dow >= 5).astype(int)

    # 8. STATISTICAL
    for w in [20, 50]:
        df[f"skewness_{w}"] = df["log_return"].rolling(w).skew()
        df[f"kurtosis_{w}"] = df["log_return"].rolling(w).kurt()

    # 9. PRICE PATTERNS
    for w in [20, 50, 100]:
        highest, lowest = df["high"].rolling(w).max(), df["low"].rolling(w).min()
        df[f"dist_from_high_{w}"] = (df["close"] - highest) / highest * 100
        df[f"dist_from_low_{w}"] = (df["close"] - lowest) / lowest * 100
        df[f"range_position_{w}"] = (df["close"] - lowest) / (highest - lowest + 1e-10)

    return df

def get_feature_columns(df):
    exclude = ["open_time","close_time","symbol","ignore","open","high","low","close","volume","quote_volume","trades","taker_buy_base","taker_buy_quote"]
    return [c for c in df.columns if c not in exclude and not c.startswith("target_")]

In [4]:
SYMBOLS = ["BTCUSDT", "ETHUSDT", "BNBUSDT", "SOLUSDT"]
print("Collecting data...")
all_data = []
for sym in tqdm(SYMBOLS):
    df = fetch_klines_sync(sym, days=90)
    if len(df) > 0:
        all_data.append(df)
        print(f"  ✓ {sym}: {len(df):,} rows")
    else:
        print(f"  ✗ {sym}: FAILED")

if not all_data: raise ValueError("No data!")
raw_data = pd.concat(all_data, ignore_index=True)
print(f"\n✓ Total: {len(raw_data):,} rows")

Collecting data...


  0%|          | 0/4 [00:00<?, ?it/s]

  ✓ BTCUSDT: 90,000 rows
  ✓ ETHUSDT: 90,000 rows
  ✓ BNBUSDT: 90,000 rows
  ✓ SOLUSDT: 90,000 rows

✓ Total: 360,000 rows


In [5]:
# Per-symbol split with comprehensive features
TARGET_COL = "target_return_5"
train_dfs, val_dfs, test_dfs = [], [], []

for sym in raw_data["symbol"].unique():
    sdf = raw_data[raw_data["symbol"]==sym].copy().sort_values("open_time")
    sdf = calculate_comprehensive_features(sdf)  # ~150 features
    sdf[TARGET_COL] = sdf["close"].shift(-5)/sdf["close"] - 1
    sdf = sdf.replace([np.inf,-np.inf], np.nan).iloc[200:].dropna()  # Extended warmup
    n = len(sdf)
    train_end, val_end = int(n*0.70), int(n*0.85)
    train_dfs.append(sdf.iloc[:train_end])
    val_dfs.append(sdf.iloc[train_end:val_end])
    test_dfs.append(sdf.iloc[val_end:])
    print(f"{sym}: {len(sdf):,} rows")

train_df = pd.concat(train_dfs, ignore_index=True)
val_df = pd.concat(val_dfs, ignore_index=True)
test_df = pd.concat(test_dfs, ignore_index=True)
print(f"\n✓ Split: {len(train_df):,}/{len(val_df):,}/{len(test_df):,}")
print(f"✓ Features: {len(get_feature_columns(train_df))}")

BTCUSDT: 89,793 rows
ETHUSDT: 89,795 rows
BNBUSDT: 89,795 rows
SOLUSDT: 89,795 rows

✓ Split: 251,423/53,876/53,879
✓ Features: 98


In [6]:
# Prepare data
feature_cols = get_feature_columns(train_df)
scaler = RobustScaler()
X_train = scaler.fit_transform(train_df[feature_cols].values)
X_val = scaler.transform(val_df[feature_cols].values)
X_test = scaler.transform(test_df[feature_cols].values)
y_train = train_df[TARGET_COL].values
y_val = val_df[TARGET_COL].values
y_test = test_df[TARGET_COL].values
print(f"Features: {len(feature_cols)} | Train: {X_train.shape}")

Features: 98 | Train: (251423, 98)


In [7]:
print("=" * 60)
print("TRAINING XGBOOST (GPU)")
print("=" * 60)

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "tree_method": "hist",
    "device": "cuda",
    "max_depth": 6,
    "learning_rate": 0.05,
    "subsample": 0.8,
    "colsample_bytree": 0.9,
    "seed": 42
}

start_time = time.time()
model = xgb.train(params, dtrain, num_boost_round=2000,
                  evals=[(dtrain, "train"), (dval, "val")],
                  early_stopping_rounds=100, verbose_eval=200)
train_time = time.time() - start_time
print(f"\n✓ Training time: {train_time:.1f}s")

TRAINING XGBOOST (GPU)
[0]	train-rmse:0.00260	val-rmse:0.00209
[100]	train-rmse:0.00230	val-rmse:0.00211

✓ Training time: 1.3s


In [None]:
# =====================================================================
# COMPREHENSIVE EVALUATION WITH OVERFITTING DETECTION
# =====================================================================
print("=" * 70)
print("COMPREHENSIVE MODEL EVALUATION")
print("=" * 70)

from scipy.stats import spearmanr

# Create DMatrix for test set (FIXED: was using undefined 'dtest')
dtest = xgb.DMatrix(X_test, label=y_test)

# Predictions on all sets
y_pred_train = model.predict(xgb.DMatrix(X_train))
y_pred_val = model.predict(xgb.DMatrix(X_val))
y_pred_test = model.predict(dtest)

def comprehensive_metrics(y_true, y_pred, set_name):
    """Calculate comprehensive metrics"""
    mse = np.mean((y_true - y_pred) ** 2)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(y_true - y_pred))
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    r2 = 1 - (ss_res / (ss_tot + 1e-10))
    ic, _ = spearmanr(y_true, y_pred)
    direction_acc = np.mean(np.sign(y_true) == np.sign(y_pred))
    strategy_returns = y_true * np.sign(y_pred)
    sharpe = (np.mean(strategy_returns) / (np.std(strategy_returns) + 1e-10)) * np.sqrt(252*24*60)
    downside = strategy_returns[strategy_returns < 0]
    downside_std = np.std(downside) if len(downside) > 0 else 1e-10
    sortino = (np.mean(strategy_returns) / (downside_std + 1e-10)) * np.sqrt(252*24*60)
    cumulative = np.cumsum(strategy_returns)
    running_max = np.maximum.accumulate(cumulative)
    max_dd = np.min(cumulative - running_max)
    profits = strategy_returns[strategy_returns > 0].sum()
    losses = abs(strategy_returns[strategy_returns < 0].sum())
    profit_factor = profits / (losses + 1e-10)

    print(f"\n{set_name} METRICS:")
    print(f"  MSE: {mse:.8f} | RMSE: {rmse:.8f} | MAE: {mae:.8f}")
    print(f"  R²: {r2:.6f} | IC: {ic:.6f} | Dir Acc: {direction_acc:.4%}")
    print(f"  Sharpe: {sharpe:.4f} | Sortino: {sortino:.4f} | MaxDD: {max_dd:.6f}")

    return {'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2, 'ic': ic,
            'direction_acc': direction_acc, 'sharpe': sharpe, 'sortino': sortino,
            'max_dd': max_dd, 'profit_factor': profit_factor}

train_metrics = comprehensive_metrics(y_train, y_pred_train, "TRAIN")
val_metrics = comprehensive_metrics(y_val, y_pred_val, "VALIDATION")
test_metrics = comprehensive_metrics(y_test, y_pred_test, "TEST")

# OVERFITTING DETECTION
print("\n" + "=" * 70)
print("OVERFITTING ANALYSIS")
print("=" * 70)

train_val_gap = train_metrics['sharpe'] - val_metrics['sharpe']
val_test_gap = val_metrics['sharpe'] - test_metrics['sharpe']
train_test_gap = train_metrics['sharpe'] - test_metrics['sharpe']
dir_gap = train_metrics['direction_acc'] - test_metrics['direction_acc']
r2_gap = train_metrics['r2'] - test_metrics['r2']

print(f"\nSharpe Gaps: Train-Val={train_val_gap:+.2f} | Val-Test={val_test_gap:+.2f} | Train-Test={train_test_gap:+.2f}")
print(f"Dir Acc Gap: {dir_gap:+.4%} | R² Gap: {r2_gap:+.6f}")

overfitting_score = sum([train_val_gap > 2, val_test_gap > 1, train_test_gap > 3, dir_gap > 0.05, r2_gap > 0.1])
if overfitting_score == 0: print("\n✓ NO OVERFITTING DETECTED")
elif overfitting_score <= 2: print("\n⚠️ MILD OVERFITTING")
else: print("\n❌ SEVERE OVERFITTING")

# DATA LEAKAGE VALIDATION
print("\n" + "=" * 70)
print("DATA LEAKAGE VALIDATION")
print("=" * 70)
train_max = train_df["open_time"].max()
val_min = val_df["open_time"].min()
test_min = test_df["open_time"].min()
# Use <= for boundary check (rows at same timestamp are different symbols)
temporal_ok = train_max <= val_min and val_min <= test_min
print(f"Temporal Order: Train<=Val<=Test = {'✓ CORRECT' if temporal_ok else '❌ LEAKAGE'}")
print(f"Dir Acc {test_metrics['direction_acc']:.2%}: {'✓ Realistic' if test_metrics['direction_acc'] < 0.55 else '⚠️ High'}")
print(f"Sharpe {test_metrics['sharpe']:.2f}: {'✓ Realistic' if abs(test_metrics['sharpe']) < 3 else '⚠️ High'}")

In [None]:
# Export ONNX with opset 15
import onnx
from onnxmltools import convert_xgboost
from onnxmltools.convert.common.data_types import FloatTensorType

initial_types = [("input", FloatTensorType([None, len(feature_cols)]))]
onnx_model = convert_xgboost(model, initial_types=initial_types, target_opset=15)

# Save directly to trained/ directory
onnx_path = TRAINED_DIR / "xgboost_model.onnx"
onnx.save_model(onnx_model, str(onnx_path))
onnx.checker.check_model(onnx.load(str(onnx_path)))
print(f"✓ ONNX saved: {onnx_path}")

metadata = {
    "model_type": "xgboost", "num_features": len(feature_cols), "onnx_opset": 15,
    "train_metrics": train_metrics, "val_metrics": val_metrics, "test_metrics": test_metrics,
    "overfitting_score": int(overfitting_score),
    "temporal_ordering_valid": bool(temporal_ok)
}
with open(TRAINED_DIR / "xgboost_metadata.json", "w") as f:
    json.dump(metadata, f, indent=2, default=str)
print("\n✓ XGBOOST TRAINING COMPLETE!")