# D4PG + EVT Training Pipeline

## ‚öôÔ∏è Runtime: L4 GPU (~12 CU)
**Menu: Runtime ‚Üí Change runtime type ‚Üí L4 GPU**

## Anti-Leakage Guarantees
1. **Per-Symbol Temporal Split** - Each symbol split independently
2. **RL Trained on Train Data ONLY** - No information from val/test
3. **EVT Risk Model** - Trained on training period returns only
4. **Realistic Episode Structure** - Proper environment reset

## Output
- `trained/d4pg_actor.onnx`
- `trained/d4pg_actor.pt`
- `trained/d4pg_metadata.json`

## Note
Training takes 3-4 hours due to 200 episodes x 250k+ steps per episode

In [None]:
!nvidia-smi
import torch
print(f"PyTorch: {torch.__version__}, CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available(): print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
!pip install -q torch onnx onnxruntime-gpu pandas numpy scikit-learn scipy requests tqdm
print("‚úì Dependencies installed!")

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import RobustScaler
from scipy import stats
from collections import deque
from pathlib import Path
import json, time, random, warnings
warnings.filterwarnings('ignore')

TRAINED_DIR = Path("trained")
TRAINED_DIR.mkdir(parents=True, exist_ok=True)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {DEVICE}")

In [None]:
import requests
from datetime import datetime, timedelta
from tqdm.notebook import tqdm

def fetch_klines_sync(symbol, days=90):
    base_url = "https://api.binance.com/api/v3/klines"
    end_time = datetime.utcnow()
    start_time = end_time - timedelta(days=days)
    all_data = []
    current = start_time
    while current < end_time:
        params = {"symbol": symbol, "interval": "1m",
                  "startTime": int(current.timestamp()*1000),
                  "endTime": int(min(current+timedelta(days=1), end_time).timestamp()*1000), "limit": 1440}
        try:
            resp = requests.get(base_url, params=params, timeout=30)
            data = resp.json()
            if isinstance(data, list): all_data.extend(data)
        except: pass
        current += timedelta(days=1)
        time.sleep(0.1)
    if not all_data: return pd.DataFrame()
    cols = ["open_time","open","high","low","close","volume","close_time","quote_volume","trades","taker_buy_base","taker_buy_quote","ignore"]
    df = pd.DataFrame(all_data, columns=cols)
    df["open_time"] = pd.to_datetime(df["open_time"], unit="ms")
    for c in ["open","high","low","close","volume","quote_volume","taker_buy_base","taker_buy_quote"]: df[c] = pd.to_numeric(df[c], errors="coerce")
    df["symbol"] = symbol
    return df.drop_duplicates(subset=["open_time"]).sort_values("open_time")

def calculate_comprehensive_features(df):
    """Calculate ~150 institutional-grade crypto features"""
    df = df.copy()
    ann_factor = np.sqrt(252 * 24 * 60)

    # 1. RETURNS & PRICE ACTION
    df["log_return"] = np.log(df["close"] / df["close"].shift(1))
    df["return_1"] = df["close"].pct_change(1)
    for w in [5, 10, 20, 50, 100, 200]:
        df[f"return_{w}"] = df["close"].pct_change(w)
    for w in [20, 50]:
        vol = df["log_return"].rolling(w).std()
        df[f"sharpe_{w}"] = df[f"return_{w}"] / (vol * np.sqrt(w) + 1e-10)

    # 2. VOLATILITY (multiple estimators)
    for w in [5, 10, 20, 50, 100]:
        df[f"volatility_{w}"] = df["log_return"].rolling(w).std() * ann_factor
    for w in [20, 50]:
        log_hl = np.log(df["high"] / df["low"])
        df[f"parkinson_vol_{w}"] = np.sqrt((1/(4*np.log(2))) * (log_hl**2).rolling(w).mean()) * ann_factor
        log_co = np.log(df["close"] / df["open"])
        gk = 0.5 * log_hl**2 - (2*np.log(2) - 1) * log_co**2
        df[f"gk_vol_{w}"] = np.sqrt(gk.rolling(w).mean().abs()) * ann_factor
    for w in [14, 20, 50]:
        tr = pd.concat([df["high"] - df["low"], abs(df["high"] - df["close"].shift(1)), abs(df["low"] - df["close"].shift(1))], axis=1).max(axis=1)
        df[f"atr_{w}"] = tr.rolling(w).mean()
        df[f"atr_pct_{w}"] = df[f"atr_{w}"] / df["close"] * 100
    df["vol_regime"] = df["volatility_20"] / (df["volatility_100"] + 1e-10)

    # 3. VOLUME (CVD, VWAP, trades)
    for w in [5, 10, 20, 50]:
        df[f"volume_ma_{w}"] = df["volume"].rolling(w).mean()
    df["rvol_20"] = df["volume"] / (df["volume"].rolling(20).mean() + 1e-10)
    df["volume_zscore"] = (df["volume"] - df["volume"].rolling(50).mean()) / (df["volume"].rolling(50).std() + 1e-10)
    typical_price = (df["high"] + df["low"] + df["close"]) / 3
    for w in [20, 50]:
        cum_vol = df["volume"].rolling(w).sum()
        cum_tp_vol = (typical_price * df["volume"]).rolling(w).sum()
        df[f"vwap_dist_{w}"] = (df["close"] - cum_tp_vol/(cum_vol+1e-10)) / (cum_tp_vol/(cum_vol+1e-10)+1e-10) * 100
    volume_delta = df["taker_buy_base"] - (df["volume"] - df["taker_buy_base"])
    for w in [10, 20, 50]:
        df[f"cvd_{w}"] = volume_delta.rolling(w).sum()
        df[f"cvd_norm_{w}"] = df[f"cvd_{w}"] / (df["volume"].rolling(w).sum() + 1e-10)
    df["dollar_vol_ratio"] = df["quote_volume"] / (df["quote_volume"].rolling(20).mean() + 1e-10)

    # 4. MICROSTRUCTURE
    df["spread_bps"] = (df["high"] - df["low"]) / df["close"] * 10000
    df["ofi"] = df["taker_buy_base"] / (df["volume"] + 1e-10)
    for w in [10, 20, 50]:
        df[f"buy_pressure_{w}"] = df["taker_buy_base"].rolling(w).sum() / (df["volume"].rolling(w).sum() + 1e-10)
    df["amihud"] = abs(df["return_1"]) / (df["quote_volume"] / 1e6 + 1e-10)

    # 5. MOMENTUM (MACD, RSI, ADX, etc.)
    for w in [5, 10, 20, 50, 100]:
        df[f"ma_dist_{w}"] = (df["close"] - df["close"].rolling(w).mean()) / df["close"].rolling(w).mean() * 100
    ema12 = df["close"].ewm(span=12, adjust=False).mean()
    ema26 = df["close"].ewm(span=26, adjust=False).mean()
    df["macd"] = ema12 - ema26
    df["macd_signal"] = df["macd"].ewm(span=9, adjust=False).mean()
    df["macd_hist"] = df["macd"] - df["macd_signal"]
    for w in [7, 14, 21]:
        delta = df["close"].diff()
        gain = delta.where(delta > 0, 0).rolling(w).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(w).mean()
        df[f"rsi_{w}"] = 100 - (100 / (1 + gain/(loss+1e-10)))
        df[f"rsi_{w}_norm"] = (df[f"rsi_{w}"] - 50) / 50
    rsi14 = df["rsi_14"]
    rsi_min, rsi_max = rsi14.rolling(14).min(), rsi14.rolling(14).max()
    df["stoch_rsi"] = (rsi14 - rsi_min) / (rsi_max - rsi_min + 1e-10)
    for w in [14, 21]:
        highest, lowest = df["high"].rolling(w).max(), df["low"].rolling(w).min()
        df[f"williams_r_{w}"] = -100 * (highest - df["close"]) / (highest - lowest + 1e-10)
    for w in [14, 20]:
        plus_dm = df["high"].diff().where(lambda x: x > 0, 0)
        minus_dm = (-df["low"].diff()).where(lambda x: x > 0, 0)
        tr = pd.concat([df["high"]-df["low"], abs(df["high"]-df["close"].shift(1)), abs(df["low"]-df["close"].shift(1))], axis=1).max(axis=1)
        atr = tr.rolling(w).mean()
        plus_di = 100 * (plus_dm.rolling(w).mean() / (atr + 1e-10))
        minus_di = 100 * (minus_dm.rolling(w).mean() / (atr + 1e-10))
        df[f"adx_{w}"] = (100 * abs(plus_di - minus_di) / (plus_di + minus_di + 1e-10)).rolling(w).mean()
    tp = (df["high"] + df["low"] + df["close"]) / 3
    df["cci_20"] = (tp - tp.rolling(20).mean()) / (0.015 * tp.rolling(20).std() + 1e-10)

    # 6. MEAN REVERSION (Bollinger, z-scores)
    for w in [20, 50]:
        ma, std = df["close"].rolling(w).mean(), df["close"].rolling(w).std()
        df[f"bb_width_{w}"] = (4 * std) / ma * 100
        df[f"bb_position_{w}"] = (df["close"] - (ma - 2*std)) / (4*std + 1e-10)
        df[f"price_zscore_{w}"] = (df["close"] - ma) / (std + 1e-10)

    # 7. TIME FEATURES
    hour = df["open_time"].dt.hour
    dow = df["open_time"].dt.dayofweek
    df["hour_sin"] = np.sin(2 * np.pi * hour / 24)
    df["hour_cos"] = np.cos(2 * np.pi * hour / 24)
    df["dow_sin"] = np.sin(2 * np.pi * dow / 7)
    df["dow_cos"] = np.cos(2 * np.pi * dow / 7)
    df["is_asia"] = ((hour >= 0) & (hour < 8)).astype(int)
    df["is_europe"] = ((hour >= 7) & (hour < 16)).astype(int)
    df["is_us"] = ((hour >= 13) & (hour < 22)).astype(int)
    df["is_weekend"] = (dow >= 5).astype(int)

    # 8. STATISTICAL
    for w in [20, 50]:
        df[f"skewness_{w}"] = df["log_return"].rolling(w).skew()
        df[f"kurtosis_{w}"] = df["log_return"].rolling(w).kurt()

    # 9. PRICE PATTERNS
    for w in [20, 50, 100]:
        highest, lowest = df["high"].rolling(w).max(), df["low"].rolling(w).min()
        df[f"dist_from_high_{w}"] = (df["close"] - highest) / highest * 100
        df[f"dist_from_low_{w}"] = (df["close"] - lowest) / lowest * 100
        df[f"range_position_{w}"] = (df["close"] - lowest) / (highest - lowest + 1e-10)

    return df

def get_feature_columns(df):
    exclude = ["open_time","close_time","symbol","ignore","open","high","low","close","volume","quote_volume","trades","taker_buy_base","taker_buy_quote","hour","day_of_week"]
    return [c for c in df.columns if c not in exclude and not c.startswith("target_")]

In [None]:
SYMBOLS = ["BTCUSDT", "ETHUSDT", "BNBUSDT", "SOLUSDT"]
print("Collecting data...")
all_data = []
for sym in tqdm(SYMBOLS):
    df = fetch_klines_sync(sym, days=90)
    if len(df) > 0:
        all_data.append(df)
        print(f"  ‚úì {sym}: {len(df):,} rows")

if not all_data: raise ValueError("No data collected!")
raw_data = pd.concat(all_data, ignore_index=True)
print(f"\n‚úì Total: {len(raw_data):,} rows")

# Per-symbol split - RL uses ONLY training data with comprehensive features
train_dfs = []
for sym in raw_data["symbol"].unique():
    sdf = raw_data[raw_data["symbol"]==sym].copy().sort_values("open_time").reset_index(drop=True)
    sdf = calculate_comprehensive_features(sdf)  # ~150 features
    sdf = sdf.replace([np.inf,-np.inf], np.nan).iloc[200:].dropna()  # Extended warmup
    n = len(sdf)
    train_end = int(n * 0.70)
    train_dfs.append(sdf.iloc[:train_end])
    print(f"{sym}: {train_end:,} train rows")

train_df = pd.concat(train_dfs).sort_values("open_time").reset_index(drop=True)
print(f"\n‚úì Total train: {len(train_df):,}")
print(f"‚úì Features: {len(get_feature_columns(train_df))}")

In [None]:
# Prepare RL data (TRAIN ONLY)
feature_cols = get_feature_columns(train_df)
ohlcv_cols = ["open", "high", "low", "close", "volume"]

scaler = RobustScaler()
rl_features = scaler.fit_transform(train_df[feature_cols].values)
rl_ohlcv = train_df[ohlcv_cols].values

print(f"RL Features: {rl_features.shape}")
print(f"RL OHLCV: {rl_ohlcv.shape}")

In [None]:
# EVT Risk Model
class EVTRiskModel:
    """Extreme Value Theory for tail risk estimation"""
    def __init__(self, threshold_percentile=95.0):
        self.threshold_percentile = threshold_percentile
        self.losses = []
        self.shape = None
        self.scale = None
        self.threshold = None

    def update(self, returns):
        losses = -returns[returns < 0]
        self.losses.extend(losses.tolist())
        if len(self.losses) < 100:
            return
        losses_array = np.array(self.losses)
        self.threshold = np.percentile(losses_array, self.threshold_percentile)
        exceedances = losses_array[losses_array > self.threshold] - self.threshold
        if len(exceedances) >= 10:
            try:
                self.shape, _, self.scale = stats.genpareto.fit(exceedances, floc=0)
            except:
                pass

    def var(self, confidence=0.99):
        if self.shape is None or self.threshold is None:
            return 0.0
        n = len(self.losses)
        n_exc = sum(1 for l in self.losses if l > self.threshold)
        if n_exc == 0:
            return 0.0
        p = n_exc / n
        q = 1 - confidence
        if self.shape == 0:
            return self.threshold + self.scale * np.log(p / q)
        return self.threshold + (self.scale / self.shape) * ((p / q) ** self.shape - 1)

    def cvar(self, confidence=0.99):
        var = self.var(confidence)
        if self.shape is None or self.shape >= 1:
            return var
        return var / (1 - self.shape) + (self.scale - self.shape * self.threshold) / (1 - self.shape)

In [None]:
# D4PG Networks
class D4PGActor(nn.Module):
    def __init__(self, state_dim, action_dim=1, hidden_dim=256):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(),
            nn.Linear(hidden_dim, action_dim), nn.Tanh()
        )

    def forward(self, state):
        return self.net(state)


class D4PGCritic(nn.Module):
    def __init__(self, state_dim, action_dim=1, hidden_dim=256, n_atoms=51):
        super().__init__()
        self.n_atoms = n_atoms
        self.net = nn.Sequential(
            nn.Linear(state_dim + action_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim), nn.ReLU(),
            nn.Linear(hidden_dim, n_atoms)
        )

    def forward(self, state, action):
        x = torch.cat([state, action], dim=-1)
        return torch.softmax(self.net(x), dim=-1)

In [None]:
# Trading Environment
class TradingEnvRL:
    """RL Environment using TRAIN data only"""
    def __init__(self, data, features, initial_balance=100000, transaction_cost=0.0005):
        self.data = data
        self.features = features
        self.initial_balance = initial_balance
        self.transaction_cost = transaction_cost
        self.reset()

    def reset(self):
        self.balance = self.initial_balance
        self.position = 0.0
        self.step_idx = 0
        self.returns = []
        return self._get_state()

    def _get_state(self):
        market = self.features[self.step_idx]
        portfolio = np.array([
            self.position,
            self.balance / self.initial_balance - 1,
            np.mean(self.returns[-20:]) if self.returns else 0,
            np.std(self.returns[-20:]) if len(self.returns) > 1 else 0
        ])
        return np.concatenate([market, portfolio])

    def step(self, action):
        target_pos = float(np.clip(action[0], -1, 1))
        pos_change = target_pos - self.position
        current_price = self.data[self.step_idx, 3]  # close price
        cost = abs(pos_change) * current_price * self.transaction_cost

        self.step_idx += 1
        done = self.step_idx >= len(self.data) - 1

        if not done:
            next_price = self.data[self.step_idx, 3]
            ret = (next_price - current_price) / current_price
            pnl = self.position * ret * self.balance - cost
            self.balance += pnl
            step_ret = pnl / self.initial_balance
            self.returns.append(step_ret)
            self.position = target_pos

            # Reward: Sharpe-like
            if len(self.returns) > 1:
                reward = np.mean(self.returns[-20:]) / (np.std(self.returns[-20:]) + 1e-8)
            else:
                reward = step_ret * 100
        else:
            reward = 0

        return self._get_state() if not done else np.zeros_like(self._get_state()), reward, done

In [None]:
# Training
print("="*60)
print("TRAINING D4PG+EVT (CUDA)")
print("="*60)
print("WARNING: This will take 3-4 hours")
print("="*60)

state_dim = rl_features.shape[1] + 4  # features + portfolio state
env = TradingEnvRL(rl_ohlcv, rl_features)

actor = D4PGActor(state_dim).to(DEVICE)
actor_target = D4PGActor(state_dim).to(DEVICE)
actor_target.load_state_dict(actor.state_dict())

critic = D4PGCritic(state_dim).to(DEVICE)
critic_target = D4PGCritic(state_dim).to(DEVICE)
critic_target.load_state_dict(critic.state_dict())

actor_opt = torch.optim.Adam(actor.parameters(), lr=1e-4)
critic_opt = torch.optim.Adam(critic.parameters(), lr=3e-4)

evt_model = EVTRiskModel()
buffer = deque(maxlen=100000)
batch_size = 256
gamma = 0.99
tau = 0.005
episodes = 200

start_time = time.time()
for ep in range(episodes):
    state = env.reset()
    ep_reward = 0

    while True:
        state_t = torch.FloatTensor(state).unsqueeze(0).to(DEVICE)
        with torch.no_grad():
            action = actor(state_t).cpu().numpy()[0]

        # Exploration noise
        action = action + np.random.normal(0, 0.1, size=action.shape)
        action = np.clip(action, -1, 1)

        next_state, reward, done = env.step(action)
        buffer.append((state, action, reward, next_state, done))
        evt_model.update(np.array([reward]))

        state = next_state
        ep_reward += reward

        # Training step
        if len(buffer) >= batch_size:
            batch = random.sample(buffer, batch_size)
            states, actions, rewards, next_states, dones = zip(*batch)

            states = torch.FloatTensor(np.array(states)).to(DEVICE)
            actions = torch.FloatTensor(np.array(actions)).to(DEVICE)
            rewards = torch.FloatTensor(rewards).unsqueeze(1).to(DEVICE)
            next_states = torch.FloatTensor(np.array(next_states)).to(DEVICE)
            dones = torch.FloatTensor(dones).unsqueeze(1).to(DEVICE)

            # Critic update
            with torch.no_grad():
                next_actions = actor_target(next_states)
                target_q = critic_target(next_states, next_actions)

            current_q = critic(states, actions)
            critic_loss = nn.MSELoss()(current_q.mean(dim=1), rewards.squeeze() + gamma * (1-dones.squeeze()) * target_q.mean(dim=1))

            critic_opt.zero_grad()
            critic_loss.backward()
            critic_opt.step()

            # Actor update
            actor_loss = -critic(states, actor(states)).mean()

            actor_opt.zero_grad()
            actor_loss.backward()
            actor_opt.step()

            # Soft update
            for p, tp in zip(actor.parameters(), actor_target.parameters()):
                tp.data.copy_(tau * p.data + (1-tau) * tp.data)
            for p, tp in zip(critic.parameters(), critic_target.parameters()):
                tp.data.copy_(tau * p.data + (1-tau) * tp.data)

        if done:
            break

    if (ep + 1) % 20 == 0:
        total_ret = (env.balance - env.initial_balance) / env.initial_balance
        print(f"Episode {ep+1}/{episodes} - Return: {total_ret:.2%}, VaR: {evt_model.var():.4f}")

train_time = time.time() - start_time
print(f"\n‚úì Training time: {train_time/3600:.2f} hours")

In [None]:
# Comprehensive RL Evaluation with Overfitting Detection
from scipy.stats import spearmanr

def comprehensive_rl_metrics(env, model_name="D4PG"):
    """Calculate comprehensive metrics for RL model evaluation"""
    returns = np.array(env.returns)
    
    if len(returns) == 0:
        print(f"‚ö†Ô∏è No returns to evaluate for {model_name}")
        return {}
    
    ann_factor = np.sqrt(252 * 24 * 60)  # minute-level annualization
    
    # Basic metrics
    total_return = (env.balance - env.initial_balance) / env.initial_balance
    mean_return = np.mean(returns)
    std_return = np.std(returns)
    
    # Risk-adjusted metrics
    sharpe = (mean_return / (std_return + 1e-10)) * ann_factor
    
    # Sortino ratio (downside deviation)
    downside_returns = returns[returns < 0]
    downside_std = np.std(downside_returns) if len(downside_returns) > 0 else 1e-10
    sortino = (mean_return / (downside_std + 1e-10)) * ann_factor
    
    # Maximum Drawdown
    cumulative = np.cumsum(returns)
    running_max = np.maximum.accumulate(cumulative)
    drawdowns = running_max - cumulative
    max_dd = np.max(drawdowns) if len(drawdowns) > 0 else 0
    
    # Calmar ratio
    calmar = total_return / (max_dd + 1e-10) if max_dd > 0 else 0
    
    # Win rate & profit factor
    wins = returns[returns > 0]
    losses = returns[returns < 0]
    win_rate = len(wins) / len(returns) if len(returns) > 0 else 0
    profit_factor = abs(np.sum(wins) / (np.sum(losses) + 1e-10)) if len(losses) > 0 else 0
    
    # Tail risk metrics
    var_95 = np.percentile(returns, 5)
    var_99 = np.percentile(returns, 1)
    cvar_95 = np.mean(returns[returns <= var_95]) if len(returns[returns <= var_95]) > 0 else 0
    
    # Stability metrics
    returns_first_half = returns[:len(returns)//2]
    returns_second_half = returns[len(returns)//2:]
    sharpe_first = (np.mean(returns_first_half) / (np.std(returns_first_half) + 1e-10)) * ann_factor if len(returns_first_half) > 0 else 0
    sharpe_second = (np.mean(returns_second_half) / (np.std(returns_second_half) + 1e-10)) * ann_factor if len(returns_second_half) > 0 else 0
    sharpe_stability = 1 - abs(sharpe_first - sharpe_second) / (abs(sharpe_first) + abs(sharpe_second) + 1e-10)
    
    metrics = {
        "total_return": total_return,
        "mean_return": mean_return,
        "std_return": std_return,
        "sharpe": sharpe,
        "sortino": sortino,
        "max_drawdown": max_dd,
        "calmar": calmar,
        "win_rate": win_rate,
        "profit_factor": profit_factor,
        "var_95": var_95,
        "var_99": var_99,
        "cvar_95": cvar_95,
        "sharpe_stability": sharpe_stability,
        "n_trades": len(returns)
    }
    
    return metrics

def print_rl_metrics(metrics, model_name="D4PG"):
    """Print formatted metrics"""
    print(f"\n{'='*60}")
    print(f"{model_name} COMPREHENSIVE EVALUATION")
    print(f"{'='*60}")
    
    print(f"\nüìä RETURN METRICS:")
    print(f"  Total Return:     {metrics['total_return']:.4%}")
    print(f"  Mean Return:      {metrics['mean_return']:.6f}")
    print(f"  Std Return:       {metrics['std_return']:.6f}")
    
    print(f"\nüìà RISK-ADJUSTED METRICS:")
    print(f"  Sharpe Ratio:     {metrics['sharpe']:.4f}")
    print(f"  Sortino Ratio:    {metrics['sortino']:.4f}")
    print(f"  Calmar Ratio:     {metrics['calmar']:.4f}")
    
    print(f"\nüìâ RISK METRICS:")
    print(f"  Max Drawdown:     {metrics['max_drawdown']:.4%}")
    print(f"  VaR 95%:          {metrics['var_95']:.6f}")
    print(f"  VaR 99%:          {metrics['var_99']:.6f}")
    print(f"  CVaR 95%:         {metrics['cvar_95']:.6f}")
    
    print(f"\nüéØ TRADING METRICS:")
    print(f"  Win Rate:         {metrics['win_rate']:.2%}")
    print(f"  Profit Factor:    {metrics['profit_factor']:.4f}")
    print(f"  Total Trades:     {metrics['n_trades']:,}")
    
    print(f"\nüî¨ STABILITY METRICS:")
    print(f"  Sharpe Stability: {metrics['sharpe_stability']:.4f}")

def overfitting_analysis_rl(metrics, evt_model):
    """Analyze potential overfitting in RL model"""
    print(f"\n{'='*60}")
    print("OVERFITTING ANALYSIS")
    print(f"{'='*60}")
    
    warnings = []
    
    # Check for unrealistic metrics
    if metrics['sharpe'] > 3.0:
        warnings.append(f"‚ö†Ô∏è HIGH SHARPE ({metrics['sharpe']:.2f}) - Possible overfitting or look-ahead bias")
    
    if metrics['win_rate'] > 0.60:
        warnings.append(f"‚ö†Ô∏è HIGH WIN RATE ({metrics['win_rate']:.2%}) - Check for data leakage")
    
    if metrics['sharpe_stability'] < 0.5:
        warnings.append(f"‚ö†Ô∏è LOW STABILITY ({metrics['sharpe_stability']:.2f}) - Performance not consistent across time")
    
    if metrics['profit_factor'] > 3.0:
        warnings.append(f"‚ö†Ô∏è HIGH PROFIT FACTOR ({metrics['profit_factor']:.2f}) - Suspiciously good")
    
    if metrics['max_drawdown'] < 0.01:
        warnings.append(f"‚ö†Ô∏è VERY LOW DRAWDOWN ({metrics['max_drawdown']:.4%}) - Unrealistic for volatile assets")
    
    # EVT consistency check
    if evt_model.var() > 0:
        evt_var = evt_model.var()
        empirical_var = abs(metrics['var_99'])
        if abs(evt_var - empirical_var) / (empirical_var + 1e-10) > 0.5:
            warnings.append(f"‚ö†Ô∏è EVT-EMPIRICAL VAR MISMATCH - EVT: {evt_var:.6f} vs Empirical: {empirical_var:.6f}")
    
    if len(warnings) == 0:
        print("‚úÖ No obvious overfitting signals detected")
        print("   - Sharpe ratio in realistic range")
        print("   - Win rate not suspiciously high")
        print("   - Performance stable across time")
    else:
        for w in warnings:
            print(w)
    
    # Final verdict
    print(f"\nüìã VERDICT:")
    if len(warnings) <= 1:
        print("‚úÖ Model appears well-calibrated for production")
    elif len(warnings) <= 3:
        print("‚ö†Ô∏è Some concerns - recommend additional validation")
    else:
        print("‚ùå Multiple overfitting signals - DO NOT deploy without investigation")
    
    return warnings

# Run evaluation
metrics = comprehensive_rl_metrics(env, "D4PG+EVT")
print_rl_metrics(metrics, "D4PG+EVT")
overfitting_warnings = overfitting_analysis_rl(metrics, evt_model)

# Save metrics
import json
eval_metrics = {
    **metrics,
    "evt_var_99": float(evt_model.var()),
    "evt_cvar_99": float(evt_model.cvar()),
    "overfitting_warnings": len(overfitting_warnings)
}
with open(TRAINED_DIR / "d4pg_evaluation.json", "w") as f:
    json.dump(eval_metrics, f, indent=2)

In [None]:
# Export ONNX (FIXED: opset_version=15 for Colab compatibility, save to trained/)
actor.eval()
dummy_state = torch.randn(1, state_dim).to(DEVICE)

# Save directly to trained/ directory
onnx_path = TRAINED_DIR / "d4pg_actor.onnx"
torch.onnx.export(
    actor, dummy_state, str(onnx_path),
    input_names=["state"], output_names=["action"],
    dynamic_axes={"state": {0: "batch"}, "action": {0: "batch"}},
    opset_version=15  # FIXED: Changed from 17 to 15 for Colab ONNX compatibility
)

import onnx
onnx.checker.check_model(onnx.load(str(onnx_path)))
print(f"‚úì ONNX saved: {onnx_path}")

# Metadata
metadata = {
    "model_type": "d4pg_actor",
    "state_dim": state_dim,
    "action_dim": 1,
    "evt_metrics": {"var_99": float(evt_model.var()), "cvar_99": float(evt_model.cvar())},
    "train_time_hours": train_time / 3600,
    "evaluation": eval_metrics
}
with open(TRAINED_DIR / "d4pg_metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)

# Save PyTorch
torch.save(actor.state_dict(), TRAINED_DIR / "d4pg_actor.pt")
print("\n‚úì D4PG+EVT TRAINING COMPLETE!")