In [1]:
# ================================
# üöÄ VAST.AI ONE-CELL SOLUTION (Everything in One Cell)
# ================================
print("\n" + "=" * 80)
print("üöÄ VAST.AI ONE-CELL SOLUTION")
print("=" * 80)

# ----- PART 1: INSTALL DEPENDENCIES -----
print("\nüì¶ PART 1: Installing dependencies...")
import subprocess
import sys

# Install all required packages
packages = [
    "numpy", "pandas", "matplotlib", "scipy", "scikit-learn",
    "yfinance==0.2.36", "xgboost", "shap", "pyarrow", "fastparquet"
]

print("Installing Python packages...")
for pkg in packages:
    !pip -q install {pkg}

# Install PyTorch with CUDA support
print("Installing PyTorch with CUDA...")
!pip -q install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

print("‚úÖ Dependencies installed!")

# ----- PART 2: FIND AND EXTRACT DATA -----
print("\nüìÇ PART 2: Finding and extracting data...")
import os
import zipfile
import glob
from pathlib import Path

# Look for the zip file
zip_files = []
search_paths = [
    "/home/user/",  # Default upload location
    "/workspace/",  # Persistent storage
    "/content/",    # Colab-like
    os.path.expanduser("~"),  # Home directory
    "."  # Current directory
]

for path in search_paths:
    if os.path.exists(path):
        found = glob.glob(os.path.join(path, "quantgenius_complete_*.zip"))
        zip_files.extend(found)

if not zip_files:
    # Try any zip file
    for path in search_paths:
        if os.path.exists(path):
            found = glob.glob(os.path.join(path, "*.zip"))
            zip_files.extend(found)

if zip_files:
    zip_file = sorted(zip_files)[-1]  # Get most recent
    print(f"üì¶ Found: {zip_file}")
    
    # Extract to workspace (persistent storage)
    extract_to = "/workspace/quantgenius_project"
    print(f"üìÇ Extracting to: {extract_to}")
    
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall("/workspace/")
    
    print("‚úÖ Extraction complete!")
    BASE_DIR = Path(extract_to)
else:
    print("‚ö†Ô∏è No zip file found - checking for existing project...")
    # Check if project already exists
    possible_locations = [
        "/workspace/quantgenius_project",
        "/content/quantgenius_project",
        os.path.expanduser("~/quantgenius_project"),
        "quantgenius_project"
    ]
    
    for loc in possible_locations:
        if os.path.exists(loc):
            BASE_DIR = Path(loc)
            print(f"‚úÖ Found existing project at: {loc}")
            break
    else:
        raise FileNotFoundError("‚ùå No project found! Please upload the zip file first.")

# ----- PART 3: LOAD ALL DATA -----
print("\nüìä PART 3: Loading all data...")
import numpy as np
import pandas as pd
import json
import torch

# Set up project directories
PROJECT_DIRS = {
    "data": BASE_DIR / "data",
    "figures": BASE_DIR / "figures",
    "tables": BASE_DIR / "tables",
    "models": BASE_DIR / "models",
    "reports": BASE_DIR / "reports",
    "cache": BASE_DIR / "cache"
}

# Create directories if they don't exist
for name, path in PROJECT_DIRS.items():
    path.mkdir(parents=True, exist_ok=True)

# Load metadata
metadata_files = list((BASE_DIR / "tables").glob("targets_metadata_*.json"))
if metadata_files:
    with open(sorted(metadata_files)[-1], 'r') as f:
        metadata = json.load(f)
    
    FORECAST_HORIZONS = metadata.get("forecast_horizons", {
        "short_1d": 1, "short_3d": 3, "short_5d": 5,
        "intermediate_10d": 10, "intermediate_15d": 15, "intermediate_20d": 20
    })
    TEMPORAL_SPLITS = metadata.get("temporal_splits", {})
    print(f"‚úÖ Loaded metadata: {len(FORECAST_HORIZONS)} horizons")
else:
    print("‚ö†Ô∏è Using default horizons")
    FORECAST_HORIZONS = {
        "short_1d": 1, "short_3d": 3, "short_5d": 5,
        "intermediate_10d": 10, "intermediate_15d": 15, "intermediate_20d": 20
    }
    TEMPORAL_SPLITS = {}
    metadata = {}

MAX_HORIZON = max(FORECAST_HORIZONS.values())

# Load DATASETS (numpy arrays)
print("\nüß© Loading DATASETS...")
DATASETS = {}

# Find the latest datasets folder
datasets_folders = list((BASE_DIR / "data").glob("datasets_*"))
if datasets_folders:
    latest_dataset = sorted(datasets_folders)[-1]
    print(f"üìÇ Loading from: {latest_dataset.name}")
    
    for horizon in FORECAST_HORIZONS.keys():
        horizon_dir = latest_dataset / horizon
        if horizon_dir.exists():
            DATASETS[horizon] = {}
            npy_files = list(horizon_dir.glob("*.npy"))
            
            for npy_file in npy_files:
                try:
                    array = np.load(npy_file)
                    DATASETS[horizon][npy_file.stem] = array
                except Exception as e:
                    print(f"  ‚ö†Ô∏è Failed to load {npy_file.name}: {e}")
            
            # Print key shapes for verification
            if npy_files:
                print(f"  ‚úì {horizon}: {len(npy_files)} arrays")
                if "X_train" in DATASETS[horizon]:
                    print(f"    X_train: {DATASETS[horizon]['X_train'].shape}")
                if "y_train_raw" in DATASETS[horizon]:
                    print(f"    y_train_raw: {DATASETS[horizon]['y_train_raw'].shape}")
else:
    print("‚ùå No DATASETS found! Checking for parquet files...")

# Load main DataFrames from parquet files
print("\nüíæ Loading DataFrames...")
dataframes_to_load = {}

# Check for parquet files in datasets folder
if datasets_folders:
    latest_dataset = sorted(datasets_folders)[-1]
    parquet_files = list(latest_dataset.glob("*.parquet"))
    
    for pq_file in parquet_files:
        try:
            df_name = pq_file.stem
            dataframes_to_load[df_name] = pd.read_parquet(pq_file)
            print(f"  ‚úì {df_name}: {dataframes_to_load[df_name].shape}")
        except Exception as e:
            print(f"  ‚ö†Ô∏è Failed to load {pq_file.name}: {e}")

# Assign to variables
if "features" in dataframes_to_load:
    features_df = dataframes_to_load["features"]
if "targets" in dataframes_to_load:
    targets_df = dataframes_to_load["targets"]
if "returns" in dataframes_to_load:
    returns_df = dataframes_to_load["returns"]
if "targets_all" in dataframes_to_load:
    targets_all = dataframes_to_load["targets_all"]
if "cleaned_prices" in dataframes_to_load:
    cleaned_prices = dataframes_to_load["cleaned_prices"]

# ----- PART 4: SET GLOBALS FOR CELLS 7-10 -----
print("\nüîß PART 4: Setting up globals for Cells 7-10...")

# Device setup
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üñ•Ô∏è Device: {DEVICE}")
if DEVICE.type == "cuda":
    print(f"‚úÖ GPU: {torch.cuda.get_device_name(0)}")

RANDOM_SEED = 42

# Asset universe (from your Cell 1)
ASSET_UNIVERSE = ['AAPL', 'MSFT', 'NVDA', 'AMZN', 'GOOGL', 'TSLA', 'META', 'JPM', 'KO', 'DIS',
                  'EURUSD=X', 'GBPUSD=X', 'USDJPY=X', 'EURGBP=X', 'USDCHF=X']
STOCKS = ['AAPL', 'MSFT', 'NVDA', 'AMZN', 'GOOGL', 'TSLA', 'META', 'JPM', 'KO', 'DIS']
FX_PAIRS = ['EURUSD=X', 'GBPUSD=X', 'USDJPY=X', 'EURGBP=X', 'USDCHF=X']

# Update globals dictionary
globals().update({
    "PROJECT_DIRS": PROJECT_DIRS,
    "BASE_DIR": BASE_DIR,
    "DATASETS": DATASETS,
    "FORECAST_HORIZONS": FORECAST_HORIZONS,
    "MAX_HORIZON": MAX_HORIZON,
    "TEMPORAL_SPLITS": TEMPORAL_SPLITS,
    "ASSET_UNIVERSE": ASSET_UNIVERSE,
    "STOCKS": STOCKS,
    "FX_PAIRS": FX_PAIRS,
    "DEVICE": DEVICE,
    "RANDOM_SEED": RANDOM_SEED,
    "metadata": metadata,
})

# Also add DataFrames to globals if they exist
for var_name in ["features_df", "targets_df", "returns_df", "targets_all", "cleaned_prices"]:
    if var_name in locals():
        globals()[var_name] = locals()[var_name]
        print(f"  ‚úÖ {var_name} available")

# ----- PART 5: VERIFICATION -----
print("\nüîç PART 5: Verification...")
print(f"‚úÖ DATASETS loaded: {len(DATASETS)} horizons")
print(f"‚úÖ FORECAST_HORIZONS: {FORECAST_HORIZONS}")
print(f"‚úÖ PROJECT_DIRS set up: {len(PROJECT_DIRS)} directories")
print(f"‚úÖ DEVICE: {DEVICE}")

# Test DATASETS structure
if DATASETS:
    sample_horizon = list(DATASETS.keys())[0]
    sample_data = DATASETS[sample_horizon]
    print(f"\nüìä Sample horizon '{sample_horizon}':")
    
    # Count different types of arrays
    x_keys = [k for k in sample_data.keys() if k.startswith('X_')]
    y_keys = [k for k in sample_data.keys() if k.startswith('y_')]
    
    print(f"  ‚Ä¢ X arrays: {len(x_keys)} (train/val/test)")
    print(f"  ‚Ä¢ y arrays: {len(y_keys)} (raw/volnorm/dir/cls/etc.)")
    
    # Show some shapes
    if "X_train" in sample_data:
        print(f"  ‚Ä¢ X_train shape: {sample_data['X_train'].shape}")
    if "y_train_raw" in sample_data:
        print(f"  ‚Ä¢ y_train_raw shape: {sample_data['y_train_raw'].shape}")

print("\n" + "=" * 80)
print("üéØ READY FOR CELLS 7-10!")
print("=" * 80)
print("\nüìã You can now copy-paste your Cells 7, 8, 9, 10 from Colab.")
print("üîß Required globals are available:")
print("   ‚Ä¢ DATASETS")
print("   ‚Ä¢ PROJECT_DIRS") 
print("   ‚Ä¢ FORECAST_HORIZONS")
print("   ‚Ä¢ ASSET_UNIVERSE, STOCKS, FX_PAIRS")
print("   ‚Ä¢ DEVICE (GPU ready)")
print("   ‚Ä¢ RANDOM_SEED")
print("=" * 80)



üöÄ VAST.AI ONE-CELL SOLUTION

üì¶ PART 1: Installing dependencies...
Installing Python packages...
[0mInstalling PyTorch with CUDA...
[0m‚úÖ Dependencies installed!

üìÇ PART 2: Finding and extracting data...
üì¶ Found: ./quantgenius_complete_20260116_235212.zip
üìÇ Extracting to: /workspace/quantgenius_project
‚úÖ Extraction complete!

üìä PART 3: Loading all data...
‚ö†Ô∏è Using default horizons

üß© Loading DATASETS...
‚ùå No DATASETS found! Checking for parquet files...

üíæ Loading DataFrames...

üîß PART 4: Setting up globals for Cells 7-10...
üñ•Ô∏è Device: cuda
‚úÖ GPU: NVIDIA GeForce RTX 3080 Ti

üîç PART 5: Verification...
‚úÖ DATASETS loaded: 0 horizons
‚úÖ FORECAST_HORIZONS: {'short_1d': 1, 'short_3d': 3, 'short_5d': 5, 'intermediate_10d': 10, 'intermediate_15d': 15, 'intermediate_20d': 20}
‚úÖ PROJECT_DIRS set up: 6 directories
‚úÖ DEVICE: cuda

üéØ READY FOR CELLS 7-10!

üìã You can now copy-paste your Cells 7, 8, 9, 10 from Colab.
üîß Required globals a

In [9]:
# ================================
# üöÄ QUICK FIX: Manual Load
# ================================
print("üöÄ MANUAL LOAD")

import zipfile
import os

# Re-extract manually
zip_path = "./quantgenius_complete_20260116_235212.zip"
extract_to = "/workspace/"

print(f"Extracting {zip_path} to {extract_to}")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

# Check what we got
import glob
all_folders = glob.glob("/workspace/**", recursive=True)
datasets_folders = [f for f in all_folders if "datasets_" in f and os.path.isdir(f)]

print(f"\nFound {len(datasets_folders)} datasets folders:")
for f in datasets_folders:
    print(f"  ‚Ä¢ {f}")



üöÄ MANUAL LOAD
Extracting ./quantgenius_complete_20260116_235212.zip to /workspace/

Found 7 datasets folders:
  ‚Ä¢ /workspace/data/datasets_20260116_234738
  ‚Ä¢ /workspace/data/datasets_20260116_234738/intermediate_10d
  ‚Ä¢ /workspace/data/datasets_20260116_234738/intermediate_15d
  ‚Ä¢ /workspace/data/datasets_20260116_234738/intermediate_20d
  ‚Ä¢ /workspace/data/datasets_20260116_234738/short_1d
  ‚Ä¢ /workspace/data/datasets_20260116_234738/short_3d
  ‚Ä¢ /workspace/data/datasets_20260116_234738/short_5d


In [10]:
# ================================
# ‚úÖ FINAL: LOAD DATASETS PROPERLY
# ================================
print("\n" + "=" * 80)
print("‚úÖ FINAL: LOADING DATASETS")
print("=" * 80)

import numpy as np
import pandas as pd
from pathlib import Path

# The correct path
DATASETS_PATH = Path("/workspace/data/datasets_20260116_234738")
print(f"üìÇ Loading from: {DATASETS_PATH}")

# Load DATASETS
DATASETS = {}
horizons = ["short_1d", "short_3d", "short_5d", "intermediate_10d", "intermediate_15d", "intermediate_20d"]

print("\nüß© Loading numpy arrays...")
for horizon in horizons:
    horizon_dir = DATASETS_PATH / horizon
    if horizon_dir.exists():
        DATASETS[horizon] = {}
        npy_files = list(horizon_dir.glob("*.npy"))
        
        for npy_file in npy_files:
            try:
                DATASETS[horizon][npy_file.stem] = np.load(npy_file)
            except Exception as e:
                print(f"  ‚ö†Ô∏è Failed to load {npy_file.name}: {e}")
        
        print(f"  ‚úÖ {horizon}: {len(npy_files)} arrays")
        
        # Show key shapes
        if "X_train" in DATASETS[horizon]:
            print(f"    X_train: {DATASETS[horizon]['X_train'].shape}")
        if "y_train_raw" in DATASETS[horizon]:
            print(f"    y_train_raw: {DATASETS[horizon]['y_train_raw'].shape}")
    else:
        print(f"  ‚ùå {horizon}: not found")

print(f"\n‚úÖ DATASETS loaded: {len(DATASETS)} horizons")

# Load DataFrames (parquet files)
print("\nüíæ Loading DataFrames...")
dataframes = {}
for parquet_file in DATASETS_PATH.glob("*.parquet"):
    try:
        df_name = parquet_file.stem
        dataframes[df_name] = pd.read_parquet(parquet_file)
        print(f"  ‚úÖ {df_name}: {dataframes[df_name].shape}")
    except Exception as e:
        print(f"  ‚ö†Ô∏è Failed to load {parquet_file.name}: {e}")

# Assign to variables
if "features" in dataframes:
    features_df = dataframes["features"]
if "targets" in dataframes:
    targets_df = dataframes["targets"]
if "returns" in dataframes:
    returns_df = dataframes["returns"]
if "targets_all" in dataframes:
    targets_all = dataframes["targets_all"]
if "cleaned_prices" in dataframes:
    cleaned_prices = dataframes["cleaned_prices"]
if "log_prices" in dataframes:
    log_prices = dataframes["log_prices"]

# Set up PROJECT_DIRS
PROJECT_DIRS = {
    "data": Path("/workspace/quantgenius_project/data"),
    "figures": Path("/workspace/quantgenius_project/figures"),
    "tables": Path("/workspace/quantgenius_project/tables"),
    "models": Path("/workspace/quantgenius_project/models"),
    "reports": Path("/workspace/quantgenius_project/reports"),
    "cache": Path("/workspace/quantgenius_project/cache")
}

# Create directories if they don't exist
for name, path in PROJECT_DIRS.items():
    path.mkdir(parents=True, exist_ok=True)

# Set other required globals
FORECAST_HORIZONS = {
    "short_1d": 1, "short_3d": 3, "short_5d": 5,
    "intermediate_10d": 10, "intermediate_15d": 15, "intermediate_20d": 20
}
MAX_HORIZON = max(FORECAST_HORIZONS.values())

ASSET_UNIVERSE = ['AAPL', 'MSFT', 'NVDA', 'AMZN', 'GOOGL', 'TSLA', 'META', 'JPM', 'KO', 'DIS',
                  'EURUSD=X', 'GBPUSD=X', 'USDJPY=X', 'EURGBP=X', 'USDCHF=X']
STOCKS = ['AAPL', 'MSFT', 'NVDA', 'AMZN', 'GOOGL', 'TSLA', 'META', 'JPM', 'KO', 'DIS']
FX_PAIRS = ['EURUSD=X', 'GBPUSD=X', 'USDJPY=X', 'EURGBP=X', 'USDCHF=X']

# TEMPORAL_SPLITS (you might need to load from metadata)
# Try to find metadata
import json, glob
metadata_files = glob.glob("/workspace/quantgenius_project/tables/targets_metadata_*.json")
if metadata_files:
    with open(sorted(metadata_files)[-1], 'r') as f:
        metadata = json.load(f)
    TEMPORAL_SPLITS = metadata.get("temporal_splits", {})
    print(f"‚úÖ Loaded TEMPORAL_SPLITS from metadata")
else:
    TEMPORAL_SPLITS = {}
    print("‚ö†Ô∏è Using empty TEMPORAL_SPLITS")

# Device
import torch
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
RANDOM_SEED = 42

print(f"\nüñ•Ô∏è Device: {DEVICE}")
if DEVICE.type == "cuda":
    print(f"‚úÖ GPU: {torch.cuda.get_device_name(0)}")

# Update globals
globals().update({
    "DATASETS": DATASETS,
    "PROJECT_DIRS": PROJECT_DIRS,
    "FORECAST_HORIZONS": FORECAST_HORIZONS,
    "MAX_HORIZON": MAX_HORIZON,
    "ASSET_UNIVERSE": ASSET_UNIVERSE,
    "STOCKS": STOCKS,
    "FX_PAIRS": FX_PAIRS,
    "TEMPORAL_SPLITS": TEMPORAL_SPLITS,
    "DEVICE": DEVICE,
    "RANDOM_SEED": RANDOM_SEED,
})

# Add DataFrames to globals
for var_name in ["features_df", "targets_df", "returns_df", "targets_all", "cleaned_prices", "log_prices"]:
    if var_name in locals():
        globals()[var_name] = locals()[var_name]
        print(f"  ‚úÖ {var_name} added to globals")

print("\n" + "=" * 80)
print("üéØ READY FOR CELLS 7-10!")
print("=" * 80)
print("\n‚úÖ DATASETS loaded with arrays")
print("‚úÖ PROJECT_DIRS set up")
print("‚úÖ All required globals available")
print("\nüìã You can now copy-paste your Cells 7, 8, 9, 10")
print("=" * 80)



‚úÖ FINAL: LOADING DATASETS
üìÇ Loading from: /workspace/data/datasets_20260116_234738

üß© Loading numpy arrays...
  ‚úÖ short_1d: 21 arrays
    X_train: (20130, 52)
    y_train_raw: (20130,)
  ‚úÖ short_3d: 21 arrays
    X_train: (20130, 52)
    y_train_raw: (20130,)
  ‚úÖ short_5d: 21 arrays
    X_train: (20130, 52)
    y_train_raw: (20130,)
  ‚úÖ intermediate_10d: 21 arrays
    X_train: (20130, 52)
    y_train_raw: (20130,)
  ‚úÖ intermediate_15d: 21 arrays
    X_train: (20130, 52)
    y_train_raw: (20130,)
  ‚úÖ intermediate_20d: 21 arrays
    X_train: (20130, 52)
    y_train_raw: (20130,)

‚úÖ DATASETS loaded: 6 horizons

üíæ Loading DataFrames...
  ‚úÖ targets_all: (2756, 360)
  ‚úÖ targets: (2756, 60)
  ‚úÖ log_prices: (2777, 10)
  ‚úÖ features: (2756, 420)
  ‚úÖ returns: (2756, 10)
  ‚úÖ cleaned_prices: (2777, 10)
‚ö†Ô∏è Using empty TEMPORAL_SPLITS

üñ•Ô∏è Device: cuda
‚úÖ GPU: NVIDIA GeForce RTX 3080 Ti
  ‚úÖ features_df added to globals
  ‚úÖ targets_df added to globals

In [11]:
# ================================
# CELL 7 (THESIS-GRADE): XGBoost Training + Finance-Grade Evaluation (GPU/CPU) 
# ================================
# Works with your existing structure from Cell 5:
# DATASETS[horizon] contains:
#   X_train, X_val, X_test (float32)
#   y_* arrays (float32/int32): y_train_raw, y_train_volnorm, y_train_crossrank, y_train_cls, y_train_dir, ...
#
# This cell:
# ‚úÖ Trains XGB regression for: y_volnorm, y_raw, y_crossrank (optional switch)
# ‚úÖ Trains XGB classification for: y_cls (-1/0/1) with class-weighted sampling
# ‚úÖ Reports finance-grade metrics:
#    - Regression: R2, RMSE, MAE, DirAcc, IC (Spearman), Sharpe proxy (non-overlap), Turnover
#    - Classification: Acc, BalancedAcc, MacroF1, ConfusionMatrix
# ‚úÖ Saves: results CSV + meta JSON + models
# ================================

print("\n" + "=" * 110)
print("üéØ CELL 7 (THESIS-GRADE): XGBoost Training + Finance-Grade Evaluation (GPU/CPU)")
print("=" * 110)

import numpy as np
import pandas as pd
import json
import warnings
warnings.filterwarnings("ignore")

from datetime import datetime
import xgboost as xgb
from scipy import stats
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    accuracy_score, f1_score, balanced_accuracy_score, confusion_matrix
)

# ----------------
# 0) Checks
# ----------------
required = ["DATASETS", "FORECAST_HORIZONS", "PROJECT_DIRS"]
missing = [k for k in required if k not in globals()]
if missing:
    raise NameError(f"Missing required globals: {missing}. Run Cells 1‚Äì6 first.")

tbl_dir = PROJECT_DIRS["tables"]
mdl_dir = PROJECT_DIRS.get("models", PROJECT_DIRS["tables"].parent / "models")
tbl_dir.mkdir(parents=True, exist_ok=True)
mdl_dir.mkdir(parents=True, exist_ok=True)

EPS = 1e-12

# ----------------
# 1) Device detection (robust)
# ----------------
def detect_xgb_device():
    """
    Returns (gpu_available, device_string) where device_string is 'cuda' or 'cpu'.
    Compatible with XGBoost >= 2.x device API.
    """
    try:
        # tiny fit
        m = xgb.XGBRegressor(
            tree_method="hist",
            device="cuda",
            n_estimators=5,
            random_state=42
        )
        X_tmp = np.random.randn(64, 8).astype(np.float32)
        y_tmp = np.random.randn(64).astype(np.float32)
        m.fit(X_tmp, y_tmp, verbose=False)
        return True, "cuda"
    except Exception:
        return False, "cpu"

GPU_AVAILABLE, XGB_DEVICE = detect_xgb_device()
print(f"‚úÖ XGBoost device: {XGB_DEVICE} | GPU available: {GPU_AVAILABLE}")

# ----------------
# 2) Finance-grade metric helpers
# ----------------
def directional_accuracy(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = np.isfinite(y_true) & np.isfinite(y_pred)
    if mask.sum() < 10:
        return np.nan
    return float((np.sign(y_true[mask]) == np.sign(y_pred[mask])).mean())

def spearman_ic(y_true, y_pred):
    """Spearman IC (information coefficient)."""
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = np.isfinite(y_true) & np.isfinite(y_pred)
    if mask.sum() < 50:
        return 0.0
    if np.std(y_pred[mask]) < EPS:
        return 0.0
    ic = stats.spearmanr(y_true[mask], y_pred[mask]).correlation
    return float(ic) if ic == ic else 0.0

def sharpe_non_overlap(y_true_ret_raw, y_pred, h_days, cost_bps=5.0):
    """
    Strategy proxy:
    - position = sign(pred)
    - pnl = position * raw_return
    - evaluate on non-overlapping blocks (every h_days)
    - apply simple turnover cost (bps)
    Returns (sharpe, turnover)
    """
    y_true_ret_raw = np.asarray(y_true_ret_raw, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)

    mask = np.isfinite(y_true_ret_raw) & np.isfinite(y_pred)
    if mask.sum() < 50:
        return np.nan, np.nan

    y = y_true_ret_raw[mask]
    p = y_pred[mask]

    step = max(int(h_days), 1)
    idx = np.arange(0, len(y), step)
    y_s = y[idx]
    pos = np.sign(p[idx])  # -1/0/1

    pnl = pos * y_s

    if len(pos) > 1:
        turnover = np.abs(pos[1:] - pos[:-1])
        pnl[1:] -= turnover * (cost_bps / 10000.0)
        avg_turnover = float(np.mean(turnover))
    else:
        avg_turnover = 0.0

    mu = float(np.mean(pnl))
    sd = float(np.std(pnl))
    ann = np.sqrt(252 / step)
    sharpe = np.nan if sd < EPS else (mu / sd) * ann
    return float(sharpe), float(avg_turnover)

def reg_metrics(y_true, y_pred, h_days, y_true_raw_for_sharpe=None):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = np.isfinite(y_true) & np.isfinite(y_pred)
    if mask.sum() < 50:
        return {
            "R2": np.nan, "RMSE": np.nan, "MAE": np.nan,
            "DirAcc": np.nan, "IC": np.nan,
            "Sharpe": np.nan, "Turnover": np.nan,
            "pred_std": np.nan, "n_eval": int(mask.sum())
        }

    yt = y_true[mask]
    yp = y_pred[mask]

    rmse = float(np.sqrt(mean_squared_error(yt, yp)))
    mae  = float(mean_absolute_error(yt, yp))
    r2   = float(r2_score(yt, yp))
    da   = directional_accuracy(yt, yp)
    ic   = spearman_ic(yt, yp)

    # Sharpe proxy should use RAW return if available
    if y_true_raw_for_sharpe is None:
        y_true_raw_for_sharpe = yt

    sharpe, to = sharpe_non_overlap(y_true_raw_for_sharpe, yp, h_days)

    return {
        "R2": r2, "RMSE": rmse, "MAE": mae,
        "DirAcc": da, "IC": ic,
        "Sharpe": sharpe, "Turnover": to,
        "pred_std": float(np.std(yp)),
        "n_eval": int(len(yt))
    }

def cls_metrics(y_true_cls, y_pred_cls):
    """
    y_true_cls and y_pred_cls are in {-1,0,1}.
    """
    y_true_cls = np.asarray(y_true_cls)
    y_pred_cls = np.asarray(y_pred_cls)
    mask = np.isfinite(y_true_cls) & np.isfinite(y_pred_cls)
    if mask.sum() < 50:
        return {"acc": np.nan, "balanced_acc": np.nan, "macro_f1": np.nan, "cm": []}

    yt = y_true_cls[mask].astype(int)
    yp = y_pred_cls[mask].astype(int)

    return {
        "acc": float(accuracy_score(yt, yp)),
        "balanced_acc": float(balanced_accuracy_score(yt, yp)),
        "macro_f1": float(f1_score(yt, yp, average="macro", zero_division=0)),
        "cm": confusion_matrix(yt, yp, labels=[-1, 0, 1]).tolist()
    }

# ----------------
# 3) Training settings (thesis-grade defaults)
# ----------------
# You can switch these without breaking anything.
TRAIN_REG_TARGETS = ["y_volnorm", "y_raw", "y_crossrank"]  # keep all for a strong thesis table
TRAIN_CLASSIFICATION = True

# Regression hyperparams: robust to fat tails
REG_PARAMS = dict(
    objective="reg:pseudohubererror",
    eval_metric="rmse",
    tree_method="hist",
    device=XGB_DEVICE,
    random_state=42,
    n_estimators=5000,
    learning_rate=0.03,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    min_child_weight=8,
    reg_alpha=0.2,
    reg_lambda=2.0,
    verbosity=0
)

# Classification hyperparams (ternary)
CLS_PARAMS = dict(
    objective="multi:softprob",
    num_class=3,
    eval_metric="mlogloss",
    tree_method="hist",
    device=XGB_DEVICE,
    random_state=42,
    n_estimators=3000,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.85,
    colsample_bytree=0.85,
    min_child_weight=10,
    gamma=0.1,
    reg_alpha=0.3,
    reg_lambda=2.0,
    verbosity=0
)

EARLY_STOPPING_ROUNDS = 150  # used when val exists
COST_BPS_FOR_SHARPE = 5.0

# ----------------
# 4) Quick dataset sanity
# ----------------
print("\nüîç DATASETS sanity check:")
sample_hz = list(DATASETS.keys())[0]
print(f"Sample horizon: {sample_hz}")
print("Keys:", sorted([k for k in DATASETS[sample_hz].keys() if k.startswith(("X_", "y_"))])[:25], "...")

# ----------------
# 5) Training loops
# ----------------
results = []
meta_runs = []

def get_reg_keys(tgt):
    # maps to your dataset keys from Cell 5
    return f"y_train_{tgt.split('_', 1)[1]}", f"y_val_{tgt.split('_', 1)[1]}", f"y_test_{tgt.split('_', 1)[1]}"

def resolve_reg_arrays(ds, tgt_name):
    """
    tgt_name in {"y_volnorm","y_raw","y_crossrank"}
    returns (ytr, yva, yte) or (None,..) if missing.
    """
    if tgt_name == "y_volnorm":
        return ds.get("y_train_volnorm"), ds.get("y_val_volnorm"), ds.get("y_test_volnorm")
    if tgt_name == "y_raw":
        return ds.get("y_train_raw"), ds.get("y_val_raw"), ds.get("y_test_raw")
    if tgt_name == "y_crossrank":
        return ds.get("y_train_crossrank"), ds.get("y_val_crossrank"), ds.get("y_test_crossrank")
    return None, None, None

print("\n" + "=" * 110)
print("üìâ REGRESSION TRAINING (XGBoost) ‚Äî targets:", TRAIN_REG_TARGETS)
print("=" * 110)

for hz, h_days in FORECAST_HORIZONS.items():
    if hz not in DATASETS:
        print(f"‚ö†Ô∏è {hz}: not found in DATASETS, skipping.")
        continue

    ds = DATASETS[hz]
    Xtr = ds.get("X_train")
    Xva = ds.get("X_val")
    Xte = ds.get("X_test")

    # raw returns used for Sharpe proxy
    yte_raw_for_sharpe = ds.get("y_test_raw", None)

    if Xtr is None or Xte is None:
        print(f"‚ùå {hz}: Missing X_train or X_test. Skipping.")
        continue

    for tgt in TRAIN_REG_TARGETS:
        ytr, yva, yte = resolve_reg_arrays(ds, tgt)
        if ytr is None or yte is None:
            print(f"  ‚ö†Ô∏è {hz}/{tgt}: missing target arrays, skipping.")
            continue

        # Drop NaNs for training/validation
        tr_mask = np.isfinite(ytr)
        Xtr_fit = Xtr[tr_mask]
        ytr_fit = ytr[tr_mask]

        if len(ytr_fit) < 1000:
            print(f"  ‚ö†Ô∏è {hz}/{tgt}: too few finite train rows ({len(ytr_fit)}). Skipping.")
            continue

        eval_set = None
        if Xva is not None and yva is not None:
            va_mask = np.isfinite(yva)
            Xva_fit = Xva[va_mask]
            yva_fit = yva[va_mask]
            if len(yva_fit) >= 200:
                eval_set = [(Xva_fit, yva_fit)]
            else:
                eval_set = None

        print(f"\nüéØ {hz} ({h_days}d) | Regression target: {tgt}")
        print(f"   Train: {Xtr_fit.shape} | Val: {eval_set[0][0].shape if eval_set else None} | Test: {Xte.shape}")

        model = xgb.XGBRegressor(**REG_PARAMS)
        if eval_set:
            model.set_params(early_stopping_rounds=EARLY_STOPPING_ROUNDS)
            model.fit(Xtr_fit, ytr_fit, eval_set=eval_set, verbose=False)
        else:
            model.fit(Xtr_fit, ytr_fit, verbose=False)

        best_iter = int(getattr(model, "best_iteration", model.n_estimators - 1))

        # Predict on test
        try:
            pred = model.predict(Xte, iteration_range=(0, best_iter + 1))
        except Exception:
            pred = model.predict(Xte)

        mets = reg_metrics(
            y_true=yte,
            y_pred=pred,
            h_days=h_days,
            y_true_raw_for_sharpe=yte_raw_for_sharpe if yte_raw_for_sharpe is not None else yte
        )

        print(f"   ‚úÖ R2={mets['R2']:.4f} | IC={mets['IC']:.4f} | DirAcc={mets['DirAcc']:.3f} | Sharpe={mets['Sharpe']:.3f}")

        # Save model
        stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        model_path = mdl_dir / f"xgb_reg_{tgt}_{hz}_{XGB_DEVICE}_{stamp}.json"
        model.save_model(str(model_path))

        results.append({
            "task": "regression",
            "horizon": hz,
            "horizon_days": int(h_days),
            "target": tgt,
            "device": XGB_DEVICE,
            "best_iter": best_iter,
            "n_train": int(len(ytr_fit)),
            "n_test": int(len(Xte)),
            "model_path": str(model_path),
            **mets
        })

        meta_runs.append({
            "task": "regression",
            "horizon": hz,
            "target": tgt,
            "params": REG_PARAMS,
            "early_stopping_rounds": EARLY_STOPPING_ROUNDS if eval_set else None,
            "cost_bps_for_sharpe": COST_BPS_FOR_SHARPE,
            "best_iteration": best_iter,
            "model_path": str(model_path),
        })

# ----------------
# 6) Classification training (ternary y_cls)
# ----------------
if TRAIN_CLASSIFICATION:
    print("\n" + "=" * 110)
    print("üìå CLASSIFICATION TRAINING (XGBoost) ‚Äî target: y_cls (-1/0/1)")
    print("=" * 110)

    for hz, h_days in FORECAST_HORIZONS.items():
        if hz not in DATASETS:
            continue

        ds = DATASETS[hz]
        Xtr = ds.get("X_train")
        Xva = ds.get("X_val")
        Xte = ds.get("X_test")

        ytr = ds.get("y_train_cls")
        yva = ds.get("y_val_cls")
        yte = ds.get("y_test_cls")

        if Xtr is None or Xte is None or ytr is None or yte is None:
            print(f"  ‚ö†Ô∏è {hz}: missing classification arrays, skipping.")
            continue

        # Combine train+val for fitting (common in thesis baselines)
        if Xva is not None and yva is not None:
            X_fit = np.vstack([Xtr, Xva])
            y_fit = np.concatenate([ytr, yva])
        else:
            X_fit = Xtr
            y_fit = ytr

        # Map {-1,0,1} -> {0,1,2}
        y_fit = np.asarray(y_fit).astype(int)
        y_fit_m = (y_fit + 1).astype(int)
        yte = np.asarray(yte).astype(int)
        yte_m = (yte + 1).astype(int)

        # Class weights (inverse frequency)
        counts = np.bincount(y_fit_m, minlength=3).astype(float)
        inv = 1.0 / np.maximum(counts, 1.0)
        class_w = inv / inv.mean()
        sample_w = class_w[y_fit_m]

        print(f"\nüéØ {hz} ({h_days}d) | Classification target: y_cls")
        print(f"   Train: {X_fit.shape} | Test: {Xte.shape} | Class counts: {counts.astype(int).tolist()}")

        model = xgb.XGBClassifier(**CLS_PARAMS)
        model.fit(X_fit, y_fit_m, sample_weight=sample_w, verbose=False)

        # Predict -> map back to {-1,0,1}
        prob = model.predict_proba(Xte)
        pred_m = np.argmax(prob, axis=1).astype(int)
        pred = (pred_m - 1).astype(int)

        mets = cls_metrics(y_true_cls=yte, y_pred_cls=pred)

        print(f"   ‚úÖ acc={mets['acc']:.3f} | bal_acc={mets['balanced_acc']:.3f} | macro_f1={mets['macro_f1']:.3f}")

        # Save model
        stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        model_path = mdl_dir / f"xgb_cls_y_cls_{hz}_{XGB_DEVICE}_{stamp}.json"
        model.save_model(str(model_path))

        results.append({
            "task": "classification",
            "horizon": hz,
            "horizon_days": int(h_days),
            "target": "y_cls",
            "device": XGB_DEVICE,
            "best_iter": np.nan,
            "n_train": int(len(X_fit)),
            "n_test": int(len(Xte)),
            "model_path": str(model_path),
            **mets
        })

        meta_runs.append({
            "task": "classification",
            "horizon": hz,
            "target": "y_cls",
            "params": CLS_PARAMS,
            "class_counts": counts.astype(int).tolist(),
            "class_weights": class_w.tolist(),
            "model_path": str(model_path),
        })

# ----------------
# 7) Save results + meta
# ----------------
if not results:
    raise RuntimeError("‚ùå No models trained. Check DATASETS keys and contents.")

results_df = pd.DataFrame(results)

stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
out_csv  = tbl_dir / f"cell7_xgb_thesis_results_{XGB_DEVICE}_{stamp}.csv"
out_json = tbl_dir / f"cell7_xgb_thesis_meta_{XGB_DEVICE}_{stamp}.json"

results_df.to_csv(out_csv, index=False)

meta = {
    "created_at": stamp,
    "device": XGB_DEVICE,
    "gpu_available": bool(GPU_AVAILABLE),
    "horizons": {k: int(v) for k, v in FORECAST_HORIZONS.items()},
    "trained_regression_targets": TRAIN_REG_TARGETS,
    "trained_classification": bool(TRAIN_CLASSIFICATION),
    "reg_params": REG_PARAMS,
    "cls_params": CLS_PARAMS,
    "early_stopping_rounds": EARLY_STOPPING_ROUNDS,
    "cost_bps_for_sharpe": COST_BPS_FOR_SHARPE,
    "runs": meta_runs,
    "files": {
        "results_csv": str(out_csv)
    }
}
with open(out_json, "w") as f:
    json.dump(meta, f, indent=2)

# Export globals for Cells 8‚Äì10
globals().update({
    "CELL7_RESULTS_DF": results_df,
    "CELL7_RESULTS_PATH": str(out_csv),
    "CELL7_META_PATH": str(out_json),
    "XGB_MODELS_DIR": str(mdl_dir)
})

print("\n" + "=" * 110)
print("‚úÖ CELL 7 COMPLETE (THESIS-GRADE)")
print(f"üìÅ Results CSV : {out_csv}")
print(f"üìÅ Meta JSON   : {out_json}")
print(f"üìÅ Models dir  : {mdl_dir}")
print("=" * 110)

print("\nüìã QUICK SUMMARY (sorted):")
cols_show = [c for c in ["task","target","horizon","horizon_days","R2","IC","DirAcc","Sharpe","acc","balanced_acc","macro_f1","n_train","n_test"] if c in results_df.columns]
print(results_df[cols_show].sort_values(["task","target","horizon_days"]).to_string(index=False))



üéØ CELL 7 (THESIS-GRADE): XGBoost Training + Finance-Grade Evaluation (GPU/CPU)
‚úÖ XGBoost device: cuda | GPU available: True

üîç DATASETS sanity check:
Sample horizon: short_1d
Keys: ['X_test', 'X_train', 'X_val', 'y_test_cls', 'y_test_crossrank', 'y_test_dir', 'y_test_extreme', 'y_test_raw', 'y_test_volnorm', 'y_train_cls', 'y_train_crossrank', 'y_train_dir', 'y_train_extreme', 'y_train_raw', 'y_train_volnorm', 'y_val_cls', 'y_val_crossrank', 'y_val_dir', 'y_val_extreme', 'y_val_raw', 'y_val_volnorm'] ...

üìâ REGRESSION TRAINING (XGBoost) ‚Äî targets: ['y_volnorm', 'y_raw', 'y_crossrank']

üéØ short_1d (1d) | Regression target: y_volnorm
   Train: (19940, 52) | Val: (2500, 52) | Test: (4930, 52)
   ‚úÖ R2=-0.0002 | IC=0.0033 | DirAcc=0.535 | Sharpe=0.811

üéØ short_1d (1d) | Regression target: y_raw
   Train: (20130, 52) | Val: (2500, 52) | Test: (4930, 52)
   ‚úÖ R2=0.0003 | IC=0.0063 | DirAcc=0.535 | Sharpe=0.749

üéØ short_1d (1d) | Regression target: y_crossrank
   Tra

In [16]:
# ================================
# ‚úÖ FIX FOR CELL 8: build masks from TEMPORAL_SPLITS dates (no train_mask needed)
# ================================
import pandas as pd
import numpy as np

assert "features_df" in globals() and features_df is not None, "features_df missing"
assert "TEMPORAL_SPLITS" in globals() and isinstance(TEMPORAL_SPLITS, dict), "TEMPORAL_SPLITS missing"

idx = pd.to_datetime(features_df.index)

train_end = pd.to_datetime(TEMPORAL_SPLITS.get("train_end", "2022-12-31"))
val_end   = pd.to_datetime(TEMPORAL_SPLITS.get("val_end",   "2023-12-31"))

train_mask = (idx <= train_end)
val_mask   = (idx > train_end) & (idx <= val_end)
test_mask  = (idx > val_end)

# sanity
print("‚úÖ Rebuilt masks for Cell 8 (from dates):")
print(f"   Train: {train_mask.sum()} | Val: {val_mask.sum()} | Test: {test_mask.sum()}")
print(f"   idx range: {idx.min().date()} ‚Üí {idx.max().date()}")
print(f"   train_end={train_end.date()} | val_end={val_end.date()}")

# (optional) expose for later cells
TEMPORAL_SPLITS["train_mask"] = train_mask
TEMPORAL_SPLITS["val_mask"]   = val_mask
TEMPORAL_SPLITS["test_mask"]  = test_mask


‚úÖ Rebuilt masks for Cell 8 (from dates):
   Train: 2013 | Val: 250 | Test: 493
   idx range: 2015-01-05 ‚Üí 2025-12-17
   train_end=2022-12-31 | val_end=2023-12-31


In [17]:
# ================================
# CELL 8: Multi-Asset Sequence Dataset Builder (FAST + NO LEAKAGE) ‚Äî FIXED (FOREX SAFE)
# ================================
# Fixes applied:
# ‚úÖ Filters ASSET_UNIVERSE to assets that exist in BOTH features_df and targets_df (prevents KeyError like 'EURUSD=X')
# ‚úÖ Per-asset guard inside loop (extra safety)
# ‚úÖ Robust split-by-dates using pandas Index.intersection (faster + safer than set membership)
# ‚úÖ Saves scaler params (mean/std) in meta so Vast.ai can reproduce inference without pickling sklearn objects
# ================================

print("\n" + "=" * 85)
print("üß¨ CELL 8: FAST MULTI-ASSET SEQUENCE DATASET BUILDER (NO LEAKAGE) [FIXED]")
print("=" * 85)

import numpy as np
import pandas as pd
import json
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

from numpy.lib.stride_tricks import sliding_window_view

# ---------
# 0) Sanity checks
# ---------
required = ["features_df", "targets_df", "FORECAST_HORIZONS", "TEMPORAL_SPLITS",
            "PROJECT_DIRS", "ASSET_UNIVERSE", "STOCKS", "FX_PAIRS"]
missing = [x for x in required if x not in globals()]
if missing:
    raise NameError(f"Missing globals: {missing}. Run Cells 1‚Äì7 first.")

if not isinstance(features_df.columns, pd.MultiIndex):
    raise ValueError("features_df.columns must be a MultiIndex of (asset, feature).")

if not isinstance(targets_df.columns, pd.MultiIndex):
    raise ValueError("targets_df.columns must be a MultiIndex of (asset, target_name).")

# ---------
# 0.5) FIX: Filter ASSET_UNIVERSE to assets available in BOTH features_df and targets_df
# ---------
feat_assets = set(features_df.columns.get_level_values(0))
tgt_assets  = set(targets_df.columns.get_level_values(0))
available_assets = sorted(list(feat_assets.intersection(tgt_assets)))

missing_assets = [a for a in ASSET_UNIVERSE if a not in available_assets]
print("\nüîé ASSET AVAILABILITY CHECK:")
print(f"   ‚Ä¢ In features_df: {len(feat_assets)} assets")
print(f"   ‚Ä¢ In targets_df : {len(tgt_assets)} assets")
print(f"   ‚Ä¢ In BOTH       : {len(available_assets)} assets")

if missing_assets:
    print("\n‚ö†Ô∏è These assets are in ASSET_UNIVERSE but missing in features/targets and will be SKIPPED:")
    print("   ", missing_assets)

ASSET_UNIVERSE = [a for a in ASSET_UNIVERSE if a in available_assets]
print(f"\n‚úÖ Using ASSET_UNIVERSE (filtered): {len(ASSET_UNIVERSE)} assets")
print("   ", ASSET_UNIVERSE)

# ---------
# 1) Config (tune these for speed)
# ---------
LOOKBACK = 60          # 60 trading days
STRIDE = 1             # 1 = daily sliding
MIN_SEQ_PER_ASSET = 50 # skip assets with too little sequence data
SCALE = True           # fit scaler on TRAIN ONLY per horizon (no leakage)

stamp = datetime.now().strftime("%Y%m%d_%H%M%S")

print("\n‚öôÔ∏è CONFIG:")
print(f"   ‚Ä¢ LOOKBACK: {LOOKBACK}")
print(f"   ‚Ä¢ STRIDE  : {STRIDE}")
print(f"   ‚Ä¢ SCALE   : {SCALE}")
print(f"   ‚Ä¢ Assets  : {len(ASSET_UNIVERSE)}")

# ---------
# 2) Prepare split masks / dates (strict temporal)
# ---------
idx = features_df.index
train_mask = TEMPORAL_SPLITS["train_mask"]
val_mask   = TEMPORAL_SPLITS["val_mask"]
test_mask  = TEMPORAL_SPLITS["test_mask"]

train_idx = idx[train_mask]
val_idx   = idx[val_mask]
test_idx  = idx[test_mask]

print("\nüìÖ SPLITS:")
print(f"   ‚Ä¢ Train days: {len(train_idx)}")
print(f"   ‚Ä¢ Val days  : {len(val_idx)}")
print(f"   ‚Ä¢ Test days : {len(test_idx)}")

# ---------
# 3) Helper: asset feature extraction + asset-ID features
# ---------
def get_asset_features(asset: str) -> pd.DataFrame:
    """Extract one asset's feature block: features_df columns MultiIndex (asset, feature) -> DataFrame[feature]."""
    X = features_df.loc[:, asset].copy()  # drops first level
    # Add asset-type flags (constant across rows)
    X["asset_is_stock"] = 1 if asset in STOCKS else 0
    X["asset_is_forex"] = 1 if asset in FX_PAIRS else 0
    return X

def get_asset_target(asset: str, horizon_key: str) -> pd.Series:
    """Target column format: (asset, f'y_{horizon_key}')"""
    col = (asset, f"y_{horizon_key}")
    return targets_df[col]

# ---------
# 4) Fast sequence creation (numpy)
# ---------
def make_sequences_fast(X: np.ndarray, y: np.ndarray, dates: np.ndarray, lookback: int, stride: int):
    """
    Create sequences using sliding windows:
    - X_seq shape: (n_seq, lookback, n_features)
    - y_seq shape: (n_seq, 1) where target is aligned with window end index
    - d_seq: end-date for each sequence
    """
    n = X.shape[0]
    if n <= lookback:
        return None

    # (n-lookback+1, lookback, n_features)
    Xw = sliding_window_view(X, window_shape=(lookback, X.shape[1]))[:, 0, :, :]
    yw = y[lookback - 1:]
    dw = dates[lookback - 1:]

    # stride
    Xw = Xw[::stride]
    yw = yw[::stride]
    dw = dw[::stride]

    # drop windows with NaNs (fast)
    flat = Xw.reshape(Xw.shape[0], -1)
    good = (~np.isnan(flat).any(axis=1)) & (~np.isnan(yw))
    if good.sum() == 0:
        return None

    Xw = Xw[good].astype(np.float32)
    yw = yw[good].astype(np.float32).reshape(-1, 1)
    dw = dw[good]
    return Xw, yw, dw

def split_by_dates_vectorized(X_seq, y_seq, d_seq, train_idx, val_idx, test_idx):
    d_seq = pd.to_datetime(d_seq)
    d_seq = pd.DatetimeIndex(d_seq)

    tr = d_seq.isin(train_idx)
    va = d_seq.isin(val_idx)
    te = d_seq.isin(test_idx)

    return {
        "X_train": X_seq[tr], "y_train": y_seq[tr],
        "X_val":   X_seq[va], "y_val":   y_seq[va],
        "X_test":  X_seq[te], "y_test":  y_seq[te],
        "n_train": int(tr.sum()), "n_val": int(va.sum()), "n_test": int(te.sum()),
        "d_train": d_seq[tr].astype(str).tolist(),
        "d_val":   d_seq[va].astype(str).tolist(),
        "d_test":  d_seq[te].astype(str).tolist(),
    }

# ---------
# 5) Build SEQ_DATASETS for ALL horizons
# ---------
SEQ_DATASETS = {}
stats_rows = []

print("\nüîß BUILDING SEQUENCES FOR ALL HORIZONS...")

for horizon_key, h_days in FORECAST_HORIZONS.items():
    print("\n" + "-" * 75)
    print(f"üèÅ Horizon: {horizon_key} ({h_days} days)")
    print("-" * 75)

    X_all_list, y_all_list, d_all_list, asset_id_list = [], [], [], []
    feature_dim_ref = None

    for asset_id, asset in enumerate(ASSET_UNIVERSE):

        # Extra guard (prevents KeyError even if universe changes later)
        if asset not in feat_assets:
            print(f"   ‚ö†Ô∏è {asset:10s}: not in features_df, skipping")
            continue
        tgt_col = (asset, f"y_{horizon_key}")
        if tgt_col not in targets_df.columns:
            print(f"   ‚ö†Ô∏è {asset:10s}: missing target {tgt_col}, skipping")
            continue

        X_df = get_asset_features(asset)
        y_s  = get_asset_target(asset, horizon_key)

        # Align (inner join) and ensure we keep the global index order
        aligned = X_df.join(y_s.rename("y"), how="inner")
        aligned = aligned.reindex(idx)  # enforce global timeline index
        dates = aligned.index.values

        X = aligned.drop(columns=["y"]).to_numpy(dtype=np.float32)
        y = aligned["y"].to_numpy(dtype=np.float32)

        out = make_sequences_fast(X, y, dates, LOOKBACK, STRIDE)
        if out is None:
            print(f"   ‚ö†Ô∏è {asset:10s}: no valid sequences (NaNs / too short), skipping")
            continue

        X_seq, y_seq, d_seq = out
        if len(X_seq) < MIN_SEQ_PER_ASSET:
            print(f"   ‚ö†Ô∏è {asset:10s}: only {len(X_seq)} sequences (<{MIN_SEQ_PER_ASSET}), skipping")
            continue

        if feature_dim_ref is None:
            feature_dim_ref = X_seq.shape[2]

        X_all_list.append(X_seq)
        y_all_list.append(y_seq)
        d_all_list.append(d_seq)
        asset_id_list.append(np.full((len(X_seq),), asset_id, dtype=np.int64))

        print(f"   ‚Ä¢ {asset:10s}: {len(X_seq):,} sequences")

    if not X_all_list:
        print("   ‚ùå No sequences created for this horizon.")
        continue

    X_all = np.concatenate(X_all_list, axis=0)
    y_all = np.concatenate(y_all_list, axis=0)
    d_all = np.concatenate(d_all_list, axis=0)
    asset_ids = np.concatenate(asset_id_list, axis=0)

    # Split by end-date (strict temporal)
    split = split_by_dates_vectorized(X_all, y_all, d_all, train_idx, val_idx, test_idx)
    if split["n_train"] == 0:
        print("   ‚ùå No training sequences after split ‚Äî skipping.")
        continue

    # Scale features (fit on TRAIN only) ‚Äî no leakage
    scaler = None
    scaler_params = None
    if SCALE:
        X_tr = split["X_train"]
        n_tr, L, F = X_tr.shape

        scaler = StandardScaler()
        scaler.fit(X_tr.reshape(-1, F))

        # Save params so you can recreate scaler on Vast.ai without pickling objects
        scaler_params = {
            "mean_": scaler.mean_.tolist(),
            "scale_": scaler.scale_.tolist(),
            "var_": scaler.var_.tolist() if hasattr(scaler, "var_") else None,
            "n_features": int(F),
        }

        def _scale(X):
            if X is None or len(X) == 0:
                return X
            n, L, F = X.shape
            return scaler.transform(X.reshape(-1, F)).reshape(n, L, F).astype(np.float32)

        split["X_train"] = _scale(split["X_train"])
        split["X_val"]   = _scale(split["X_val"])
        split["X_test"]  = _scale(split["X_test"])

    # Also split asset_ids by date masks (use the same d_all as used in split)
    d_all_idx = pd.DatetimeIndex(pd.to_datetime(d_all))
    tr_mask = d_all_idx.isin(train_idx)
    va_mask = d_all_idx.isin(val_idx)
    te_mask = d_all_idx.isin(test_idx)

    SEQ_DATASETS[horizon_key] = {
        "X_train": split["X_train"], "y_train": split["y_train"], "asset_ids_train": asset_ids[tr_mask],
        "X_val":   split["X_val"],   "y_val":   split["y_val"],   "asset_ids_val":   asset_ids[va_mask],
        "X_test":  split["X_test"],  "y_test":  split["y_test"],  "asset_ids_test":  asset_ids[te_mask],
        "d_train": split["d_train"], "d_val": split["d_val"], "d_test": split["d_test"],
        "lookback": int(LOOKBACK),
        "n_features": int(feature_dim_ref),
        "n_assets": int(len(ASSET_UNIVERSE)),
        "horizon_days": int(h_days),
        "scaler_params": scaler_params,
    }

    stats_rows.append({
        "horizon": horizon_key,
        "horizon_days": int(h_days),
        "n_total": int(len(X_all)),
        "n_train": split["n_train"],
        "n_val": split["n_val"],
        "n_test": split["n_test"],
        "lookback": int(LOOKBACK),
        "n_features": int(feature_dim_ref),
        "n_assets": int(len(ASSET_UNIVERSE)),
    })

    print("\n‚úÖ FINAL SHAPES:")
    print(f"   X_train: {SEQ_DATASETS[horizon_key]['X_train'].shape} | y_train: {SEQ_DATASETS[horizon_key]['y_train'].shape}")
    print(f"   X_val  : {SEQ_DATASETS[horizon_key]['X_val'].shape} | y_val  : {SEQ_DATASETS[horizon_key]['y_val'].shape}")
    print(f"   X_test : {SEQ_DATASETS[horizon_key]['X_test'].shape} | y_test : {SEQ_DATASETS[horizon_key]['y_test'].shape}")

# ---------
# 6) Save datasets (NPZ per horizon) + stats
# ---------
print("\nüíæ SAVING NPZ DATASETS...")

stats_df = pd.DataFrame(stats_rows)
stats_path = PROJECT_DIRS["tables"] / f"cell8_seq_stats_{stamp}.csv"
stats_df.to_csv(stats_path, index=False)

saved_files = {}
for horizon_key, ds in SEQ_DATASETS.items():
    out_path = PROJECT_DIRS["data"] / f"seq_{horizon_key}_{stamp}.npz"
    np.savez_compressed(
        out_path,
        X_train=ds["X_train"], y_train=ds["y_train"], asset_ids_train=ds["asset_ids_train"],
        X_val=ds["X_val"], y_val=ds["y_val"], asset_ids_val=ds["asset_ids_val"],
        X_test=ds["X_test"], y_test=ds["y_test"], asset_ids_test=ds["asset_ids_test"],
        lookback=np.array([ds["lookback"]], dtype=np.int32),
        horizon_days=np.array([ds["horizon_days"]], dtype=np.int32),
    )
    saved_files[horizon_key] = str(out_path)
    print(f"   ‚úì {horizon_key}: {out_path}")

meta_path = PROJECT_DIRS["tables"] / f"cell8_seq_meta_{stamp}.json"
meta = {
    "created_at": stamp,
    "lookback": int(LOOKBACK),
    "stride": int(STRIDE),
    "scale_train_only": bool(SCALE),
    "assets_used": ASSET_UNIVERSE,
    "assets_skipped": missing_assets,
    "horizons": {k: int(v) for k, v in FORECAST_HORIZONS.items()},
    "files_npz": saved_files,
    "stats_csv": str(stats_path),
    "scaler_params_per_horizon": {k: SEQ_DATASETS[k].get("scaler_params") for k in SEQ_DATASETS.keys()},
}
with open(meta_path, "w") as f:
    json.dump(meta, f, indent=2)

print(f"\nüìÅ Saved stats : {stats_path}")
print(f"üìÅ Saved meta  : {meta_path}")

globals().update({
    "SEQ_DATASETS": SEQ_DATASETS,
    "SEQ_STATS_DF": stats_df,
    "CELL8_META_PATH": str(meta_path),
    "CELL8_SAVED_FILES": saved_files,
    "LOOKBACK_WINDOW": int(LOOKBACK),
})

print("\n" + "=" * 85)
print("‚úÖ CELL 8 COMPLETE: Sequence datasets built + saved (NPZ)")
print("‚û°Ô∏è NEXT: CELL 9 (LSTM Training on GPU)")
print("=" * 85)



üß¨ CELL 8: FAST MULTI-ASSET SEQUENCE DATASET BUILDER (NO LEAKAGE) [FIXED]

üîé ASSET AVAILABILITY CHECK:
   ‚Ä¢ In features_df: 10 assets
   ‚Ä¢ In targets_df : 10 assets
   ‚Ä¢ In BOTH       : 10 assets

‚úÖ Using ASSET_UNIVERSE (filtered): 10 assets
    ['AAPL', 'MSFT', 'NVDA', 'AMZN', 'GOOGL', 'TSLA', 'META', 'JPM', 'KO', 'DIS']

‚öôÔ∏è CONFIG:
   ‚Ä¢ LOOKBACK: 60
   ‚Ä¢ STRIDE  : 1
   ‚Ä¢ SCALE   : True
   ‚Ä¢ Assets  : 10

üìÖ SPLITS:
   ‚Ä¢ Train days: 2013
   ‚Ä¢ Val days  : 250
   ‚Ä¢ Test days : 493

üîß BUILDING SEQUENCES FOR ALL HORIZONS...

---------------------------------------------------------------------------
üèÅ Horizon: short_1d (1 days)
---------------------------------------------------------------------------
   ‚Ä¢ AAPL      : 2,656 sequences
   ‚Ä¢ MSFT      : 2,656 sequences
   ‚Ä¢ NVDA      : 2,656 sequences
   ‚Ä¢ AMZN      : 2,656 sequences
   ‚Ä¢ GOOGL     : 2,656 sequences
   ‚Ä¢ TSLA      : 2,656 sequences
   ‚Ä¢ META      : 2,656 sequences
   ‚Ä¢

In [18]:
# ================================
# CELL 9 (UPDATED v3): LSTM TRAINING (TARGET-NORM + HUBER + CLIP + SCHEDULER)
# ================================

print("\n" + "=" * 90)
print("üß† CELL 9 (UPDATED v3): LSTM TRAINING (TARGET-NORM + HUBER + CLIP + SCHEDULER)")
print("=" * 90)

import numpy as np, pandas as pd, json, math, warnings
warnings.filterwarnings("ignore")
from datetime import datetime

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

required = ["SEQ_DATASETS", "FORECAST_HORIZONS", "PROJECT_DIRS", "ASSET_UNIVERSE"]
missing = [x for x in required if x not in globals()]
if missing:
    raise NameError(f"Missing globals: {missing}. Run Cells 1‚Äì8 first.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üñ•Ô∏è Device: {device}")

# CONFIG
BATCH_SIZE = 256 if device.type == "cuda" else 128
EPOCHS = 50
LR = 1e-3
WEIGHT_DECAY = 1e-4
PATIENCE = 8

HIDDEN = 128
LAYERS = 2
DROPOUT = 0.2
ASSET_EMB_DIM = 8

USE_AMP = (device.type == "cuda")
CLIP_NORM = 1.0
HUBER_DELTA = 1.0
TARGET_NORM = True  # train-only normalization per horizon

stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
MODEL_DIR = PROJECT_DIRS.get("models", PROJECT_DIRS["tables"].parent / "models")
MODEL_DIR.mkdir(parents=True, exist_ok=True)

print("\n‚öôÔ∏è TRAINING CONFIG:")
print(f"   ‚Ä¢ Batch: {BATCH_SIZE}")
print(f"   ‚Ä¢ Epochs: {EPOCHS}")
print(f"   ‚Ä¢ LR: {LR}")
print(f"   ‚Ä¢ Hidden: {HIDDEN} | Layers: {LAYERS} | Dropout: {DROPOUT}")
print(f"   ‚Ä¢ Asset Emb Dim: {ASSET_EMB_DIM}")
print(f"   ‚Ä¢ Target Norm (train-only): {TARGET_NORM}")
print(f"   ‚Ä¢ Loss: Huber | delta={HUBER_DELTA}")
print(f"   ‚Ä¢ Grad clip: {CLIP_NORM}")
print(f"   ‚Ä¢ Scheduler: ReduceLROnPlateau")
print(f"   ‚Ä¢ AMP: {USE_AMP}")

class SeqDataset(Dataset):
    def __init__(self, X, y, asset_ids):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.asset_ids = torch.tensor(asset_ids, dtype=torch.long)

    def __len__(self): return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], self.asset_ids[idx]

class LSTMRegressor(nn.Module):
    def __init__(self, n_features, n_assets, asset_emb_dim=8, hidden=128, layers=2, dropout=0.2):
        super().__init__()
        self.asset_emb = nn.Embedding(n_assets, asset_emb_dim)
        in_dim = n_features + asset_emb_dim

        self.lstm = nn.LSTM(
            input_size=in_dim,
            hidden_size=hidden,
            num_layers=layers,
            batch_first=True,
            dropout=dropout if layers > 1 else 0.0
        )
        self.head = nn.Sequential(
            nn.LayerNorm(hidden),
            nn.Linear(hidden, hidden // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden // 2, 1)
        )

    def forward(self, x, asset_id):
        emb = self.asset_emb(asset_id)                  # (B,E)
        emb = emb.unsqueeze(1).repeat(1, x.size(1), 1)  # (B,L,E)
        x = torch.cat([x, emb], dim=-1)                 # (B,L,F+E)
        out, _ = self.lstm(x)
        last = out[:, -1, :]
        return self.head(last)

def rmse_np(y_true, y_pred):
    y_true = y_true.reshape(-1)
    y_pred = y_pred.reshape(-1)
    return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))

def r2_np(y_true, y_pred):
    y_true = y_true.reshape(-1)
    y_pred = y_pred.reshape(-1)
    ss_res = float(np.sum((y_true - y_pred) ** 2))
    ss_tot = float(np.sum((y_true - np.mean(y_true)) ** 2))
    return 1.0 - ss_res / (ss_tot + 1e-12)

def dir_acc(y_true, y_pred):
    return float((np.sign(y_true) == np.sign(y_pred)).mean())

def run_epoch(model, loader, loss_fn, optimizer=None, amp_scaler=None):
    train_mode = optimizer is not None
    model.train(train_mode)

    losses = []
    y_true_all, y_pred_all = [], []

    for x, y, asset_id in loader:
        x = x.to(device)
        y = y.to(device)
        asset_id = asset_id.to(device)

        if train_mode:
            optimizer.zero_grad(set_to_none=True)

        with torch.cuda.amp.autocast(enabled=USE_AMP):
            pred = model(x, asset_id)
            loss = loss_fn(pred, y)

        if train_mode:
            if USE_AMP:
                amp_scaler.scale(loss).backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_NORM)
                amp_scaler.step(optimizer)
                amp_scaler.update()
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_NORM)
                optimizer.step()

        losses.append(float(loss.item()))
        y_true_all.append(y.detach().cpu().numpy())
        y_pred_all.append(pred.detach().cpu().numpy())

    return float(np.mean(losses)), np.vstack(y_true_all), np.vstack(y_pred_all)

print("\nüöÄ TRAINING LSTM PER HORIZON...\n")

results_rows = []
saved_models = {}

for horizon_key, h_days in FORECAST_HORIZONS.items():
    if horizon_key not in SEQ_DATASETS:
        print(f"‚ö†Ô∏è Skipping {horizon_key} (no sequences built).")
        continue

    ds = SEQ_DATASETS[horizon_key]
    X_train, y_train = ds["X_train"], ds["y_train"]
    X_val, y_val     = ds["X_val"], ds["y_val"]
    X_test, y_test   = ds["X_test"], ds["y_test"]

    a_train = ds["asset_ids_train"]
    a_val   = ds["asset_ids_val"]
    a_test  = ds["asset_ids_test"]

    # Target norm (train only)
    if TARGET_NORM:
        mu = float(np.mean(y_train))
        sd = float(np.std(y_train) + 1e-12)
        y_train_n = (y_train - mu) / sd
        y_val_n   = (y_val   - mu) / sd
        y_test_n  = (y_test  - mu) / sd
    else:
        mu, sd = 0.0, 1.0
        y_train_n, y_val_n, y_test_n = y_train, y_val, y_test

    n_features = X_train.shape[2]
    n_assets = int(ds["n_assets"])

    print("-" * 75)
    print(f"üìä Horizon: {horizon_key} ({h_days}d)")
    print(f"   Train: {X_train.shape} | Val: {X_val.shape} | Test: {X_test.shape}")

    train_loader = DataLoader(SeqDataset(X_train, y_train_n, a_train), batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
    val_loader   = DataLoader(SeqDataset(X_val,   y_val_n,   a_val),   batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
    test_loader  = DataLoader(SeqDataset(X_test,  y_test_n,  a_test),  batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

    model = LSTMRegressor(n_features=n_features, n_assets=n_assets, asset_emb_dim=ASSET_EMB_DIM,
                         hidden=HIDDEN, layers=LAYERS, dropout=DROPOUT).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

    # ‚úÖ FIX: ReduceLROnPlateau no verbose arg (some torch builds error on it)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode="min", factor=0.5, patience=2, threshold=1e-4, min_lr=1e-6
    )

    amp_scaler = torch.cuda.amp.GradScaler(enabled=USE_AMP)
    loss_fn = nn.SmoothL1Loss(beta=HUBER_DELTA)

    best_val = float("inf")
    bad = 0
    best_path = MODEL_DIR / f"lstm_UPDATEDv3_{horizon_key}_{stamp}.pt"

    for epoch in range(1, EPOCHS + 1):
        tr_loss, _, _ = run_epoch(model, train_loader, loss_fn, optimizer=optimizer, amp_scaler=amp_scaler)
        va_loss, _, _ = run_epoch(model, val_loader, loss_fn)

        scheduler.step(va_loss)
        lr_now = optimizer.param_groups[0]["lr"]

        if va_loss < best_val - 1e-6:
            best_val = va_loss
            bad = 0
            torch.save(model.state_dict(), best_path)
        else:
            bad += 1

        print(f"   Epoch {epoch:02d} | TrainLoss {tr_loss:.6f} | ValLoss {va_loss:.6f} | LR {lr_now:.2e} | Pat {bad}/{PATIENCE}")

        if bad >= PATIENCE:
            print("   ‚èπÔ∏è Early stopping triggered.")
            break

    model.load_state_dict(torch.load(best_path, map_location=device))
    te_loss, te_y_n, te_p_n = run_epoch(model, test_loader, loss_fn)

    # De-normalize predictions for metrics on RAW scale
    te_y = (te_y_n * sd) + mu
    te_p = (te_p_n * sd) + mu

    rmse = rmse_np(te_y, te_p)
    mae = float(np.mean(np.abs(te_y - te_p)))
    r2 = r2_np(te_y, te_p)
    da = dir_acc(te_y, te_p)

    print(f"‚úÖ TEST | loss(norm) {te_loss:.6f} | RMSE {rmse:.6f} | MAE {mae:.6f} | R¬≤ {r2:.4f} | DirAcc {da:.3f}\n")

    saved_models[horizon_key] = str(best_path)
    results_rows.append({
        "horizon": horizon_key,
        "horizon_days": int(h_days),
        "test_loss_norm": float(te_loss),
        "rmse_raw": float(rmse),
        "mae_raw": float(mae),
        "r2_raw": float(r2),
        "diracc_raw": float(da),
        "target_norm_mu": float(mu),
        "target_norm_sd": float(sd),
        "model_path": str(best_path),
        "device": str(device),
    })

results_df = pd.DataFrame(results_rows).sort_values("horizon_days")
results_path = PROJECT_DIRS["tables"] / f"cell9_lstm_results_UPDATEDv3_{stamp}.csv"
results_df.to_csv(results_path, index=False)

meta_path = PROJECT_DIRS["tables"] / f"cell9_lstm_meta_UPDATEDv3_{stamp}.json"
with open(meta_path, "w") as f:
    json.dump({
        "created_at": stamp,
        "device": str(device),
        "amp": bool(USE_AMP),
        "saved_models": saved_models,
        "results_csv": str(results_path),
    }, f, indent=2)

globals().update({
    "LSTM_RESULTS_DF": results_df,
    "LSTM_MODELS": saved_models,
    "CELL9_RESULTS_PATH": str(results_path),
    "CELL9_META_PATH": str(meta_path),
})

print("\n" + "=" * 90)
print("‚úÖ CELL 9 COMPLETE (UPDATED v3)")
print("üìÅ Results:", results_path)
print("üìÅ Meta   :", meta_path)
print("=" * 90)



üß† CELL 9 (UPDATED v3): LSTM TRAINING (TARGET-NORM + HUBER + CLIP + SCHEDULER)
üñ•Ô∏è Device: cuda

‚öôÔ∏è TRAINING CONFIG:
   ‚Ä¢ Batch: 256
   ‚Ä¢ Epochs: 50
   ‚Ä¢ LR: 0.001
   ‚Ä¢ Hidden: 128 | Layers: 2 | Dropout: 0.2
   ‚Ä¢ Asset Emb Dim: 8
   ‚Ä¢ Target Norm (train-only): True
   ‚Ä¢ Loss: Huber | delta=1.0
   ‚Ä¢ Grad clip: 1.0
   ‚Ä¢ Scheduler: ReduceLROnPlateau
   ‚Ä¢ AMP: True

üöÄ TRAINING LSTM PER HORIZON...

---------------------------------------------------------------------------
üìä Horizon: short_1d (1d)
   Train: (19130, 60, 44) | Val: (2500, 60, 44) | Test: (4930, 60, 44)
   Epoch 01 | TrainLoss 0.339044 | ValLoss 0.303927 | LR 1.00e-03 | Pat 0/8
   Epoch 02 | TrainLoss 0.335934 | ValLoss 0.304687 | LR 1.00e-03 | Pat 1/8
   Epoch 03 | TrainLoss 0.335402 | ValLoss 0.304436 | LR 1.00e-03 | Pat 2/8
   Epoch 04 | TrainLoss 0.335501 | ValLoss 0.304112 | LR 5.00e-04 | Pat 3/8
   Epoch 05 | TrainLoss 0.334576 | ValLoss 0.304430 | LR 5.00e-04 | Pat 4/8
   Epoch 06 | 

In [8]:
# ================================
# CELL 10: FINAL EVALUATION + ECONOMIC BACKTEST + XAI (THESIS-GRADE)
# Compatible with: Cell 5 (DATASETS), Cell 7 (XGB models), Cell 8/9 (LSTM)
# ================================

print("\n" + "="*95)
print("üìå CELL 10: FINAL EVALUATION + ECONOMIC BACKTEST + XAI (THESIS-GRADE)")
print("="*95)

import os, glob, json, math, warnings
warnings.filterwarnings("ignore")
from datetime import datetime
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

# --- Required globals ---
required = ["PROJECT_DIRS", "FORECAST_HORIZONS"]
missing = [x for x in required if x not in globals()]
if missing:
    raise NameError(f"Missing required globals: {missing}. Run Cells 1‚Äì3 first (and bootstrap on Vast).")

# Optional but expected if you trained models:
HAS_DATASETS = "DATASETS" in globals()
HAS_CELL7    = "CELL7_RESULTS_DF" in globals() or "CELL7_RESULTS_PATH" in globals()
HAS_LSTM     = "LSTM_RESULTS_DF" in globals() or "CELL9_RESULTS_PATH" in globals()
HAS_SEQ      = "SEQ_DATASETS" in globals()

print(f"‚úÖ HAS_DATASETS={HAS_DATASETS} | HAS_CELL7={HAS_CELL7} | HAS_LSTM={HAS_LSTM} | HAS_SEQ={HAS_SEQ}")

TABLES_DIR  = PROJECT_DIRS["tables"]
FIG_DIR     = PROJECT_DIRS.get("figures", TABLES_DIR.parent / "figures")
MODELS_DIR  = PROJECT_DIRS.get("models",  TABLES_DIR.parent / "models")
for p in [TABLES_DIR, FIG_DIR, MODELS_DIR]:
    p.mkdir(parents=True, exist_ok=True)

stamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# --------------------------
# Helpers: metrics
# --------------------------
EPS = 1e-12

def reg_metrics(y_true, y_pred):
    y_true = np.asarray(y_true).reshape(-1)
    y_pred = np.asarray(y_pred).reshape(-1)
    mae  = float(np.mean(np.abs(y_true - y_pred)))
    rmse = float(np.sqrt(np.mean((y_true - y_pred) ** 2)))
    ss_res = float(np.sum((y_true - y_pred) ** 2))
    ss_tot = float(np.sum((y_true - np.mean(y_true)) ** 2))
    r2   = float(1.0 - ss_res / (ss_tot + EPS))
    da   = float((np.sign(y_true) == np.sign(y_pred)).mean())
    ic   = float(np.corrcoef(y_true, y_pred)[0,1]) if (np.std(y_true) > 0 and np.std(y_pred) > 0) else 0.0
    return {"R2": r2, "MAE": mae, "RMSE": rmse, "DirAcc": da, "IC": ic, "pred_std": float(np.std(y_pred))}

def cls_metrics(y_true, y_pred, labels=(-1,0,1)):
    y_true = np.asarray(y_true).reshape(-1)
    y_pred = np.asarray(y_pred).reshape(-1)
    acc = float((y_true == y_pred).mean())
    # balanced acc
    bal_parts = []
    for lb in labels:
        m = (y_true == lb)
        bal_parts.append(float((y_pred[m] == lb).mean()) if m.sum() > 0 else np.nan)
    bal = float(np.nanmean(bal_parts)) if np.isfinite(np.nanmean(bal_parts)) else 0.0
    # macro F1
    f1s = []
    for lb in labels:
        tp = np.sum((y_true==lb) & (y_pred==lb))
        fp = np.sum((y_true!=lb) & (y_pred==lb))
        fn = np.sum((y_true==lb) & (y_pred!=lb))
        prec = tp / (tp + fp + EPS)
        rec  = tp / (tp + fn + EPS)
        f1 = 2*prec*rec/(prec+rec+EPS)
        f1s.append(f1)
    macro_f1 = float(np.mean(f1s))
    # confusion matrix
    lab2i = {lb:i for i,lb in enumerate(labels)}
    cm = np.zeros((len(labels), len(labels)), dtype=int)
    for yt, yp in zip(y_true, y_pred):
        if yt in lab2i and yp in lab2i:
            cm[lab2i[yt], lab2i[yp]] += 1
    return {"acc": acc, "balanced_acc": bal, "macro_f1": macro_f1, "cm": cm.tolist()}

# --------------------------
# Helper: economic backtest (rank L/S)
# --------------------------
def rank_ls_backtest(panel: pd.DataFrame, score_col: str, ret_col: str, date_col="date", q=0.2):
    """
    Long top-q assets, Short bottom-q assets each date (equal-weight).
    Requires panel with columns: date, asset, score, realized_return
    Returns: daily portfolio returns series + summary stats
    """
    df = panel[[date_col, "asset", score_col, ret_col]].dropna().copy()
    df[date_col] = pd.to_datetime(df[date_col]).dt.normalize()
    df = df.sort_values(date_col)

    rets = []
    for d, g in df.groupby(date_col):
        g = g.dropna()
        if len(g) < 5:
            continue
        g = g.sort_values(score_col)
        k = max(1, int(len(g)*q))
        short = g.head(k)[ret_col].mean()
        long  = g.tail(k)[ret_col].mean()
        rets.append((d, float(long - short)))

    if not rets:
        return None, None

    s = pd.Series(dict(rets)).sort_index()
    mu = float(s.mean())
    sd = float(s.std() + EPS)
    sharpe = float((mu / sd) * math.sqrt(252))
    cum = float((1 + s).prod() - 1)
    hit = float((s > 0).mean())
    return s, {"mean_daily": mu, "std_daily": sd, "sharpe": sharpe, "cum_return": cum, "hit_rate": hit, "n_days": int(len(s))}

# --------------------------
# Load Cell7 results if needed
# --------------------------
cell7_df = None
if "CELL7_RESULTS_DF" in globals():
    cell7_df = CELL7_RESULTS_DF.copy()
elif "CELL7_RESULTS_PATH" in globals() and os.path.exists(CELL7_RESULTS_PATH):
    cell7_df = pd.read_csv(CELL7_RESULTS_PATH)

# Load LSTM results if needed
lstm_df = None
if "LSTM_RESULTS_DF" in globals():
    lstm_df = LSTM_RESULTS_DF.copy()
elif "CELL9_RESULTS_PATH" in globals() and os.path.exists(CELL9_RESULTS_PATH):
    lstm_df = pd.read_csv(CELL9_RESULTS_PATH)

print("\nüì¶ Loaded result tables:")
print(" - cell7_df:", None if cell7_df is None else cell7_df.shape)
print(" - lstm_df :", None if lstm_df is None else lstm_df.shape)

# --------------------------
# Select "best" models per horizon
# --------------------------
best = {"xgb_reg": {}, "xgb_cls": {}, "lstm": {}}

if cell7_df is not None and len(cell7_df):
    # Regression best by highest R2 (volnorm reg)
    reg_df = cell7_df[cell7_df["task"].str.contains("regression", na=False)].copy()
    if len(reg_df):
        for hz, g in reg_df.groupby("horizon"):
            gg = g.sort_values(["R2", "DirAcc"], ascending=False).head(1)
            best["xgb_reg"][hz] = gg.iloc[0].to_dict()

    # Classification best by balanced_acc then macro_f1
    cls_df = cell7_df[cell7_df["task"].str.contains("classification", na=False)].copy()
    if len(cls_df):
        for hz, g in cls_df.groupby("horizon"):
            gg = g.sort_values(["balanced_acc", "macro_f1"], ascending=False).head(1)
            best["xgb_cls"][hz] = gg.iloc[0].to_dict()

if lstm_df is not None and len(lstm_df):
    for hz, g in lstm_df.groupby("horizon"):
        gg = g.sort_values(["r2_raw", "diracc_raw"], ascending=False).head(1)
        best["lstm"][hz] = gg.iloc[0].to_dict()

print("\nüèÜ Best-model picks (per horizon):")
print(" - XGB REG:", list(best["xgb_reg"].keys()))
print(" - XGB CLS:", list(best["xgb_cls"].keys()))
print(" - LSTM   :", list(best["lstm"].keys()))

# --------------------------
# Re-evaluate models on TEST (from DATASETS / SEQ_DATASETS)
# --------------------------
rows = []

# XGB REG re-eval (volnorm clean)
if HAS_DATASETS and len(best["xgb_reg"]) > 0:
    import xgboost as xgb
    for hz in best["xgb_reg"]:
        ds = DATASETS.get(hz, None)
        if ds is None:
            continue
        needed = ["X_test_volnorm_clean", "y_test_volnorm_clean"]
        if not all(k in ds for k in needed):
            print(f"‚ö†Ô∏è {hz}: missing volnorm_clean arrays in DATASETS. Skipping XGB reg re-eval.")
            continue

        model_path = best["xgb_reg"][hz].get("model_path", "")
        if not model_path or (not os.path.exists(model_path)):
            print(f"‚ö†Ô∏è {hz}: model file not found: {model_path}. Skipping.")
            continue

        Xte = ds["X_test_volnorm_clean"]
        yte = ds["y_test_volnorm_clean"]

        model = xgb.XGBRegressor()
        model.load_model(model_path)
        pred = model.predict(Xte)

        mets = reg_metrics(yte, pred)
        rows.append({"model_family":"XGB_REG_volnorm_clean", "horizon":hz, **mets, "model_path":model_path, "n": int(len(yte))})

# XGB CLS re-eval (ternary)
if HAS_DATASETS and len(best["xgb_cls"]) > 0:
    import xgboost as xgb
    for hz in best["xgb_cls"]:
        ds = DATASETS.get(hz, None)
        if ds is None:
            continue
        needed = ["X_test", "y_test_cls"]
        if not all(k in ds for k in needed):
            print(f"‚ö†Ô∏è {hz}: missing classification arrays in DATASETS. Skipping XGB cls re-eval.")
            continue

        model_path = best["xgb_cls"][hz].get("model_path", "")
        if not model_path or (not os.path.exists(model_path)):
            print(f"‚ö†Ô∏è {hz}: model file not found: {model_path}. Skipping.")
            continue

        Xte = ds["X_test"]
        yte = ds["y_test_cls"]

        model = xgb.XGBClassifier()
        model.load_model(model_path)
        pred_m = model.predict(Xte)
        pred = (pred_m - 1).astype(int)  # back to -1/0/1

        mets = cls_metrics(yte, pred)
        rows.append({"model_family":"XGB_CLS_ternary", "horizon":hz, **mets, "model_path":model_path, "n": int(len(yte))})

# LSTM re-eval (optional quick check from saved results only)
if len(best["lstm"]) > 0:
    for hz in best["lstm"]:
        r = best["lstm"][hz]
        rows.append({
            "model_family":"LSTM_from_cell9_results",
            "horizon": hz,
            "R2": float(r.get("r2_raw", np.nan)),
            "MAE": float(r.get("mae_raw", np.nan)),
            "RMSE": float(r.get("rmse_raw", np.nan)),
            "DirAcc": float(r.get("diracc_raw", np.nan)),
            "IC": np.nan,
            "pred_std": np.nan,
            "model_path": str(r.get("model_path", "")),
            "n": np.nan
        })

eval_df = pd.DataFrame(rows)
eval_path = TABLES_DIR / f"cell10_model_eval_{stamp}.csv"
eval_df.to_csv(eval_path, index=False)

print("\n‚úÖ Saved model evaluation table:", eval_path)
display(eval_df.head(30))

# --------------------------
# ECONOMIC BACKTEST (if panel test exists)
# --------------------------
bt_rows = []
bt_series_paths = {}

def find_panel_test(ds: dict):
    # Try common keys your Cell5 might have saved
    for k in ["panel_test", "test_panel", "df_test_panel", "panel_df_test", "test_df"]:
        if k in ds and isinstance(ds[k], pd.DataFrame):
            return ds[k]
    return None

if HAS_DATASETS:
    for hz in FORECAST_HORIZONS.keys():
        ds = DATASETS.get(hz, None)
        if ds is None:
            continue

        panel = find_panel_test(ds)
        if panel is None:
            continue

        # Must have date+asset and a realized return/target column.
        # We will try y_raw (continuous return target) as realized return proxy if present.
        cand_ret = None
        for c in ["y_raw", "y_test_raw", "y", "target", "y_future", "y_true"]:
            if c in panel.columns:
                cand_ret = c
                break
        if cand_ret is None:
            # fallback: if panel has the actual target column name
            ycols = [c for c in panel.columns if str(c).startswith("y_")]
            if ycols:
                cand_ret = ycols[0]
        if cand_ret is None:
            continue

        # Use XGB regression score if we can attach predictions
        xgb_reg_path = best["xgb_reg"].get(hz, {}).get("model_path", None)
        if xgb_reg_path and os.path.exists(xgb_reg_path) and ("X_test_volnorm_clean" in ds):
            try:
                import xgboost as xgb
                model = xgb.XGBRegressor()
                model.load_model(xgb_reg_path)

                # If you stored matching panel rows for the clean test set, prefer that.
                # Otherwise, we skip to avoid misalignment.
                if "panel_test_volnorm_clean" in ds and isinstance(ds["panel_test_volnorm_clean"], pd.DataFrame):
                    panel_bt = ds["panel_test_volnorm_clean"].copy()
                    Xte = ds["X_test_volnorm_clean"]
                    pred = model.predict(Xte)
                    panel_bt["score_xgb"] = pred
                else:
                    # no safe alignment key -> skip
                    continue

                s, summ = rank_ls_backtest(panel_bt, "score_xgb", cand_ret, date_col="date", q=0.2)
                if s is None:
                    continue
                out_ser = TABLES_DIR / f"cell10_bt_{hz}_{stamp}.csv"
                s.to_csv(out_ser, header=["ls_return"])
                bt_series_paths[hz] = str(out_ser)

                bt_rows.append({
                    "horizon": hz,
                    "model": "XGB_REG_volnorm_clean",
                    **summ,
                    "series_path": str(out_ser)
                })

                # plot
                cum = (1 + s).cumprod()
                plt.figure()
                plt.plot(cum.index, cum.values)
                plt.title(f"Cumulative L/S (top20%-bottom20%) | {hz}")
                plt.xlabel("Date")
                plt.ylabel("Cumulative wealth")
                fig_path = FIG_DIR / f"cell10_cum_ls_{hz}_{stamp}.png"
                plt.savefig(fig_path)
                plt.close()

            except Exception as e:
                print(f"‚ö†Ô∏è Backtest failed for {hz}: {repr(e)}")

bt_df = pd.DataFrame(bt_rows)
bt_path = TABLES_DIR / f"cell10_backtests_{stamp}.csv"
bt_df.to_csv(bt_path, index=False)

print("\nüìà Backtest summary (only if panel alignment existed):")
print(" - saved:", bt_path)
display(bt_df)

# --------------------------
# XAI: SHAP for XGBoost Regression (optional)
# --------------------------
shap_path = None
try:
    import shap
    HAS_SHAP = True
except Exception:
    HAS_SHAP = False

if HAS_SHAP and HAS_DATASETS and len(best["xgb_reg"]) > 0:
    import xgboost as xgb

    # pick one horizon (best by R2 in eval_df)
    cand = eval_df[eval_df["model_family"].str.contains("XGB_REG", na=False)].copy()
    if len(cand):
        pick = cand.sort_values("R2", ascending=False).iloc[0]["horizon"]
        ds = DATASETS[pick]
        model_path = best["xgb_reg"][pick]["model_path"]

        if os.path.exists(model_path) and ("X_train_volnorm_clean" in ds):
            model = xgb.XGBRegressor()
            model.load_model(model_path)

            # SHAP on a subsample for speed
            Xtr = ds["X_train_volnorm_clean"]
            n = min(5000, len(Xtr))
            idx = np.random.choice(len(Xtr), size=n, replace=False)
            Xs = Xtr[idx]

            explainer = shap.TreeExplainer(model)
            sv = explainer.shap_values(Xs)

            # feature names if provided
            feat_names = ds.get("feature_names", None)
            if feat_names is None:
                feat_names = [f"f{i}" for i in range(Xs.shape[1])]

            # mean abs shap importance
            imp = np.mean(np.abs(sv), axis=0)
            shap_df = pd.DataFrame({"feature": feat_names, "mean_abs_shap": imp}).sort_values("mean_abs_shap", ascending=False)

            shap_path = TABLES_DIR / f"cell10_shap_importance_{pick}_{stamp}.csv"
            shap_df.to_csv(shap_path, index=False)

            # plot top 30
            top = shap_df.head(30).iloc[::-1]
            plt.figure(figsize=(10, 8))
            plt.barh(top["feature"], top["mean_abs_shap"])
            plt.title(f"Top SHAP Features | XGB REG volnorm | {pick}")
            plt.xlabel("Mean |SHAP|")
            fig_path = FIG_DIR / f"cell10_shap_top30_{pick}_{stamp}.png"
            plt.savefig(fig_path)
            plt.close()

            print(f"\n‚úÖ SHAP saved: {shap_path}")
        else:
            print("\n‚ö†Ô∏è SHAP skipped: missing model or X_train_volnorm_clean.")
else:
    print("\n‚ÑπÔ∏è SHAP not installed (or no XGB reg). Skipping XAI.")

# --------------------------
# Save a compact "Cell10 meta"
# --------------------------
meta = {
    "created_at": stamp,
    "has_datasets": HAS_DATASETS,
    "has_cell7": HAS_CELL7,
    "has_lstm": HAS_LSTM,
    "best_models": best,
    "eval_csv": str(eval_path),
    "backtests_csv": str(bt_path),
    "shap_csv": (str(shap_path) if shap_path else None),
}

meta_path = TABLES_DIR / f"cell10_meta_{stamp}.json"
with open(meta_path, "w") as f:
    json.dump(meta, f, indent=2)

globals().update({
    "CELL10_EVAL_DF": eval_df,
    "CELL10_EVAL_PATH": str(eval_path),
    "CELL10_BT_DF": bt_df,
    "CELL10_BT_PATH": str(bt_path),
    "CELL10_META_PATH": str(meta_path),
})

print("\n" + "="*95)
print("‚úÖ CELL 10 COMPLETE")
print("üìÅ Eval CSV     :", eval_path)
print("üìÅ Backtest CSV :", bt_path)
print("üìÅ Meta JSON    :", meta_path)
if shap_path:
    print("üìÅ SHAP CSV     :", shap_path)
print("="*95)



üìå CELL 10: FINAL EVALUATION + ECONOMIC BACKTEST + XAI (THESIS-GRADE)
‚úÖ HAS_DATASETS=True | HAS_CELL7=True | HAS_LSTM=True | HAS_SEQ=True

üì¶ Loaded result tables:
 - cell7_df: (24, 22)
 - lstm_df : (6, 11)

üèÜ Best-model picks (per horizon):
 - XGB REG: ['intermediate_10d', 'intermediate_15d', 'intermediate_20d', 'short_1d', 'short_3d', 'short_5d']
 - XGB CLS: ['intermediate_10d', 'intermediate_15d', 'intermediate_20d', 'short_1d', 'short_3d', 'short_5d']
 - LSTM   : ['intermediate_10d', 'intermediate_15d', 'intermediate_20d', 'short_1d', 'short_3d', 'short_5d']
‚ö†Ô∏è intermediate_10d: missing volnorm_clean arrays in DATASETS. Skipping XGB reg re-eval.
‚ö†Ô∏è intermediate_15d: missing volnorm_clean arrays in DATASETS. Skipping XGB reg re-eval.
‚ö†Ô∏è intermediate_20d: missing volnorm_clean arrays in DATASETS. Skipping XGB reg re-eval.
‚ö†Ô∏è short_1d: missing volnorm_clean arrays in DATASETS. Skipping XGB reg re-eval.
‚ö†Ô∏è short_3d: missing volnorm_clean arrays in DATASETS

Unnamed: 0,model_family,horizon,acc,balanced_acc,macro_f1,cm,model_path,n,R2,MAE,RMSE,DirAcc,IC,pred_std
0,XGB_CLS_ternary,intermediate_10d,0.45071,0.329103,0.32717,"[[776, 52, 1065], [150, 15, 208], [1146, 87, 1...",/workspace/quantgenius_project/models/xgb_cls_...,4930.0,,,,,,
1,XGB_CLS_ternary,intermediate_15d,0.476065,0.345754,0.345727,"[[819, 40, 990], [122, 15, 171], [1203, 57, 15...",/workspace/quantgenius_project/models/xgb_cls_...,4930.0,,,,,,
2,XGB_CLS_ternary,intermediate_20d,0.477688,0.328943,0.32437,"[[746, 50, 1037], [117, 3, 154], [1177, 40, 16...",/workspace/quantgenius_project/models/xgb_cls_...,4930.0,,,,,,
3,XGB_CLS_ternary,short_1d,0.379108,0.375108,0.375441,"[[570, 416, 626], [443, 492, 482], [646, 448, ...",/workspace/quantgenius_project/models/xgb_cls_...,4930.0,,,,,,
4,XGB_CLS_ternary,short_3d,0.408722,0.343316,0.341923,"[[767, 174, 885], [297, 87, 378], [982, 199, 1...",/workspace/quantgenius_project/models/xgb_cls_...,4930.0,,,,,,
5,XGB_CLS_ternary,short_5d,0.446653,0.34822,0.34618,"[[825, 103, 958], [220, 39, 289], [1026, 132, ...",/workspace/quantgenius_project/models/xgb_cls_...,4930.0,,,,,,
6,LSTM_from_cell9_results,intermediate_10d,,,,,/workspace/quantgenius_project/models/lstm_UPD...,,0.005209,0.048373,0.067411,0.567343,,
7,LSTM_from_cell9_results,intermediate_15d,,,,,/workspace/quantgenius_project/models/lstm_UPD...,,-0.029577,0.060543,0.083679,0.539757,,
8,LSTM_from_cell9_results,intermediate_20d,,,,,/workspace/quantgenius_project/models/lstm_UPD...,,-0.052364,0.071317,0.096501,0.529006,,
9,LSTM_from_cell9_results,short_1d,,,,,/workspace/quantgenius_project/models/lstm_UPD...,,0.008432,0.014579,0.022432,0.530629,,



üìà Backtest summary (only if panel alignment existed):
 - saved: /workspace/quantgenius_project/tables/cell10_backtests_20260117_141023.csv



‚úÖ CELL 10 COMPLETE
üìÅ Eval CSV     : /workspace/quantgenius_project/tables/cell10_model_eval_20260117_141023.csv
üìÅ Backtest CSV : /workspace/quantgenius_project/tables/cell10_backtests_20260117_141023.csv
üìÅ Meta JSON    : /workspace/quantgenius_project/tables/cell10_meta_20260117_141023.json


In [20]:
# ==========================================================================================
# üß† CELL 11 (THESIS-GRADE): TRANSFORMER ENCODER TRAINING (MULTI-HORIZON, NO LEAKAGE, VAST SAFE)
# FIX: ReduceLROnPlateau on your torch version does NOT accept `verbose=...`
# ==========================================================================================

print("\n" + "=" * 98)
print("üß† CELL 11 (THESIS-GRADE): TRANSFORMER ENCODER TRAINING (MULTI-HORIZON, VAST SAFE)")
print("=" * 98)

import os, json, math, warnings
warnings.filterwarnings("ignore")
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ----------------------------
# 0) Guards
# ----------------------------
required = ["FORECAST_HORIZONS", "PROJECT_DIRS"]
missing = [x for x in required if x not in globals()]
if missing:
    raise NameError(f"Missing required globals: {missing}. Run Cells 1‚Äì8 first.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üñ•Ô∏è Device: {device}")

data_dir   = Path(PROJECT_DIRS["data"])
tables_dir = Path(PROJECT_DIRS["tables"])
models_dir = Path(PROJECT_DIRS.get("models", tables_dir.parent / "models"))
tables_dir.mkdir(parents=True, exist_ok=True)
models_dir.mkdir(parents=True, exist_ok=True)

# ----------------------------
# 1) Config (safe defaults for Vast GPU)
# ----------------------------
CFG = {
    "batch_size": 256,
    "epochs": 40,
    "lr": 1e-3,
    "weight_decay": 1e-4,
    "lookback": 60,
    "d_model": 128,
    "n_heads": 8,
    "n_layers": 3,
    "dropout": 0.15,
    "ff_mult": 4,
    "grad_clip": 1.0,
    "huber_delta": 1.0,
    "patience": 8,
    "use_amp": True,
    "target_norm_train_only": True,
    "num_workers": 2,
}

print("\n‚öôÔ∏è TRAINING CONFIG:")
for k, v in CFG.items():
    print(f"  ‚Ä¢ {k}: {v}")

stamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# ----------------------------
# 2) Dataset
# ----------------------------
class SeqNPZDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X).float()
        y = y.reshape(-1)
        self.y = torch.from_numpy(y).float()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# ----------------------------
# 3) Model: Transformer Encoder Regressor
# ----------------------------
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512, dropout=0.0):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1).float()
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))  # (1, max_len, d_model)

    def forward(self, x):
        T = x.size(1)
        x = x + self.pe[:, :T, :]
        return self.dropout(x)

class TransformerRegressor(nn.Module):
    def __init__(self, n_features, lookback, d_model=128, n_heads=8, n_layers=3, dropout=0.1, ff_mult=4):
        super().__init__()
        self.in_proj = nn.Linear(n_features, d_model)
        self.pos = PositionalEncoding(d_model, max_len=lookback + 5, dropout=dropout)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=d_model * ff_mult,
            dropout=dropout,
            batch_first=True,
            activation="gelu",
            norm_first=True
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)
        self.norm = nn.LayerNorm(d_model)

        self.head = nn.Sequential(
            nn.Linear(d_model * 2, d_model),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model, 1)
        )

    def forward(self, x):
        h = self.in_proj(x)      # (B, T, D)
        h = self.pos(h)
        h = self.encoder(h)      # (B, T, D)
        h = self.norm(h)
        last = h[:, -1, :]
        mean = h.mean(dim=1)
        z = torch.cat([last, mean], dim=1)
        return self.head(z).squeeze(-1)

# ----------------------------
# 4) Metrics
# ----------------------------
def reg_metrics_np(y_true, y_pred):
    y_true = np.asarray(y_true).reshape(-1)
    y_pred = np.asarray(y_pred).reshape(-1)
    m = np.isfinite(y_true) & np.isfinite(y_pred)
    if m.sum() < 10:
        return {"R2": np.nan, "RMSE": np.nan, "MAE": np.nan, "DirAcc": np.nan}
    yt, yp = y_true[m], y_pred[m]
    ss_res = np.sum((yt - yp) ** 2)
    ss_tot = np.sum((yt - yt.mean()) ** 2) + 1e-12
    r2 = 1.0 - ss_res / ss_tot
    rmse = float(np.sqrt(np.mean((yt - yp) ** 2)))
    mae = float(np.mean(np.abs(yt - yp)))
    diracc = float(np.mean(np.sign(yt) == np.sign(yp)))
    return {"R2": float(r2), "RMSE": rmse, "MAE": mae, "DirAcc": diracc}

def get_lr(opt):
    return float(opt.param_groups[0]["lr"])

# ----------------------------
# 5) Train loop
# ----------------------------
def train_one_horizon(hz):
    # Load latest seq_{hz}_*.npz
    cands = sorted(list(data_dir.glob(f"seq_{hz}_*.npz")))
    if not cands:
        print(f"‚ùå Missing NPZ for {hz}. Expected seq_{hz}_*.npz in {data_dir}")
        return None
    npz_path = cands[-1]

    blob = np.load(npz_path, allow_pickle=True)
    Xtr = blob["X_train"].astype(np.float32)
    ytr = blob["y_train"].astype(np.float32).reshape(-1)
    Xva = blob["X_val"].astype(np.float32)
    yva = blob["y_val"].astype(np.float32).reshape(-1)
    Xte = blob["X_test"].astype(np.float32)
    yte = blob["y_test"].astype(np.float32).reshape(-1)

    print("\n" + "-" * 90)
    print(f"üìä Horizon: {hz} | NPZ: {npz_path.name}")
    print(f"   Train: {Xtr.shape} | Val: {Xva.shape} | Test: {Xte.shape}")

    # Train-only target normalization
    if CFG["target_norm_train_only"]:
        mu = float(np.mean(ytr))
        sd = float(np.std(ytr))
        sd = sd if sd > 1e-8 else 1.0
        ytr_n = (ytr - mu) / sd
        yva_n = (yva - mu) / sd
        yte_n = (yte - mu) / sd
    else:
        mu, sd = 0.0, 1.0
        ytr_n, yva_n, yte_n = ytr, yva, yte

    train_ds = SeqNPZDataset(Xtr, ytr_n)
    val_ds = SeqNPZDataset(Xva, yva_n)
    test_ds = SeqNPZDataset(Xte, yte_n)

    train_loader = DataLoader(train_ds, batch_size=CFG["batch_size"], shuffle=True,
                              num_workers=CFG["num_workers"], pin_memory=True)
    val_loader = DataLoader(val_ds, batch_size=CFG["batch_size"], shuffle=False,
                            num_workers=CFG["num_workers"], pin_memory=True)
    test_loader = DataLoader(test_ds, batch_size=CFG["batch_size"], shuffle=False,
                             num_workers=CFG["num_workers"], pin_memory=True)

    n_features = Xtr.shape[-1]
    model = TransformerRegressor(
        n_features=n_features,
        lookback=CFG["lookback"],
        d_model=CFG["d_model"],
        n_heads=CFG["n_heads"],
        n_layers=CFG["n_layers"],
        dropout=CFG["dropout"],
        ff_mult=CFG["ff_mult"],
    ).to(device)

    opt = torch.optim.AdamW(model.parameters(), lr=CFG["lr"], weight_decay=CFG["weight_decay"])
    loss_fn = nn.HuberLoss(delta=CFG["huber_delta"])

    # ‚úÖ FIX: remove verbose arg (older torch versions)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="min", factor=0.5, patience=2)

    scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda" and CFG["use_amp"]))

    best_val = float("inf")
    best_state = None
    patience = 0

    for ep in range(1, CFG["epochs"] + 1):
        model.train()
        tr_losses = []

        for xb, yb in train_loader:
            xb = xb.to(device, non_blocking=True)
            yb = yb.to(device, non_blocking=True)

            opt.zero_grad(set_to_none=True)
            with torch.cuda.amp.autocast(enabled=(device.type == "cuda" and CFG["use_amp"])):
                pred = model(xb)
                loss = loss_fn(pred, yb)

            scaler.scale(loss).backward()
            scaler.unscale_(opt)
            nn.utils.clip_grad_norm_(model.parameters(), CFG["grad_clip"])
            scaler.step(opt)
            scaler.update()
            tr_losses.append(loss.item())

        model.eval()
        va_losses = []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb = xb.to(device, non_blocking=True)
                yb = yb.to(device, non_blocking=True)
                with torch.cuda.amp.autocast(enabled=(device.type == "cuda" and CFG["use_amp"])):
                    pred = model(xb)
                    loss = loss_fn(pred, yb)
                va_losses.append(loss.item())

        tr_loss = float(np.mean(tr_losses)) if tr_losses else np.nan
        va_loss = float(np.mean(va_losses)) if va_losses else np.nan

        scheduler.step(va_loss)
        lr_now = get_lr(opt)

        print(f"   Epoch {ep:02d} | TrainLoss {tr_loss:.6f} | ValLoss {va_loss:.6f} | LR {lr_now:.2e} | Pat {patience}/{CFG['patience']}")

        if va_loss + 1e-6 < best_val:
            best_val = va_loss
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            patience = 0
        else:
            patience += 1
            if patience >= CFG["patience"]:
                print("   ‚èπÔ∏è Early stopping triggered.")
                break

    if best_state is not None:
        model.load_state_dict(best_state)

    # Test
    model.eval()
    preds, ys = [], []
    with torch.no_grad():
        for xb, yb in test_loader:
            xb = xb.to(device, non_blocking=True)
            pred = model(xb).detach().cpu().numpy()
            preds.append(pred)
            ys.append(yb.detach().cpu().numpy())

    pred_n = np.concatenate(preds).reshape(-1)
    y_n = np.concatenate(ys).reshape(-1)

    # de-normalize
    pred = pred_n * sd + mu
    y_true = y_n * sd + mu

    mets = reg_metrics_np(y_true, pred)
    print(f"‚úÖ TEST | RMSE {mets['RMSE']:.6f} | MAE {mets['MAE']:.6f} | R¬≤ {mets['R2']:.4f} | DirAcc {mets['DirAcc']:.3f}")

    model_path = models_dir / f"transformer_reg_{hz}_{stamp}.pt"
    torch.save({
        "model_state": model.state_dict(),
        "config": CFG,
        "horizon": hz,
        "target": "y_raw",
        "target_mu": mu,
        "target_sd": sd,
        "npz_path": str(npz_path),
    }, model_path)

    return {
        "horizon": hz,
        "horizon_days": int(FORECAST_HORIZONS[hz]),
        "target": "y_raw",
        "model": "TransformerEncoder",
        "R2": mets["R2"],
        "RMSE": mets["RMSE"],
        "MAE": mets["MAE"],
        "DirAcc": mets["DirAcc"],
        "model_path": str(model_path),
        "npz_path": str(npz_path),
        "n_train": int(Xtr.shape[0]),
        "n_val": int(Xva.shape[0]),
        "n_test": int(Xte.shape[0]),
    }

# ----------------------------
# 6) Train across horizons
# ----------------------------
results = []
for hz in FORECAST_HORIZONS.keys():
    out = train_one_horizon(hz)
    if out is not None:
        results.append(out)

res_df = pd.DataFrame(results).sort_values(["horizon_days"]).reset_index(drop=True)

out_csv = tables_dir / f"cell11_transformer_results_{stamp}.csv"
out_json = tables_dir / f"cell11_transformer_meta_{stamp}.json"
res_df.to_csv(out_csv, index=False)

meta = {
    "stamp": stamp,
    "device": str(device),
    "cfg": CFG,
    "results_csv": str(out_csv),
    "models_dir": str(models_dir),
}
with open(out_json, "w") as f:
    json.dump(meta, f, indent=2)

print("\n" + "=" * 98)
print("‚úÖ CELL 11 COMPLETE (Transformer Training)")
print(f"üìÅ Results: {out_csv}")
print(f"üìÅ Meta   : {out_json}")
print(f"üìÅ Models : {models_dir}")
print("=" * 98)

print("\nüìã RESULTS SUMMARY:")
print(res_df[["horizon", "R2", "DirAcc", "RMSE", "MAE"]].to_string(index=False))

globals()["CELL11_RESULTS_DF"] = res_df
globals()["CELL11_RESULTS_PATH"] = str(out_csv)



üß† CELL 11 (THESIS-GRADE): TRANSFORMER ENCODER TRAINING (MULTI-HORIZON, VAST SAFE)
üñ•Ô∏è Device: cuda

‚öôÔ∏è TRAINING CONFIG:
  ‚Ä¢ batch_size: 256
  ‚Ä¢ epochs: 40
  ‚Ä¢ lr: 0.001
  ‚Ä¢ weight_decay: 0.0001
  ‚Ä¢ lookback: 60
  ‚Ä¢ d_model: 128
  ‚Ä¢ n_heads: 8
  ‚Ä¢ n_layers: 3
  ‚Ä¢ dropout: 0.15
  ‚Ä¢ ff_mult: 4
  ‚Ä¢ grad_clip: 1.0
  ‚Ä¢ huber_delta: 1.0
  ‚Ä¢ patience: 8
  ‚Ä¢ use_amp: True
  ‚Ä¢ target_norm_train_only: True
  ‚Ä¢ num_workers: 2

------------------------------------------------------------------------------------------
üìä Horizon: short_1d | NPZ: seq_short_1d_20260117_145029.npz
   Train: (19130, 60, 44) | Val: (2500, 60, 44) | Test: (4930, 60, 44)
   Epoch 01 | TrainLoss 0.339818 | ValLoss 0.306886 | LR 1.00e-03 | Pat 0/8
   Epoch 02 | TrainLoss 0.336100 | ValLoss 0.305590 | LR 1.00e-03 | Pat 0/8
   Epoch 03 | TrainLoss 0.334369 | ValLoss 0.306608 | LR 1.00e-03 | Pat 0/8
   Epoch 04 | TrainLoss 0.333894 | ValLoss 0.305563 | LR 1.00e-03 | Pat 1/8
   Epoch

In [22]:
# ==========================================================================================
# üìä CELL 12 (FIXED, VAST-SAFE): UNIFIED MODEL LEADERBOARD (XGB vs LSTM vs Transformer)
#   - Robust to different column names in LSTM/TRF CSVs
# ==========================================================================================

print("\n" + "=" * 98)
print("üìä CELL 12 (FIXED): UNIFIED MODEL LEADERBOARD")
print("=" * 98)

import pandas as pd, numpy as np, glob
from pathlib import Path
from datetime import datetime

required = ["PROJECT_DIRS", "FORECAST_HORIZONS"]
missing = [x for x in required if x not in globals()]
if missing:
    raise NameError(f"Missing required globals: {missing}. Run earlier cells first.")

tables_dir = Path(PROJECT_DIRS["tables"])
fig_dir = Path(PROJECT_DIRS.get("figures", tables_dir.parent / "figures"))
fig_dir.mkdir(parents=True, exist_ok=True)

def latest(pattern):
    files = sorted(glob.glob(str(tables_dir / pattern)))
    return files[-1] if files else None

# --- Locate result files
xgb_path  = latest("cell7_xgb_thesis_results_*.csv") or latest("cell7_xgb_results_*.csv")
lstm_path = latest("cell9_lstm_results_*.csv")
trf_path  = latest("cell11_transformer_results_*.csv")

print("üì¶ Found:")
print(" - XGB :", xgb_path)
print(" - LSTM:", lstm_path)
print(" - TRF :", trf_path)

# ---------------------------
# Helpers: normalize columns
# ---------------------------
def norm_col(s: str) -> str:
    """Normalize column names for matching."""
    s = str(s).strip()
    s = s.replace("¬≤", "2")
    s = s.replace(" ", "_").replace("-", "_").replace("/", "_")
    s = s.lower()
    return s

def pick_col(df: pd.DataFrame, candidates):
    """Return first matching column name from candidates (normalized match)."""
    nmap = {norm_col(c): c for c in df.columns}
    for cand in candidates:
        key = norm_col(cand)
        if key in nmap:
            return nmap[key]
    # also allow partial contains match (last resort)
    for cand in candidates:
        key = norm_col(cand)
        for k, orig in nmap.items():
            if key in k:
                return orig
    return None

def ensure_metrics(df: pd.DataFrame, source_name=""):
    """
    Ensure df has standard columns:
    horizon, horizon_days, R2, RMSE, MAE, DirAcc, model_path
    using robust mapping from whatever is present.
    """
    out = df.copy()

    # Map horizon
    h_col = pick_col(out, ["horizon", "hz"])
    if h_col is None:
        raise KeyError(f"[{source_name}] Can't find horizon column. Available: {list(out.columns)[:30]}")
    out["horizon"] = out[h_col].astype(str)

    # Map horizon_days
    hd_col = pick_col(out, ["horizon_days", "h_days", "days", "horizon_day"])
    if hd_col is not None:
        out["horizon_days"] = pd.to_numeric(out[hd_col], errors="coerce")
    else:
        # derive from FORECAST_HORIZONS
        out["horizon_days"] = out["horizon"].map(lambda x: int(FORECAST_HORIZONS.get(x, np.nan)))

    # Map regression metrics
    r2_col  = pick_col(out, ["R2", "r2", "r_2", "r2_score", "rsq", "r_sq"])
    rmse_col= pick_col(out, ["RMSE", "rmse"])
    mae_col = pick_col(out, ["MAE", "mae"])
    da_col  = pick_col(out, ["DirAcc", "diracc", "dir_acc", "directional_accuracy", "direction_acc", "sign_acc"])

    out["R2"]     = pd.to_numeric(out[r2_col],  errors="coerce") if r2_col  else np.nan
    out["RMSE"]   = pd.to_numeric(out[rmse_col],errors="coerce") if rmse_col else np.nan
    out["MAE"]    = pd.to_numeric(out[mae_col], errors="coerce") if mae_col  else np.nan
    out["DirAcc"] = pd.to_numeric(out[da_col],  errors="coerce") if da_col   else np.nan

    # Optional
    mp_col = pick_col(out, ["model_path", "path", "checkpoint", "ckpt", "model_file"])
    out["model_path"] = out[mp_col].astype(str) if mp_col else ""

    # target name if exists
    t_col = pick_col(out, ["target", "y_target", "label"])
    out["target"] = out[t_col].astype(str) if t_col else "y_raw"

    return out

# ---------------------------
# Build unified table
# ---------------------------
dfs = []

# XGB
if xgb_path:
    xgb = pd.read_csv(xgb_path)
    # Already thesis-grade from your run, but keep it robust
    if "task" not in xgb.columns:
        xgb["task"] = np.where(xgb.get("acc").notna(), "classification", "regression")
    xgb["model_family"] = np.where(xgb["task"].str.contains("class", case=False, na=False), "XGB_CLS", "XGB_REG")

    # Ensure standard cols exist (fill if missing)
    for c in ["target","horizon","horizon_days","R2","RMSE","MAE","DirAcc","IC","Sharpe","acc","balanced_acc","macro_f1","model_path"]:
        if c not in xgb.columns:
            xgb[c] = np.nan
    keep = ["model_family","task","target","horizon","horizon_days","R2","RMSE","MAE","DirAcc","IC","Sharpe",
            "acc","balanced_acc","macro_f1","model_path"]
    dfs.append(xgb[keep])

# LSTM
if lstm_path:
    lstm_raw = pd.read_csv(lstm_path)
    lstm = ensure_metrics(lstm_raw, source_name="LSTM")
    lstm["model_family"] = "LSTM"
    lstm["task"] = "regression"
    lstm["IC"] = np.nan
    lstm["Sharpe"] = np.nan
    lstm["acc"] = np.nan
    lstm["balanced_acc"] = np.nan
    lstm["macro_f1"] = np.nan
    keep = ["model_family","task","target","horizon","horizon_days","R2","RMSE","MAE","DirAcc","IC","Sharpe",
            "acc","balanced_acc","macro_f1","model_path"]
    dfs.append(lstm[keep])

# Transformer
if trf_path:
    trf_raw = pd.read_csv(trf_path)
    trf = ensure_metrics(trf_raw, source_name="Transformer")
    trf["model_family"] = "Transformer"
    trf["task"] = "regression"
    trf["IC"] = np.nan
    trf["Sharpe"] = np.nan
    trf["acc"] = np.nan
    trf["balanced_acc"] = np.nan
    trf["macro_f1"] = np.nan
    keep = ["model_family","task","target","horizon","horizon_days","R2","RMSE","MAE","DirAcc","IC","Sharpe",
            "acc","balanced_acc","macro_f1","model_path"]
    dfs.append(trf[keep])

if not dfs:
    raise RuntimeError("No result tables found. Ensure Cells 7/9/11 saved CSVs to PROJECT_DIRS['tables'].")

leader = pd.concat(dfs, ignore_index=True)

# ---------------------------
# Leaderboards
# ---------------------------
reg = leader[leader["task"].str.contains("reg", case=False, na=False)].copy()

reg["R2"] = pd.to_numeric(reg["R2"], errors="coerce")
reg["DirAcc"] = pd.to_numeric(reg["DirAcc"], errors="coerce")
reg["RMSE"] = pd.to_numeric(reg["RMSE"], errors="coerce")
reg["MAE"] = pd.to_numeric(reg["MAE"], errors="coerce")

# Best overall per horizon: prefer higher R2, then higher DirAcc
best_overall = (reg.sort_values(["horizon_days","R2","DirAcc"], ascending=[True,False,False])
                  .groupby("horizon", as_index=False)
                  .head(1))

stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
out_leader = tables_dir / f"cell12_leaderboard_FIXED_{stamp}.csv"
out_best   = tables_dir / f"cell12_best_overall_FIXED_{stamp}.csv"

leader.to_csv(out_leader, index=False)
best_overall.to_csv(out_best, index=False)

print("\n‚úÖ Saved:")
print(" - Full combined leaderboard:", out_leader)
print(" - Best model per horizon   :", out_best)

print("\nüèÜ BEST OVERALL PER HORIZON:")
print(best_overall[["horizon","model_family","target","R2","DirAcc","RMSE","MAE","model_path"]].to_string(index=False))

pivot = best_overall.pivot_table(index="horizon", values=["R2","DirAcc","RMSE","MAE"], aggfunc="first")
print("\nüìå QUICK PIVOT (best per horizon):")
print(pivot.to_string())



üìä CELL 12 (FIXED): UNIFIED MODEL LEADERBOARD
üì¶ Found:
 - XGB : /workspace/quantgenius_project/tables/cell7_xgb_thesis_results_cuda_20260117_142617.csv
 - LSTM: /workspace/quantgenius_project/tables/cell9_lstm_results_UPDATEDv3_20260117_145043.csv
 - TRF : /workspace/quantgenius_project/tables/cell11_transformer_results_20260117_153517.csv

‚úÖ Saved:
 - Full combined leaderboard: /workspace/quantgenius_project/tables/cell12_leaderboard_FIXED_20260117_160432.csv
 - Best model per horizon   : /workspace/quantgenius_project/tables/cell12_best_overall_FIXED_20260117_160432.csv

üèÜ BEST OVERALL PER HORIZON:
         horizon model_family             target       R2   DirAcc     RMSE      MAE                                                                                         model_path
        short_1d         LSTM  0.000670997891575 0.006188 0.507302 0.022458 0.014613                   /workspace/quantgenius_project/models/lstm_UPDATEDv3_short_1d_20260117_145043.pt
        short

In [24]:
# ==========================================================================================
# ‚úÖ CELL 13 (FIXED): ECONOMIC BACKTEST OF BEST MODELS (LOAD CHECKPOINTS SAFELY)
#   - Fixes LSTM/Transformer state_dict mismatch
#   - Supports checkpoints saved as:
#       (A) raw state_dict
#       (B) dict with keys: model_state, config, target_mu, target_sd, ...
# ==========================================================================================

print("\n" + "=" * 98)
print("üìà CELL 13 (FIXED): ECONOMIC BACKTEST (BEST-PER-HORIZON, CHECKPOINT-SAFE)")
print("=" * 98)

import numpy as np, pandas as pd, glob, json, warnings
warnings.filterwarnings("ignore")
from pathlib import Path
from datetime import datetime

required = ["PROJECT_DIRS", "DATASETS"]
missing = [x for x in required if x not in globals()]
if missing:
    raise NameError(f"Missing globals: {missing}. Run Cells 5‚Äì12 first.")

tables_dir = Path(PROJECT_DIRS["tables"])
data_dir   = Path(PROJECT_DIRS["data"])
models_dir = Path(PROJECT_DIRS.get("models", tables_dir.parent / "models"))
fig_dir    = Path(PROJECT_DIRS.get("figures", tables_dir.parent / "figures"))
for d in [tables_dir, data_dir, models_dir, fig_dir]:
    d.mkdir(parents=True, exist_ok=True)

def latest(pattern, base=tables_dir):
    files = sorted(glob.glob(str(base / pattern)))
    return files[-1] if files else None

best_path = latest("cell12_best_overall_FIXED_*.csv")
if not best_path:
    raise FileNotFoundError("Missing cell12_best_overall_FIXED_*.csv in tables/. Re-run Cell 12.")
best = pd.read_csv(best_path)

print("‚úÖ Loaded best-per-horizon table:", best_path)
print(best[["horizon","model_family","model_path","R2","DirAcc"]].to_string(index=False))

EPS = 1e-12
TOPK = 3
USE_TOPK_LONGSHORT = True

def annualized_sharpe(daily_rets):
    x = np.asarray(daily_rets, dtype=float)
    mu = np.nanmean(x); sd = np.nanstd(x)
    return float(np.sqrt(252) * mu / (sd + EPS))

def max_drawdown(equity_curve):
    ec = np.asarray(equity_curve, dtype=float)
    peak = np.maximum.accumulate(ec)
    dd = (ec - peak) / (peak + EPS)
    return float(np.nanmin(dd))

def cagr(equity_curve, n_days):
    ec = np.asarray(equity_curve, dtype=float)
    if len(ec) < 2: return np.nan
    total = ec[-1] / (ec[0] + EPS)
    years = n_days / 252.0
    return float(total ** (1.0 / (years + EPS)) - 1.0)

# ------------------------------------------------------------------------------------------
# ‚úÖ Robust checkpoint loaders (Torch)
# ------------------------------------------------------------------------------------------
import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("üñ•Ô∏è Torch device:", device)

def load_torch_checkpoint(path):
    ckpt = torch.load(path, map_location=device)
    if isinstance(ckpt, dict) and "model_state" in ckpt:
        return ckpt["model_state"], ckpt.get("config", {}), ckpt
    if isinstance(ckpt, dict) and all(isinstance(k, str) for k in ckpt.keys()):
        # could be raw state_dict
        return ckpt, {}, {"raw_state_dict": True}
    raise ValueError(f"Unrecognized checkpoint format: {path}")

class LSTMRegressorWithAssetEmb(nn.Module):
    def __init__(self, n_features, n_assets=10, asset_emb_dim=8, hidden=128, layers=2, dropout=0.2):
        super().__init__()
        self.use_asset_emb = asset_emb_dim is not None and asset_emb_dim > 0
        if self.use_asset_emb:
            self.asset_emb = nn.Embedding(n_assets, asset_emb_dim)
            lstm_in = n_features + asset_emb_dim
        else:
            self.asset_emb = None
            lstm_in = n_features

        self.lstm = nn.LSTM(
            input_size=lstm_in,
            hidden_size=hidden,
            num_layers=layers,
            dropout=dropout if layers > 1 else 0.0,
            batch_first=True,
        )
        # match "head.*" keys used in your saved model (head.0, head.1, head.4 etc)
        self.head = nn.Sequential(
            nn.LayerNorm(hidden),
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden, 1),
        )

    def forward(self, x, asset_id=None):
        # x: [B,T,F]
        if self.use_asset_emb:
            if asset_id is None:
                # fallback: asset_id=0
                asset_id = torch.zeros((x.size(0),), dtype=torch.long, device=x.device)
            emb = self.asset_emb(asset_id)                    # [B, E]
            emb_rep = emb.unsqueeze(1).repeat(1, x.size(1), 1) # [B,T,E]
            x = torch.cat([x, emb_rep], dim=-1)

        out, _ = self.lstm(x)
        last = out[:, -1, :]
        return self.head(last).squeeze(-1)

class TransformerRegressor(nn.Module):
    def __init__(self, n_features, d_model=128, n_heads=8, n_layers=3, dropout=0.15, ff_mult=4):
        super().__init__()
        self.in_proj = nn.Linear(n_features, d_model)
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=d_model * ff_mult,
            dropout=dropout,
            batch_first=True,
            activation="gelu",
            norm_first=True,
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)
        self.head = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, d_model//2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model//2, 1),
        )

    def forward(self, x):
        z = self.in_proj(x)
        z = self.encoder(z)
        z = z.mean(dim=1)
        return self.head(z).squeeze(-1)

def infer_seq_npz(hz):
    files = sorted(glob.glob(str(data_dir / f"seq_{hz}_*.npz")))
    if not files:
        raise FileNotFoundError(f"Missing seq_{hz}_*.npz in {data_dir}. Run Cell 8.")
    return files[-1]

def build_panel_from_npz(npz):
    X = npz["X_test"]
    y = npz["y_test"].reshape(-1)
    # Optional arrays
    asset = npz["asset_test"] if "asset_test" in npz.files else None
    date  = npz["date_test"]  if "date_test"  in npz.files else None
    return X, y, asset, date

# ------------------------------------------------------------------------------------------
# ‚úÖ Backtest core (TopK long/short)
# ------------------------------------------------------------------------------------------
def backtest_from_preds(preds, rets, asset=None, date=None, n_assets_fallback=10):
    preds = np.asarray(preds, float).reshape(-1)
    rets  = np.asarray(rets, float).reshape(-1)
    n = len(preds)

    df = pd.DataFrame({"pred": preds, "ret": rets})

    if asset is None or date is None:
        df["asset"] = np.tile(np.arange(n_assets_fallback), int(np.ceil(n / n_assets_fallback)))[:n]
        df["date"]  = np.repeat(np.arange(int(np.ceil(n / n_assets_fallback))), n_assets_fallback)[:n]
    else:
        df["asset"] = asset
        df["date"]  = date
        try:
            df["date"] = pd.to_datetime(df["date"])
        except Exception:
            pass

    daily = []
    for d, g in df.groupby("date"):
        g = g.dropna(subset=["pred","ret"])
        if len(g) < 2 * TOPK:
            continue
        g = g.sort_values("pred")
        short = g.head(TOPK)
        long  = g.tail(TOPK)
        # equal-weight long/short
        port_ret = float(long["ret"].mean() - short["ret"].mean())
        hit = float(long["ret"].mean() > short["ret"].mean())
        daily.append((d, port_ret, hit))

    if not daily:
        return None, None

    daily_df = pd.DataFrame(daily, columns=["date","port_ret","hit"]).sort_values("date").reset_index(drop=True)
    equity = (1.0 + daily_df["port_ret"].fillna(0.0)).cumprod().values

    out = {
        "n_days": int(len(daily_df)),
        "avg_daily_ret": float(daily_df["port_ret"].mean()),
        "vol_daily": float(daily_df["port_ret"].std()),
        "sharpe": annualized_sharpe(daily_df["port_ret"].values),
        "max_dd": max_drawdown(equity),
        "cagr": cagr(equity, len(daily_df)),
        "hit_rate": float(daily_df["hit"].mean()),
    }
    return out, daily_df

# ------------------------------------------------------------------------------------------
# ‚úÖ Model prediction helpers
# ------------------------------------------------------------------------------------------
import xgboost as xgb

def predict_xgb(model_path, hz):
    ds = DATASETS[hz]
    Xte = ds["X_test"]
    # For realized returns we prefer y_test_raw; fallback y_test_volnorm
    y_real = ds.get("y_test_raw", None)
    if y_real is None:
        y_real = ds.get("y_test_volnorm", None)
    m = xgb.XGBRegressor()
    m.load_model(model_path)
    preds = m.predict(Xte)
    return preds, y_real, ds.get("asset_test", None), ds.get("date_test", None)

def predict_lstm(model_path, hz):
    npz_path = infer_seq_npz(hz)
    npz = np.load(npz_path)
    X, y_real, asset, date = build_panel_from_npz(npz)
    n_features = X.shape[-1]

    state, cfg, full = load_torch_checkpoint(model_path)

    # Detect asset embedding from keys
    has_asset_emb = any(k.startswith("asset_emb.") for k in state.keys())
    # Detect hidden/layers from state shapes (robust)
    hidden = state["lstm.weight_hh_l0"].shape[1]
    layers = len([k for k in state.keys() if k.startswith("lstm.weight_ih_l")])

    # If asset_emb exists, infer emb dim + n_assets
    asset_emb_dim = None
    n_assets = 10
    if has_asset_emb:
        w = state["asset_emb.weight"]
        n_assets = w.shape[0]
        asset_emb_dim = w.shape[1]

    model = LSTMRegressorWithAssetEmb(
        n_features=n_features,
        n_assets=n_assets,
        asset_emb_dim=asset_emb_dim if has_asset_emb else 0,
        hidden=hidden,
        layers=layers,
        dropout=float(cfg.get("dropout", 0.2)) if isinstance(cfg, dict) else 0.2
    ).to(device)

    model.load_state_dict(state, strict=True)
    model.eval()

    with torch.no_grad():
        xb = torch.tensor(X, dtype=torch.float32, device=device)
        # asset id optional
        if has_asset_emb:
            if asset is None:
                # fallback all zeros
                aid = torch.zeros((xb.size(0),), dtype=torch.long, device=device)
            else:
                # ensure integer ids (0..n_assets-1). if strings, factorize.
                if isinstance(asset[0], (str, bytes)):
                    codes, _ = pd.factorize(asset)
                    asset_ids = codes
                else:
                    asset_ids = asset.astype(int)
                aid = torch.tensor(asset_ids, dtype=torch.long, device=device)
            preds = model(xb, aid).detach().cpu().numpy()
        else:
            preds = model(xb, None).detach().cpu().numpy()

    return preds, y_real, asset, date

def predict_transformer(model_path, hz):
    npz_path = infer_seq_npz(hz)
    npz = np.load(npz_path)
    X, y_real, asset, date = build_panel_from_npz(npz)
    n_features = X.shape[-1]

    state, cfg, full = load_torch_checkpoint(model_path)

    # If checkpoint wrapper: cfg likely contains transformer params
    d_model = int(cfg.get("d_model", 128)) if isinstance(cfg, dict) else 128
    n_heads = int(cfg.get("n_heads", 8))  if isinstance(cfg, dict) else 8
    n_layers= int(cfg.get("n_layers", 3)) if isinstance(cfg, dict) else 3
    dropout = float(cfg.get("dropout", 0.15)) if isinstance(cfg, dict) else 0.15
    ff_mult = int(cfg.get("ff_mult", 4)) if isinstance(cfg, dict) else 4

    model = TransformerRegressor(
        n_features=n_features,
        d_model=d_model,
        n_heads=n_heads,
        n_layers=n_layers,
        dropout=dropout,
        ff_mult=ff_mult
    ).to(device)

    model.load_state_dict(state, strict=True)
    model.eval()

    with torch.no_grad():
        xb = torch.tensor(X, dtype=torch.float32, device=device)
        preds = model(xb).detach().cpu().numpy()

    return preds, y_real, asset, date

# ------------------------------------------------------------------------------------------
# ‚úÖ Run backtests
# ------------------------------------------------------------------------------------------
stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
bt_rows = []
daily_files = []

for _, row in best.iterrows():
    hz  = str(row["horizon"])
    fam = str(row["model_family"])
    mp  = str(row["model_path"])

    print("\n" + "-" * 90)
    print(f"üèÅ Backtesting {hz} | {fam}")

    try:
        if fam.startswith("XGB"):
            preds, y_real, asset, date = predict_xgb(mp, hz)
        elif fam == "LSTM":
            preds, y_real, asset, date = predict_lstm(mp, hz)
        elif fam == "Transformer":
            preds, y_real, asset, date = predict_transformer(mp, hz)
        else:
            print(f"‚ö†Ô∏è Unknown family: {fam}. Skipping.")
            continue

        out, daily_df = backtest_from_preds(preds, y_real, asset=asset, date=date, n_assets_fallback=10)
        if out is None:
            print(f"‚ö†Ô∏è {hz}: not enough daily groups to backtest.")
            continue

        out_row = {
            "horizon": hz,
            "model_family": fam,
            "model_path": mp,
            **out
        }
        bt_rows.append(out_row)

        out_daily = tables_dir / f"cell13_FIXED_daily_{hz}_{fam}_{stamp}.csv"
        daily_df.to_csv(out_daily, index=False)
        daily_files.append(str(out_daily))

        print(f"‚úÖ {hz}: Sharpe={out['sharpe']:.2f} | CAGR={out['cagr']:.2%} | MaxDD={out['max_dd']:.2%} | Hit={out['hit_rate']:.2f}")
        print(f"   Saved daily: {out_daily}")

    except Exception as e:
        print(f"‚ùå {hz} failed: {e}")

bt_df = pd.DataFrame(bt_rows)
out_bt = tables_dir / f"cell13_FIXED_backtest_summary_{stamp}.csv"
bt_df.to_csv(out_bt, index=False)

meta = {
    "best_overall_csv": str(best_path),
    "daily_files": daily_files,
    "topk": TOPK,
    "timestamp": stamp,
}
out_meta = tables_dir / f"cell13_FIXED_backtest_meta_{stamp}.json"
with open(out_meta, "w") as f:
    json.dump(meta, f, indent=2)

print("\n" + "=" * 98)
print("‚úÖ CELL 13 (FIXED) COMPLETE")
print("üìÅ Backtest summary:", out_bt)
print("üìÅ Meta JSON       :", out_meta)
if len(bt_df):
    print("\nüìã BACKTEST SUMMARY:")
    print(bt_df.sort_values("sharpe", ascending=False).to_string(index=False))
print("=" * 98)

globals().update({
    "CELL13_BACKTEST_DF": bt_df,
    "CELL13_BACKTEST_PATH": str(out_bt),
    "CELL13_BACKTEST_META": str(out_meta),
})



üìà CELL 13 (FIXED): ECONOMIC BACKTEST (BEST-PER-HORIZON, CHECKPOINT-SAFE)
‚úÖ Loaded best-per-horizon table: /workspace/quantgenius_project/tables/cell12_best_overall_FIXED_20260117_160432.csv
         horizon model_family                                                                                         model_path       R2   DirAcc
        short_1d         LSTM                   /workspace/quantgenius_project/models/lstm_UPDATEDv3_short_1d_20260117_145043.pt 0.006188 0.507302
        short_3d         LSTM                   /workspace/quantgenius_project/models/lstm_UPDATEDv3_short_3d_20260117_145043.pt 0.009567 0.554361
        short_5d  Transformer                  /workspace/quantgenius_project/models/transformer_reg_short_5d_20260117_153517.pt 0.002056 0.563895
intermediate_10d      XGB_REG /workspace/quantgenius_project/models/xgb_reg_y_volnorm_intermediate_10d_cuda_20260117_142507.json 0.001505 0.578702
intermediate_15d  Transformer          /workspace/quantgenius_project