In [3]:
import os, json
from pathlib import Path
import numpy as np
import pandas as pd

# =======================
# Config
# =======================
ASSETS = {
    "BTC": "data/btc_1d_data_2018_to_2025.csv",
    "ETH": "data/ETH_cleaned.csv",
    "TSLA": "data/TSLA_cleaned.csv",
    "AAPL": "data/AAPL_cleaned.csv",
}
OUT_DIR = Path("data"); OUT_DIR.mkdir(parents=True, exist_ok=True)

# =======================
# Utils (indicators w/o leakage)
# =======================
def rsi(series: pd.Series, window: int = 14):
    delta = series.diff()
    gain  = delta.clip(lower=0).rolling(window, min_periods=window).mean()
    loss  = (-delta.clip(upper=0)).rolling(window, min_periods=window).mean()
    rs = gain / loss.replace(0, np.nan)
    return 100 - (100 / (1 + rs))

def ema(series: pd.Series, span: int):
    return series.ewm(span=span, adjust=False).mean()

def macd(close: pd.Series, fast=12, slow=26, signal=9):
    macd_line = ema(close, fast) - ema(close, slow)
    signal_line = ema(macd_line, signal)
    return macd_line, signal_line, macd_line - signal_line

def bollinger(close: pd.Series, window=20, n_std=2.0):
    ma = close.rolling(window, min_periods=window).mean()
    sd = close.rolling(window, min_periods=window).std()
    upper = ma + n_std * sd
    lower = ma - n_std * sd
    width = (upper - lower) / (ma.replace(0, np.nan))
    pb = (close - lower) / (upper - lower)
    return ma, upper, lower, width, pb

def true_range(high, low, close):
    prev_close = close.shift(1)
    return pd.concat([(high - low), (high - prev_close).abs(), (low - prev_close).abs()], axis=1).max(axis=1)

def atr(high, low, close, window=14):
    return true_range(high, low, close).rolling(window, min_periods=window).mean()

def cyclical_time_features(dt_index: pd.Series):
    dow = dt_index.dt.weekday
    month = dt_index.dt.month
    return pd.DataFrame({
        "dow_sin": np.sin(2*np.pi*dow/7),  "dow_cos": np.cos(2*np.pi*dow/7),
        "mon_sin": np.sin(2*np.pi*(month-1)/12), "mon_cos": np.cos(2*np.pi*(month-1)/12)
    }, index=dt_index.index)

# =======================
# IO helpers
# =======================
RENAME_MAP = {
    "Open time":"open_time","Open":"open","High":"high","Low":"low","Close":"close","Volume":"volume",
    "Close time":"close_time","Quote asset volume":"quote_asset_volume","Number of trades":"num_trades",
    "Taker buy base asset volume":"taker_buy_base","Taker buy quote asset volume":"taker_buy_quote",
    "Ignore":"ignore","Symbol":"symbol"
}

def load_and_standardize(path: str) -> pd.DataFrame:
    if not Path(path).exists():
        raise FileNotFoundError(f"Файл не найден: {path} (cwd={Path.cwd()})")
    df = pd.read_csv(path)
    df = df.rename(columns=RENAME_MAP)
    # требуем минимум OHLCV + хотя бы одно время
    required = ["open","high","low","close","volume"]
    miss = [c for c in required if c not in df.columns]
    if miss:
        raise ValueError(f"{path}: отсутствуют обязательные столбцы: {miss}")
    # время: prefer close_time, иначе open_time
    if "close_time" not in df.columns:
        if "open_time" not in df.columns:
            raise ValueError(f"{path}: нужен хотя бы один столбец времени: 'close_time' или 'open_time'")
        df["close_time"] = df["open_time"]
    for c in ["open_time","close_time"]:
        if c in df.columns:
            df[c] = pd.to_datetime(df[c], utc=True, errors="coerce")
    df = df.sort_values("close_time").drop_duplicates(subset=["close_time"]).reset_index(drop=True)
    return df

def safe_log1p(s: pd.Series):
    return np.log1p(s.astype(float))

# =======================
# Core feature pipeline (per asset)
# =======================
def build_features(df: pd.DataFrame) -> pd.DataFrame:
    # Targets (T+1 на следующий наблюдаемый день)
    df["next_close"]  = df["close"].shift(-1)
    df["next_return"] = (df["next_close"] - df["close"]) / df["close"]
    df["y_class"]     = (df["next_return"] > 0).astype(int)

    # Base returns / lags
    df["ret_1d"] = df["close"].pct_change()
    for k in [2,3,5,10]:
        df[f"ret_{k}d"] = df["close"].pct_change(k)
    for l in [1,2,3,5,10]:
        df[f"close_lag_{l}"] = df["close"].shift(l)

    # Rolling stats
    for win in [5,10,20]:
        df[f"sma_{win}"] = df["close"].rolling(win, min_periods=win).mean()
        df[f"std_{win}"] = df["close"].rolling(win, min_periods=win).std()
        df[f"ret_std_{win}"] = df["ret_1d"].rolling(win, min_periods=win).std()

    # Trend/momentum
    df["ema_12"] = ema(df["close"], 12)
    df["ema_26"] = ema(df["close"], 26)
    df["rsi_14"] = rsi(df["close"], 14)
    macd_line, signal_line, macd_hist = macd(df["close"], 12, 26, 9)
    df["macd_line"], df["macd_signal"], df["macd_hist"] = macd_line, signal_line, macd_hist

    # Bands / Volatility
    bb_ma, _, _, bb_w, bb_pb = bollinger(df["close"], 20, 2.0)
    df["bb_ma_20"], df["bb_width_20"], df["bb_percent_b_20"] = bb_ma, bb_w, bb_pb
    df["hl2"] = (df["high"] + df["low"]) / 2.0
    df["hl_spread"] = (df["high"] - df["low"]) / (df["close"] + 1e-12)
    df["atr_14"] = atr(df["high"], df["low"], df["close"], 14)

    # Volume transforms (обязателен volume)
    df["volume_log"] = safe_log1p(df["volume"])
    for win in [5,20]:
        vol_mean = df["volume"].rolling(win, min_periods=win).mean()
        vol_std  = df["volume"].rolling(win, min_periods=win).std()
        df[f"vol_sma_{win}"]  = vol_mean
        df[f"volume_z_{win}"] = (df["volume"] - vol_mean) / (vol_std + 1e-12)

    # Optional microstructure (если есть)
    if "num_trades" in df.columns:
        df["num_trades_log"] = safe_log1p(df["num_trades"])
        for win in [5,20]:
            tr_mean = df["num_trades"].rolling(win, min_periods=win).mean()
            tr_std  = df["num_trades"].rolling(win, min_periods=win).std()
            df[f"trades_sma_{win}"] = tr_mean
            df[f"trades_z_{win}"]   = (df["num_trades"] - tr_mean) / (tr_std + 1e-12)
    if "taker_buy_base" in df.columns:
        df["taker_buy_base_log"] = safe_log1p(df["taker_buy_base"])
    if "taker_buy_quote" in df.columns:
        df["taker_buy_quote_log"] = safe_log1p(df["taker_buy_quote"])

    # Time features
    df = pd.concat([df, cyclical_time_features(df["close_time"])], axis=1)

    # Drop NaNs (rolling heads) и финальную строку без next_close
    df_feat = df.dropna().copy()
    return df_feat

def select_feature_columns(df_feat: pd.DataFrame):
    exclude_cols = {
        "open_time","close_time","next_close","y_class","next_return","ignore","close","symbol"
    }
    target_cols = ["y_class","next_return"]
    feature_cols = [c for c in df_feat.columns
                    if c not in exclude_cols
                    and c not in target_cols
                    and df_feat[c].dtype != "O"]
    return feature_cols, target_cols

def process_asset(symbol: str, csv_path: str):
    df = load_and_standardize(csv_path)
    df_feat = build_features(df)
    feature_cols, target_cols = select_feature_columns(df_feat)

    # NaN control
    assert df_feat[feature_cols].isna().sum().sum() == 0, "NaN в фичах"
    assert df_feat[target_cols].isna().sum().sum() == 0, "NaN в таргетах"


    out_parquet = OUT_DIR / f"{symbol}_features.parquet"
    out_csv     = OUT_DIR / f"{symbol}_features.csv"
    cols_out = ["close_time","close"] + feature_cols + target_cols
    cols_out = list(dict.fromkeys(cols_out))  # защита от дублей
    df_out = df_feat[cols_out].copy()

    try:
        df_out.to_parquet(out_parquet, index=False)
    except Exception as e:
        print(f"[warn] {symbol}: Parquet не записан ({e}). Продолжаем с CSV.")
    df_out.to_csv(out_csv, index=False)


    report = {
        "symbol": symbol,
        "rows_total_raw": int(len(df)),
        "rows_after_dropna": int(len(df_out)),
        "date_range": [str(df_out["close_time"].iloc[0]), str(df_out["close_time"].iloc[-1])],
        "n_features": len(feature_cols),
        "target_columns": target_cols,
        "files": {"parquet": str(out_parquet), "csv": str(out_csv)}
    }
    print(f"=== {symbol} T+1 FEATURE PIPELINE — SUMMARY ===")
    print(json.dumps(report, indent=2, ensure_ascii=False))

    # Tail preview
    try:
        from IPython.display import display
        display(df_out.tail(5)[["close_time","close","y_class","next_return"] + feature_cols[:8]])
    except Exception:
        print(df_out.tail(5)[["close_time","close","y_class","next_return"] + feature_cols[:8]].to_string(index=False))

    return report

# =======================
# Run for all assets
# =======================
all_reports = {}
for sym, path in ASSETS.items():
    try:
        all_reports[sym] = process_asset(sym, path)
    except Exception as e:
        print(f"[error] {sym}: {e}")


with open(OUT_DIR / "features_dashboard.json", "w", encoding="utf-8") as f:
    json.dump(all_reports, f, ensure_ascii=False, indent=2)
print("Saved dashboard:", OUT_DIR / "features_dashboard.json")


=== BTC T+1 FEATURE PIPELINE — SUMMARY ===
{
  "symbol": "BTC",
  "rows_total_raw": 2846,
  "rows_after_dropna": 2825,
  "date_range": [
    "2018-01-21 23:59:59.999000+00:00",
    "2025-10-15 23:59:59.999000+00:00"
  ],
  "n_features": 55,
  "target_columns": [
    "y_class",
    "next_return"
  ],
  "files": {
    "parquet": "data\\BTC_features.parquet",
    "csv": "data\\BTC_features.csv"
  }
}


Unnamed: 0,close_time,close,y_class,next_return,open,high,low,volume,quote_asset_volume,num_trades,taker_buy_base,taker_buy_quote
2840,2025-10-11 23:59:59.999000+00:00,112143.65,0,-0.01876,112774.49,113178.66,111620.3,4004.34367,450080100.0,605882,1850.16133,207997200.0
2841,2025-10-12 23:59:59.999000+00:00,110039.84,1,0.047654,110644.4,110685.63,109565.06,1449.71411,159565100.0,334156,618.82561,68105450.0
2842,2025-10-13 23:59:59.999000+00:00,115283.65,0,-0.004967,114958.81,115888.0,114766.69,1831.14783,210997600.0,516920,1101.86745,126950200.0
2843,2025-10-14 23:59:59.999000+00:00,114711.0,0,-0.017296,115166.0,115409.96,114542.0,1023.79943,117774000.0,277364,311.71266,35855820.0
2844,2025-10-15 23:59:59.999000+00:00,112726.98,0,-0.015275,113028.13,113112.41,112384.32,1477.97662,166729700.0,316764,480.5479,54204470.0


=== ETH T+1 FEATURE PIPELINE — SUMMARY ===
{
  "symbol": "ETH",
  "rows_total_raw": 2557,
  "rows_after_dropna": 2536,
  "date_range": [
    "2018-11-12 00:00:00+00:00",
    "2025-10-21 00:00:00+00:00"
  ],
  "n_features": 44,
  "target_columns": [
    "y_class",
    "next_return"
  ],
  "files": {
    "parquet": "data\\ETH_features.parquet",
    "csv": "data\\ETH_features.csv"
  }
}


Unnamed: 0,close_time,close,y_class,next_return,open,high,low,volume,ret_1d,ret_2d,ret_3d,ret_5d
2551,2025-10-17 00:00:00+00:00,3832.558838,1,0.015078,3894.377686,3950.566895,3678.620361,57404271888,-0.015969,-0.038847,-0.070988,-0.079691
2552,2025-10-18 00:00:00+00:00,3890.346191,1,0.02424,3833.009521,3927.245605,3822.266357,23815676385,0.015078,-0.001132,-0.024355,-0.083647
2553,2025-10-19 00:00:00+00:00,3984.649658,0,-0.000976,3890.583496,4029.355469,3843.772949,32870655221,0.02424,0.039684,0.023081,-0.034121
2554,2025-10-20 00:00:00+00:00,3980.760254,0,-0.026125,3984.696289,4084.159668,3911.726318,40224612563,-0.000976,0.023241,0.038669,-0.00168
2555,2025-10-21 00:00:00+00:00,3876.76416,0,-0.017706,3980.740479,4109.533203,3843.227783,49960290350,-0.026125,-0.027075,-0.003491,-0.004619


=== TSLA T+1 FEATURE PIPELINE — SUMMARY ===
{
  "symbol": "TSLA",
  "rows_total_raw": 1255,
  "rows_after_dropna": 1234,
  "date_range": [
    "2020-11-20 00:00:00+00:00",
    "2025-10-21 00:00:00+00:00"
  ],
  "n_features": 44,
  "target_columns": [
    "y_class",
    "next_return"
  ],
  "files": {
    "parquet": "data\\TSLA_features.parquet",
    "csv": "data\\TSLA_features.csv"
  }
}


Unnamed: 0,close_time,close,y_class,next_return,open,high,low,volume,ret_1d,ret_2d,ret_3d,ret_5d
1249,2025-10-15 00:00:00+00:00,435.149994,0,-0.014708,434.899994,440.51001,426.329987,71558200,0.013769,-0.001721,0.052383,-0.008069
1250,2025-10-16 00:00:00+00:00,428.75,1,0.02463,434.730011,439.350006,421.309998,77189900,-0.014708,-0.001142,-0.016403,-0.01559
1251,2025-10-17 00:00:00+00:00,439.309998,1,0.018484,425.5,441.459991,423.600006,89331600,0.02463,0.00956,0.02346,0.062444
1252,2025-10-20 00:00:00+00:00,447.429993,0,-0.010795,443.869995,449.799988,440.609985,63719000,0.018484,0.043568,0.02822,0.026451
1253,2025-10-21 00:00:00+00:00,442.600006,0,-0.008202,445.76001,449.299988,442.049988,54412200,-0.010795,0.007489,0.032303,0.031125


=== AAPL T+1 FEATURE PIPELINE — SUMMARY ===
{
  "symbol": "AAPL",
  "rows_total_raw": 1255,
  "rows_after_dropna": 1234,
  "date_range": [
    "2020-11-20 00:00:00+00:00",
    "2025-10-21 00:00:00+00:00"
  ],
  "n_features": 44,
  "target_columns": [
    "y_class",
    "next_return"
  ],
  "files": {
    "parquet": "data\\AAPL_features.parquet",
    "csv": "data\\AAPL_features.csv"
  }
}


Unnamed: 0,close_time,close,y_class,next_return,open,high,low,volume,ret_1d,ret_2d,ret_3d,ret_5d
1249,2025-10-15 00:00:00+00:00,249.339996,0,-0.00758,249.490005,251.820007,247.470001,33893600,0.006336,0.006783,0.016594,-0.033791
1250,2025-10-16 00:00:00+00:00,247.449997,1,0.019559,248.25,249.039993,245.130005,39777000,-0.00758,-0.001292,-0.000848,-0.025941
1251,2025-10-17 00:00:00+00:00,252.289993,1,0.039439,248.020004,253.380005,247.270004,49147000,0.019559,0.011831,0.018243,0.028621
1252,2025-10-20 00:00:00+00:00,262.23999,1,0.002021,255.889999,264.380005,255.630005,90483000,0.039439,0.05977,0.051737,0.058871
1253,2025-10-21 00:00:00+00:00,262.769989,0,-0.01644,261.880005,265.290009,261.829987,46695900,0.002021,0.041539,0.061911,0.06054


Saved dashboard: data\features_dashboard.json


In [4]:
import os, json, time, warnings
from pathlib import Path
import numpy as np
import pandas as pd
from pandas.api.types import is_datetime64_any_dtype
from sklearn.metrics import (accuracy_score, balanced_accuracy_score, f1_score,
                             precision_score, recall_score, roc_auc_score)
from sklearn.isotonic import IsotonicRegression
import xgboost as xgb

warnings.filterwarnings("ignore")

# ==================== CONFIG ====================
ASSETS = {
    "BTC":  {"features": "data/BTC_features.parquet",  "calendar": "crypto"},
    "ETH":  {"features": "data/ETH_features.parquet",  "calendar": "crypto"},
    "TSLA": {"features": "data/TSLA_features.parquet", "calendar": "equity"},
    "AAPL": {"features": "data/AAPL_features.parquet", "calendar": "equity"},
}

DATA_DIR   = Path("data");   DATA_DIR.mkdir(parents=True, exist_ok=True)
MODELS_DIR = Path("models"); MODELS_DIR.mkdir(parents=True, exist_ok=True)

MODEL_VERSION = "hybrid_xgb_multi_tplus1_classic_gate_v10_1_v11ux"
RANDOM_STATE  = 42

# split
TEST_LEN        = 180
VAL_LEN         = 120
EMBARGO_DAYS    = 5
LOOKBACK_YEARS  = 2

# ε-labeling and stationarity
EPSILON         = 0.0016
Z_WIN, Z_MINP   = 180, 90

# weights v10.1
HALF_LIFE_DAYS  = 540
MAG_ALPHA       = 60.0
MAG_CAP         = 0.03

# decision constraints — same as in BTC script
MIN_POS_RATE    = 0.25
MAX_POS_RATE    = 0.75
TRADE_GATE_Q    = 0.70     # as in the old BTC code
REGIME_EMA      = 200      # risk-on (Close > EMA200); 0 = disable

# UX
UX_SOFT_THR     = 0.29
UX_CONF_BAND    = 0.05

# models
CLF_PARAMS = dict(objective="binary:logistic", eval_metric="auc",
                  learning_rate=0.02, max_depth=2, min_child_weight=8,
                  subsample=0.65, colsample_bytree=0.65,
                  reg_lambda=9.0, reg_alpha=1.0, gamma=2.0,
                  tree_method="hist", seed=RANDOM_STATE)
REG_PARAMS = dict(objective="reg:squarederror", eval_metric="rmse",
                  learning_rate=0.02, max_depth=3, min_child_weight=6,
                  subsample=0.7, colsample_bytree=0.7,
                  reg_lambda=7.0, reg_alpha=0.5, gamma=1.0,
                  tree_method="hist", seed=RANDOM_STATE)
N_ROUNDS_CLF = 7000
N_ROUNDS_REG = 7000
EARLY_STOP   = 300

# ==================== HELPERS ====================
def load_features(path_parquet_or_csv: str) -> pd.DataFrame:
    p = Path(path_parquet_or_csv)
    if p.suffix.lower()==".parquet" and p.exists():
        df = pd.read_parquet(p)
    elif p.suffix.lower()==".parquet":
        csv = p.with_suffix(".csv")
        if not csv.exists():
            raise FileNotFoundError(f"No {p} and fallback {csv}")
        df = pd.read_csv(csv)
    elif p.suffix.lower()==".csv":
        df = pd.read_csv(p)
    else:
        raise ValueError(f"Format not supported: {p}")
    if "close_time" not in df: raise KeyError("No 'close_time'")
    if not is_datetime64_any_dtype(df["close_time"]):
        df["close_time"] = pd.to_datetime(df["close_time"], utc=True, errors="coerce")
    df = df.sort_values("close_time").reset_index(drop=True)
    if "close" not in df.columns: raise KeyError("No 'close'")
    return df

def next_trading_day(dt: pd.Timestamp, calendar: str) -> pd.Timestamp:
    if calendar == "crypto":
        return (dt + pd.Timedelta(days=1)).normalize()
    d = dt.normalize()
    while True:
        d = d + pd.Timedelta(days=1)
        if d.weekday() < 5:
            return d

def rolling_z(s: pd.Series, win=Z_WIN, minp=Z_MINP):
    mu = s.rolling(win, min_periods=minp).mean().shift(1)
    sd = s.rolling(win, min_periods=minp).std().shift(1)
    z  = (s - mu) / sd
    return z.replace([np.inf, -np.inf], np.nan)

def rolling_mad(x):
    med = np.median(x)
    return np.median(np.abs(x - med))

def rolling_robust_z(s: pd.Series, win=Z_WIN, minp=Z_MINP):
    med = s.rolling(win, min_periods=minp).median().shift(1)
    mad = s.rolling(win, min_periods=minp).apply(rolling_mad, raw=True).shift(1)
    denom = 1.4826 * mad.replace(0, np.nan)
    z = (s - med) / denom
    return z.replace([np.inf, -np.inf], np.nan)

def cum_return(x):
    x = np.asarray(x, float)
    return float(np.prod(1.0 + x) - 1.0) if x.size else 0.0

def sharpe(x, days_per_year=365):
    x = np.asarray(x, float)
    if x.size == 0: return 0.0
    mu, sd = np.mean(x), np.std(x, ddof=1)
    return float(mu / (sd + 1e-12) * np.sqrt(days_per_year))

def win_rate_on_trades(x):
    x = np.asarray(x, float)
    return float((x > 0).mean()) if x.size else 0.0

def priorsafe_threshold(scores, y_true, min_pos=MIN_POS_RATE, max_pos=MAX_POS_RATE):
    p = float((y_true==1).mean()); p = float(np.clip(p, min_pos, max_pos))
    return float(np.quantile(scores, 1.0 - p))

# ==================== MAIN ====================
dashboard = {}

for SYM, cfg in ASSETS.items():
    try:
        calendar = cfg["calendar"]
        fp = cfg["features"]
        print(f"\n==================== {SYM} ====================")
        df = load_features(fp)

        # EMA200 for risk-on
        if REGIME_EMA:
            df["ema_200"] = df["close"].ewm(span=REGIME_EMA, adjust=False).mean()
        else:
            df["ema_200"] = np.nan

        time_col = "close_time"
        for col in ["y_class","next_return","ret_1d"]:
            if col not in df.columns: raise KeyError(f"{SYM}: no '{col}'")

        # ===== Stationarization =====
        ban_cols = {"y_class","next_return", time_col, "close", "open_time", "close_time", "ignore", "symbol"}
        num_cols = [c for c in df.columns if c not in ban_cols and np.issubdtype(df[c].dtype, np.number)]

        Z = pd.DataFrame(index=df.index)
        for c in num_cols: Z[f"z{Z_WIN}_{c}"] = rolling_z(df[c].astype(float))
        heavy = [c for c in ["volume","num_trades","taker_buy_base","taker_buy_quote",
                             "volume_log","num_trades_log","taker_buy_base_log","taker_buy_quote_log"]
                 if c in df.columns]
        for c in heavy: Z[f"rz{Z_WIN}_{c}"] = rolling_robust_z(df[c].astype(float))

        work = pd.concat([df[[time_col,"y_class","next_return","close","ema_200"]], Z], axis=1).dropna().reset_index(drop=True)

        # ===== Labels & splits (purged) =====
        y_raw   = work["y_class"].astype(int).to_numpy()
        y_ret   = work["next_return"].to_numpy()
        times   = work[time_col].to_numpy()

        y_eps = np.full_like(y_raw, -1)
        y_eps[y_ret >=  EPSILON] = 1
        y_eps[y_ret <= -EPSILON] = 0

        feat_cols_all = [c for c in work.columns if c not in {time_col,"y_class","next_return"}]
        X_all = work[feat_cols_all].to_numpy(dtype=np.float32)

        if len(work) < (TEST_LEN + VAL_LEN + 300 + EMBARGO_DAYS):
            warnings.warn(f"{SYM}: not enough data for reliable validation.")

        test_start = len(work) - TEST_LEN
        cutoff_time = work.iloc[test_start][time_col] - pd.Timedelta(days=365*LOOKBACK_YEARS)
        train_mask_time = work[time_col] >= cutoff_time
        train_mask_time.iloc[test_start:] = False

        pool_idx = np.where(train_mask_time & (y_eps != -1))[0]
        if len(pool_idx) < (VAL_LEN + 300): warnings.warn(f"{SYM}: not enough train data after ε-filter.")
        val_idx   = pool_idx[-VAL_LEN:]
        emb_start = max(0, pool_idx[-VAL_LEN] - EMBARGO_DAYS)
        train_idx = pool_idx[pool_idx < emb_start]
        test_idx  = np.arange(test_start, len(work))

        X_tr0, y_tr0, t_tr0, r_tr0 = X_all[train_idx], y_eps[train_idx], times[train_idx], y_ret[train_idx]
        X_va0, y_va0, t_va0, r_va0 = X_all[val_idx],   y_eps[val_idx],   times[val_idx],   y_ret[val_idx]

        # ===== Adversarial pruning + IPW (as in BTC code) =====
        def train_adv(X_train, X_test, feats):
            y_train = np.zeros(len(X_train), dtype=int)
            y_test  = np.ones(len(X_test), dtype=int)
            X = np.vstack([X_train, X_test]); y = np.concatenate([y_train, y_test])
            rng = np.random.default_rng(RANDOM_STATE)
            perm = rng.permutation(len(y)); X, y = X[perm], y[perm]
            split = int(0.8*len(y))
            dtr = xgb.DMatrix(X[:split], label=y[:split], feature_names=feats)
            dva = xgb.DMatrix(X[split:], label=y[split:], feature_names=feats)
            params = dict(objective="binary:logistic", eval_metric="auc", max_depth=2,
                          min_child_weight=6, eta=0.05, subsample=0.8, colsample_bytree=0.8,
                          reg_lambda=4.0, reg_alpha=0.5, tree_method="hist", seed=RANDOM_STATE)
            bst = xgb.train(params, dtr, num_boost_round=800,
                            evals=[(dtr,"train"), (dva,"valid")],
                            early_stopping_rounds=100, verbose_eval=False)
            from sklearn.metrics import roc_auc_score as _auc
            auc = _auc(y[split:], bst.predict(dva, iteration_range=(0, getattr(bst,"best_iteration",0)+1)))
            imp = bst.get_score(importance_type="gain")
            return auc, imp, bst

        feat_cols = feat_cols_all.copy()
        X_adv_train = X_all[np.where(train_mask_time)[0]]
        X_adv_test  = X_all[test_idx]  # as in BTC version
        ADV_TARGET_AUC  = 0.85
        SHIFT_DROP_MAX  = 60
        MIN_FEATS       = 15

        adv_auc, adv_imp, adv_bst = train_adv(X_adv_train, X_adv_test, feat_cols)
        adv_auc_initial = float(adv_auc)
        drop_list, iters = [], 0
        while adv_auc > ADV_TARGET_AUC and iters < SHIFT_DROP_MAX and len(feat_cols) > MIN_FEATS:
            if not adv_imp: break
            top_feat = sorted(adv_imp.items(), key=lambda x: -x[1])[0][0]
            drop_list.append((top_feat, float(adv_auc)))
            feat_cols.remove(top_feat)
            cols_idx_tmp = [feat_cols_all.index(f) for f in feat_cols]
            adv_auc, adv_imp, adv_bst = train_adv(X_adv_train[:, cols_idx_tmp], X_adv_test[:, cols_idx_tmp], feat_cols)
            iters += 1

        cols_idx = [feat_cols_all.index(f) for f in feat_cols]
        X_tr0, X_va0 = X_tr0[:, cols_idx], X_va0[:, cols_idx]
        X_test_full  = X_all[test_idx][:, cols_idx]
        adv_auc_final = float(adv_auc)

        # IPW
        dadv_pool = xgb.DMatrix(X_tr0, feature_names=feat_cols)
        p_testlike = adv_bst.predict(dadv_pool, iteration_range=(0, getattr(adv_bst,"best_iteration",0)+1))
        ipw = p_testlike / np.clip(1.0 - p_testlike, 1e-6, None)
        ipw = np.clip(ipw, 0.2, 5.0); ipw = ipw / ipw.mean()

        # ===== Weights =====
        days_from_end = (t_tr0.max() - t_tr0).astype('timedelta64[D]').astype(int)
        w_time = 0.5 ** (days_from_end / HALF_LIFE_DAYS)
        w_mag  = 1.0 + MAG_ALPHA * np.clip(np.abs(r_tr0), 0, MAG_CAP)
        w_tr   = (w_time * w_mag * ipw).astype(np.float32)

        # ===== Classifier =====
        pos, neg = int((y_tr0==1).sum()), int((y_tr0==0).sum())
        scale_pos_weight = (neg / max(pos,1)) if pos>0 else 1.0
        dtr_clf = xgb.DMatrix(X_tr0, label=y_tr0, weight=w_tr, feature_names=feat_cols)
        dva_clf = xgb.DMatrix(X_va0, label=y_va0, feature_names=feat_cols)
        clf_params = {**CLF_PARAMS, "scale_pos_weight": scale_pos_weight}
        bst_clf = xgb.train(clf_params, dtr_clf, num_boost_round=N_ROUNDS_CLF,
                            evals=[(dtr_clf,"train"), (dva_clf,"valid")],
                            early_stopping_rounds=EARLY_STOP, verbose_eval=False)
        it_clf = getattr(bst_clf, "best_iteration", None)
        proba_va_raw = bst_clf.predict(dva_clf, iteration_range=(0, int(it_clf)+1) if it_clf is not None else (0,0))

        # Isotonic calibration (as in BTC)
        cw0 = (len(y_va0) / (2*max(1,(y_va0==0).sum()))); cw1 = (len(y_va0) / (2*max(1,(y_va0==1).sum())))
        val_w = np.where(y_va0==1, cw1, cw0).astype(np.float32)
        iso = IsotonicRegression(out_of_bounds="clip"); iso.fit(proba_va_raw, y_va0, sample_weight=val_w)
        proba_va = iso.transform(proba_va_raw)

        # ===== Regressor (magnitude) =====
        reg_idx_all = np.where(train_mask_time)[0]
        X_reg_all = X_all[reg_idx_all][:, cols_idx]; y_reg_all = y_ret[reg_idx_all]
        X_va_reg  = X_all[val_idx][:, cols_idx];     y_va_reg = y_ret[val_idx]
        dtr_reg = xgb.DMatrix(X_reg_all[:-(VAL_LEN+EMBARGO_DAYS)], label=y_reg_all[:-(VAL_LEN+EMBARGO_DAYS)], feature_names=feat_cols)
        dva_reg = xgb.DMatrix(X_va_reg, label=y_va_reg, feature_names=feat_cols)
        bst_reg = xgb.train(REG_PARAMS, dtr_reg, num_boost_round=N_ROUNDS_REG,
                            evals=[(dtr_reg,"train"), (dva_reg,"valid")],
                            early_stopping_rounds=EARLY_STOP, verbose_eval=False)
        it_reg = getattr(bst_reg, "best_iteration", None)
        ret_va_pred = bst_reg.predict(dva_reg, iteration_range=(0, int(it_reg)+1) if it_reg is not None else (0,0))
        scale_vol = max(1e-6, np.std(y_va_reg))
        ret_score_va = np.tanh(ret_va_pred / (3*scale_vol))  # [-1..1]

        # ===== Threshold search (as in BTC) =====
        alphas = np.linspace(0.0, 1.0, 11)
        thr_grid = np.linspace(0.2, 0.8, 121)
        best = {"bal_acc": -1, "alpha": 0.0, "thr": 0.5}

        for a in alphas:
            blend = (1-a)*proba_va + a*(ret_score_va*0.5 + 0.5)
            for thr in thr_grid:
                y_hat = (blend >= thr).astype(int)
                pos_rate = y_hat.mean()
                if pos_rate < MIN_POS_RATE or pos_rate > MAX_POS_RATE:
                    continue
                ba = balanced_accuracy_score(y_va0, y_hat)
                if ba > best["bal_acc"]:
                    best = {"bal_acc": float(ba), "alpha": float(a), "thr": float(thr)}
            # priorsafe
            thr_p = priorsafe_threshold(blend, y_va0)
            y_hat_p = (blend >= thr_p).astype(int)
            pos_rate_p = y_hat_p.mean()
            if MIN_POS_RATE <= pos_rate_p <= MAX_POS_RATE:
                ba_p = balanced_accuracy_score(y_va0, y_hat_p)
                if ba_p > best["bal_acc"]:
                    best = {"bal_acc": float(ba_p), "alpha": float(a), "thr": float(thr_p)}

        alpha_final = best["alpha"]; thr_final = best["thr"]

        # ===== HOLDOUT =====
        dtest = xgb.DMatrix(X_test_full, feature_names=feat_cols)
        proba_test_raw = bst_clf.predict(dtest, iteration_range=(0, int(it_clf)+1) if it_clf is not None else (0,0))
        proba_test = iso.transform(proba_test_raw)
        ret_test_pred = bst_reg.predict(dtest, iteration_range=(0, int(it_reg)+1) if it_reg is not None else (0,0))
        ret_score_test = np.tanh(ret_test_pred / (3*scale_vol))
        blend_test = (1-alpha_final) * proba_test + alpha_final * (ret_score_test*0.5 + 0.5)

        y_test_raw = y_raw[test_idx]
        dates_test = pd.to_datetime(times[test_idx])

        # ===== classic trade-gate as in BTC =====
        gate = float(np.quantile(np.abs(ret_score_va), TRADE_GATE_Q))
        trade_mask = (np.abs(ret_score_test) >= gate)

        # regime filter (EMA200)
        if REGIME_EMA:
            close_test = work.loc[test_idx, "close"].to_numpy()
            ema200_test = work.loc[test_idx, "ema_200"].to_numpy()
            regime_on = close_test > ema200_test
        else:
            regime_on = np.ones_like(trade_mask, dtype=bool)

        # final decision
        decision_long = ((blend_test >= thr_final) & trade_mask & regime_on).astype(int)

        # ===== Metrics =====
        auc_test = roc_auc_score(y_test_raw, blend_test) if len(np.unique(y_test_raw))==2 else None
        metrics_raw = {
            "acc": float(accuracy_score(y_test_raw, decision_long)),
            "bal_acc": float(balanced_accuracy_score(y_test_raw, decision_long)),
            "f1": float(f1_score(y_test_raw, decision_long, zero_division=0)),
            "precision": float(precision_score(y_test_raw, decision_long, zero_division=0)),
            "recall": float(recall_score(y_test_raw, decision_long, zero_division=0)),
            "auc": float(auc_test) if auc_test is not None else None,
            "pred_pos_rate": float(decision_long.mean())
        }

        # ===== Business (as in BTC code) =====
        r_test = y_ret[test_idx].astype(float)
        exec_mask_all  = (decision_long == 1)
        ret_exec_all   = r_test[exec_mask_all]

        ret_bh    = r_test
        ret_strat = decision_long * r_test

        def cum_return(x):
            x = np.asarray(x, float)
            return float(np.prod(1.0 + x) - 1.0) if x.size else 0.0

        def sharpe(x, days_per_year=365):
            x = np.asarray(x, float)
            mu, sd = np.mean(x), np.std(x, ddof=1)
            return float(mu / (sd + 1e-12) * np.sqrt(days_per_year)) if x.size else 0.0

        def win_rate_on_trades(x):
            x = np.asarray(x, float)
            return float((x > 0).mean()) if x.size else 0.0

        bh_total     = cum_return(ret_bh)
        strat_total  = cum_return(ret_strat)
        excess_total = strat_total - bh_total

        exec_mask_conf = trade_mask & exec_mask_all
        ret_exec_conf  = r_test[exec_mask_conf]
        bh_conf_total  = cum_return(r_test[trade_mask])
        conf_total     = cum_return(ret_exec_conf)
        excess_conf    = conf_total - bh_conf_total

        biz = {
            "symbol": SYM,
            "test_period": [str(dates_test.min()), str(dates_test.max())],
            "Strategy Return (Long-only)": strat_total,
            "Buy&Hold Return": bh_total,
            "Excess Return": excess_total,
            "Confident Return (gated trades only)": conf_total,
            "Confident Excess Return": excess_conf,
            "Win Rate (executed)": win_rate_on_trades(ret_exec_all),
            "Win Rate (confident executed)": win_rate_on_trades(ret_exec_conf),
            "Sharpe (all, long-only)": sharpe(ret_strat),
            "Sharpe (confident executed)": sharpe(ret_exec_conf),
            "Coverage (gate)": float(trade_mask.mean()),
            "Executed trades (all)": int(exec_mask_all.sum()),
            "Executed trades (confident)": int(exec_mask_conf.sum())
        }

        # ===== UX export =====
        up_prob_full = proba_test
        ux_soft_buy  = (up_prob_full >= UX_SOFT_THR).astype(int)
        def ux_verdict(p):
            if p >= 0.5 + UX_CONF_BAND: return "Buy"
            if p <= 0.5 - UX_CONF_BAND: return "Caution"
            return "Neutral"
        ux_labels = np.array([ux_verdict(p) for p in up_prob_full])

        # ===== EXPORT (per-asset files) =====
        HORIZON_DAYS = 1
        pred_trading_path = DATA_DIR / f"predictions_trading_{SYM}_{MODEL_VERSION}.csv"
        pred_ux_path      = DATA_DIR / f"predictions_ux_{SYM}_{MODEL_VERSION}.csv"
        clf_path          = MODELS_DIR / f"{SYM}_{MODEL_VERSION}.clf.json"
        reg_path          = MODELS_DIR / f"{SYM}_{MODEL_VERSION}.reg.json"
        meta_path         = MODELS_DIR / f"{SYM}_{MODEL_VERSION}.meta.json"

        pd.DataFrame({
            "symbol": SYM,
            "horizon_days": HORIZON_DAYS, 
            "asof_time": dates_test,
            "pred_date": [next_trading_day(dt, calendar) for dt in dates_test],
            "blend_score": blend_test,
            "gate": trade_mask.astype(int),
            "regime_on": regime_on.astype(int),
            "decision_long": decision_long,
            "actual_ret_tplus1": r_test
        }).to_csv(pred_trading_path, index=False)

        pd.DataFrame({
            "symbol": SYM,
            "horizon_days": HORIZON_DAYS, 
            "asof_time": dates_test,
            "pred_date": [next_trading_day(dt, calendar) for dt in dates_test],
            "up_prob": up_prob_full,
            "ux_soft_buy": ux_soft_buy,
            "ux_verdict": ux_labels
        }).to_csv(pred_ux_path, index=False)

        ts = int(time.time())
        bst_clf.save_model(clf_path)
        bst_reg.save_model(reg_path)
        with open(meta_path, "w", encoding="utf-8") as f:
            json.dump({
                "symbol": SYM,
                "version": MODEL_VERSION,
                "created_at": ts,
                "horizon_days": HORIZON_DAYS,
                "alpha": float(alpha_final), "thr": float(thr_final),
                "gate_q": TRADE_GATE_Q, "regime_ema": REGIME_EMA,
                "features": feat_cols, "n_features": len(feat_cols),
                "adv_auc_initial": float(adv_auc_initial), "adv_auc_final": float(adv_auc_final),
                "shift_dropped": [d[0] for d in drop_list],
                "holdout_metrics_raw": metrics_raw,
                "business": biz
            }, f, ensure_ascii=False, indent=2)

        print(json.dumps({
            "symbol": SYM,
            "val_best": {"alpha": float(alpha_final), "thr": float(thr_final)},
            "holdout_metrics_raw": metrics_raw,
            "business": {k: (round(v,4) if isinstance(v,float) else v) for k,v in biz.items()},
            "artifacts": {
                "pred_trading_csv": str(pred_trading_path),
                "pred_ux_csv": str(pred_ux_path),
                "clf_model": str(clf_path),
                "reg_model": str(reg_path),
                "meta": str(meta_path)
            }
        }, indent=2, ensure_ascii=False))

        dashboard[SYM] = {
            "symbol": SYM,
            "holdout_bal_acc": round(metrics_raw["bal_acc"], 4),
            "holdout_auc": round(metrics_raw["auc"], 4) if metrics_raw["auc"] is not None else None,
            "pos_rate": round(metrics_raw["pred_pos_rate"], 4),
        }

    except Exception as e:
        print(f"[ERROR] {SYM}: {e}")

with open(DATA_DIR / f"dashboard_{MODEL_VERSION}.json", "w", encoding="utf-8") as f:
    json.dump(dashboard, f, ensure_ascii=False, indent=2)

print("\nALL DONE. Dashboard:", DATA_DIR / f"dashboard_{MODEL_VERSION}.json")



{
  "symbol": "BTC",
  "val_best": {
    "alpha": 0.0,
    "thr": 0.31000000000000005
  },
  "holdout_metrics_raw": {
    "acc": 0.5166666666666667,
    "bal_acc": 0.5235294117647059,
    "f1": 0.4662576687116564,
    "precision": 0.5588235294117647,
    "recall": 0.4,
    "auc": 0.5328173374613002,
    "pred_pos_rate": 0.37777777777777777
  },
  "business": {
    "symbol": "BTC",
    "test_period": [
      "2025-04-19 23:59:59.999000+00:00",
      "2025-10-15 23:59:59.999000+00:00"
    ],
    "Strategy Return (Long-only)": 0.306,
    "Buy&Hold Return": 0.3048,
    "Excess Return": 0.0012,
    "Confident Return (gated trades only)": 0.306,
    "Confident Excess Return": 0.0012,
    "Win Rate (executed)": 0.5588,
    "Win Rate (confident executed)": 0.5588,
    "Sharpe (all, long-only)": 2.2155,
    "Sharpe (confident executed)": 3.6285,
    "Coverage (gate)": 1.0,
    "Executed trades (all)": 68,
    "Executed trades (confident)": 68
  },
  "artifacts": {
    "pred_trading_csv": "data