In [8]:
import sys, pathlib
import pandas as pd
import numpy as np
import joblib

# Añadir src/ al path para poder importar config
PROJECT_ROOT = pathlib.Path().resolve().parent.parent  # estamos en /notebooks/lstm5d
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src import config as cfg
from sklearn.preprocessing import StandardScaler   

In [9]:
# Leer precios desde parquet
df = pd.read_parquet(cfg.DATA / "raw" / "prices.parquet")
df = df.ffill().dropna()

# Cargar VIX y alinearlo con los precios
vix = pd.read_parquet(cfg.DATA / "raw" / "vix.parquet").squeeze()
vix = vix.ffill().reindex(df.index).ffill()
vix_t1 = vix.shift(1).rename('vix_t-1')

# Calcular retornos logarítmicos
ret = np.log(df / df.shift(1)).dropna()
ret.head()

Ticker,AAPL,ABT,ADBE,AMZN,BAC,BTC-USD,COST,CRM,CSCO,CVX,...,PFE,PG,T,TSLA,UNH,V,VZ,WFC,WMT,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-05-21,0.056626,0.00389,0.022111,0.019725,-0.027439,0.0,0.00948,0.023556,0.012071,0.012415,...,-0.00133,-0.002049,-0.000892,0.042968,0.028485,0.031288,-0.004585,0.014758,0.009724,0.006972
2012-05-22,-0.007708,0.002908,0.0,-0.012828,0.021725,0.0,-0.004189,0.00268,0.003593,-0.003719,...,-0.007571,-0.003793,-0.00268,0.068181,0.003055,0.0185,0.001209,0.008562,0.010886,-0.001097
2012-05-23,0.024107,-0.010052,0.005297,0.009015,0.026856,0.0,0.002464,0.007732,-0.002394,-0.003227,...,-0.012596,-0.012108,-0.005381,0.007118,-0.006482,0.006567,-0.002661,0.002208,0.013249,0.001097
2012-05-24,-0.009226,0.016965,-0.020089,-0.009433,-0.004193,0.0,0.013947,-0.02965,-0.018138,0.01065,...,0.002261,0.002881,0.008358,-0.024145,0.015415,0.005022,0.002661,0.002203,0.007559,0.006924
2012-05-25,-0.005374,-0.002728,0.001901,-0.010978,0.0014,0.0,0.0,0.005389,-0.003667,-0.012065,...,-0.000452,-0.001279,0.001485,-0.015644,-0.00178,-0.003346,0.001449,0.00157,0.003682,-0.006436


In [11]:
# ╔════════════════════════════════════════════════════════════════╗
# FEATURES AVANZADAS MULTI-ESCALA PARA LSTM-5d MEJORADO
# ╚════════════════════════════════════════════════════════════════╝
print("🔧 Generando features avanzadas multi-temporales...")

# ── RETORNOS MULTI-HORIZONTE (4×40=160) ─────────────────────────
ret_1d = ret.copy()
ret_3d = ret.rolling(3).sum()
ret_5d = ret.rolling(5).sum()
ret_10d = ret.rolling(10).sum()

print("✅ Retornos multi-horizonte: 1d, 3d, 5d, 10d")

# ── VOLATILIDADES JERÁRQUICAS (3×40=120) ───────────────────────
vol_5d = ret.rolling(5).std()   # Volatilidad corta
vol_20d = ret.rolling(20).std()  # Volatilidad media
vol_60d = ret.rolling(60).std()  # Volatilidad larga

print("✅ Volatilidades jerárquicas: 5d, 20d, 60d")

# ── MOMENTUM TEMPORAL (2×40=80) ──────────────────────────────────
mom_3_20 = (ret_3d / (vol_5d + 1e-6)).clip(-10, 10)   # Momentum 3d vs vol 5d
mom_5_60 = (ret_5d / (vol_20d + 1e-6)).clip(-10, 10)   # Momentum 5d vs vol 20d

print("✅ Momentum temporal: corto y medio plazo")

# ── VIX FEATURES AVANZADAS (4×1=4) ──────────────────────────────
vix_level = (vix / 100)
vix_change_1d = vix.pct_change(1).clip(-1, 1)
vix_change_5d = vix.pct_change(5).clip(-2, 2)
vix_trend = (vix.rolling(5).mean() / vix.rolling(20).mean()).clip(0.5, 2.0)

print("✅ VIX features: nivel, cambios 1d/5d, tendencia")

# ── CORRELACIONES CROSS-ASSET (1×40=40) ─────────────────────────
market_ret = ret.mean(axis=1)  # Retorno promedio del mercado
market_corr = ret.rolling(20).corrwith(market_ret).clip(-1, 1)

print("✅ Correlaciones cross-asset con mercado")

# ── RATIOS TÉCNICOS (2×40=80) ────────────────────────────────────
ret_ratio_3_10 = (ret_3d / (ret_10d.abs() + 1e-6)).clip(-5, 5)  # Aceleración
vol_ratio_5_20 = (vol_5d / (vol_20d + 1e-6)).clip(0, 10)        # Régimen volatilidad

print("✅ Ratios técnicos: aceleración y régimen volatilidad")

# ── TARGET: RETORNO 5d FUTURO (MANTENER ORIGINAL) ───────────────
y = ret_5d.shift(-5).dropna()

print(f"🎯 Target y shape: {y.shape}")
print(f"🚀 Total features creadas: {4*40 + 3*40 + 2*40 + 4 + 1*40 + 2*40} = 484 features")
print(f"   vs features originales: 81 → incremento 6x")


🔧 Generando features avanzadas multi-temporales...
✅ Retornos multi-horizonte: 1d, 3d, 5d, 10d
✅ Volatilidades jerárquicas: 5d, 20d, 60d
✅ Momentum temporal: corto y medio plazo
✅ VIX features: nivel, cambios 1d/5d, tendencia


AttributeError: 'Rolling' object has no attribute 'corrwith'

In [None]:
# ╔════════════════════════════════════════════════════════════════╗
# FEATURES AVANZADAS MULTI-ESCALA PARA LSTM-5d MEJORADO
# ╚════════════════════════════════════════════════════════════════╝
print("🔧 Generando features avanzadas multi-temporales...")

# ── RETORNOS MULTI-HORIZONTE (4×40=160) ─────────────────────────
ret_1d = ret.copy()
ret_3d = ret.rolling(3).sum()
ret_5d = ret.rolling(5).sum()
ret_10d = ret.rolling(10).sum()

print("✅ Retornos multi-horizonte: 1d, 3d, 5d, 10d")

# ── VOLATILIDADES JERÁRQUICAS (3×40=120) ───────────────────────
vol_5d = ret.rolling(5).std()   # Volatilidad corta
vol_20d = ret.rolling(20).std()  # Volatilidad media
vol_60d = ret.rolling(60).std()  # Volatilidad larga

print("✅ Volatilidades jerárquicas: 5d, 20d, 60d")

# ── MOMENTUM TEMPORAL (2×40=80) ──────────────────────────────────
mom_3_20 = (ret_3d / (vol_5d + 1e-6)).clip(-10, 10)   # Momentum 3d vs vol 5d
mom_5_60 = (ret_5d / (vol_20d + 1e-6)).clip(-10, 10)   # Momentum 5d vs vol 20d

print("✅ Momentum temporal: corto y medio plazo")

# ── VIX FEATURES AVANZADAS (4×1=4) ──────────────────────────────
vix_level = (vix / 100)
vix_change_1d = vix.pct_change(1).clip(-1, 1)
vix_change_5d = vix.pct_change(5).clip(-2, 2)
vix_trend = (vix.rolling(5).mean() / vix.rolling(20).mean()).clip(0.5, 2.0)

print("✅ VIX features: nivel, cambios 1d/5d, tendencia")

# ── CORRELACIONES CROSS-ASSET (1×40=40) ─────────────────────────
market_ret = ret.mean(axis=1)  # Retorno promedio del mercado
market_corr = ret.rolling(20).corrwith(market_ret).clip(-1, 1)

print("✅ Correlaciones cross-asset con mercado")

# ── RATIOS TÉCNICOS (2×40=80) ────────────────────────────────────
ret_ratio_3_10 = (ret_3d / (ret_10d.abs() + 1e-6)).clip(-5, 5)  # Aceleración
vol_ratio_5_20 = (vol_5d / (vol_20d + 1e-6)).clip(0, 10)        # Régimen volatilidad

print("✅ Ratios técnicos: aceleración y régimen volatilidad")

# ── TARGET: RETORNO 5d FUTURO (MANTENER ORIGINAL) ───────────────
y = ret_5d.shift(-5).dropna()

print(f"🎯 Target y shape: {y.shape}")
print(f"🚀 Total features creadas: {4*40 + 3*40 + 2*40 + 4 + 1*40 + 2*40} = 484 features")
print(f"   vs features originales: 81 → incremento 6x")


🔧 Generando features avanzadas multi-temporales...
✅ Retornos multi-horizonte: 1d, 3d, 5d, 10d
✅ Volatilidades jerárquicas: 5d, 20d, 60d
✅ Momentum temporal: corto y medio plazo
✅ VIX features: nivel, cambios 1d/5d, tendencia


AttributeError: 'Rolling' object has no attribute 'corrwith'

In [None]:
# 🔧 CAMBIO CRÍTICO: TARGET DIARIO (como LSTM-1d)
print("🔧 CAMBIO: Target de retorno acumulado 5d → retorno diario")

# Retorno acumulado 5 días (solo para momentum)
ret5 = ret.rolling(5).sum()

# Volatilidad 5 días (solo para momentum)
vol5 = ret.rolling(5).std()

# Momentum = retorno / volatilidad
momentum = (ret5 / (vol5 + 1e-6)).clip(-10, 10)

# 🎯 TARGET DIARIO: retorno 1 día futuro (como LSTM-1d) 
# ✅ CRITICAL FIX: Asegurar que NO HAY LOOK-AHEAD BIAS
y = ret.shift(-1).dropna()  # Predecir retorno t+1 usando info hasta t-1

# Alinear features al target
ret = ret.shift(1).reindex(y.index)
momentum = momentum.shift(1).reindex(y.index)
vix_t1 = vix_t1.reindex(y.index)

print(f"✅ Nuevo target: retorno diario")
print(f"   Target shape: {y.shape}")
print(f"   Target range: [{y.min().min():.4f}, {y.max().max():.4f}]")
print(f"   Target mean: {y.mean().mean():.6f}")

# Visual check
ret.tail(3), momentum.tail(3), y[-3:]


🔧 CAMBIO: Target de retorno acumulado 5d → retorno diario
✅ Nuevo target: retorno diario
   Target shape: (4519, 40)
   Target range: [-0.5507, 6.1254]
   Target mean: 0.000533


(Ticker          AAPL       ABT      ADBE      AMZN       BAC   BTC-USD  \
 Date                                                                     
 2025-06-23  0.000000  0.000000  0.000000  0.000000  0.000000 -0.012500   
 2025-06-24  0.002484  0.002028  0.008454 -0.005835  0.017430  0.044455   
 2025-06-25 -0.005973  0.035531  0.005823  0.020417  0.007745  0.004422   
 
 Ticker          COST       CRM      CSCO       CVX  ...       PFE        PG  \
 Date                                                ...                       
 2025-06-23  0.000000  0.000000  0.000000  0.000000  ...  0.000000  0.000000   
 2025-06-24  0.024377  0.010610  0.015857 -0.018151  ...  0.002500  0.012183   
 2025-06-25 -0.002552  0.029034  0.007541 -0.022796  ...  0.011585 -0.004169   
 
 Ticker             T      TSLA       UNH         V        VZ       WFC  \
 Date                                                                     
 2025-06-23  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000

In [None]:
# ╔════════════════════════════════════════════════════════════════╗
# CONSTRUCCIÓN DE SECUENCIAS CON FEATURES AVANZADAS (484 features)
# ╚════════════════════════════════════════════════════════════════╝

# ── ALINEAR TODAS LAS FEATURES AL TARGET ────────────────────────
print("🔧 Alineando features avanzadas al target...")

# ✅ CRITICAL: Features hasta t-1, target en t+1 (NO LOOK-AHEAD)
# Alinear features t-1 al índice del target  
ret_1d = ret_1d.shift(2).reindex(y.index)  # ✅ shift(2): usa ret hasta t-2 para predecir t+1
ret_3d = ret_3d.shift(2).reindex(y.index)  # ✅ shift(2) por consistencia
ret_5d_feat = ret_5d.shift(2).reindex(y.index)  # ✅ shift(2) por consistencia
ret_10d = ret_10d.shift(2).reindex(y.index)  # ✅ shift(2) por consistencia
vol_5d = vol_5d.shift(2).reindex(y.index)  # ✅ shift(2) por consistencia
vol_20d = vol_20d.shift(2).reindex(y.index)  # ✅ shift(2) por consistencia 
vol_60d = vol_60d.shift(2).reindex(y.index)  # ✅ shift(2) por consistencia
mom_3_20 = mom_3_20.shift(2).reindex(y.index)  # ✅ shift(2) por consistencia
mom_5_60 = mom_5_60.shift(2).reindex(y.index)  # ✅ shift(2) por consistencia
ret_ratio_3_10 = ret_ratio_3_10.shift(2).reindex(y.index)  # ✅ shift(2) por consistencia
vol_ratio_5_20 = vol_ratio_5_20.shift(2).reindex(y.index)  # ✅ shift(2) por consistencia
market_corr = market_corr.shift(2).reindex(y.index)  # ✅ shift(2) por consistencia

# VIX features alineadas
vix_level = vix_level.shift(2).reindex(y.index)  # ✅ shift(2) por consistencia
vix_change_1d = vix_change_1d.shift(2).reindex(y.index)  # ✅ shift(2) por consistencia
vix_change_5d = vix_change_5d.shift(2).reindex(y.index)  # ✅ shift(2) por consistencia
vix_trend = vix_trend.shift(2).reindex(y.index)  # ✅ shift(2) por consistencia

print("✅ Features alineadas al target")

# ── CONSTRUIR SECUENCIAS TEMPORALES ─────────────────────────────
X_advanced = []
y_out = []
dates = []

print("🔧 Construyendo secuencias temporales avanzadas...")

for i in range(cfg.WINDOW, len(y)):
    # Extraer ventanas de 60 días para cada grupo de features
    ventana_ret_1d = ret_1d.iloc[i - cfg.WINDOW:i].values     # (60, 40)
    ventana_ret_3d = ret_3d.iloc[i - cfg.WINDOW:i].values     # (60, 40)
    ventana_ret_5d = ret_5d_feat.iloc[i - cfg.WINDOW:i].values # (60, 40)
    ventana_ret_10d = ret_10d.iloc[i - cfg.WINDOW:i].values   # (60, 40)
    
    ventana_vol_5d = vol_5d.iloc[i - cfg.WINDOW:i].values     # (60, 40)
    ventana_vol_20d = vol_20d.iloc[i - cfg.WINDOW:i].values   # (60, 40)
    ventana_vol_60d = vol_60d.iloc[i - cfg.WINDOW:i].values   # (60, 40)
    
    ventana_mom_3_20 = mom_3_20.iloc[i - cfg.WINDOW:i].values # (60, 40)
    ventana_mom_5_60 = mom_5_60.iloc[i - cfg.WINDOW:i].values # (60, 40)
    
    ventana_ratio_3_10 = ret_ratio_3_10.iloc[i - cfg.WINDOW:i].values # (60, 40)
    ventana_vol_ratio = vol_ratio_5_20.iloc[i - cfg.WINDOW:i].values   # (60, 40)
    ventana_market_corr = market_corr.iloc[i - cfg.WINDOW:i].values    # (60, 40)
    
    # VIX features (4 features × 1 = 4)
    ventana_vix_level = vix_level.iloc[i - cfg.WINDOW:i].values.reshape(-1, 1)     # (60, 1)
    ventana_vix_chg_1d = vix_change_1d.iloc[i - cfg.WINDOW:i].values.reshape(-1, 1) # (60, 1)
    ventana_vix_chg_5d = vix_change_5d.iloc[i - cfg.WINDOW:i].values.reshape(-1, 1) # (60, 1)
    ventana_vix_trend = vix_trend.iloc[i - cfg.WINDOW:i].values.reshape(-1, 1)     # (60, 1)
    
    # Concatenar todas las features: 484 total
    bloque = np.concatenate([
        # Retornos (160)
        ventana_ret_1d, ventana_ret_3d, ventana_ret_5d, ventana_ret_10d,
        # Volatilidades (120)
        ventana_vol_5d, ventana_vol_20d, ventana_vol_60d,
        # Momentum (80)
        ventana_mom_3_20, ventana_mom_5_60,
        # Ratios técnicos (80)
        ventana_ratio_3_10, ventana_vol_ratio,
        # Cross-asset (40)
        ventana_market_corr,
        # VIX features (4)
        ventana_vix_level, ventana_vix_chg_1d, ventana_vix_chg_5d, ventana_vix_trend
    ], axis=1)  # (60, 484)

    X_advanced.append(bloque)
    y_out.append(y.iloc[i])
    dates.append(y.index[i])

X = np.array(X_advanced, dtype=np.float32)
y = np.array(y_out, dtype=np.float32)
dates = pd.to_datetime(dates)

print(f"🚀 LSTM-5d AVANZADO:")
print(f"   ✅ X shape: {X.shape} (vs original: (4455, 60, 81))")
print(f"   ✅ y shape: {y.shape}")
print(f"   📊 Features: {X.shape[2]} (incremento {X.shape[2]/81:.1f}x)")
print(f"   🎯 Target: retornos 5-días (mantener división /5 en backtest)")


In [4]:
# 🔧 CREAR VENTANAS DE DATOS CON TARGET DIARIO
print("\n🔧 Creando ventanas de datos con target diario...")

X = []
y_out = []
dates = []

for i in range(cfg.WINDOW, len(ret)):
    ventana_ret = ret.iloc[i - cfg.WINDOW:i].values
    ventana_mom = momentum.iloc[i - cfg.WINDOW:i].values
    ventana_vix = vix_t1.iloc[i - cfg.WINDOW:i].values.reshape(-1, 1)
    bloque = np.concatenate([ventana_ret, ventana_mom, ventana_vix], axis=1)

    X.append(bloque)
    y_out.append(y.iloc[i])      # ← target diario
    dates.append(y.index[i])     # ← fecha del target

X = np.array(X, dtype=np.float32)
y = np.array(y_out, dtype=np.float32)
dates = pd.to_datetime(dates)

print(f"✅ X shape: {X.shape}")
print(f"✅ y shape: {y.shape}")
print(f"✅ Features por timestep: {X.shape[2]} [retornos(40) + momentum(40) + VIX(1)]")
print(f"✅ Ventanas de datos: {X.shape[0]} muestras")
print(f"✅ Período: {dates.min().date()} → {dates.max().date()}")



🔧 Creando ventanas de datos con target diario...
✅ X shape: (4459, 60, 81)
✅ y shape: (4459, 40)
✅ Features por timestep: 81 [retornos(40) + momentum(40) + VIX(1)]
✅ Ventanas de datos: 4459 muestras
✅ Período: 2012-08-15 → 2025-06-25


In [5]:
# Ver una muestra
pd.DataFrame(X[0]).iloc[:5]  # primeros 5 días de la primera muestra

# Ver un target
pd.Series(y[0], index=df.columns).sort_values(ascending=False).head()


Ticker
CSCO    0.091898
TSLA    0.030153
HD      0.023539
ADBE    0.022237
NVDA    0.020507
dtype: float32

In [6]:
# ─────────────────────────────────────────────
# 4. ESCALADO (features y target)
# ─────────────────────────────────────────────
from pathlib import Path      #  ← AÑADE ESTA LÍNEA

n_feat = X.shape[2]

# ---- escalador de X ----
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X.reshape(-1, n_feat)).reshape(X.shape)

# ---- escalador de y ----
scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(y)     # y.shape = (n_muestras, n_assets)

# ---- guarda los escaladores ----
Path(cfg.MODELS).mkdir(parents=True, exist_ok=True)
joblib.dump(scaler_X, cfg.MODELS / "scaler_X_lstm5d.pkl")
joblib.dump(scaler_y, cfg.MODELS / "scaler_y_lstm5d.pkl")
print("✅ Escaladores guardados (lstm5d con VIX integrado)")


✅ Escaladores guardados (lstm5d con VIX integrado)


In [7]:
joblib.dump(
    {
        "X": X_scaled.astype(np.float32),
        "y": y_scaled.astype(np.float32),
        "tickers": df.columns.tolist(),
        "dates": dates,
        "target_type": "daily",  # ← NUEVO: indicar que es target diario
        "features": "ret_momentum_vix"  # ← NUEVO: indicar features incluidas
    },
    cfg.DATA / "processed" / "lstm5d_data.pkl"
)
print("✅ Datos LSTM-5d MEJORADOS guardados (target diario + features completas)")


✅ Datos LSTM-5d MEJORADOS guardados (target diario + features completas)
