In [7]:
# ╔════════════════════════════════════════════════════════════════╗
# 0 · IMPORTS Y SET-UP
# ╚════════════════════════════════════════════════════════════════╝
import sys, pathlib
from pathlib import Path                      # ← NUEVO
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping

# ── ruta al proyecto ──────────────────────────────────────────────
PROJECT_ROOT = pathlib.Path().resolve().parent.parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src import config as cfg       # ← tu módulo de configuración

data       = joblib.load(cfg.DATA / "processed" / "lstm5d_vix.pkl")
X, y       = data["X"], data["y"]              # shapes: (n_muestras, 60, 81) y (n_muestras, 40)
fechas_X   = pd.to_datetime(data["dates"])

print("Datos brutos  ➜  X:", X.shape, "  y:", y.shape)

Datos brutos  ➜  X: (4455, 60, 81)   y: (4455, 40)


In [9]:
# ╔════════════════════════════════════════════════════════════════╗
# 2 · SPLITS  (train / val / test)
# ╚════════════════════════════════════════════════════════════════╝
train_mask = fechas_X <  "2019-01-01"
val_mask   = (fechas_X >= "2019-01-01") & (fechas_X < "2021-01-01")
test_mask  = fechas_X >= "2021-01-01"

X_train, y_train = X[train_mask], y[train_mask]
X_val,   y_val   = X[val_mask ], y[val_mask ]
X_test,  y_test  = X[test_mask ], y[test_mask ]

fechas_train, fechas_val, fechas_test = fechas_X[train_mask], fechas_X[val_mask], fechas_X[test_mask]

print(f"Train: {X_train.shape}  {fechas_train.min()} → {fechas_train.max()}")
print(f"Val  : {X_val.shape}    {fechas_val.min()} → {fechas_val.max()}")
print(f"Test : {X_test.shape}   {fechas_test.min()} → {fechas_test.max()}")

Train: (2091, 60, 81)  2012-08-15 00:00:00 → 2018-12-31 00:00:00
Val  : (731, 60, 81)    2019-01-01 00:00:00 → 2020-12-31 00:00:00
Test : (1633, 60, 81)   2021-01-01 00:00:00 → 2025-06-21 00:00:00


In [11]:
# ╔════════════════════════════════════════════════════════════════╗
# 3 · LIMPIAR TRAIN DE NaNs / Inf
# ╚════════════════════════════════════════════════════════════════╝
mask_valid          = ~np.isnan(X_train).any(axis=(1, 2))
X_train, y_train    = X_train[mask_valid], y_train[mask_valid]
print("Train limpio:", X_train.shape)


Train limpio: (2084, 60, 81)


In [13]:
# ╔════════════════════════════════════════════════════════════════╗
# 4 · ESCALADO  (features y target)
# ╚════════════════════════════════════════════════════════════════╝
# ── Features (X) ─────────────────────────────────────────────────
scaler_X = StandardScaler()

X_train_scaled = scaler_X.fit_transform(X_train.reshape(-1, X_train.shape[2])) \
                         .reshape(X_train.shape)

X_val_scaled  = scaler_X.transform(X_val.reshape(-1, X_val.shape[2])) \
                        .reshape(X_val.shape)

X_test_scaled = scaler_X.transform(X_test.reshape(-1, X_test.shape[2])) \
                        .reshape(X_test.shape)

# ── Target (y) ───────────────────────────────────────────────────
scaler_y      = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train)
y_val_scaled   = scaler_y.transform(y_val)
y_test_scaled  = scaler_y.transform(y_test)

# ── Guardar escaladores ─────────────────────────────────────────
Path(cfg.MODELS).mkdir(parents=True, exist_ok=True)
joblib.dump(scaler_X, cfg.MODELS / "scaler_X_lstm5d_vix.pkl")
joblib.dump(scaler_y, cfg.MODELS / "scaler_y_lstm5d_vix.pkl")
print("✅ Escaladores guardados")

✅ Escaladores guardados


In [22]:
# ╔═════════════════════════════════════════════════════════════╗
# LIMPIEZA FINAL  ·  elimina filas con NaNs / Inf en X o y
# ╚═════════════════════════════════════════════════════════════╝
def _filtra_nan_inf(X_set, y_set, nombre):
    mask =  np.isfinite(X_set).all(axis=(1, 2)) \
          & np.isfinite(y_set).all(axis=1)          # True si la fila es sana
    n_drop = (~mask).sum()
    if n_drop:
        print(f"⚠️  {nombre}: se descartan {n_drop} muestras con NaN/Inf")
    return X_set[mask], y_set[mask]

X_train, y_train = _filtra_nan_inf(X_train, y_train, "Train")
X_val,   y_val   = _filtra_nan_inf(X_val,   y_val,   "Val")
X_test,  y_test  = _filtra_nan_inf(X_test,  y_test,  "Test")

# Verifica que ya está limpio
assert np.isfinite(X_train).all() and np.isfinite(y_train).all(), "Train aún tiene NaN/Inf"
assert np.isfinite(X_val  ).all() and np.isfinite(y_val  ).all(), "Val   aún tiene NaN/Inf"
assert np.isfinite(X_test ).all() and np.isfinite(y_test ).all(), "Test  aún tiene NaN/Inf"
print("✅ Todos los sets están limpios")


✅ Todos los sets están limpios


In [15]:
model = models.Sequential([
    layers.Input(shape=(cfg.WINDOW, X_train.shape[2])),   # 60 × 80
    layers.LSTM(64, return_sequences=True),
    layers.Dropout(0.2),
    layers.LSTM(32),
    layers.Dense(y.shape[1])                              # 40 salidas
])

model.compile(optimizer=tf.keras.optimizers.Adam(1e-3), loss="mse")
model.summary()

early_stop = EarlyStopping(patience=5, restore_best_weights=True)

history = model.fit(
    X_train_scaled, y_train_scaled,
    validation_data=(X_val_scaled, y_val_scaled),
    epochs=50,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)


Epoch 1/50
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 62ms/step - loss: 0.9956 - val_loss: 2.0562
Epoch 2/50
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 62ms/step - loss: 0.9003 - val_loss: 2.1060
Epoch 3/50
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 66ms/step - loss: 0.8503 - val_loss: 2.1275
Epoch 4/50
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 64ms/step - loss: 0.7667 - val_loss: 2.1576
Epoch 5/50
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 60ms/step - loss: 0.7336 - val_loss: 2.1756
Epoch 6/50
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 64ms/step - loss: 0.6802 - val_loss: 2.1947


In [17]:
model.save(cfg.MODELS / "lstm5d_vix.keras")
print("✅ Modelo guardado")
joblib.dump(history.history, cfg.RESULT / "history_lstm5d_vix.pkl")

y_pred      = model.predict(X_test_scaled, verbose=0)
rmse_vec    = np.sqrt(((y_test_scaled - y_pred) ** 2).mean(axis=0))
rmse_mean   = rmse_vec.mean()

joblib.dump(
    {"rmse_by_asset": rmse_vec, "rmse_mean": rmse_mean},
    cfg.RESULT / "rmse_lstm5d_vix.pkl"
)
print("RMSE medio:", rmse_mean)

✅ Modelo guardado
RMSE medio: 1.1982268


El valor de RMSE medio obtenido (1.03) se justifica por la mayor complejidad del modelo LSTM al trabajar con ventanas de 60 días y 80 variables por muestra, lo cual introduce alta dimensionalidad y mayor varianza en los errores. Aunque el rendimiento es inferior al de otras configuraciones con menos variables (RMSE ≈ 0.14), este modelo captura dinámicas más ricas, lo que puede resultar valioso al combinarse con métodos evolutivos en la fase de optimización de carteras.