In [12]:
# ────────────────────────────────────────────────────────────────────
# CHECK DE DESFASE  |  ¿hay “look-ahead” en las fechas?
# ────────────────────────────────────────────────────────────────────
import sys, pathlib, joblib
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Ruta proyecto ────────────────────────────────────────────────────
PROJECT_ROOT = pathlib.Path().resolve().parent.parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))
from src import config as cfg

# Carga paquete procesado que se usó para entrenar
d       = joblib.load(cfg.DATA / "processed" / "lstm5d_data.pkl")
X, y    = d["X"], d["y"]                       # X.shape = (n, win, n_feats)
dates   = pd.to_datetime(d["dates"])           # índice «representativo» (último día ventana)

# Serie de retornos diarios reales
df_px   = pd.read_parquet(cfg.DATA / "raw" / "prices.parquet")
ret1d   = np.log(df_px / df_px.shift(1)).dropna()

# Target “futuro” real (t+1)
y_check = (
    ret1d.shift(-1)            # futuro
         .reindex(dates)       # mismo índice que X
         .dropna()
)

# ── 1) Quedarse sólo con fechas comunes ───────────────────────────
common_dates  = dates.intersection(y_check.index)  # orden original respetado
mask_common   = np.isin(dates, common_dates)   # ← sin .to_numpy()


X_common      = X[mask_common]
y_common      = y[mask_common]
dates_common  = dates[mask_common]
y_check_comm  = y_check.loc[dates_common]          # misma longitud y orden

print("¿Índices iguales?", dates_common.equals(y_check_comm.index))
print("Longitudes:", len(dates_common), "/", len(y_check_comm))

# ── 2) Filtrar NaN / Inf en X (¡después de alinear!) ───────────────
mask_clean = (
    ~np.isnan(X_common).any(axis=(1, 2))
    & ~np.isinf(X_common).any(axis=(1, 2))
)

X_ok        = X_common[mask_clean]
y_ok        = y_common[mask_clean]
dates_ok    = dates_common[mask_clean]
y_check_ok  = y_check_comm.loc[dates_ok]          # sigue alineado

print("Final shapes  X:", X_ok.shape, " y:", y_ok.shape)
print("¿Índices finales iguales?", dates_ok.equals(y_check_ok.index))


¿Índices iguales? True
Longitudes: 2641 / 2641
Final shapes  X: (2634, 60, 80)  y: (2634, 40)
¿Índices finales iguales? True


In [16]:
# ------------------------------------------------------------
# (1)   FECHAS   ── conviértelas a un objeto Pandas manejable
# ------------------------------------------------------------
dates = pd.to_datetime(d["dates"])             # <- 1-a-1 con X y con y

# ------------------------------------------------------------
# (2)   LIMPIEZA GLOBAL  ── descarta filas con NaN/Inf en X
# ------------------------------------------------------------
mask_good = ~np.isnan(X).any(axis=(1, 2)) & ~np.isinf(X).any(axis=(1, 2))

X_ok      = X[mask_good]
y_ok      = y[mask_good]
dates_ok  = dates[mask_good]                   # <-- MISMA long. que X_ok

print("Dims OK:", X_ok.shape, y_ok.shape, len(dates_ok))   # debe cuadrar

# ------------------------------------------------------------
# (3)   SPLITS (train/val/test) usando SOLO dates_ok
# ------------------------------------------------------------
dates_ok = pd.DatetimeIndex(dates_ok)          # para .year, comparaciones

train_mask = dates_ok < "2019-01-01"
val_mask   = (dates_ok >= "2019-01-01") & (dates_ok < "2021-01-01")
test_mask  = dates_ok >= "2021-01-01"

X_train, y_train = X_ok[train_mask], y_ok[train_mask]
X_val,   y_val   = X_ok[val_mask]  , y_ok[val_mask]
X_test,  y_test  = X_ok[test_mask] , y_ok[test_mask]

print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)

# ------------------------------------------------------------
# (4)   NUEVO StandardScaler ajustado SOLO con TRAIN
# ------------------------------------------------------------
sc_new = StandardScaler()
sc_new.fit(X_train.reshape(-1, X_train.shape[2]))

# cargar scaler de producción
sc_prod = joblib.load(cfg.MODELS / "scaler_X_lstm5d.pkl")

print("\n─ COMPARACIÓN DE ESCALADORES ─")
print("Media  new :", sc_new.mean_[:5])
print("Media prod:", sc_prod.mean_[:5])
print("Std    new :", sc_new.scale_[:5])
print("Std    prod:", sc_prod.scale_[:5])


Dims OK: (4448, 60, 80) (4448, 40) 4448
Train: (2084, 60, 80) Val: (731, 60, 80) Test: (1633, 60, 80)

─ COMPARACIÓN DE ESCALADORES ─
Media  new : [-0.00842533  0.00633255  0.02304093  0.0168192   0.01190116]
Media prod: [-0.00842533  0.00633255  0.02304093  0.0168192   0.01190116]
Std    new : [0.89325047 0.88864062 0.81899326 0.93818502 0.87703659]
Std    prod: [0.89325047 0.88864062 0.81899326 0.93818502 0.87703659]
