In [3]:
# ───────────────────────────────────────────────────────────────────────────────
# PREPROCESS CNN-5d   (notebook cell)
# ───────────────────────────────────────────────────────────────────────────────
import sys, pathlib, joblib, pandas as pd, numpy as np
PROJECT_ROOT = pathlib.Path().resolve().parent.parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src import config as cfg     # contiene WINDOW, rutas, etc.

# 1. Precios limpios -----------------------------------------------------------
df = (
    pd.read_parquet(cfg.DATA / "raw" / "prices.parquet")
      .ffill()
      .dropna()
)

# 2. Retornos y features --------------------------------------------------------
ret      = np.log(df / df.shift(1)).dropna()
ret5     = ret.rolling(5).sum()
vol5     = ret.rolling(5).std()
momentum = (ret5 / (vol5 + 1e-6)).clip(-10, 10)

# 3. Target = retorno 5-días futuro -------------------------------------------
y_future = ret5.shift(-5).dropna()

# 4. Desfase de 1 día en features (evitar look-ahead) --------------------------
ret      = ret.shift(1).reindex(y_future.index)
momentum = momentum.shift(1).reindex(y_future.index)

# 5. Ventanas deslizantes  (channels_last) -------------------------------------
X, y_list, dates = [], [], []
for i in range(cfg.WINDOW, len(y_future)):
    win_ret = ret.iloc[i - cfg.WINDOW:i].values          # (W, n_assets)
    win_mom = momentum.iloc[i - cfg.WINDOW:i].values     # (W, n_assets)
    bloque  = np.stack([win_ret, win_mom], axis=-1)      # (W, n_assets, 2)

    if not np.isnan(bloque).any():
        X.append(bloque.astype(np.float32))
        y_list.append(y_future.iloc[i].values.astype(np.float32))
        dates.append(y_future.index[i])

X_arr  = np.asarray(X, dtype=np.float32)                 # (N, W, n_assets, 2)
y_arr  = np.asarray(y_list, dtype=np.float32)            # (N, n_assets//2)
dates  = pd.to_datetime(dates)

print(f"✅ X shape  : {X_arr.shape}")
print(f"✅ y shape  : {y_arr.shape}")
print(f"✅ Nº fechas: {len(dates)}")


✅ X shape  : (4450, 60, 40, 2)
✅ y shape  : (4450, 40)
✅ Nº fechas: 4450


In [5]:

# 6. Guardar dataset -----------------------------------------------------------
joblib.dump(
    {"X": X_arr, "y": y_arr, "tickers": df.columns.tolist(), "dates": dates},
    cfg.DATA / "processed" / "cnn5d_data.pkl"
)
print("✅ Dataset cnn5d_data.pkl guardado.")


✅ Dataset cnn5d_data.pkl guardado.
