In [4]:
import sys, pathlib
import pandas as pd
import numpy as np
import joblib

# Añadir src/ al path
PROJECT_ROOT = pathlib.Path().resolve().parent.parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src import config as cfg

# === 1. Cargar precios ===
df = pd.read_parquet(cfg.DATA / "raw" / "prices.parquet")
df = df.ffill().dropna()

# === 2. Calcular retornos y características ===
ret = np.log(df / df.shift(1)).dropna()
ret5 = ret.rolling(5).sum()
vol5 = ret.rolling(5).std()
momentum = (ret5 / (vol5 + 1e-6)).clip(-10, 10)

# === 3. Definir target ===
y = ret5.shift(-5).dropna()  # retorno acumulado futuro

# === 4. Alinear features con el target ===
ret = ret.shift(1).reindex(y.index)
momentum = momentum.shift(1).reindex(y.index)

# === 5. Crear ventanas ===
X, y_clean, dates = [], [], []

for i in range(cfg.WINDOW, len(y)):
    ventana_ret = ret.iloc[i - cfg.WINDOW:i].values
    ventana_mom = momentum.iloc[i - cfg.WINDOW:i].values
    bloque = np.concatenate([ventana_ret, ventana_mom], axis=1)
    if not np.isnan(bloque).any():
        X.append(bloque)
        y_clean.append(y.iloc[i].values)
        dates.append(y.index[i])

X = np.array(X, dtype=np.float32)
y = np.array(y_clean, dtype=np.float32)
dates = pd.to_datetime(dates)

print(f"✅ X shape: {X.shape}")
print(f"✅ y shape: {y.shape}")
print(f"✅ Nº fechas: {len(dates)}")

✅ X shape: (4450, 60, 80)
✅ y shape: (4450, 40)
✅ Nº fechas: 4450


In [6]:
# === 6. Guardar datos procesados ===
joblib.dump({
    "X": X,
    "y": y,
    "tickers": df.columns.tolist(),
    "dates": dates
}, cfg.DATA / "processed" / "cnn5d_data.pkl")

print("✅ Dataset CNN5d guardado.")

✅ Dataset CNN5d guardado.
