In [15]:
import sys, pathlib
import pandas as pd
import numpy as np
import joblib

# Añadir src/ al path para poder importar config
PROJECT_ROOT = pathlib.Path().resolve().parent.parent  # estamos en /notebooks/lstm5d
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src import config as cfg
from sklearn.preprocessing import StandardScaler   

In [16]:
# Leer precios desde parquet
df = pd.read_parquet(cfg.DATA / "raw" / "prices.parquet")
df = df.ffill().dropna()

# Cargar VIX y alinearlo con los precios
vix = pd.read_parquet(cfg.DATA / "raw" / "vix.parquet").squeeze()
vix = vix.ffill().reindex(df.index).ffill()
vix_t1 = vix.shift(1).rename('vix_t-1')

# Calcular retornos logarítmicos
ret = np.log(df / df.shift(1)).dropna()
ret.head()

Ticker,AAPL,ABT,ADBE,AMZN,BAC,BTC-USD,COST,CRM,CSCO,CVX,...,PFE,PG,T,TSLA,UNH,V,VZ,WFC,WMT,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-05-21,0.056626,0.00389,0.022111,0.019725,-0.027439,0.0,0.00948,0.023556,0.012071,0.012415,...,-0.00133,-0.002049,-0.000892,0.042968,0.028485,0.031288,-0.004585,0.014758,0.009724,0.006972
2012-05-22,-0.007708,0.002908,0.0,-0.012828,0.021725,0.0,-0.004189,0.00268,0.003593,-0.003719,...,-0.007571,-0.003793,-0.00268,0.068181,0.003055,0.0185,0.001209,0.008562,0.010886,-0.001097
2012-05-23,0.024107,-0.010052,0.005297,0.009015,0.026856,0.0,0.002464,0.007732,-0.002394,-0.003227,...,-0.012596,-0.012108,-0.005381,0.007118,-0.006482,0.006567,-0.002661,0.002208,0.013249,0.001097
2012-05-24,-0.009226,0.016965,-0.020089,-0.009433,-0.004193,0.0,0.013947,-0.02965,-0.018138,0.01065,...,0.002261,0.002881,0.008358,-0.024145,0.015415,0.005022,0.002661,0.002203,0.007559,0.006924
2012-05-25,-0.005374,-0.002728,0.001901,-0.010978,0.0014,0.0,0.0,0.005389,-0.003667,-0.012065,...,-0.000452,-0.001279,0.001485,-0.015644,-0.00178,-0.003346,0.001449,0.00157,0.003682,-0.006436


In [19]:
# Retorno acumulado 5 días
ret5 = ret.rolling(5).sum()

# Volatilidad 5 días
vol5 = ret.rolling(5).std()

# Momentum = retorno / volatilidad
momentum = (ret5 / (vol5 + 1e-6)).clip(-10, 10)

# Target: retorno 5d futuro (desde t+1 a t+5)
y = ret5.shift(-5).dropna()

# Alinear features al target
ret = ret.shift(1).reindex(y.index)
momentum = momentum.shift(1).reindex(y.index)
vix_t1 = vix_t1.reindex(y.index)

# Visual check
ret.tail(3), momentum.tail(3), y[-3:]


(Ticker          AAPL       ABT      ADBE      AMZN       BAC   BTC-USD  \
 Date                                                                     
 2025-06-19  0.004793  0.000982 -0.012199 -0.010764  0.018592  0.002694   
 2025-06-20  0.000000  0.000000  0.000000  0.000000  0.000000 -0.001900   
 2025-06-21  0.022235  0.004371 -0.002967 -0.013406  0.009717 -0.013219   
 
 Ticker          COST       CRM      CSCO       CVX  ...       PFE        PG  \
 Date                                                ...                       
 2025-06-19 -0.002888 -0.010900  0.007776 -0.004444  ... -0.005013 -0.003159   
 2025-06-20  0.000000  0.000000  0.000000  0.000000  ...  0.000000  0.000000   
 2025-06-21  0.005513  0.004345  0.007264  0.009136  ...  0.003762  0.006686   
 
 Ticker             T      TSLA       UNH         V       VZ       WFC  \
 Date                                                                    
 2025-06-19  0.000362  0.017858 -0.005421 -0.050023 -0.00431  0.030429   

In [23]:
# Target: retorno 5d futuro (desde t+1 a t+5)
y = ret5.shift(-5).dropna()

# Alinear features al target
ret = ret.shift(1).reindex(y.index)
momentum = momentum.shift(1).reindex(y.index)

X = []
y_out = []
dates = []

for i in range(cfg.WINDOW, len(ret)):
    ventana_ret = ret.iloc[i - cfg.WINDOW:i].values
    ventana_mom = momentum.iloc[i - cfg.WINDOW:i].values
    ventana_vix = vix_t1.iloc[i - cfg.WINDOW:i].values.reshape(-1, 1)
    bloque = np.concatenate([ventana_ret, ventana_mom, ventana_vix], axis=1)

    X.append(bloque)
    y_out.append(y.iloc[i])      # ← ahora sí funciona
    dates.append(y.index[i])     # ← fecha del target

X = np.array(X, dtype=np.float32)
y = np.array(y_out, dtype=np.float32)
dates = pd.to_datetime(dates)



print(f"✅ X shape: {X.shape}")
print(f"✅ y shape: {y.shape}")


✅ X shape: (4455, 60, 81)
✅ y shape: (4455, 40)


In [25]:
# Ver una muestra
pd.DataFrame(X[0]).iloc[:5]  # primeros 5 días de la primera muestra

# Ver un target
pd.Series(y[0], index=df.columns).sort_values(ascending=False).head()


Ticker
CSCO    0.102358
AAPL    0.058553
BAC     0.043512
NFLX    0.033269
CRM     0.030677
dtype: float32

In [28]:
# ─────────────────────────────────────────────
# 4. ESCALADO (features y target)
# ─────────────────────────────────────────────
from pathlib import Path      #  ← AÑADE ESTA LÍNEA

n_feat = X.shape[2]

# ---- escalador de X ----
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X.reshape(-1, n_feat)).reshape(X.shape)

# ---- escalador de y ----
scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(y)     # y.shape = (n_muestras, n_assets)

# ---- guarda los escaladores ----
Path(cfg.MODELS).mkdir(parents=True, exist_ok=True)
joblib.dump(scaler_X, cfg.MODELS / "scaler_X_lstm5d_vix.pkl")
joblib.dump(scaler_y, cfg.MODELS / "scaler_y_lstm5d_vix.pkl")
print("✅ Escaladores guardados (lstm5d)")


✅ Escaladores guardados (lstm5d)


In [30]:
joblib.dump(
    {
        "X": X_scaled.astype(np.float32),
        "y": y_scaled.astype(np.float32),
        "tickers": df.columns.tolist(),
        "dates": dates                      # la lista creada en el bucle
    },
    cfg.DATA / "processed" / "lstm5d_vix.pkl"
)
print("✅ Datos lstm5d listos y guardados")


✅ Datos lstm5d listos y guardados


📝 Preprocesamiento LSTM-5d con momentum
En este notebook se preparan los datos para entrenar un modelo LSTM que predice el retorno acumulado a 5 días, en lugar del retorno diario del modelo baseline. Además, se añade una nueva feature de momentum, calculada como:


Esto permite al modelo captar tendencias de corto plazo y reducir el ruido.

Se genera:

X: secuencia de 60 días de retornos + momentum por activo + VIX.

y: retorno acumulado futuro (5 días).

Se guarda en lstm5d_vix.pkl para su posterior entrenamiento.

Este paso es necesario porque el modelo usa un objetivo y unas entradas diferentes al baseline y no sería válido reutilizar sus datos.