In [10]:
import sys, pathlib
import pandas as pd
import numpy as np
import joblib

# Añadir src/ al path para poder importar config
PROJECT_ROOT = pathlib.Path().resolve().parent.parent  # estamos en /notebooks/lstm5d
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src import config as cfg


In [12]:
# Leer precios desde parquet
df = pd.read_parquet(cfg.DATA / "raw" / "prices.parquet")
df = df.ffill().dropna()

# Calcular retornos logarítmicos
ret = np.log(df / df.shift(1)).dropna()
ret.head()

Ticker,AAPL,ABT,ADBE,AMZN,BAC,BTC-USD,COST,CRM,CSCO,CVX,...,PFE,PG,T,TSLA,UNH,V,VZ,WFC,WMT,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-05-21,0.056626,0.00389,0.022111,0.019725,-0.027439,0.0,0.00948,0.023556,0.012071,0.012415,...,-0.00133,-0.002049,-0.000892,0.042968,0.028485,0.031288,-0.004585,0.014758,0.009724,0.006972
2012-05-22,-0.007708,0.002908,0.0,-0.012828,0.021725,0.0,-0.004189,0.00268,0.003593,-0.003719,...,-0.007571,-0.003793,-0.00268,0.068181,0.003055,0.0185,0.001209,0.008562,0.010886,-0.001097
2012-05-23,0.024107,-0.010052,0.005297,0.009015,0.026856,0.0,0.002464,0.007732,-0.002394,-0.003227,...,-0.012596,-0.012108,-0.005381,0.007118,-0.006482,0.006567,-0.002661,0.002208,0.013249,0.001097
2012-05-24,-0.009226,0.016965,-0.020089,-0.009433,-0.004193,0.0,0.013947,-0.02965,-0.018138,0.01065,...,0.002261,0.002881,0.008358,-0.024145,0.015415,0.005022,0.002661,0.002203,0.007559,0.006924
2012-05-25,-0.005374,-0.002728,0.001901,-0.010978,0.0014,0.0,0.0,0.005389,-0.003667,-0.012065,...,-0.000452,-0.001279,0.001485,-0.015644,-0.00178,-0.003346,0.001449,0.00157,0.003682,-0.006436


In [25]:
# Retorno acumulado 5 días
ret5 = ret.rolling(5).sum()

# Volatilidad 5 días
vol5 = ret.rolling(5).std()

# Momentum = retorno / volatilidad
momentum = (ret5 / (vol5 + 1e-6)).clip(-10, 10)

# Target: retorno 5d futuro (desde t+1 a t+5)
y = ret5.shift(-5).dropna()

# Alinear features al target
ret = ret.shift(1).reindex(y.index)
momentum = momentum.shift(1).reindex(y.index)

# Visual check
ret.tail(3), momentum.tail(3), y[-3:]


(Ticker          AAPL       ABT      ADBE      AMZN       BAC   BTC-USD  \
 Date                                                                     
 2025-06-14  0.002111  0.008612  0.002033  0.000188 -0.002462 -0.025699   
 2025-06-15 -0.013901 -0.006101 -0.054648 -0.005360 -0.011949  0.001527   
 2025-06-16  0.000000  0.000000  0.000000  0.000000  0.000000 -0.005848   
 
 Ticker          COST       CRM      CSCO       CVX  ...       PFE        PG  \
 Date                                                ...                       
 2025-06-14  0.005932  0.003754  0.014077  0.001311  ...  0.014599  0.006579   
 2025-06-15 -0.012545 -0.032403 -0.015636  0.006463  ... -0.012151 -0.017932   
 2025-06-16  0.000000  0.000000  0.000000  0.000000  ...  0.000000  0.000000   
 
 Ticker             T      TSLA       UNH         V        VZ       WFC  \
 Date                                                                     
 2025-06-14 -0.002473 -0.022680  0.025245 -0.005130 -0.013349 -0.012354

In [27]:
X = []
for i in range(cfg.WINDOW, len(y)):
    ventana_ret = ret.iloc[i - cfg.WINDOW:i].values       # (60, N)
    ventana_mom = momentum.iloc[i - cfg.WINDOW:i].values  # (60, N)
    bloque = np.concatenate([ventana_ret, ventana_mom], axis=1)  # (60, 2N)
    X.append(bloque)

X = np.stack(X)  # shape = (n_muestras, 60, 2N)
y = y.iloc[cfg.WINDOW:].values  # shape = (n_muestras, N)

print(f"✅ X shape: {X.shape}")
print(f"✅ y shape: {y.shape}")


✅ X shape: (4450, 60, 80)
✅ y shape: (4450, 40)


In [29]:
# Ver una muestra
pd.DataFrame(X[0]).iloc[:5]  # primeros 5 días de la primera muestra

# Ver un target
pd.Series(y[0], index=df.columns).sort_values(ascending=False).head()


Ticker
CSCO    0.109661
NFLX    0.060320
BAC     0.051358
ADBE    0.049813
AAPL    0.037853
dtype: float64

In [36]:
joblib.dump({
    "X": X,
    "y": y,
    "tickers": df.columns.tolist()
}, cfg.DATA / "processed" / "lstm5d_data.pkl")

loaded = joblib.load(cfg.DATA / "processed" / "lstm5d_data.pkl")
print("Tickers guardados:", len(loaded["tickers"]))  # ✅ Debe ser 80



Tickers guardados: 40


📝 Preprocesamiento LSTM-5d con momentum
En este notebook se preparan los datos para entrenar un modelo LSTM que predice el retorno acumulado a 5 días, en lugar del retorno diario del modelo baseline. Además, se añade una nueva feature de momentum, calculada como:


Esto permite al modelo captar tendencias de corto plazo y reducir el ruido.

Se genera:

X: secuencia de 60 días de retornos + momentum por activo.

y: retorno acumulado futuro (5 días).

Se guarda en lstm5d_data.pkl para su posterior entrenamiento.

Este paso es necesario porque el modelo usa un objetivo y unas entradas diferentes al baseline y no sería válido reutilizar sus datos.