In [2]:
# ─── Importaciones base ─────────────────────────────
import sys, pathlib
PROJECT_ROOT = pathlib.Path().resolve().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src import config as cfg

import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler

# ─── Cargar precios ─────────────────────────────────
df_prices = pd.read_parquet(cfg.DATA / "raw" / "prices.parquet").sort_index()

print("✅ Precios cargados con shape:", df_prices.shape)


✅ Precios cargados con shape: (5120, 40)


In [4]:
min_valid_assets = int(df_prices.shape[1] * 0.8)
df_filtered = df_prices.dropna(thresh=min_valid_assets)

print("✅ Fechas válidas:", df_filtered.shape)


✅ Fechas válidas: (3894, 40)


In [6]:
# Forward-fill + back-fill en festivos
df_filled = df_filtered.ffill(limit=5).bfill(limit=5)

# Eliminar activos con muchos huecos (>17%)
min_valid_rows = int(len(df_filled) * 0.83)
df_filled = df_filled.dropna(axis=1, thresh=min_valid_rows)

tickers_original = df_prices.columns.tolist()
tickers_final    = df_filled.columns.tolist()
tickers_dropped  = list(set(tickers_original) - set(tickers_final))

print("❌ Activos eliminados por huecos excesivos:", tickers_dropped)

print("✅ Sin huecos graves. Dimensión final:", df_filled.shape)


❌ Activos eliminados por huecos excesivos: []
✅ Sin huecos graves. Dimensión final: (3894, 40)


In [8]:
df_ret = np.log(df_filled / df_filled.shift(1)).dropna()
print("✅ Retornos calculados:", df_ret.shape)

scaler = StandardScaler()
df_scaled = pd.DataFrame(
    scaler.fit_transform(df_ret),
    index=df_ret.index,
    columns=df_ret.columns
)

print("✅ Datos normalizados (media ≈ 0, std ≈ 1)")



✅ Retornos calculados: (3299, 40)
✅ Datos normalizados (media ≈ 0, std ≈ 1)


In [10]:
joblib.dump(scaler, cfg.DATA / "processed" / "ret_scaler.pkl")

['C:\\Users\\ferra\\Documents\\TFM\\data\\processed\\ret_scaler.pkl']

In [12]:
# Cortar temporalmente el DataFrame hasta 2018-12-31
df_ret = df_ret[df_ret.index < "2019-01-01"]
df_scaled = df_scaled.loc[df_ret.index]  # mantener misma ventana

X, y = [], []
WINDOW = cfg.WINDOW
HORIZON = cfg.TARGET_HORIZON

vals = df_scaled.values
targets = df_ret.values  # sin escalar

for i in range(WINDOW, len(vals) - HORIZON + 1):
    X.append(vals[i - WINDOW:i])
    y.append(targets[i + HORIZON - 1])

import numpy as np
X = np.array(X, dtype=np.float32)
y = np.array(y, dtype=np.float32)

print("✅ Tensores generados:", "X:", X.shape, "| y:", y.shape)

✅ Tensores generados: X: (1609, 60, 40) | y: (1609, 40)


In [14]:
joblib.dump({"X": X, "y": y, "tickers": df_ret.columns.tolist()},
            cfg.DATA / "processed" / "lstm_data.pkl")

print("✅ Datos listos y guardados en:", cfg.DATA / "processed" / "lstm_data.pkl")


✅ Datos listos y guardados en: C:\Users\ferra\Documents\TFM\data\processed\lstm_data.pkl
