In [1]:
import torch, lightning, pytorch_forecasting as pf, tensorflow as tf, numpy
print("Torch      :", torch.__version__)
print("Lightning  :", lightning.__version__)
print("Forecasting:", pf.__version__)
print("NumPy      :", numpy.__version__)
print("TensorFlow :", tf.__version__)

Torch      : 2.1.2+cpu
Lightning  : 2.5.2
Forecasting: 1.4.0
NumPy      : 1.26.4
TensorFlow : 2.16.2


In [5]:
import pandas as pd, numpy as np, pathlib, sys, warnings
from sklearn.preprocessing import StandardScaler
from pytorch_forecasting import TimeSeriesDataSet
from pytorch_forecasting.data.encoders import TorchNormalizer
import torch, pickle

warnings.simplefilter("ignore")

# ----------------------------------------------------------
# Configuración de paths
# ----------------------------------------------------------
ROOT = pathlib.Path().resolve().parent.parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from src import config as cfg

# ----------------------------------------------------------
# 1. Cargar datos base
# ----------------------------------------------------------
prices = pd.read_parquet(cfg.DATA / "raw" / "prices.parquet")
vix    = pd.read_parquet(cfg.DATA / "raw" / "vix.parquet")
ff     = pd.read_parquet(cfg.DATA / "raw" / "fedfunds.parquet")

# Asegurar que el índice tenga nombre
prices.index.name = "date"
vix.index.name = "date"
ff.index.name = "date"

# Mostrar columnas para confirmar nombres reales
print("VIX columns:", vix.columns)
print("Fed Funds columns:", ff.columns)


VIX columns: Index(['^VIX'], dtype='object', name='Ticker')
Fed Funds columns: Index(['^IRX'], dtype='object', name='Ticker')


In [7]:

# ----------------------------------------------------------
# 2. Calcular log-retornos y features
# ----------------------------------------------------------
rets    = np.log(prices / prices.shift(1)).dropna()
ma50    = prices.rolling(50).mean()
zscore  = (prices - ma50) / prices.rolling(50).std()

# ------------ Pasar a formato largo ----------------
def _to_long(df_wide, value_name):
    df_long = df_wide.stack().reset_index()
    df_long.columns = ["date", "ticker", value_name]
    return df_long

rets_long   = _to_long(rets,    "ret")
ma50_long   = _to_long(ma50,    "ma50")
zscore_long = _to_long(zscore,  "zscore")

# ------------ Merge de las features ----------------
df = (
    rets_long
    .merge(ma50_long,   on=["date", "ticker"])
    .merge(zscore_long, on=["date", "ticker"])
)

# ----------------------------------------------------------
# 2b. Añadir covariables exógenas
# ----------------------------------------------------------
# Detectar nombres reales de columnas y renombrar si es necesario
vix_col = vix.columns[0]
ff_col = ff.columns[0]

df = (
    df.merge(vix.rename(columns={vix_col: "vix"}), left_on="date", right_index=True, how="left")
      .merge(ff.rename(columns={ff_col: "irx"}),   left_on="date", right_index=True, how="left")
      .dropna()
)

# ----------------------------------------------------------
# 3. Codificar índices requeridos por TFT
# ----------------------------------------------------------
df["group_id"] = df["ticker"]
df["time_idx"] = (df["date"] - df["date"].min()).dt.days.astype("int64")

# Confirmación de columnas finales
print("Final DataFrame columns:", df.columns)

# ----------------------------------------------------------
# 4. Definir TimeSeriesDataSet
# ----------------------------------------------------------
context_length     = 60
prediction_length  = 5

training = TimeSeriesDataSet(
    df,
    time_idx="time_idx",
    target="ret",
    group_ids=["group_id"],
    max_encoder_length=context_length,
    max_prediction_length=prediction_length,
    time_varying_known_reals=["time_idx", "vix", "irx"],
    time_varying_unknown_reals=["ret", "ma50", "zscore"],
    static_categoricals=["group_id"],
    target_normalizer=TorchNormalizer(method="identity"),
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
    allow_missing_timesteps=True  # 🔧 esta línea es clave
)


# ----------------------------------------------------------
# 5. Guardar dataset serializado
# ----------------------------------------------------------
out_path = cfg.DATA / "processed" / "tft_data.pkl"
out_path.parent.mkdir(parents=True, exist_ok=True)

with open(out_path, "wb") as f:
    pickle.dump(training, f)

print("✅ Dataset TFT guardado en:", out_path)


Final DataFrame columns: Index(['date', 'ticker', 'ret', 'ma50', 'zscore', 'vix', 'irx', 'group_id',
       'time_idx'],
      dtype='object')
✅ Dataset TFT guardado en: C:\Users\ferra\Documents\TFM\data\processed\tft_data.pkl
