In [12]:
# PASO 1: Cargar precios correctamente y validar
import sys, pathlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# A√±adir src/ al path
PROJECT_ROOT = pathlib.Path().resolve().parent.parent  # notebooks/xgb/ ‚Üí proyecto ra√≠z
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src import config as cfg

# 1. Cargar datos
df = pd.read_parquet(cfg.DATA / "raw" / "prices.parquet")
df = df.ffill().dropna()

# 2. Info general
print(f"üìà Dimensiones: {df.shape}")
print(f"üóìÔ∏è Rango de fechas: {df.index.min().date()} a {df.index.max().date()}")
print(f"üìä Activos disponibles: {len(df.columns)}")

# 3. Checks
# 3. Checks
assert df.index.is_monotonic_increasing, "‚ùå Fechas no ordenadas"

if df.index.inferred_freq != "B":
    print("‚ö†Ô∏è Las fechas no est√°n exactamente en frecuencia 'B' (puede haber feriados).")

n_nulos = df.isna().sum().sum()
print(f"‚ùì Valores nulos: {n_nulos}")

üìà Dimensiones: (4521, 40)
üóìÔ∏è Rango de fechas: 2012-05-18 a 2025-06-26
üìä Activos disponibles: 40
‚ö†Ô∏è Las fechas no est√°n exactamente en frecuencia 'B' (puede haber feriados).
‚ùì Valores nulos: 0


In [16]:
# PASO 2: calcular retornos, momentum y target
# --------------------------------------------

# 1. Retornos logar√≠tmicos diarios
ret = np.log(df / df.shift(1)).dropna()

# 2. Retorno acumulado 5 d√≠as
ret5 = ret.rolling(5).sum()

# 3. Volatilidad 5 d√≠as
vol5 = ret.rolling(5).std()

# 4. Momentum = retorno / volatilidad
momentum = (ret5 / (vol5 + 1e-6)).clip(-10, 10)

# 5. Target: retorno futuro (t+1 a t+5)
y = ret5.shift(-5).dropna()

# 6. Alinear features al target (ventana t-1)
ret = ret.shift(1).reindex(y.index)
momentum = momentum.shift(1).reindex(y.index)

# Visual quick check
print("üìè ret shape:", ret.shape)
print("üìè momentum shape:", momentum.shape)
print("üéØ target shape (y):", y.shape)


üìè ret shape: (4515, 40)
üìè momentum shape: (4515, 40)
üéØ target shape (y): (4515, 40)


In [18]:
# PASO 3: Construcci√≥n de dataset plano por ticker/fecha
# ------------------------------------------------------

df_list = []

for ticker in df.columns:
    df_feat = pd.DataFrame(index=ret.index)
    df_feat["ret_1d"] = ret[ticker]
    df_feat["ret_5d"] = ret5[ticker].reindex(ret.index)
    df_feat["vol_5d"] = vol5[ticker].reindex(ret.index)
    df_feat["momentum"] = momentum[ticker]
    df_feat["target_5d"] = y[ticker]
    
    df_feat["ticker"] = ticker
    df_feat["date"] = df_feat.index

    df_list.append(df_feat.dropna())

# Unir todos los tickers
df_xgb = pd.concat(df_list).reset_index(drop=True)

# Ordenar por fecha
df_xgb = df_xgb.sort_values(["date", "ticker"]).reset_index(drop=True)

# Quick check
print(f"‚úÖ Dataset total: {df_xgb.shape[0]} muestras √ó {df_xgb.shape[1]} columnas")
df_xgb.head()


‚úÖ Dataset total: 180400 muestras √ó 7 columnas


Unnamed: 0,ret_1d,ret_5d,vol_5d,momentum,target_5d,ticker,date
0,-0.005374,0.019391,0.015723,2.040688,-0.016634,AAPL,2012-05-29
1,-0.002728,0.005325,0.010027,1.104154,-0.030058,ABT,2012-05-29
2,0.001901,0.001249,0.012598,0.612189,-0.042709,ADBE,2012-05-29
3,-0.010978,-0.015525,0.010986,-0.310394,-0.007197,AMZN,2012-05-29
4,0.0014,0.085547,0.018234,0.842262,-0.045431,BAC,2012-05-29


In [22]:
import joblib
from pathlib import Path

# Ruta de guardado
output_path = cfg.DATA / "processed" / "xgb_data.pkl"

# Guardar
joblib.dump(df_xgb, output_path)

print(f"üíæ Dataset guardado en: {output_path}")
print(f"üì¶ Filas: {len(df_xgb):,} | Columnas: {df_xgb.shape[1]}")


üíæ Dataset guardado en: C:\Users\ferra\Documents\TFM\data\processed\xgb_data.pkl
üì¶ Filas: 180,400 | Columnas: 7
