In [11]:
import sys, pathlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
from sklearn.preprocessing import StandardScaler
import joblib

# Añadir src/ al path
PROJECT_ROOT = pathlib.Path().resolve().parent.parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src import config as cfg

print(" XGBoost Enhanced - Preprocessing Iniciado")
print(f" Datos: {cfg.DATA}")
print(f" Modelos: {cfg.MODELS}")


 XGBoost Enhanced - Preprocessing Iniciado
 Datos: C:\Users\1M72763\Desktop\TFM\data
 Modelos: C:\Users\1M72763\Desktop\TFM\models


In [12]:
# 1. CARGAR DATOS BASE
df_prices = pd.read_parquet(cfg.DATA / "raw" / "prices.parquet").ffill().dropna()
df_ret = np.log(df_prices / df_prices.shift(1)).dropna()

print(f" Precios: {df_prices.shape} | Retornos: {df_ret.shape}")
print(f" Período: {df_prices.index.min().date()} → {df_prices.index.max().date()}")

# 2. DESCARGAR VIX para features macro
print(" Descargando VIX...")
vix_raw = yf.download("^VIX", start=df_prices.index.min(), end=df_prices.index.max(), progress=False)
#  FIX YFINANCE COLUMNS
if 'Adj Close' in vix_raw.columns:
    vix = vix_raw["Adj Close"].reindex(df_prices.index, method="ffill")
elif 'Close' in vix_raw.columns:
    vix = vix_raw["Close"].reindex(df_prices.index, method="ffill")
else:
    vix = vix_raw.iloc[:, -1].reindex(df_prices.index, method="ffill")

# 3. DESCARGAR SPY para correlaciones
print(" Descargando SPY...")
spy_raw = yf.download("SPY", start=df_prices.index.min(), end=df_prices.index.max(), progress=False)
# FIX YFINANCE COLUMNS
if 'Adj Close' in spy_raw.columns:
    spy_prices = spy_raw["Adj Close"].reindex(df_prices.index, method="ffill")
elif 'Close' in spy_raw.columns:
    spy_prices = spy_raw["Close"].reindex(df_prices.index, method="ffill")
else:
    spy_prices = spy_raw.iloc[:, -1].reindex(df_prices.index, method="ffill")
spy_ret = np.log(spy_prices / spy_prices.shift(1)).dropna()

print(f" VIX shape: {vix.shape} | SPY shape: {spy_ret.shape}")
print(" Datos base cargados correctamente")


 Precios: (4521, 40) | Retornos: (4520, 40)
 Período: 2012-05-18 → 2025-06-26
Descargando VIX...
 Descargando SPY...
 VIX shape: (4521, 1) | SPY shape: (4520, 1)
 Datos base cargados correctamente


  vix_raw = yf.download("^VIX", start=df_prices.index.min(), end=df_prices.index.max(), progress=False)
  spy_raw = yf.download("SPY", start=df_prices.index.min(), end=df_prices.index.max(), progress=False)


In [13]:
# 4. FUNCIONES TÉCNICAS AVANZADAS
def calculate_rsi(prices, window=14):
    """Relative Strength Index"""
    delta = prices.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def bollinger_position(prices, window=20, num_std=2):
    """Posición dentro de las Bollinger Bands (0=banda inferior, 1=banda superior)"""
    rolling_mean = prices.rolling(window).mean()
    rolling_std = prices.rolling(window).std()
    upper_band = rolling_mean + (rolling_std * num_std)
    lower_band = rolling_mean - (rolling_std * num_std)
    position = (prices - lower_band) / (upper_band - lower_band)
    return position.clip(0, 1)

def calculate_beta(asset_ret, market_ret, window=60):
    """Calcular beta dinámico de una acción vs mercado"""
    covariance = asset_ret.rolling(window).cov(market_ret)
    market_variance = market_ret.rolling(window).var()
    beta = covariance / (market_variance + 1e-8)
    return beta.fillna(0).clip(-3, 3)

print("📐 Funciones técnicas cargadas:")
print("  • RSI (Relative Strength Index)")
print("  • Bollinger Bands Position")  
print("  • Beta dinámico vs mercado")


📐 Funciones técnicas cargadas:
  • RSI (Relative Strength Index)
  • Bollinger Bands Position
  • Beta dinámico vs mercado


In [16]:
# 5. FEATURE ENGINEERING AVANZADO - 24 FEATURES TÉCNICAS
print("🔧 Calculando 24+ features financieras avanzadas...")

features_list = []
feature_names = []

for ticker in df_prices.columns:
    if ticker in ['BTC-USD', 'ETH-USD']:  # Skip crypto por compatibilidad
        continue
        
    print(f"  📊 {ticker}...")
    
    prices = df_prices[ticker]
    returns = df_ret[ticker]
    
    # === RETORNOS MULTI-PERIODO ===
    ret_1d = returns
    ret_5d = returns.rolling(5).sum()
    ret_10d = returns.rolling(10).sum()
    ret_20d = returns.rolling(20).sum()
    
    # === VOLATILIDADES MULTI-ESCALA ===
    vol_5d = returns.rolling(5).std()
    vol_10d = returns.rolling(10).std()
    vol_20d = returns.rolling(20).std()
    vol_60d = returns.rolling(60).std()
    
    # === MOMENTUM & MEAN REVERSION ===
    momentum_5d = ret_5d / (vol_5d + 1e-6)
    momentum_20d = ret_20d / (vol_20d + 1e-6)
    rsi_14 = calculate_rsi(prices, 14) / 100  # Normalizado 0-1
    
    # === MOVING AVERAGES & RATIOS ===
    ma_5 = prices.rolling(5).mean()
    ma_20 = prices.rolling(20).mean()
    ma_50 = prices.rolling(50).mean()
    ma_200 = prices.rolling(200).mean()
    
    ma_ratio_5_20 = (ma_5 / ma_20 - 1).clip(-0.5, 0.5)
    price_to_ma_50 = (prices / ma_50 - 1).clip(-0.5, 0.5)
    price_to_ma_200 = (prices / ma_200 - 1).clip(-0.5, 0.5)
    
    # === TECHNICAL INDICATORS ===
    bollinger_pos = bollinger_position(prices, 20)
    price_deviation = ((prices - ma_20) / ma_20).clip(-0.3, 0.3)
    
    # Z-score de retornos (normalización estadística)
    returns_mean = returns.rolling(60).mean()
    returns_std = returns.rolling(60).std()
    returns_z_score = ((returns - returns_mean) / (returns_std + 1e-8)).clip(-3, 3)
    
    # === CROSS-ASSET FEATURES ===
    corr_spy_20d = returns.rolling(20).corr(spy_ret.reindex(returns.index)).fillna(0)
    beta_to_market = calculate_beta(returns, spy_ret.reindex(returns.index), 60)
    
    # VIX features
    vix_ret = vix.pct_change()
    corr_vix_20d = returns.rolling(20).corr(vix_ret.reindex(returns.index)).fillna(0)
    
    # === MACRO FEATURES ===
    vix_level = (vix / 100).clip(0, 1)  # VIX normalizado
    vix_change_5d = vix.pct_change(5).clip(-1, 1)
    
    # === VOLUME PROXY FEATURES ===
    vol_ratio_20d = (vol_5d / (vol_20d + 1e-8)).clip(0, 5)
    vol_spike = (vol_5d > returns.rolling(20).std().rolling(20).quantile(0.8)).astype(float)
    
    # === CREAR DATASET TICKER ===
    # ✅ FIX DIMENSIONAL: Usar común index truncado + conversión BRUTAL a 1D
    common_index = returns.index[200:]  # Skip primeros 200 días para estabilizar rolling windows
    
    # ✅ RADICAL APPROACH: Crear DataFrame paso a paso con arrays numpy 1D puros
    data_dict = {}
    
    # Metadatos
    data_dict['ticker'] = ticker
    data_dict['date'] = common_index
    
    # Retornos (4) - Fuerza .flatten() para garantizar 1D
    data_dict['ret_1d'] = ret_1d.reindex(common_index).values.flatten()
    data_dict['ret_5d'] = ret_5d.reindex(common_index).values.flatten()
    data_dict['ret_10d'] = ret_10d.reindex(common_index).values.flatten()
    data_dict['ret_20d'] = ret_20d.reindex(common_index).values.flatten()
    
    # Volatilidades (4)
    data_dict['vol_5d'] = vol_5d.reindex(common_index).values.flatten()
    data_dict['vol_10d'] = vol_10d.reindex(common_index).values.flatten()
    data_dict['vol_20d'] = vol_20d.reindex(common_index).values.flatten()
    data_dict['vol_60d'] = vol_60d.reindex(common_index).values.flatten()
    
    # Momentum (3)
    data_dict['momentum_5d'] = momentum_5d.reindex(common_index).values.flatten()
    data_dict['momentum_20d'] = momentum_20d.reindex(common_index).values.flatten()
    data_dict['rsi_14'] = rsi_14.reindex(common_index).values.flatten()
    
    # Moving Averages (3)
    data_dict['ma_ratio_5_20'] = ma_ratio_5_20.reindex(common_index).values.flatten()
    data_dict['price_to_ma_50'] = price_to_ma_50.reindex(common_index).values.flatten()
    data_dict['price_to_ma_200'] = price_to_ma_200.reindex(common_index).values.flatten()
    
    # Technical (3)
    data_dict['bollinger_position'] = bollinger_pos.reindex(common_index).values.flatten()
    data_dict['price_deviation'] = price_deviation.reindex(common_index).values.flatten()
    data_dict['returns_z_score'] = returns_z_score.reindex(common_index).values.flatten()
    
    # Cross-Asset (3)
    data_dict['corr_spy_20d'] = corr_spy_20d.reindex(common_index).values.flatten()
    data_dict['beta_to_market'] = beta_to_market.reindex(common_index).values.flatten()
    data_dict['corr_vix_20d'] = corr_vix_20d.reindex(common_index).values.flatten()
    
    # Macro (2) - ✅ DOUBLE FLATTEN: .squeeze() + .flatten()
    data_dict['vix_level'] = vix_level.reindex(common_index).squeeze().values.flatten()
    data_dict['vix_change_5d'] = vix_change_5d.reindex(common_index).squeeze().values.flatten()
    
    # Volume (2)
    data_dict['vol_ratio_20d'] = vol_ratio_20d.reindex(common_index).values.flatten()
    data_dict['vol_spike'] = vol_spike.reindex(common_index).values.flatten()
    
    # Target (1)
    data_dict['target_5d'] = ret_5d.shift(-5).reindex(common_index).values.flatten()
    
    # ✅ CREAR DATAFRAME desde diccionario con arrays 1D garantizados
    df_ticker = pd.DataFrame(data_dict).dropna()
    
    features_list.append(df_ticker)

# COMBINAR TODOS LOS TICKERS
df_enhanced = pd.concat(features_list, ignore_index=True)
df_enhanced = df_enhanced.sort_values(['date', 'ticker']).reset_index(drop=True)

# NAMES DE FEATURES (para reference)
feature_names = [c for c in df_enhanced.columns if c not in ['ticker', 'date', 'target_5d']]

print(f" Dataset enhanced creado:")
print(f" Shape: {df_enhanced.shape}")
print(f" Features: {len(feature_names)}")
print(f" Tickers: {df_enhanced['ticker'].nunique()}")
print(f" Período: {df_enhanced['date'].min().date()} → {df_enhanced['date'].max().date()}")


🔧 Calculando 24+ features financieras avanzadas...
  📊 AAPL...
  📊 ABT...
  📊 ADBE...
  📊 AMZN...
  📊 BAC...
  📊 COST...
  📊 CRM...
  📊 CSCO...
  📊 CVX...
  📊 DIS...
  📊 GOOGL...
  📊 HD...
  📊 IBM...
  📊 INTC...
  📊 JNJ...
  📊 JPM...
  📊 KO...
  📊 LIN...
  📊 MA...
  📊 MCD...
  📊 META...
  📊 MRK...
  📊 MSFT...
  📊 NFLX...
  📊 NKE...
  📊 NVDA...
  📊 ORCL...
  📊 PEP...
  📊 PFE...
  📊 PG...
  📊 T...
  📊 TSLA...
  📊 UNH...
  📊 V...
  📊 VZ...
  📊 WFC...
  📊 WMT...
  📊 XOM...
 Dataset enhanced creado:
 Shape: (163970, 27)
 Features: 24
 Tickers: 38
 Período: 2013-03-11 → 2025-06-21


In [17]:
# 6. GUARDAR DATASET Y COMPARACIÓN
output_path = cfg.DATA / "processed" / "xgb_enhanced_data.pkl"
joblib.dump(df_enhanced, output_path)

print(f" Dataset enhanced guardado: {output_path}")
print(f" Shape final: {df_enhanced.shape}")
print(f" Features totales: {len(feature_names)}")

# Mostrar lista completa de features
print(f"\n FEATURES TÉCNICAS AVANZADAS ({len(feature_names)}):")
for i, feature in enumerate(feature_names, 1):
    category = ""
    if feature.startswith('ret_'):
        category = " Retornos"
    elif feature.startswith('vol_'):
        category = " Volatilidad"
    elif feature.startswith('momentum') or feature == 'rsi_14':
        category = " Momentum"
    elif 'ma_' in feature or 'price_to_' in feature:
        category = " Moving Avg"
    elif feature in ['bollinger_position', 'price_deviation', 'returns_z_score']:
        category = " Technical"
    elif 'corr_' in feature or 'beta_' in feature:
        category = " Cross-Asset"
    elif 'vix_' in feature:
        category = " Macro"
    elif 'vol_' in feature and ('ratio' in feature or 'spike' in feature):
        category = " Volume"
    
    print(f"{i:2d}. {category:<15} {feature}")

# Comparación con dataset básico (si existe)
try:
    df_basic = joblib.load(cfg.DATA / "processed" / "ridge.pkl")
    basic_features = [c for c in df_basic.columns if c not in ['ticker', 'date', 'target_5d']]
    
    print(f"\n COMPARACIÓN CON MODELOS BÁSICOS:")
    print(f"Ridge/XGB Básico:  {len(basic_features) if basic_features else 4} features")
    print(f"XGB Enhanced:      {len(feature_names)} features")
    print(f"Mejora:            +{len(feature_names) - (len(basic_features) if basic_features else 4)} features ({((len(feature_names) / (len(basic_features) if basic_features else 4)) - 1) * 100:.0f}% más)")
    
except:
    print(f"\n COMPARACIÓN:")
    print(f"Ridge Básico:      4 features (ret_1d, ret_5d, vol_5d, momentum)")
    print(f"XGB Enhanced:      {len(feature_names)} features")
    print(f"Mejora:            +{len(feature_names) - 4} features ({((len(feature_names) / 4) - 1) * 100:.0f}% más)")

print(f"\n PREPROCESAMIENTO XGB ENHANCED COMPLETADO")
print(f" Siguiente paso: Ejecutar entrenamiento 03_train_xgb_enhanced.ipynb")


 Dataset enhanced guardado: C:\Users\1M72763\Desktop\TFM\data\processed\xgb_enhanced_data.pkl
 Shape final: (163970, 27)
 Features totales: 24

📋 FEATURES TÉCNICAS AVANZADAS (24):
 1.  Retornos      ret_1d
 2.  Retornos      ret_5d
 3.  Retornos      ret_10d
 4.  Retornos      ret_20d
 5.  Volatilidad   vol_5d
 6.  Volatilidad   vol_10d
 7.  Volatilidad   vol_20d
 8.  Volatilidad   vol_60d
 9.  Momentum      momentum_5d
10.  Momentum      momentum_20d
11.  Momentum      rsi_14
12.  Moving Avg    ma_ratio_5_20
13.  Moving Avg    price_to_ma_50
14.  Moving Avg    price_to_ma_200
15.  Technical     bollinger_position
16.  Technical     price_deviation
17.  Technical     returns_z_score
18.  Cross-Asset   corr_spy_20d
19.  Cross-Asset   beta_to_market
20.  Cross-Asset   corr_vix_20d
21.  Macro         vix_level
22.  Macro         vix_change_5d
23.  Volatilidad   vol_ratio_20d
24.  Volatilidad   vol_spike

 COMPARACIÓN:
Ridge Básico:      4 features (ret_1d, ret_5d, vol_5d, momentum)
XGB En