In [2]:
import sys, pathlib
import pandas as pd
import numpy as np
import joblib
from lightgbm import LGBMRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

PROJECT_ROOT = pathlib.Path().resolve().parent.parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src import config as cfg

print("🚀 XGBoost Enhanced - Entrenamiento Iniciado")
print(f"📂 Datos: {cfg.DATA}")
print(f"📊 Modelos: {cfg.MODELS}")


🚀 XGBoost Enhanced - Entrenamiento Iniciado
📂 Datos: C:\Users\1M72763\Desktop\TFM\data
📊 Modelos: C:\Users\1M72763\Desktop\TFM\models


In [3]:
# 1. CARGAR DATASET ENHANCED
df = joblib.load(cfg.DATA / "processed" / "xgb_enhanced_data.pkl")

print(f"✅ Dataset cargado: {df.shape[0]:,} muestras")
print(f"📊 Tickers únicos: {df['ticker'].nunique()}")
print(f"📈 Rango temporal: {df['date'].min().date()} → {df['date'].max().date()}")

# Features (excluir metadatos y target)
feature_cols = [c for c in df.columns if c not in ['ticker', 'date', 'target_5d']]
print(f"🔢 Features disponibles: {len(feature_cols)}")

# Mostrar primeras features para verificar
print(f"\n📋 Primeras 10 features:")
for i, feat in enumerate(feature_cols[:10], 1):
    print(f"{i:2d}. {feat}")
print("   ...")

# Split temporal 80-20
split_date = df['date'].quantile(0.8)
df_train = df[df['date'] <= split_date].copy()
df_test = df[df['date'] > split_date].copy()

print(f"\n🔄 Train: {len(df_train):,} | Test: {len(df_test):,}")
print(f"📅 Split date: {split_date.date()}")

# Verificar distribución de target
print(f"\n📊 Target distribution:")
print(f"Train target range: [{df_train['target_5d'].min():.4f}, {df_train['target_5d'].max():.4f}]")
print(f"Train target mean: {df_train['target_5d'].mean():.4f} ± {df_train['target_5d'].std():.4f}")


✅ Dataset cargado: 163,970 muestras
📊 Tickers únicos: 38
📈 Rango temporal: 2013-03-11 → 2025-06-21
🔢 Features disponibles: 24

📋 Primeras 10 features:
 1. ret_1d
 2. ret_5d
 3. ret_10d
 4. ret_20d
 5. vol_5d
 6. vol_10d
 7. vol_20d
 8. vol_60d
 9. momentum_5d
10. momentum_20d
   ...

🔄 Train: 131,176 | Test: 32,794
📅 Split date: 2023-02-09

📊 Target distribution:
Train target range: [-0.4809, 0.4192]
Train target mean: 0.0021 ± 0.0331


In [4]:
# 2. ENTRENAMIENTO POR TICKER CON FEATURES AVANZADAS
models_enhanced = {}
mae_scores = {}
feature_importance = {}

print("🔧 Entrenando modelos con 24+ features técnicas...")

for ticker in df['ticker'].unique():
    print(f"  🎯 {ticker}... ", end="")
    
    # Datos del ticker
    train_data = df_train[df_train['ticker'] == ticker]
    test_data = df_test[df_test['ticker'] == ticker]
    
    if len(train_data) < 100:  # Skip si muy pocos datos
        print("❌ Datos insuficientes")
        continue
    
    X_train = train_data[feature_cols]
    y_train = train_data['target_5d']
    X_test = test_data[feature_cols]
    y_test = test_data['target_5d']
    
    # Modelo optimizado LightGBM
    model = LGBMRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        num_leaves=31,
        min_child_samples=20,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=cfg.RANDOM_SEED,
        verbosity=-1,
        force_row_wise=True  # Para evitar warnings
    )
    
    # Entrenar
    model.fit(X_train, y_train)
    
    # Evaluar
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    
    # Guardar
    models_enhanced[ticker] = model
    mae_scores[ticker] = mae
    
    # Feature importance (top 5)
    importances = model.feature_importances_
    top_features = sorted(zip(feature_cols, importances), key=lambda x: x[1], reverse=True)[:5]
    feature_importance[ticker] = top_features
    
    print(f"✅ MAE: {mae:.5f}")

print(f"\n✅ Entrenamiento completado: {len(models_enhanced)} modelos")
print(f"📊 MAE promedio: {np.mean(list(mae_scores.values())):.5f}")


🔧 Entrenando modelos con 24+ features técnicas...
  🎯 AAPL... ✅ MAE: 0.02456
  🎯 ABT... ✅ MAE: 0.01936
  🎯 ADBE... ✅ MAE: 0.03219
  🎯 AMZN... ✅ MAE: 0.02659
  🎯 BAC... ✅ MAE: 0.02669
  🎯 COST... ✅ MAE: 0.01906
  🎯 CRM... ✅ MAE: 0.03006
  🎯 CSCO... ✅ MAE: 0.02015
  🎯 CVX... ✅ MAE: 0.02147
  🎯 DIS... ✅ MAE: 0.02568
  🎯 GOOGL... ✅ MAE: 0.02908
  🎯 HD... ✅ MAE: 0.02049
  🎯 IBM... ✅ MAE: 0.02252
  🎯 INTC... ✅ MAE: 0.04344
  🎯 JNJ... ✅ MAE: 0.01653
  🎯 JPM... ✅ MAE: 0.02393
  🎯 KO... ✅ MAE: 0.01643
  🎯 LIN... ✅ MAE: 0.01599
  🎯 MA... ✅ MAE: 0.01804
  🎯 MCD... ✅ MAE: 0.01733
  🎯 META... ✅ MAE: 0.03306
  🎯 MRK... ✅ MAE: 0.02112
  🎯 MSFT... ✅ MAE: 0.02391
  🎯 NFLX... ✅ MAE: 0.03518
  🎯 NKE... ✅ MAE: 0.03318
  🎯 NVDA... ✅ MAE: 0.04595
  🎯 ORCL... ✅ MAE: 0.03424
  🎯 PEP... ✅ MAE: 0.01689
  🎯 PFE... ✅ MAE: 0.02280
  🎯 PG... ✅ MAE: 0.01550
  🎯 T... ✅ MAE: 0.02231
  🎯 TSLA... ✅ MAE: 0.05917
  🎯 UNH... ✅ MAE: 0.03246
  🎯 V... ✅ MAE: 0.01841
  🎯 VZ... ✅ MAE: 0.02204
  🎯 WFC... ✅ MAE: 0.02974
  🎯 WMT..

In [5]:
# 3. GUARDAR MODELOS Y ANÁLISIS
# Guardar modelos enhanced
models_path = cfg.DATA / "processed" / "xgb_enhanced_model.pkl"
joblib.dump(models_enhanced, models_path)

# Guardar scores para comparación
scores_path = cfg.DATA / "processed" / "mae_xgb_enhanced.pkl"
joblib.dump(mae_scores, scores_path)

print("💾 Modelos enhanced guardados")
print(f"📍 Ubicación: {models_path}")

# Comparación con modelos básicos
try:
    mae_basic = joblib.load(cfg.DATA / "processed" / "mae_xgb.pkl")
    
    print(f"\n🆚 COMPARACIÓN DE PERFORMANCE:")
    print(f"{'Ticker':<8} {'Basic MAE':<12} {'Enhanced MAE':<15} {'Mejora %':<10}")
    print("-" * 50)
    
    total_improvement = 0
    valid_comparisons = 0
    
    for ticker in models_enhanced.keys():
        if ticker in mae_basic:
            basic = mae_basic[ticker]
            enhanced = mae_scores[ticker]
            improvement = ((basic - enhanced) / basic) * 100
            total_improvement += improvement
            valid_comparisons += 1
            
            print(f"{ticker:<8} {basic:<12.5f} {enhanced:<15.5f} {improvement:>+7.1f}%")
    
    if valid_comparisons > 0:
        avg_improvement = total_improvement / valid_comparisons
        print(f"\n📈 MEJORA PROMEDIO: {avg_improvement:+.1f}%")
    
except:
    print("⚠️ No se encontraron modelos básicos para comparación")

# Top features más importantes (global)
print(f"\n🏆 TOP FEATURES MÁS IMPORTANTES:")
all_importances = {}
for ticker, features in feature_importance.items():
    for feat_name, importance in features:
        if feat_name not in all_importances:
            all_importances[feat_name] = []
        all_importances[feat_name].append(importance)

# Promedio de importancia por feature
avg_importances = {feat: np.mean(imps) for feat, imps in all_importances.items()}
top_global_features = sorted(avg_importances.items(), key=lambda x: x[1], reverse=True)[:10]

for i, (feat, importance) in enumerate(top_global_features, 1):
    category = ""
    if feat.startswith('ret_'):
        category = "🔄"
    elif feat.startswith('vol_'):
        category = "📊"
    elif 'momentum' in feat or feat == 'rsi_14':
        category = "🚀"
    elif 'ma_' in feat or 'price_to_' in feat:
        category = "📈"
    elif feat in ['bollinger_position', 'price_deviation', 'returns_z_score']:
        category = "📐"
    elif 'corr_' in feat or 'beta_' in feat:
        category = "🔗"
    elif 'vix_' in feat:
        category = "🌍"
    
    print(f"{i:2d}. {category} {feat:<20} {importance:.4f}")

print(f"\n✅ XGB ENHANCED TRAINING COMPLETADO")
print(f"🔢 Features utilizadas: {len(feature_cols)}")
print(f"🎯 Modelos entrenados: {len(models_enhanced)}")
print(f"📊 MAE promedio: {np.mean(list(mae_scores.values())):.5f}")
print(f"🎯 Siguiente paso: Actualizar config.py a MODEL_TYPE = 'xgb_enhanced' y ejecutar backtest")


💾 Modelos enhanced guardados
📍 Ubicación: C:\Users\1M72763\Desktop\TFM\data\processed\xgb_enhanced_model.pkl

🆚 COMPARACIÓN DE PERFORMANCE:
Ticker   Basic MAE    Enhanced MAE    Mejora %  
--------------------------------------------------
AAPL     0.02468      0.02456            +0.5%
ABT      0.02005      0.01936            +3.4%
ADBE     0.03304      0.03219            +2.6%
AMZN     0.02809      0.02659            +5.3%
BAC      0.02688      0.02669            +0.7%
COST     0.01927      0.01906            +1.1%
CRM      0.03093      0.03006            +2.8%
CSCO     0.02016      0.02015            +0.0%
CVX      0.02183      0.02147            +1.6%
DIS      0.02553      0.02568            -0.6%
GOOGL    0.02917      0.02908            +0.3%
HD       0.02149      0.02049            +4.6%
IBM      0.02156      0.02252            -4.4%
INTC     0.04561      0.04344            +4.8%
JNJ      0.01649      0.01653            -0.2%
JPM      0.02263      0.02393            -5.7%
KO      