In [3]:
# PASO 1: Cargar datos preprocesados y preparar
# ---------------------------------------------

import sys, pathlib
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split

# AÃ±adir src/ al path
PROJECT_ROOT = pathlib.Path().resolve().parent.parent  # notebooks/xgb â†’ raÃ­z
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src import config as cfg

# Cargar dataset procesado
df = joblib.load(cfg.DATA / "processed" / "xgb_data.pkl")

print(f"âœ… Datos cargados: {df.shape[0]:,} muestras | {len(df['ticker'].unique())} tickers")

# Separar train/test por fecha (80% - 20%)
split_date = df["date"].quantile(0.8)
df_train = df[df["date"] <= split_date].copy()
df_test  = df[df["date"] > split_date].copy()

print(f"ðŸ“Š Train: {df_train.shape[0]:,} muestras | Test: {df_test.shape[0]:,}")


âœ… Datos cargados: 180,400 muestras | 40 tickers
ðŸ“Š Train: 144,320 muestras | Test: 36,080


In [5]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
from collections import defaultdict

modelos = {}
mae_scores = {}

features = ["ret_1d", "ret_5d", "vol_5d", "momentum"]

tickers = df["ticker"].unique()

for ticker in tickers:
    df_tr = df_train[df_train["ticker"] == ticker]
    df_te = df_test[df_test["ticker"] == ticker]
    
    X_train, y_train = df_tr[features], df_tr["target_5d"]
    X_test, y_test   = df_te[features], df_te["target_5d"]
    
    model = LGBMRegressor(n_estimators=200, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    
    modelos[ticker] = model
    mae_scores[ticker] = mae

    print(f"âœ… {ticker:5} | MAE: {mae:.5f} | Train: {len(X_train):5} | Test: {len(X_test):5}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000107 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 3608, number of used features: 4
[LightGBM] [Info] Start training from score 0.002758
âœ… AAPL  | MAE: 0.02468 | Train:  3608 | Test:   902
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000118 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 3608, number of used features: 4
[LightGBM] [Info] Start training from score 0.002121
âœ… ABT   | MAE: 0.02005 | Train:  3608 | Test:   902
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000182 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info

In [7]:
# PASO 3: Guardar modelos y mÃ©tricas
# ----------------------------------

# Guardar modelos
joblib.dump(modelos, cfg.DATA / "processed" / "xgb_model.pkl")
print("ðŸ’¾ Modelos guardados en xgb_model.pkl")

# Guardar errores MAE
joblib.dump(mae_scores, cfg.DATA / "processed" / "mae_xgb.pkl")
print("ðŸ“ˆ MAE por ticker guardado en mae_xgb.pkl")


ðŸ’¾ Modelos guardados en xgb_model.pkl
ðŸ“ˆ MAE por ticker guardado en mae_xgb.pkl
