## Entrenamiento Ridge

In [1]:
import sys, pathlib
import pandas as pd
import numpy as np
import joblib
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error

# A√±adir src/ al path para importar cfg
PROJECT_ROOT = pathlib.Path().resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src import config as cfg

In [2]:
# Cargar dataset procesado de XGB
df = joblib.load(cfg.DATA / 'processed' / 'xgb_data.pkl')
print(f'‚úÖ Datos cargados: {len(df):,} muestras')

# ‚úÖ LIMPIEZA INICIAL DE NaN
print(f'üìä NaN por columna antes de limpiar:')
print(df.isnull().sum()[df.isnull().sum() > 0])

# Eliminar filas con NaN en features o target
features_all = ['ret_1d','ret_5d','vol_5d','momentum', 'target_5d']
df_clean = df[features_all + ['ticker', 'date']].dropna()
print(f'üßπ Datos despu√©s de limpiar NaN: {len(df_clean):,} muestras')
print(f'üìâ Datos descartados: {len(df) - len(df_clean):,} ({(len(df) - len(df_clean))/len(df)*100:.1f}%)')

# Separar train/test por fecha
split_date = df_clean['date'].quantile(0.8)
df_train = df_clean[df_clean['date'] <= split_date].copy()
df_test  = df_clean[df_clean['date'] > split_date].copy()
print(f'üìä Train: {len(df_train):,} | Test: {len(df_test):,}')

‚úÖ Datos cargados: 180,400 muestras
üìä NaN por columna antes de limpiar:
Series([], dtype: int64)
üßπ Datos despu√©s de limpiar NaN: 180,400 muestras
üìâ Datos descartados: 0 (0.0%)
üìä Train: 144,320 | Test: 36,080


In [3]:
features = ['ret_1d','ret_5d','vol_5d','momentum']
models = {}
mae_scores = {}
total_discarded = 0

for ticker in df_clean['ticker'].unique():
    tr = df_train[df_train['ticker'] == ticker].copy()
    te = df_test[df_test['ticker'] == ticker].copy()
    
    if len(tr) == 0 or len(te) == 0:
        print(f'‚ö†Ô∏è {ticker:5} | Sin datos suficientes')
        continue
    
    # ‚úÖ VERIFICACI√ìN ADICIONAL DE NaN POR TICKER
    X_tr, y_tr = tr[features], tr['target_5d']
    X_te, y_te = te[features], te['target_5d']
    
    # Verificar NaN antes del entrenamiento
    nan_count = X_tr.isnull().sum().sum() + y_tr.isnull().sum()
    if nan_count > 0:
        print(f'‚ö†Ô∏è {ticker:5} | {nan_count} NaN detectados, limpiando...')
        # Eliminar filas con NaN
        mask = ~(X_tr.isnull().any(axis=1) | y_tr.isnull())
        X_tr, y_tr = X_tr[mask], y_tr[mask]
        total_discarded += (~mask).sum()
    
    # Verificar que tenemos datos despu√©s de limpiar
    if len(X_tr) < 10:
        print(f'‚ùå {ticker:5} | Datos insuficientes despu√©s de limpiar')
        continue
        
    # ‚úÖ VERIFICACI√ìN FINAL
    assert not X_tr.isnull().any().any(), f"Still NaN in X_tr for {ticker}"
    assert not y_tr.isnull().any(), f"Still NaN in y_tr for {ticker}"
    
    # Entrenar modelo
    model = Ridge(alpha=1.0)
    model.fit(X_tr, y_tr)
    
    # Evaluar si hay datos de test
    if len(X_te) > 0:
        # Limpiar datos test tambi√©n
        test_mask = ~(X_te.isnull().any(axis=1) | y_te.isnull())
        X_te_clean, y_te_clean = X_te[test_mask], y_te[test_mask]
        
        if len(X_te_clean) > 0:
            pred = model.predict(X_te_clean)
            mae = mean_absolute_error(y_te_clean, pred)
        else:
            mae = np.nan
    else:
        mae = np.nan
    
    models[ticker] = model
    mae_scores[ticker] = mae
    print(f'‚úÖ {ticker:5} | MAE: {mae:.5f} | Train: {len(X_tr):4} | Test: {len(X_te):4}')

print(f'\nüìä Resumen:')
print(f'‚úÖ Modelos entrenados: {len(models)}')
print(f'üßπ Total muestras descartadas por NaN: {total_discarded}')
print(f'üìà MAE promedio: {np.nanmean(list(mae_scores.values())):.5f}')

‚úÖ AAPL  | MAE: 0.02234 | Train: 3608 | Test:  902
‚úÖ ABT   | MAE: 0.01778 | Train: 3608 | Test:  902
‚úÖ ADBE  | MAE: 0.02949 | Train: 3608 | Test:  902
‚úÖ AMZN  | MAE: 0.02568 | Train: 3608 | Test:  902
‚úÖ BAC   | MAE: 0.02448 | Train: 3608 | Test:  902
‚úÖ BTC-USD | MAE: 0.04288 | Train: 3608 | Test:  902
‚úÖ COST  | MAE: 0.01776 | Train: 3608 | Test:  902
‚úÖ CRM   | MAE: 0.02844 | Train: 3608 | Test:  902
‚úÖ CSCO  | MAE: 0.01796 | Train: 3608 | Test:  902
‚úÖ CVX   | MAE: 0.01982 | Train: 3608 | Test:  902
‚úÖ DIS   | MAE: 0.02366 | Train: 3608 | Test:  902
‚úÖ ETH-USD | MAE: 0.05176 | Train: 3608 | Test:  902
‚úÖ GOOGL | MAE: 0.02713 | Train: 3608 | Test:  902
‚úÖ HD    | MAE: 0.01952 | Train: 3608 | Test:  902
‚úÖ IBM   | MAE: 0.01980 | Train: 3608 | Test:  902
‚úÖ INTC  | MAE: 0.04303 | Train: 3608 | Test:  902
‚úÖ JNJ   | MAE: 0.01509 | Train: 3608 | Test:  902
‚úÖ JPM   | MAE: 0.02061 | Train: 3608 | Test:  902
‚úÖ KO    | MAE: 0.01292 | Train: 3608 | Test:  902
‚úÖ LIN 

In [4]:
# ‚úÖ VERIFICACI√ìN FINAL ANTES DE GUARDAR
print('üîç Verificaci√≥n final de modelos:')
for ticker, model in list(models.items())[:5]:  # Solo primeros 5 para verificar
    # Crear datos de prueba sin NaN
    test_data = np.array([[0.01, 0.05, 0.02, 0.5]])  # Datos de ejemplo
    try:
        pred = model.predict(test_data)
        print(f'‚úÖ {ticker}: Predicci√≥n OK = {pred[0]:.5f}')
    except Exception as e:
        print(f'‚ùå {ticker}: Error = {e}')
        del models[ticker]  # Eliminar modelo problem√°tico

joblib.dump(models, cfg.MODELS / 'ridge.pkl')
print(f'\nüíæ {len(models)} modelos guardados en models/ridge.pkl')

üîç Verificaci√≥n final de modelos:
‚úÖ AAPL: Predicci√≥n OK = 0.00045
‚úÖ ABT: Predicci√≥n OK = -0.00087
‚úÖ ADBE: Predicci√≥n OK = 0.00043
‚úÖ AMZN: Predicci√≥n OK = 0.00276
‚úÖ BAC: Predicci√≥n OK = 0.00049

üíæ 40 modelos guardados en models/ridge.pkl


