# Entrenamiento de Modelos - California Housing Dataset

Este notebook se encarga del entrenamiento y comparaci√≥n de m√∫ltiples modelos de Machine Learning.

**Objetivo**: Entrenar, comparar y seleccionar el mejor modelo para predicci√≥n de precios de viviendas.

**Modelos a entrenar**:
- Random Forest Regressor
- XGBoost Regressor
- LightGBM Regressor

**Autor**: MLOps Pipeline Project  
**Fecha**: Noviembre 2025

## 1. Importar Librer√≠as

In [None]:
import pandas as pd
import numpy as np
import json
import warnings
import time
from datetime import datetime
warnings.filterwarnings('ignore')

# Feature Engineering
from ft_engineering import prepare_data_for_training

# Machine Learning Models
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Model Persistence
import joblib
import os

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Configuraci√≥n de visualizaci√≥n
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("‚úì Librer√≠as importadas exitosamente")
print(f"Versi√≥n de pandas: {pd.__version__}")
print(f"Versi√≥n de numpy: {np.__version__}")

## 2. Cargar Configuraci√≥n

In [None]:
# Cargar configuraci√≥n desde config.json
with open('../config.json', 'r') as f:
    config = json.load(f)

print("=" * 80)
print("CONFIGURACI√ìN DEL PROYECTO")
print("=" * 80)
for key, value in config.items():
    print(f"{key:20s}: {value}")
print("=" * 80)

# Configuraci√≥n adicional
RANDOM_STATE = config.get('random_state', 42)
MODEL_OUTPUT_DIR = f"../{config.get('model_output_dir', 'models/')}"
os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)

## 3. Preparar Datos con Feature Engineering

In [None]:
# Ejecutar pipeline de feature engineering completo
X_train, X_test, y_train, y_test, preprocessor, feature_names = prepare_data_for_training(
    config_path='../config.json',
    test_size=0.2,
    save_preprocessor=True
)

In [None]:
# Verificar dimensiones de los datos preparados
print("\n" + "=" * 80)
print("DATOS PREPARADOS PARA ENTRENAMIENTO")
print("=" * 80)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")
print(f"Total features: {len(feature_names)}")
print("\nPrimeros 10 features:")
for i, name in enumerate(feature_names[:10], 1):
    print(f"  {i:2d}. {name}")
if len(feature_names) > 10:
    print(f"  ... y {len(feature_names) - 10} m√°s")
print("=" * 80)

## 4. Definir Modelos a Entrenar

In [None]:
# Diccionario de modelos a entrenar
models = {
    'Random Forest': RandomForestRegressor(
        n_estimators=100,
        max_depth=20,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=RANDOM_STATE,
        n_jobs=-1,
        verbose=0
    ),
    'XGBoost': XGBRegressor(
        n_estimators=100,
        max_depth=7,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=RANDOM_STATE,
        n_jobs=-1,
        verbosity=0
    ),
    'LightGBM': LGBMRegressor(
        n_estimators=100,
        max_depth=7,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=RANDOM_STATE,
        n_jobs=-1,
        verbose=-1
    )
}

print("=" * 80)
print("MODELOS CONFIGURADOS")
print("=" * 80)
for name, model in models.items():
    print(f"\n{name}:")
    print(f"  {model.__class__.__name__}")
print("\n" + "=" * 80)

## 5. Funci√≥n de Evaluaci√≥n de Modelos

In [None]:
def evaluate_model(y_true, y_pred, model_name="Model"):
    """
    Eval√∫a un modelo con m√∫ltiples m√©tricas de regresi√≥n.
    
    Args:
        y_true: Valores reales
        y_pred: Valores predichos
        model_name: Nombre del modelo
        
    Returns:
        dict: Diccionario con las m√©tricas calculadas
    """
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    
    # Calcular MAPE (Mean Absolute Percentage Error)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    
    metrics = {
        'Model': model_name,
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R2': r2,
        'MAPE (%)': mape
    }
    
    return metrics

print("‚úì Funci√≥n de evaluaci√≥n definida")

## 6. Entrenar y Evaluar Modelos

In [None]:
# Diccionario para almacenar resultados
results = {
    'train_metrics': [],
    'test_metrics': [],
    'trained_models': {},
    'predictions': {},
    'training_times': {}
}

print("=" * 80)
print("ENTRENANDO MODELOS")
print("=" * 80)

for model_name, model in models.items():
    print(f"\n{'='*80}")
    print(f"Entrenando: {model_name}")
    print(f"{'='*80}")
    
    # Medir tiempo de entrenamiento
    start_time = time.time()
    
    # Entrenar modelo
    model.fit(X_train, y_train)
    
    # Calcular tiempo
    training_time = time.time() - start_time
    results['training_times'][model_name] = training_time
    
    print(f"‚úì Entrenamiento completado en {training_time:.2f} segundos")
    
    # Predicciones
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Guardar predicciones
    results['predictions'][model_name] = {
        'train': y_train_pred,
        'test': y_test_pred
    }
    
    # Evaluar en conjunto de entrenamiento
    train_metrics = evaluate_model(y_train, y_train_pred, model_name)
    results['train_metrics'].append(train_metrics)
    
    # Evaluar en conjunto de prueba
    test_metrics = evaluate_model(y_test, y_test_pred, model_name)
    results['test_metrics'].append(test_metrics)
    
    # Guardar modelo entrenado
    results['trained_models'][model_name] = model
    
    # Imprimir resultados
    print(f"\nM√©tricas en TRAIN:")
    print(f"  MAE:  ${train_metrics['MAE']:,.2f}")
    print(f"  RMSE: ${train_metrics['RMSE']:,.2f}")
    print(f"  R¬≤:   {train_metrics['R2']:.4f}")
    
    print(f"\nM√©tricas en TEST:")
    print(f"  MAE:  ${test_metrics['MAE']:,.2f}")
    print(f"  RMSE: ${test_metrics['RMSE']:,.2f}")
    print(f"  R¬≤:   {test_metrics['R2']:.4f}")

print(f"\n{'='*80}")
print("‚úÖ TODOS LOS MODELOS ENTRENADOS")
print(f"{'='*80}")

## 7. Comparaci√≥n de Resultados

In [None]:
# Crear DataFrames con los resultados
train_results_df = pd.DataFrame(results['train_metrics'])
test_results_df = pd.DataFrame(results['test_metrics'])

# Agregar tiempos de entrenamiento
train_results_df['Training Time (s)'] = train_results_df['Model'].map(results['training_times'])

print("=" * 80)
print("RESULTADOS EN CONJUNTO DE ENTRENAMIENTO")
print("=" * 80)
print(train_results_df.to_string(index=False))

print("\n" + "=" * 80)
print("RESULTADOS EN CONJUNTO DE PRUEBA")
print("=" * 80)
print(test_results_df.to_string(index=False))

### 7.1 Visualizaci√≥n de Comparaci√≥n de M√©tricas

In [None]:
# Gr√°fico de comparaci√≥n de m√©tricas
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Comparaci√≥n de Modelos - M√©tricas en Test Set', fontsize=16, fontweight='bold')

# MAE
axes[0, 0].bar(test_results_df['Model'], test_results_df['MAE'], color='skyblue', edgecolor='black')
axes[0, 0].set_title('Mean Absolute Error (MAE)', fontweight='bold')
axes[0, 0].set_ylabel('MAE ($)')
axes[0, 0].tick_params(axis='x', rotation=45)
axes[0, 0].grid(True, alpha=0.3)

# RMSE
axes[0, 1].bar(test_results_df['Model'], test_results_df['RMSE'], color='lightcoral', edgecolor='black')
axes[0, 1].set_title('Root Mean Squared Error (RMSE)', fontweight='bold')
axes[0, 1].set_ylabel('RMSE ($)')
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].grid(True, alpha=0.3)

# R¬≤
axes[1, 0].bar(test_results_df['Model'], test_results_df['R2'], color='lightgreen', edgecolor='black')
axes[1, 0].set_title('R¬≤ Score', fontweight='bold')
axes[1, 0].set_ylabel('R¬≤')
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(True, alpha=0.3)
axes[1, 0].set_ylim([0, 1])

# MAPE
axes[1, 1].bar(test_results_df['Model'], test_results_df['MAPE (%)'], color='plum', edgecolor='black')
axes[1, 1].set_title('Mean Absolute Percentage Error (MAPE)', fontweight='bold')
axes[1, 1].set_ylabel('MAPE (%)')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Seleccionar el Mejor Modelo

In [None]:
# Seleccionar mejor modelo basado en RMSE en test set
best_model_idx = test_results_df['RMSE'].idxmin()
best_model_name = test_results_df.loc[best_model_idx, 'Model']
best_model = results['trained_models'][best_model_name]

print("=" * 80)
print("MEJOR MODELO SELECCIONADO")
print("=" * 80)
print(f"\nModelo: {best_model_name}")
print(f"\nM√©tricas en Test Set:")
print(f"  MAE:  ${test_results_df.loc[best_model_idx, 'MAE']:,.2f}")
print(f"  RMSE: ${test_results_df.loc[best_model_idx, 'RMSE']:,.2f}")
print(f"  R¬≤:   {test_results_df.loc[best_model_idx, 'R2']:.4f}")
print(f"  MAPE: {test_results_df.loc[best_model_idx, 'MAPE (%)']:.2f}%")
print("\n" + "=" * 80)

## 9. Visualizaci√≥n de Predicciones del Mejor Modelo

In [None]:
# Obtener predicciones del mejor modelo
y_test_pred_best = results['predictions'][best_model_name]['test']

# Gr√°fico de predicciones vs valores reales
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Scatter plot: Predicciones vs Valores Reales
axes[0].scatter(y_test, y_test_pred_best, alpha=0.5, s=20)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
             'r--', lw=2, label='Predicci√≥n perfecta')
axes[0].set_xlabel('Valores Reales ($)', fontsize=12)
axes[0].set_ylabel('Predicciones ($)', fontsize=12)
axes[0].set_title(f'Predicciones vs Valores Reales - {best_model_name}', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Distribuci√≥n de errores
errors = y_test - y_test_pred_best
axes[1].hist(errors, bins=50, edgecolor='black', alpha=0.7)
axes[1].axvline(x=0, color='r', linestyle='--', linewidth=2, label='Error = 0')
axes[1].set_xlabel('Error de Predicci√≥n ($)', fontsize=12)
axes[1].set_ylabel('Frecuencia', fontsize=12)
axes[1].set_title(f'Distribuci√≥n de Errores - {best_model_name}', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Estad√≠sticas de errores
print(f"\nEstad√≠sticas de Errores ({best_model_name}):")
print(f"  Media de errores: ${errors.mean():,.2f}")
print(f"  Desv. Std. de errores: ${errors.std():,.2f}")
print(f"  Error m√≠nimo: ${errors.min():,.2f}")
print(f"  Error m√°ximo: ${errors.max():,.2f}")

## 10. Importancia de Features (si aplica)

In [None]:
# Verificar si el modelo tiene feature_importances_
if hasattr(best_model, 'feature_importances_'):
    # Obtener importancias
    importances = best_model.feature_importances_
    
    # Crear DataFrame con importancias
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    }).sort_values('Importance', ascending=False)
    
    # Top 15 features m√°s importantes
    top_features = feature_importance_df.head(15)
    
    # Visualizaci√≥n
    plt.figure(figsize=(12, 8))
    plt.barh(range(len(top_features)), top_features['Importance'], align='center')
    plt.yticks(range(len(top_features)), top_features['Feature'])
    plt.xlabel('Importancia', fontsize=12)
    plt.ylabel('Feature', fontsize=12)
    plt.title(f'Top 15 Features M√°s Importantes - {best_model_name}', fontsize=14, fontweight='bold')
    plt.gca().invert_yaxis()
    plt.grid(True, alpha=0.3, axis='x')
    plt.tight_layout()
    plt.show()
    
    print("\nTop 10 Features M√°s Importantes:")
    print(feature_importance_df.head(10).to_string(index=False))
else:
    print(f"\nEl modelo {best_model_name} no tiene atributo 'feature_importances_'")

## 11. Guardar el Mejor Modelo

In [None]:
# Guardar el mejor modelo
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_filename = f"best_model_{best_model_name.replace(' ', '_').lower()}_{timestamp}.pkl"
model_path = os.path.join(MODEL_OUTPUT_DIR, model_filename)

joblib.dump(best_model, model_path)

print("=" * 80)
print("MODELO GUARDADO")
print("=" * 80)
print(f"Modelo: {best_model_name}")
print(f"Ruta: {model_path}")
print(f"Tama√±o: {os.path.getsize(model_path) / 1024:.2f} KB")
print("=" * 80)

# Guardar tambi√©n los resultados de entrenamiento
results_filename = f"training_results_{timestamp}.pkl"
results_path = os.path.join(MODEL_OUTPUT_DIR, results_filename)

# Preparar resultados para guardar
results_to_save = {
    'best_model_name': best_model_name,
    'train_metrics': train_results_df,
    'test_metrics': test_results_df,
    'feature_names': feature_names,
    'training_times': results['training_times']
}

joblib.dump(results_to_save, results_path)
print(f"\n‚úì Resultados de entrenamiento guardados en: {results_path}")

## 12. Resumen Final

In [None]:
print("=" * 80)
print("RESUMEN DEL ENTRENAMIENTO DE MODELOS")
print("=" * 80)

print(f"\nüìä Dataset:")
print(f"  ‚Ä¢ Total de muestras: {len(y_train) + len(y_test):,}")
print(f"  ‚Ä¢ Muestras de entrenamiento: {len(y_train):,}")
print(f"  ‚Ä¢ Muestras de prueba: {len(y_test):,}")
print(f"  ‚Ä¢ Features totales: {len(feature_names)}")

print(f"\nü§ñ Modelos entrenados: {len(models)}")
for model_name in models.keys():
    print(f"  ‚Ä¢ {model_name}")

print(f"\nüèÜ Mejor Modelo: {best_model_name}")
print(f"  ‚Ä¢ RMSE: ${test_results_df.loc[best_model_idx, 'RMSE']:,.2f}")
print(f"  ‚Ä¢ R¬≤: {test_results_df.loc[best_model_idx, 'R2']:.4f}")
print(f"  ‚Ä¢ Tiempo de entrenamiento: {results['training_times'][best_model_name]:.2f}s")

print(f"\nüíæ Archivos guardados:")
print(f"  ‚Ä¢ Modelo: {model_filename}")
print(f"  ‚Ä¢ Resultados: {results_filename}")

print(f"\nüìà Observaciones:")
print(f"  ‚Ä¢ Todos los modelos han sido entrenados exitosamente")
print(f"  ‚Ä¢ El modelo con mejor desempe√±o ha sido guardado")
print(f"  ‚Ä¢ Los resultados est√°n listos para evaluaci√≥n detallada")

print(f"\n‚úÖ Entrenamiento completado exitosamente")
print("=" * 80)