In [1]:
# Importar librerías
import pandas as pd
import numpy as np
import json
import time
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

In [2]:
# Cargar datos y características
processed_data_path = '../../data/processed/final_processed_data.parquet'
features_path = '../../data/processed/final_features.json'

df = pd.read_parquet(processed_data_path)
with open(features_path, 'r') as f:
    final_features = json.load(f)

print("Datos y características cargados.")

Datos y características cargados.


In [3]:
# Preparar datos
target_column = 'precio_mxn'
X = df[final_features]
y = df[target_column]

if X.isnull().sum().sum() > 0 or y.isnull().sum() > 0:
    mask = ~(X.isnull().any(axis=1) | y.isnull())
    X = X[mask]
    y = y[mask]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Definir pipeline y grid de hiperparámetros
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', XGBRegressor(random_state=42, n_jobs=-1, objective='reg:squarederror'))
])

param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [5, 7],
    'regressor__learning_rate': [0.1, 0.05]
}

# Búsqueda de hiperparámetros
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='r2', n_jobs=-1, verbose=2)

print("Iniciando ajuste de hiperparámetros para XGBoost...")
start_time = time.time()
grid_search.fit(X_train, y_train)
tuning_time = time.time() - start_time
print(f"Ajuste completado en {tuning_time:.2f} segundos.")

# Mejores parámetros y score
print("Mejores parámetros encontrados:", grid_search.best_params_)
print("Mejor R² (CV):", grid_search.best_score_)

# Evaluar el mejor modelo en el conjunto de prueba
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)
test_r2 = r2_score(y_test, y_pred_test)
print(f"R² en conjunto de prueba: {test_r2:.4f}")

# Guardar el mejor modelo
model_path = '../../models/xgboost_tuned_pipeline.joblib'
joblib.dump(best_model, model_path)
print(f"\nModelo optimizado guardado en: {model_path}")

Iniciando ajuste de hiperparámetros para XGBoost...
Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] END regressor__learning_rate=0.1, regressor__max_depth=5, regressor__n_estimators=100; total time=   0.1s
[CV] END regressor__learning_rate=0.1, regressor__max_depth=5, regressor__n_estimators=100; total time=   0.1s
[CV] END regressor__learning_rate=0.1, regressor__max_depth=5, regressor__n_estimators=100; total time=   0.1s
[CV] END regressor__learning_rate=0.1, regressor__max_depth=5, regressor__n_estimators=200; total time=   0.1s
[CV] END regressor__learning_rate=0.1, regressor__max_depth=7, regressor__n_estimators=100; total time=   0.1s
[CV] END regressor__learning_rate=0.1, regressor__max_depth=5, regressor__n_estimators=200; total time=   0.1s
[CV] END regressor__learning_rate=0.05, regressor__max_depth=5, regressor__n_estimators=100; total time=   0.1s
[CV] END regressor__learning_rate=0.1, regressor__max_depth=7, regressor__n_estimators=200; total time=   0.2s