In [None]:
# Importar librerías
import pandas as pd
import numpy as np
import json
import time
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
# Cargar datos y características
processed_data_path = '../../data/processed/final_processed_data.parquet'
features_path = '../../data/processed/final_features.json'

df = pd.read_parquet(processed_data_path)
with open(features_path, 'r') as f:
    final_features = json.load(f)

print("Datos y características cargados.")

In [None]:
# Preparar datos
target_column = 'precio_mxn'
X = df[final_features]
y = df[target_column]

if X.isnull().sum().sum() > 0 or y.isnull().sum() > 0:
    mask = ~(X.isnull().any(axis=1) | y.isnull())
    X = X[mask]
    y = y[mask]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Definir pipeline y grid de hiperparámetros
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor(random_state=42, n_jobs=-1))
])

param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [10, 20],
    'regressor__min_samples_leaf': [1, 2]
}

# Búsqueda de hiperparámetros
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='r2', n_jobs=-1, verbose=2)

print("Iniciando ajuste de hiperparámetros para Random Forest...")
start_time = time.time()
grid_search.fit(X_train, y_train)
tuning_time = time.time() - start_time
print(f"Ajuste completado en {tuning_time:.2f} segundos.")

print("Mejores parámetros encontrados:", grid_search.best_params_)
print(f"Mejor R² (CV): {grid_search.best_score_:.4f}")

# Evaluar el mejor modelo
best_model = grid_search.best_estimator_
y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

def evaluate_model(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return {'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2}

train_metrics = evaluate_model(y_train, y_pred_train)
test_metrics = evaluate_model(y_test, y_pred_test)
print(f"R² en prueba (modelo optimizado): {test_metrics['r2']:.4f}")

# Guardar modelo y métricas
model_path = '../../models/random_forest_tuned_pipeline.joblib'
metrics_path = '../../models/random_forest_tuned_metrics.json'
joblib.dump(best_model, model_path)
results = {'train_metrics': train_metrics, 'test_metrics': test_metrics, 'training_time': tuning_time, 'best_params': grid_search.best_params_}
with open(metrics_path, 'w') as f:
    json.dump(results, f)

print(f"\nModelo optimizado guardado en: {model_path}")
print(f"Métricas guardadas en: {metrics_path}")