# Modelo de Regresión

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.compose import make_column_transformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# =================== Cargar Datos ===================
df = sns.load_dataset("diamonds")

# Definir variables predictoras y objetivo
X = df.drop(columns=["price"])
y = df["price"]

# Identificar tipos de columnas
categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

In [2]:
# =================== Preprocesamiento ===================
numerical_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)

categorical_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore', sparse_output=False)
)

preprocessor = make_column_transformer(
    (numerical_pipeline, numerical_cols),
    (categorical_pipeline, categorical_cols)
)

In [3]:
# =================== División de Datos ===================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# =================== Entrenamiento de Modelos ===================
modelos = {
    "Regresión Lineal": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

mejores_resultados = {}

for nombre, modelo in modelos.items():
    pipeline = make_pipeline(preprocessor, modelo)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    mejores_resultados[nombre] = {
        "Modelo": pipeline,
        "MAE": mae,
        "RMSE": rmse,
        "R²": r2
    }
    
    print(f"{nombre}: MAE={mae:.2f}, RMSE={rmse:.2f}, R²={r2:.2f}")
    

Regresión Lineal: MAE=737.15, RMSE=1288705.48, R²=0.92
Random Forest: MAE=270.16, RMSE=303125.65, R²=0.98
Gradient Boosting: MAE=398.09, RMSE=517483.05, R²=0.97


In [4]:
# =================== Selección del Mejor Modelo ===================
import os


mejor_modelo_nombre = min(mejores_resultados, key=lambda x: mejores_resultados[x]["MAE"])
mejor_modelo = mejores_resultados[mejor_modelo_nombre]["Modelo"]

print(f"\nMejor modelo seleccionado: {mejor_modelo_nombre}")

# =================== Crear carpeta models si no existe ===================
models_dir = "../models"
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
    print(f"Carpeta '{models_dir}' creada.")

# =================== Exportar el Modelo ===================
model_path = os.path.join(models_dir, "model_regression.joblib")
joblib.dump(mejor_modelo, model_path)
print(f"Modelo guardado como '{model_path}'")


Mejor modelo seleccionado: Random Forest
Modelo guardado como '../models/model_regression.joblib'
