In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# =================== Cargar Datos ===================
df = sns.load_dataset("diamonds")

# Convertir variables categóricas a tipo string
categorical_cols = ['color', 'clarity']
df[categorical_cols] = df[categorical_cols].astype(str)

# Separar características y variable objetivo
X = df.drop(columns=["cut"])  # 'cut' es lo que queremos predecir
y = df["cut"]

In [2]:
# =================== Preprocesamiento ===================
categorical_pipeline = make_pipeline(
    OneHotEncoder(handle_unknown='ignore')
)
numerical_pipeline = make_pipeline(
    StandardScaler()
)

column_transformer = make_column_transformer(
    (numerical_pipeline, ["carat", "depth", "table", "x", "y", "z", "price"]),
    (categorical_pipeline, ["color", "clarity"])
)

# Separar en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [3]:
# =================== Entrenar Modelos ===================
modelos = {
    "Random Forest": make_pipeline(column_transformer, RandomForestClassifier(n_estimators=100, random_state=42)),
    "Gradient Boosting": make_pipeline(column_transformer, GradientBoostingClassifier(n_estimators=100, random_state=42)),
    "SVM": make_pipeline(column_transformer, SVC(kernel='linear', probability=True))
}

resultados = {}

for nombre, modelo in modelos.items():
    print(f"Entrenando {nombre}...")
    modelo.fit(X_train, y_train)
    y_pred = modelo.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    print(f"Precisión de {nombre}: {acc:.4f}")
    print(classification_report(y_test, y_pred))
    
    resultados[nombre] = {
        "Modelo": modelo,
        "Accuracy": acc
    }

Entrenando Random Forest...
Precisión de Random Forest: 0.7678
              precision    recall  f1-score   support

        Fair       0.90      0.84      0.87       322
        Good       0.77      0.68      0.72       981
       Ideal       0.82      0.92      0.87      4310
     Premium       0.73      0.81      0.77      2758
   Very Good       0.67      0.47      0.56      2417

    accuracy                           0.77     10788
   macro avg       0.78      0.74      0.76     10788
weighted avg       0.76      0.77      0.76     10788

Entrenando Gradient Boosting...
Precisión de Gradient Boosting: 0.7578
              precision    recall  f1-score   support

        Fair       0.89      0.90      0.89       322
        Good       0.84      0.66      0.74       981
       Ideal       0.82      0.92      0.87      4310
     Premium       0.67      0.85      0.75      2758
   Very Good       0.69      0.39      0.50      2417

    accuracy                           0.76     107

In [5]:
# =================== Selección del Mejor Modelo ===================
import os


mejor_modelo_nombre = max(resultados, key=lambda x: resultados[x]["Accuracy"])
mejor_modelo = resultados[mejor_modelo_nombre]["Modelo"]

print(f"\nMejor modelo seleccionado: {mejor_modelo_nombre}")

# =================== Crear carpeta models si no existe ===================
models_dir = "../models"
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
    print(f"Carpeta '{models_dir}' creada.")

# =================== Exportar el Modelo ===================
model_path = os.path.join(models_dir, "model_classification.joblib")
joblib.dump(mejor_modelo, model_path)
print(f"Modelo guardado como '{model_path}'")


Mejor modelo seleccionado: Random Forest
Carpeta '../models' creada.
Modelo guardado como '../models/model_classification.joblib'
