In [None]:
from IPython.display import display, HTML

html = """
<div style="background:linear-gradient(135deg,#1a1a2e,#16213e,#0f3460);border-radius:16px;padding:40px;text-align:center;font-family:'Segoe UI',Arial,sans-serif;box-shadow:0 8px 32px rgba(0,0,0,0.4);">
  <div style="font-size:2.5em;">ðŸ¤–</div>
  <h1 style="color:#e0e0e0;font-size:2em;margin:8px 0;">Modelado y Evaluacion</h1>
  <p style="color:#a0aec0;letter-spacing:2px;text-transform:uppercase;font-size:0.95em;">Clasificacion Â· Balanceo Â· GridSearchCV Â· Evaluacion</p>
  <div style="background:rgba(255,255,255,0.06);border:1px solid rgba(255,255,255,0.1);border-radius:10px;padding:16px 24px;max-width:650px;margin:24px auto 0;text-align:left;color:#cbd5e0;font-size:0.93em;line-height:1.7;">
    Entrenamos y comparamos multiples modelos de clasificacion sobre los textos legales vectorizados.
    Manejamos el desbalance extremo de clases con agrupacion, Random Oversampling y SMOTE.
    Optimizamos el mejor modelo con GridSearchCV.
  </div>
</div>
"""
display(HTML(html))


## 1. Carga de Artefactos

Cargamos los artefactos generados por `02_preprocesamiento.ipynb`.

In [None]:
import joblib
import pandas as pd
import numpy as np

X = joblib.load("../models/X_tfidf.pkl")
y = joblib.load("../models/y_labels.pkl")
df = pd.read_csv("../data/raw/df_limpio.csv")

print(f"X shape: {X.shape}")
print(f"Clases: {y.nunique()}")
print(y.value_counts())


## 2. Split Estratificado

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train: {X_train.shape[0]:,}  |  Test: {X_test.shape[0]:,}")


## 3. Comparacion de Modelos

Comparamos cuatro algoritmos usando F1 Macro como metrica principal, ya que penaliza por igual el mal desempeno en clases minoritarias.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
import matplotlib.pyplot as plt

modelos = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced", C=1.0),
    "Naive Bayes":         MultinomialNB(alpha=0.1),
    "LinearSVC":           LinearSVC(class_weight="balanced", max_iter=2000, C=1.0),
    "Random Forest":       RandomForestClassifier(n_estimators=200, class_weight="balanced", random_state=42, n_jobs=-1)
}

resultados = {}
for nombre, modelo in modelos.items():
    modelo.fit(X_train, y_train)
    y_pred = modelo.predict(X_test)
    acc      = np.mean(y_pred == y_test)
    f1_macro = f1_score(y_test, y_pred, average="macro", zero_division=0)
    f1_w     = f1_score(y_test, y_pred, average="weighted", zero_division=0)
    resultados[nombre] = {"Accuracy": acc, "F1 Macro": f1_macro, "F1 Weighted": f1_w}
    print(f"\n{'='*40}\n  {nombre}\n  Accuracy: {acc:.4f}  F1 Macro: {f1_macro:.4f}")

df_res = pd.DataFrame(resultados).T.sort_values("F1 Macro", ascending=False)
print("\n=== Ranking ===")
print(df_res.round(4))

# Grafico
fig, ax = plt.subplots(figsize=(9, 5))
df_res_s = df_res.sort_values("F1 Macro")
ax.barh(df_res_s.index, df_res_s["F1 Macro"], color="#4299e1", alpha=0.85)
ax.set_xlabel("F1 Macro")
ax.set_title("Comparacion de Modelos - F1 Macro")
ax.xaxis.grid(True, alpha=0.4)
for i, v in enumerate(df_res_s["F1 Macro"]):
    ax.text(v + 0.003, i, f"{v:.4f}", va="center", fontsize=9)
plt.tight_layout()
plt.savefig("../reports/model_comparison.png", dpi=150, bbox_inches="tight")
plt.show()


## 4. Optimizacion con GridSearchCV

Buscamos el mejor hiperparametro C para LinearSVC usando validacion cruzada de 5 pliegues optimizando F1 Macro.

In [None]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(
    LinearSVC(class_weight="balanced", max_iter=3000),
    {"C": [0.01, 0.1, 1.0, 10.0]},
    cv=5, scoring="f1_macro", n_jobs=-1, verbose=1
)
grid_search.fit(X_train, y_train)

mejor_modelo = grid_search.best_estimator_
y_pred_best  = mejor_modelo.predict(X_test)

print(f"Mejor C: {grid_search.best_params_}")
print(f"F1 Macro CV: {grid_search.best_score_:.4f}")
print("\n=== LinearSVC Optimizado ===")
print(classification_report(y_test, y_pred_best, zero_division=0))


## 5. Manejo de Desbalance

**Paso 1:** Agrupamos las 3 clases con menos de 200 ejemplos en "otros" (de 10 a 8 clases).
**Paso 2:** Aplicamos oversampling SOLO sobre train para balancear la representacion.

In [None]:
import matplotlib.pyplot as plt

# Agrupacion
clases_min = ["affirmed", "approved", "related"]
df["case_outcome_agrupado"] = df["case_outcome"].apply(
    lambda x: "otros" if x in clases_min else x
)

dist_orig = df["case_outcome"].value_counts()
dist_agr  = df["case_outcome_agrupado"].value_counts()

fig, axes = plt.subplots(1, 2, figsize=(16, 5))
axes[0].barh(dist_orig.index, dist_orig.values, color="#5bc0de", alpha=0.85)
axes[0].set_title("Original (10 clases)")
for i, v in enumerate(dist_orig.values): axes[0].text(v+30, i, str(v), va="center", fontsize=9)

axes[1].barh(dist_agr.index, dist_agr.values, color="#5cb85c", alpha=0.85)
axes[1].set_title("Agrupado (8 clases)")
for i, v in enumerate(dist_agr.values): axes[1].text(v+30, i, str(v), va="center", fontsize=9)

plt.suptitle("Agrupamiento de Clases Minoritarias", fontsize=13)
plt.tight_layout()
plt.savefig("../reports/distribucion_clases.png", dpi=150, bbox_inches="tight")
plt.show()
print(f"Ratio max/min antes:  {dist_orig.max()//dist_orig.min()}x")
print(f"Ratio max/min despues: {dist_agr.max()//dist_agr.min()}x")


## 6. Balanceo con Random Oversampling

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score

vectorizer_ag = TfidfVectorizer(max_features=10000, ngram_range=(1,2), min_df=3, max_df=0.85, sublinear_tf=True)
X_ag = vectorizer_ag.fit_transform(df["clean_text"])
y_ag = df["case_outcome_agrupado"]

X_train_ag, X_test_ag, y_train_ag, y_test_ag = train_test_split(
    X_ag, y_ag, test_size=0.2, random_state=42, stratify=y_ag
)

ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train_ag, y_train_ag)

print(f"Train antes:  {X_train_ag.shape[0]:,} | despues: {X_train_ros.shape[0]:,}")
print("Distribucion despues del oversampling:")
print(pd.Series(y_train_ros).value_counts())

modelo_ros = LinearSVC(max_iter=3000, C=1.0)
modelo_ros.fit(X_train_ros, y_train_ros)
y_pred_ros = modelo_ros.predict(X_test_ag)

f1_ros  = f1_score(y_test_ag, y_pred_ros, average="macro", zero_division=0)
acc_ros = accuracy_score(y_test_ag, y_pred_ros)
print(f"\n=== Agrupacion + Random Oversampling ===")
print(classification_report(y_test_ag, y_pred_ros, zero_division=0))
print(f"F1 Macro: {f1_ros:.4f}  |  Accuracy: {acc_ros:.4f}")


## 7. Balanceo con SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

print("Aplicando SMOTE...")
X_train_dense = X_train_ag.toarray()
X_test_dense  = X_test_ag.toarray()

smote = SMOTE(random_state=42, k_neighbors=3)
X_train_sm, y_train_sm = smote.fit_resample(X_train_dense, y_train_ag)

print(f"Train antes: {X_train_dense.shape[0]:,} | despues: {X_train_sm.shape[0]:,}")

modelo_smote = LinearSVC(max_iter=3000, C=1.0)
modelo_smote.fit(X_train_sm, y_train_sm)
y_pred_smote = modelo_smote.predict(X_test_dense)

f1_smote  = f1_score(y_test_ag, y_pred_smote, average="macro", zero_division=0)
acc_smote = accuracy_score(y_test_ag, y_pred_smote)
print(f"\n=== Agrupacion + SMOTE ===")
print(classification_report(y_test_ag, y_pred_smote, zero_division=0))
print(f"F1 Macro: {f1_smote:.4f}  |  Accuracy: {acc_smote:.4f}")


## 8. Comparacion Final de Enfoques

In [None]:
f1_baseline = f1_score(y_test, y_pred_best, average="macro", zero_division=0)
acc_baseline = accuracy_score(y_test, y_pred_best)

enfoques    = ["Baseline\n(10 cl.)", "Agrup. +\nRandom OS", "Agrup. +\nSMOTE"]
f1_vals     = [f1_baseline, f1_ros, f1_smote]
acc_vals    = [acc_baseline, acc_ros, acc_smote]

x = np.arange(len(enfoques))
width = 0.35
fig, ax = plt.subplots(figsize=(10, 6))
b1 = ax.bar(x - width/2, f1_vals,  width, label="F1 Macro", color="#5cb85c", alpha=0.85)
b2 = ax.bar(x + width/2, acc_vals, width, label="Accuracy", color="#5bc0de", alpha=0.85)
ax.set_ylabel("Score")
ax.set_title("Comparacion de Enfoques de Balanceo")
ax.set_xticks(x); ax.set_xticklabels(enfoques); ax.legend()
ax.set_ylim(0, 1.0); ax.yaxis.grid(True, alpha=0.4)
for bar in b1: ax.text(bar.get_x()+bar.get_width()/2, bar.get_height()+0.01, f"{bar.get_height():.4f}", ha="center", va="bottom", fontsize=9)
for bar in b2: ax.text(bar.get_x()+bar.get_width()/2, bar.get_height()+0.01, f"{bar.get_height():.4f}", ha="center", va="bottom", fontsize=9)
plt.tight_layout()
plt.savefig("../reports/comparacion_balanceo.png", dpi=150, bbox_inches="tight")
plt.show()

resumen = pd.DataFrame({"Enfoque": ["Baseline", "Agrup.+ROS", "Agrup.+SMOTE"],
                         "F1 Macro": [f"{v:.4f}" for v in f1_vals],
                         "Accuracy": [f"{v:.4f}" for v in acc_vals]})
print(resumen.to_string(index=False))
idx = f1_vals.index(max(f1_vals))
print(f"\nMejor F1 Macro: {resumen.iloc[idx]['Enfoque']} ({max(f1_vals):.4f})")


## 9. Visualizaciones del Mejor Modelo

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns

clases_best = sorted(y.unique())
cm = confusion_matrix(y_test, y_pred_best, labels=clases_best)

fig, ax = plt.subplots(figsize=(12, 9))
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clases_best).plot(
    ax=ax, colorbar=True, cmap="Blues", xticks_rotation=45
)
ax.set_title("Matriz de Confusion - LinearSVC Optimizado", fontsize=13)
plt.tight_layout()
plt.savefig("../reports/confusion_matrix.png", dpi=150, bbox_inches="tight")
plt.show()


## 10. Guardado del Mejor Modelo

In [None]:
import json, os
from sklearn.metrics import accuracy_score

os.makedirs("../models", exist_ok=True)
joblib.dump(mejor_modelo,  "../models/best_model.pkl")
joblib.dump(vectorizer_ag, "../models/tfidf_vectorizer_final.pkl")

metricas = {
    "modelo": "LinearSVC optimizado",
    "mejor_C": grid_search.best_params_,
    "accuracy":    round(accuracy_score(y_test, y_pred_best), 4),
    "f1_macro":    round(f1_score(y_test, y_pred_best, average="macro", zero_division=0), 4),
    "f1_weighted": round(f1_score(y_test, y_pred_best, average="weighted", zero_division=0), 4),
    "f1_ros":   round(f1_ros, 4),
    "f1_smote": round(f1_smote, 4),
}
with open("../reports/metrics.json", "w") as f:
    json.dump(metricas, f, indent=2)

print("Modelo guardado en ../models/best_model.pkl")
print("Metricas guardadas en ../reports/metrics.json")
print(json.dumps(metricas, indent=2))
