## Modeling

In [1]:
# ============================================================
# IMEDIA ¬∑ Equipo EMI ¬∑ Training + HPO + Registry (Hyperopt + MLflow)
# Auto-resoluci√≥n de tracking (Databricks si hay auth; local si no)
# ============================================================
import os, sys, json, math, time, hashlib, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pathlib import Path
from datetime import datetime

# ML/Preproc
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Hyperopt
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

# MLflow
import mlflow
from mlflow import MlflowClient
from mlflow.models.signature import infer_signature

# .env
from dotenv import load_dotenv

# -------------------------------
# Configuraci√≥n base / Reproducibilidad
# -------------------------------
SEED = 42
np.random.seed(SEED)

EXPERIMENT_NAME = "/Users/marianasgg19@gmail.com/EMI/imedia/experiment"   # <-- ajusta si deseas
MODEL_NAME = "workspace.default/EMI_imedia_model"                         # nomenclatura sugerida

# >>> : carpeta local para guardar el preprocesador
PREPROC_LOCAL_DIR = Path("../preprocesador")
PREPROC_LOCAL_DIR.mkdir(parents=True, exist_ok=True)
# <<< 

# -------------------------------
# Carga datasets preparados
# -------------------------------
data_dir = Path("../data/processed")
train_csv = data_dir / "train_posts_clean.csv"
test_csv  = data_dir / "test_posts_clean.csv"

assert train_csv.exists(), f"No existe {train_csv}"
assert test_csv.exists(),  f"No existe {test_csv}"

train_df = pd.read_csv(train_csv)
test_df  = pd.read_csv(test_csv)

# -------------------------------
# Metadatos de datasets (versionado b√°sico)
# -------------------------------
def file_md5(path: Path) -> str:
    h = hashlib.md5()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            h.update(chunk)
    return h.hexdigest()

train_meta = {"path": str(train_csv), "shape": list(train_df.shape), "md5": file_md5(train_csv)}
test_meta  = {"path": str(test_csv),  "shape": list(test_df.shape),  "md5": file_md5(test_csv)}

# -------------------------------
# Definici√≥n de columnas y target
# -------------------------------
TARGET = "score_clipped" if "score_clipped" in train_df.columns else "score"

# Toma tus features seleccionadas y filtra por existencia
CANDIDATE_FEATURES = [
    'num_comments_capped',
    'recency_days',
    'dayofweek',
    'title_len',
    'selftext_len',
    'is_self',
    'month',
    'link_flair_text',
    'subreddit',
    'author'
]
FEATURES = [c for c in CANDIDATE_FEATURES if c in train_df.columns]

# Separaci√≥n de tipos
cat_cols = [c for c in FEATURES if train_df[c].dtype == "object"]
num_cols = [c for c in FEATURES if c not in cat_cols]

# -------------------------------
# Preprocesador (integrado en cada pipeline)
# -------------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=True, with_std=True), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", min_frequency=5), cat_cols)
    ]
)

# -------------------------------
# Split interno train/val (para HPO)
# -------------------------------
X = train_df[FEATURES].copy()
y = train_df[TARGET].astype(float).copy()

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=0.2, random_state=SEED, shuffle=True
)

# -------------------------------
# Espacios de b√∫squeda (Hyperopt)
# -------------------------------
spaces = {
    "elasticnet": {
        "alpha": hp.loguniform("alpha", math.log(1e-4), math.log(10.0)),
        "l1_ratio": hp.uniform("l1_ratio", 0.0, 1.0),
    },
    "random_forest": {
        "n_estimators": hp.quniform("n_estimators", 100, 1000, 50),
        "max_depth": hp.quniform("max_depth", 4, 30, 1),
        "min_samples_split": hp.quniform("min_samples_split", 2, 20, 1),
        "min_samples_leaf": hp.quniform("min_samples_leaf", 1, 10, 1),
        "max_features": hp.choice("max_features", ["sqrt", "log2", None]),
    },
    "xgboost": {
        "n_estimators": hp.quniform("n_estimators", 200, 1200, 50),
        "max_depth": hp.quniform("max_depth", 3, 12, 1),
        "learning_rate": hp.loguniform("learning_rate", math.log(1e-3), math.log(0.3)),
        "subsample": hp.uniform("subsample", 0.6, 1.0),
        "colsample_bytree": hp.uniform("colsample_bytree", 0.6, 1.0),
        "reg_alpha": hp.loguniform("reg_alpha", math.log(1e-8), math.log(1e-1)),
        "reg_lambda": hp.loguniform("reg_lambda", math.log(1e-6), math.log(1.0)),
        "min_child_weight": hp.quniform("min_child_weight", 1, 10, 1),
    },
}

# -------------------------------
# Helpers MLflow: tracking resiliente + tags seguros
# -------------------------------
def _resolve_tracking_uri() -> str:
    """
    Prioriza .env / entorno; si no hay auth Databricks, usa tracking local en ./mlruns.
    """
    load_dotenv(override=True)
    env_uri = os.getenv("MLFLOW_TRACKING_URI")
    profile = os.getenv("DATABRICKS_CONFIG_PROFILE")
    host = os.getenv("DATABRICKS_HOST")
    token = os.getenv("DATABRICKS_TOKEN")

    if env_uri:  # respeta configuraci√≥n expl√≠cita
        return env_uri
    if profile:  # databricks via perfil
        return f"databricks://{profile}"
    if host and token:  # databricks via host+token
        return "databricks"

    # Fallback local
    local_store = Path.cwd() / "mlruns"
    local_store.mkdir(parents=True, exist_ok=True)
    return f"file://{local_store}"

def set_mlflow():
    tracking_uri = _resolve_tracking_uri()
    mlflow.set_tracking_uri(tracking_uri)

    # Diagn√≥stico √∫til
    print(f"üîó MLflow tracking URI: {mlflow.get_tracking_uri()}")
    print("   DATABRICKS_CONFIG_PROFILE:", os.getenv("DATABRICKS_CONFIG_PROFILE"))
    print("   DATABRICKS_HOST set?:", bool(os.getenv("DATABRICKS_HOST")))
    print("   DATABRICKS_TOKEN set?:", bool(os.getenv("DATABRICKS_TOKEN")))

    # Asegurar experimento
    client = MlflowClient()
    exp = client.get_experiment_by_name(EXPERIMENT_NAME)
    if exp is None:
        exp_id = client.create_experiment(EXPERIMENT_NAME)
        print(f"üÜï Experimento creado: {EXPERIMENT_NAME} (id={exp_id})")
    else:
        print(f"‚úÖ Experimento encontrado: {EXPERIMENT_NAME} (id={exp.experiment_id})")

    mlflow.set_experiment(EXPERIMENT_NAME)

def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

def log_dataset_meta(prefix: str):
    mlflow.log_dict(train_meta, f"{prefix}_train_meta.json")
    mlflow.log_dict(test_meta,  f"{prefix}_test_meta.json")

def str_tags(d: dict) -> dict:
    """Convierte cualquier valor a string para tags (MLflow exige str)."""
    return {str(k): (json.dumps(v) if isinstance(v, (dict, list)) else str(v)) for k, v in d.items()}

# -------------------------------
# Definiciones de modelos y objetivos HPO
# -------------------------------
def build_pipeline(model_key, params):
    if model_key == "elasticnet":
        model = ElasticNet(
            alpha=float(params["alpha"]),
            l1_ratio=float(params["l1_ratio"]),
            random_state=SEED,
            max_iter=10000
        )
    elif model_key == "random_forest":
        model = RandomForestRegressor(
            n_estimators=int(params["n_estimators"]),
            max_depth=int(params["max_depth"]),
            min_samples_split=int(params["min_samples_split"]),
            min_samples_leaf=int(params["min_samples_leaf"]),
            max_features=params["max_features"],  # None / 'sqrt' / 'log2'
            n_jobs=-1,
            random_state=SEED
        )
    elif model_key == "xgboost":
        model = XGBRegressor(
            n_estimators=int(params["n_estimators"]),
            max_depth=int(params["max_depth"]),
            learning_rate=float(params["learning_rate"]),
            subsample=float(params["subsample"]),
            colsample_bytree=float(params["colsample_bytree"]),
            reg_alpha=float(params["reg_alpha"]),
            reg_lambda=float(params["reg_lambda"]),
            min_child_weight=int(params["min_child_weight"]),
            objective="reg:squarederror",
            random_state=SEED,
            n_jobs=-1,
            tree_method="hist",
        )
    else:
        raise ValueError("Modelo no soportado")

    pipe = Pipeline(steps=[("prep", preprocessor), ("model", model)])
    return pipe

def make_objective(model_key):
    def _objective(params):
        with mlflow.start_run(run_name=f"{model_key}_trial", nested=True) as run:
            # construir pipeline y entrenar
            pipe = build_pipeline(model_key, params)
            pipe.fit(X_tr, y_tr)

            # evaluar
            y_pred = pipe.predict(X_val)
            metric = rmse(y_val, y_pred)

            # log params/metric/artifacts
            mlflow.set_tag("team", "EMI")
            mlflow.set_tag("project", "imedia")
            mlflow.set_tag("model_family", model_key)
            mlflow.set_tag("feature_set", json.dumps(FEATURES))
            mlflow.log_metric("rmse", float(metric))
            mlflow.log_params({
                k: (float(v) if isinstance(v, (np.floating,)) else (int(v) if isinstance(v, (np.integer,)) else v))
                for k, v in params.items()
            })
            # signature e input_example (peque√±o)
            x_example = X_val.head(5).copy()
            y_example = pipe.predict(x_example)
            signature = infer_signature(x_example, y_example)
            mlflow.sklearn.log_model(
                sk_model=pipe,
                artifact_path="model",
                signature=signature,
                input_example=x_example
            )
            # datasets meta
            log_dataset_meta(prefix=f"{model_key}")
            # Devolver p√©rdida para Hyperopt (minimizar)
            return {"loss": float(metric), "status": STATUS_OK, "run_id": run.info.run_id}
    return _objective

# -------------------------------
# Entrenar/HPO por modelo (3 modelos)
# -------------------------------
print("üì¶ Entorno:")
print("  MLFLOW_TRACKING_URI:", os.getenv("MLFLOW_TRACKING_URI"))
print("  DATABRICKS_CONFIG_PROFILE:", os.getenv("DATABRICKS_CONFIG_PROFILE"))
print("  DATABRICKS_HOST:", os.getenv("DATABRICKS_HOST"))
print("  DATABRICKS_TOKEN set?:", bool(os.getenv("DATABRICKS_TOKEN")))

# rmse seguro (sin 'squared')
def rmse(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

# RNG compatible con Hyperopt (.integers)
def make_rstate(seed: int):
    return np.random.default_rng(seed)

# üëâ Tracking + Experiment
set_mlflow()

# üëâ Forzar Model Registry "legacy" (Workspace), NO Unity Catalog
mlflow.set_registry_uri("databricks")
print("üîó MLflow registry URI:", mlflow.get_registry_uri())

results_summary = []  # para documentar todo y luego elegir campe√≥n/desafiante

MODELS = [
    ("elasticnet", spaces["elasticnet"], 3),     
    ("random_forest", spaces["random_forest"], 3),
    ("xgboost", spaces["xgboost"], 3),
]

with mlflow.start_run(
    run_name="IMEDIA_EMI_AllModels",
    tags=str_tags({
        "team": "EMI",
        "project": "imedia",
        "target": TARGET,
        "metric": "rmse",
        "seed": SEED,
        "preprocessor": "ColumnTransformer(StandardScaler + OneHotEncoder)",
        "preprocessor_in_pipeline": True
    })
) as parent_run:

    parent_run_id = parent_run.info.run_id
    mlflow.log_text(
        f"Datasets:\ntrain={train_meta}\ntest={test_meta}\n"
        f"Features={FEATURES}\nTarget={TARGET}\nSeed={SEED}\nDate={datetime.utcnow().isoformat()}Z",
        "run_context.txt"
    )

    for model_key, space, n_trials in MODELS:
        trials = Trials()
        with mlflow.start_run(run_name=f"HPO_{model_key}", nested=True) as model_parent:
            model_parent_id = model_parent.info.run_id
            objective = make_objective(model_key)

            # Hyperopt espera RNG con .integers -> default_rng
            best = fmin(
                fn=objective,
                space=space,
                algo=tpe.suggest,
                max_evals=n_trials,
                rstate=make_rstate(SEED),
                show_progressbar=False
            )

            # Tipos/valores correctos para mejores params
            if model_key == "random_forest":
                best["n_estimators"] = int(best["n_estimators"])
                best["max_depth"] = int(best["max_depth"])
                best["min_samples_split"] = int(best["min_samples_split"])
                best["min_samples_leaf"] = int(best["min_samples_leaf"])
                # hp.choice devuelve √≠ndice -> mapear al valor real
                max_feats_choices = ["sqrt", "log2", None]
                if isinstance(best.get("max_features"), (int, np.integer)):
                    best["max_features"] = max_feats_choices[int(best["max_features"])]

            if model_key == "xgboost":
                best["n_estimators"] = int(best["n_estimators"])
                best["max_depth"] = int(best["max_depth"])
                best["min_child_weight"] = int(best["min_child_weight"])

            mlflow.log_params({f"best_{model_key}_{k}": v for k, v in best.items()})

            # Re-entrenar con mejores params en train (X_tr) y evaluar en val
            pipe_best = build_pipeline(model_key, best)
            pipe_best.fit(X_tr, y_tr)
            val_rmse = rmse(y_val, pipe_best.predict(X_val))
            mlflow.log_metric("val_rmse_best", float(val_rmse))

            # Snapshot del mejor de HPO
            x_example = X_val.head(5)
            y_example = pipe_best.predict(x_example)
            sig = infer_signature(x_example, y_example)
            mlflow.sklearn.log_model(
                pipe_best,
                artifact_path=f"{model_key}_best_snapshot",
                signature=sig,
                input_example=x_example
            )

            results_summary.append({
                "model_key": model_key,
                "parent_run": model_parent_id,
                "best_params": best,
                "val_rmse": float(val_rmse)
            })

# -------------------------------
# Selecci√≥n expl√≠cita por RMSE (m√©trica √∫nica)
# -------------------------------
results_df = pd.DataFrame(results_summary).sort_values("val_rmse", ascending=True).reset_index(drop=True)
print("=== Resultados por modelo (ordenados por RMSE asc) ===")
print(results_df)

best_model_key = results_df.loc[0, "model_key"]
second_model_key = results_df.loc[1, "model_key"] if len(results_df) > 1 else None
best_params = results_df.loc[0, "best_params"]

# -------------------------------
# Entrenar modelo FINAL (mejor) sobre TODO el train_df y evaluar en test_df
# -------------------------------
X_train_full = train_df[FEATURES].copy()
y_train_full = train_df[TARGET].astype(float).copy()
X_test = test_df[FEATURES].copy()
y_test = test_df[TARGET].astype(float).copy()

final_pipe = build_pipeline(best_model_key, best_params)

# (re)asegurar tracking y registry
set_mlflow()
mlflow.set_registry_uri("databricks")

best_test_rmse = None  # <-- para tabla comparativa

with mlflow.start_run(
    run_name=f"FINAL_{best_model_key}_on_full_train",
    tags=str_tags({
        "team": "EMI",
        "project": "imedia",
        "stage": "final_fit",
        "selected_by": "rmse ASC",
        "seed": SEED
    })
) as final_run:
    final_pipe.fit(X_train_full, y_train_full)
    test_pred = final_pipe.predict(X_test)
    test_rmse = rmse(y_test, test_pred)
    best_test_rmse = float(test_rmse)
    mlflow.log_metric("test_rmse", best_test_rmse)
    mlflow.log_params({f"final_{best_model_key}_{k}": v for k, v in best_params.items()})
    mlflow.set_tag("final_model_family", best_model_key)
    mlflow.set_tag("metric_selection", "rmse ASC (lower is better)")
    mlflow.set_tag("preprocessor_in_pipeline", "true")

    # Firma e input example
    x_example = X_test.head(5)
    y_example = final_pipe.predict(x_example)
    signature = infer_signature(x_example, y_example)

    # Registrar el pipeline completo (incluye preprocessor) en MLflow
    mlflow.sklearn.log_model(
        sk_model=final_pipe,
        artifact_path="model",
        signature=signature,
        input_example=x_example
    )

    # >>> >>> Registrar expl√≠citamente el PREPROCESADOR en MLflow **y guardarlo localmente en ./preprocesador/**
    fitted_preprocessor = final_pipe.named_steps["prep"]

    # a) Como artifact MLflow (consistencia con tracking)
    mlflow.sklearn.log_model(
        sk_model=fitted_preprocessor,
        artifact_path="preprocessor"
    )

    # b) Guardado LOCAL estilo MLflow (directorio con MLmodel + pickles) dentro de ./preprocesador/
    ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
    local_save_dir = PREPROC_LOCAL_DIR / f"{best_model_key}_preprocessor_{ts}"
    mlflow.sklearn.save_model(sk_model=fitted_preprocessor, path=str(local_save_dir))
    print(f"üíæ Preprocesador guardado localmente en: {local_save_dir.resolve()}")
    # <<< <<<

    # Descripci√≥n del modelo (para el registry)
    model_description = {
        "team": "EMI",
        "project": "imedia",
        "target": TARGET,
        "primary_metric": "rmse",
        "test_rmse": best_test_rmse,
        "features": FEATURES,
        "preprocessor": "ColumnTransformer(StandardScaler + OneHotEncoder)",
        "preprocessor_in_pipeline": True,
        "datasets": {"train": train_meta, "test": test_meta},
        "date": datetime.utcnow().strftime("%Y-%m-%d"),
        "seeds": {"global": SEED, "models": SEED, "hyperopt": SEED},
        "changelog": "Registro autom√°tico con HPO (Hyperopt) para 3 familias; selecci√≥n por RMSE m√≠nimo.",
        "responsibles": ["Equipo EMI"]
    }
    mlflow.log_text(json.dumps(model_description, indent=2), "model_description.json")

    best_run_id = final_run.info.run_id
    best_run_uri = f"runs:/{best_run_id}/model"

# -------------------------------
# Registrar en Model Registry (Unity Catalog)
# -------------------------------
# Usa Unity Catalog como backend del Model Registry
mlflow.set_registry_uri("databricks-uc")
print("üîó MLflow registry URI:", mlflow.get_registry_uri())

client = MlflowClient()

# Nombre UC en formato catalog.schema.model
MODEL_NAME_REG = "workspace.default.EMI_imedia_model"  # <= ajusta cat√°logo/esquema si usas otros

# Registra el campe√≥n (el modelo final ya logueado en este run)
result = mlflow.register_model(model_uri=best_run_uri, name=MODEL_NAME_REG)
champ_version = result.version

# Challenger opcional: segundo mejor (si existe)
challenger_version = None
challenger_test_rmse = None  # <-- para tabla comparativa
if second_model_key is not None:
    second_params = results_df.loc[1, "best_params"]
    second_pipe = build_pipeline(second_model_key, second_params)

    set_mlflow()  # asegura tracking para este nuevo run
    mlflow.set_registry_uri("databricks-uc")

    with mlflow.start_run(
        run_name=f"FINAL_{second_model_key}_challenger",
        tags=str_tags({
            "team": "EMI",
            "project": "imedia",
            "stage": "final_fit_challenger",
            "selected_by": "second_best_val_rmse",
            "seed": SEED
        })
    ) as chal_run:
        second_pipe.fit(X_train_full, y_train_full)
        chal_pred = second_pipe.predict(X_test)
        chal_rmse = rmse(y_test, chal_pred)
        challenger_test_rmse = float(chal_rmse)
        mlflow.log_metric("test_rmse", challenger_test_rmse)
        mlflow.log_params({f"final_{second_model_key}_{k}": v for k, v in second_params.items()})
        mlflow.set_tag("final_model_family", second_model_key)
        mlflow.set_tag("metric_selection", "rmse ASC (second best)")
        mlflow.set_tag("preprocessor_in_pipeline", "true")

        x_example = X_test.head(5)
        y_example = second_pipe.predict(x_example)
        sig2 = infer_signature(x_example, y_example)
        mlflow.sklearn.log_model(
            second_pipe,
            artifact_path="model",
            signature=sig2,
            input_example=x_example
        )

        # >>> : tambi√©n guardar el preprocesador del challenger en MLflow y LOCAL ./preprocesador/
        fitted_preprocessor_chal = second_pipe.named_steps["prep"]

        # a) Artifact MLflow
        mlflow.sklearn.log_model(
            sk_model=fitted_preprocessor_chal,
            artifact_path="preprocessor"
        )

        # b) Guardado LOCAL
        ts2 = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
        local_save_dir2 = PREPROC_LOCAL_DIR / f"{second_model_key}_preprocessor_{ts2}"
        mlflow.sklearn.save_model(sk_model=fitted_preprocessor_chal, path=str(local_save_dir2))
        print(f"üíæ Preprocesador (challenger) guardado localmente en: {local_save_dir2.resolve()}")
        # <<< 

        chal_run_id = chal_run.info.run_id
        chal_run_uri = f"runs:/{chal_run_id}/model"
        res2 = mlflow.register_model(model_uri=chal_run_uri, name=MODEL_NAME_REG)
        challenger_version = res2.version

# Aliases en UC (soportados)
client.set_registered_model_alias(name=MODEL_NAME_REG, alias="champion",   version=champ_version)
if challenger_version is not None:
    client.set_registered_model_alias(name=MODEL_NAME_REG, alias="challenger", version=challenger_version)

print("\n‚úÖ Registro completo en Model Registry (Unity Catalog)")
print(f"  Champion  -> {MODEL_NAME_REG}@champion (v{champ_version})")
if challenger_version is not None:
    print(f"  Challenger -> {MODEL_NAME_REG}@challenger (v{challenger_version})")
print("\nExperimento MLflow:", EXPERIMENT_NAME)
print("M√©trica √∫nica de selecci√≥n: RMSE (menor es mejor)")
print("Target:", TARGET)
print("Features:", FEATURES)
print("Train shape:", tuple(train_df.shape), "| Test shape:", tuple(test_df.shape))

# -------------------------------
# >>> Tabla comparativa de la m√©trica principal entre modelos
# -------------------------------
comparison_rows = []
for _, r in results_df.iterrows():
    comparison_rows.append({
        "model_key": r["model_key"],
        "split": "val",
        "rmse": float(r["val_rmse"])
    })
if best_test_rmse is not None:
    comparison_rows.append({
        "model_key": best_model_key,
        "split": "test",
        "rmse": best_test_rmse
    })
if 'challenger_test_rmse' in locals() and challenger_test_rmse is not None:
    comparison_rows.append({
        "model_key": second_model_key,
        "split": "test",
        "rmse": challenger_test_rmse
    })

comparison_df = pd.DataFrame(comparison_rows).sort_values(["split", "rmse"]).reset_index(drop=True)

print("\n=== Tabla comparativa RMSE (menor es mejor) ===")
print(comparison_df)

csv_path = Path("model_metric_comparison.csv")
comparison_df.to_csv(csv_path, index=False)
print(f"\nüìÑ Tabla comparativa guardada en: {csv_path.resolve()}")

set_mlflow()
with mlflow.start_run(run_name="METRICS_COMPARISON") as cmp_run:
    mlflow.log_text(comparison_df.to_csv(index=False), "model_metric_comparison.csv")
    mlflow.set_tag("note", "Comparativa de RMSE entre modelos (val/test).")


üì¶ Entorno:
  MLFLOW_TRACKING_URI: None
  DATABRICKS_CONFIG_PROFILE: None
  DATABRICKS_HOST: https://dbc-5922e233-b716.cloud.databricks.com/
  DATABRICKS_TOKEN set?: True
üîó MLflow tracking URI: databricks
   DATABRICKS_CONFIG_PROFILE: None
   DATABRICKS_HOST set?: True
   DATABRICKS_TOKEN set?: True
‚úÖ Experimento encontrado: /Users/marianasgg19@gmail.com/EMI/imedia/experiment (id=2410509746257126)
üîó MLflow registry URI: databricks


2025/11/24 01:47:14 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


üèÉ View run elasticnet_trial at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126/runs/5bf089dbd97c4ebaafb4ce6e45b39d3b
üß™ View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126




üèÉ View run elasticnet_trial at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126/runs/f2432ed67b274690aaeb42384baf73ea
üß™ View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126




üèÉ View run elasticnet_trial at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126/runs/9773690648de4c4a980f833b0ec202e1
üß™ View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126




üèÉ View run HPO_elasticnet at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126/runs/92e9a8549e664489a9c54329aebc5f2b
üß™ View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126




üèÉ View run random_forest_trial at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126/runs/f59018912f464c3b8133f8a0f5f335ed
üß™ View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126




üèÉ View run random_forest_trial at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126/runs/f116654b304e482b84d424615f1d2c1b
üß™ View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126




üèÉ View run random_forest_trial at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126/runs/b772f4e38c3d43a0be9093f3dd6cc7ee
üß™ View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126




üèÉ View run HPO_random_forest at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126/runs/e42c7e6ba363438cb511eb4141527ca9
üß™ View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126




üèÉ View run xgboost_trial at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126/runs/e6997ded03274bccac6a1e8aa9199c04
üß™ View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126




üèÉ View run xgboost_trial at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126/runs/6d678312f49d496697cab0a25c9e6dee
üß™ View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126




üèÉ View run xgboost_trial at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126/runs/21fd11f96b224deb9fe3880806680012
üß™ View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126




üèÉ View run HPO_xgboost at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126/runs/cd3982fea75a43b8a86ec4119bed2b7a
üß™ View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126
üèÉ View run IMEDIA_EMI_AllModels at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126/runs/16fd3e23ed6545b3a41ed43418323f19
üß™ View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126
=== Resultados por modelo (ordenados por RMSE asc) ===
       model_key                        parent_run  \
0  random_forest  e42c7e6ba363438cb511eb4141527ca9   
1        xgboost  cd3982fea75a43b8a86ec4119bed2b7a   
2     elasticnet  92e9a8549e664489a9c54329aebc5f2b   

                                         best_params    val_rmse  
0  {'max_depth': 19, 'max_features': None, 'min_s...   95.584186  
1  {'colsample_bytree': 0.7295996142861736, 'lear...   98.757263  
2  {'alpha': 0.1



üíæ Preprocesador guardado localmente en: /Users/msgarcia/Desktop/School/proyecto_2/IMEDIA_Project_v2/preprocesador/random_forest_preprocessor_20251124_074843
üèÉ View run FINAL_random_forest_on_full_train at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126/runs/d44c2e77ac0f4a28a9d8d0ec9618c9d1
üß™ View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126
üîó MLflow registry URI: databricks-uc


Registered model 'workspace.default.EMI_imedia_model' already exists. Creating a new version of this model...
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:02<00:00,  3.37it/s]
Uploading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [00:07<00:00,  1.06it/s]
Created version '5' of model 'workspace.default.emi_imedia_model'.


üîó MLflow tracking URI: databricks
   DATABRICKS_CONFIG_PROFILE: None
   DATABRICKS_HOST set?: True
   DATABRICKS_TOKEN set?: True
‚úÖ Experimento encontrado: /Users/marianasgg19@gmail.com/EMI/imedia/experiment (id=2410509746257126)


Registered model 'workspace.default.EMI_imedia_model' already exists. Creating a new version of this model...


üíæ Preprocesador (challenger) guardado localmente en: /Users/msgarcia/Desktop/School/proyecto_2/IMEDIA_Project_v2/preprocesador/xgboost_preprocessor_20251124_074909


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:01<00:00,  4.03it/s]
Uploading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [00:03<00:00,  2.55it/s]
Created version '6' of model 'workspace.default.emi_imedia_model'.


üèÉ View run FINAL_xgboost_challenger at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126/runs/285dd5ccabe5424282bc23d015a5a0ad
üß™ View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/2410509746257126

‚úÖ Registro completo en Model Registry (Unity Catalog)
  Champion  -> workspace.default.EMI_imedia_model@champion (v5)
  Challenger -> workspace.default.EMI_imedia_model@challenger (v6)

Experimento MLflow: /Users/marianasgg19@gmail.com/EMI/imedia/experiment
M√©trica √∫nica de selecci√≥n: RMSE (menor es mejor)
Target: score_clipped
Features: ['num_comments_capped', 'recency_days', 'dayofweek', 'title_len', 'selftext_len', 'is_self', 'month', 'link_flair_text', 'subreddit', 'author']
Train shape: (1233, 11) | Test shape: (309, 11)

=== Tabla comparativa RMSE (menor es mejor) ===
       model_key split        rmse
0  random_forest  test   84.697971
1        xgboost  test   85.303490
2  random_forest   val   95.584186
3        

# 2) üì¶ Datos y Data Readiness

**Versionado y tama√±os (train/test)**  
- Rutas: `../data/processed/train_posts_clean.csv` y `../data/processed/test_posts_clean.csv`.  
- Tama√±os usados en entrenamiento/final: **Train (685, 11)** | **Test (172, 11)**.  
- Integridad: en cada run se loggea `md5` y `shape` de ambos datasets como artifacts (`*_train_meta.json`, `*_test_meta.json`).

**Target & m√©tricas**  
- **Target:** `score_clipped` (recorta outliers v√≠a IQR para estabilizar la varianza).  
- **M√©trica principal:** **RMSE** (menor es mejor).  
- M√©trica reportada en **val** durante HPO y en **test** para el modelo final.

**Selecci√≥n de variables (features)**  
Se parte de un set curado por EDA y de ingenier√≠a ligera:  
- Num√©ricas: `num_comments_capped`, `recency_days`, `dayofweek`, `title_len`, `selftext_len`, `is_self`, `month`  
- Categ√≥ricas: `link_flair_text`, `subreddit`, `author`  
*(el c√≥digo filtra autom√°ticamente por columnas presentes en el dataset).*

**Prevenci√≥n de data leakage**  
- Se **excluyen** columnas con fuga: `post_id`, `url`, `permalink`, `thumbnail`.  
- El target usa `score_clipped` (no derivado de variables futuras).  
- La evaluaci√≥n final se hace en **test** separado.

**Tabla breve de features (tipo y raz√≥n)**

| Feature              | Tipo      | Raz√≥n/hip√≥tesis de valor |
|---|---|---|
| `num_comments_capped` | Num√©rica  | Se√±al de interacci√≥n temprana; cap al p99 reduce var extrema. |
| `recency_days`       | Num√©rica  | Efecto de frescura del post. |
| `dayofweek`          | Num√©rica (entero) | Patrones de consumo por d√≠a. |
| `month`              | Num√©rica (entero) | Estacionalidad y tendencias. |
| `title_len`          | Num√©rica  | Longitud del t√≠tulo correlaciona con CTR/engagement. |
| `selftext_len`       | Num√©rica  | Carga informativa del contenido. |
| `is_self`            | Binaria   | Diferencia entre link-post y text-post. |
| `link_flair_text`    | Categ√≥rica| Tema/contexto del post (moderaci√≥n/comunidad). |
| `subreddit`          | Categ√≥rica| Efecto de comunidad. |
| `author`             | Categ√≥rica| Efecto autor (historial/credibilidad). |

---

# 3) üß± Preprocesamiento (ColumnTransformer/Pipeline)

**Definici√≥n**  
- `ColumnTransformer` con:  
  - **Num√©ricas:** `StandardScaler(with_mean=True, with_std=True)` sobre columnas num√©ricas.  
  - **Categ√≥ricas:** `OneHotEncoder(handle_unknown="ignore", min_frequency=5)` sobre `object`.

**Integraci√≥n y versionado**  
- El **preprocessor est√° integrado** en cada `Pipeline` (`("prep", preprocessor) ‚Üí ("model", estimador)`), por lo que **viaja junto al modelo**.  
- Adem√°s, se **versiona por separado**:  
  - Se loggea como **artifact MLflow** en `preprocessor/`.  
  - Se guarda **local** en `../preprocesador/<modelo>_preprocessor_<timestamp>/` (directorio con `MLmodel` + pickles).

**Reproducibilidad**  
- Semilla global `SEED=42` para split y modelos; `Hyperopt` con `default_rng(SEED)`.  
- Tracking determin√≠stico: se registran rutas, shapes, `md5`, lista de `FEATURES` y `TARGET` en `run_context.txt`.

**Evidencia m√≠nima**  
- En los runs `FINAL_*` aparecen artifacts `model/` (pipeline completo) y `preprocessor/` (solo transformador).  
- Mensajes de consola confirman los guardados locales en `../preprocesador/...`.

---

# 4) üß™ Experimentos de Modelado (‚â•3 modelos)

**Familias y HPO**  
- Modelos: **ElasticNet**, **RandomForestRegressor**, **XGBRegressor**.  
- Para cada familia:  
  - `HPO_<modelo>` con **runs anidados** (`nested=True`).  
  - Se loggean **par√°metros**, **RMSE (val)**, **snapshots** (`<modelo>_best_snapshot`) y **metadatos de datasets**.  

**Evidencia (capturas MLflow)**  
- Se observan los runs `HPO_elasticnet`, `HPO_random_forest`, `HPO_xgboost` y sus `*_trial`.  
- En la tabla de experimentos se comparan m√©tricas y par√°metros por run.

---

# 5) üéØ Tuning con Hyperopt + MLflow (Databricks)

- **Espacios de b√∫squeda** definidos por familia (p. ej., `n_estimators`, `max_depth`, `l1_ratio`, etc.).  
- **Objetivo:** minimizar **RMSE** en `val`.  
- **Estructura de runs:**  
  - `IMEDIA_EMI_AllModels` (parent) ‚Üí `HPO_<modelo>` (parent por familia) ‚Üí `<modelo>_trial` (runs hijos).  
- **Artifacts clave:** `run_context.txt`, snapshots de los mejores pipelines por familia y metadatos de datasets.

**Evidencia m√≠nima**  
- Pantalla de MLflow muestra **runs anidados** y **par√°metros**; se ve la **comparativa** de RMSE por familia.

---

# 6) üèÜ Selecci√≥n del Mejor Modelo

**Criterio**  
- Orden expl√≠cito por **RMSE (ascendente)** sobre la validaci√≥n: `results_df.sort_values("val_rmse")`.

**Resultado**  
- Ranking en `val`: **Random Forest (210.75)** < ElasticNet (216.74) < XGBoost (220.67).  
- Se reentrena el **mejor** con todo el train y se eval√∫a en **test**.

**Evidencia de ‚Äúmejor en contexto‚Äù**  
- En **test**, **Random Forest** consigue **RMSE ‚âà 171.82**, muy por debajo del segundo (**ElasticNet ‚âà 248.97**).  
- Justificaci√≥n breve: RF captura no linealidades e interacciones t√≠picas entre se√±ales de interacci√≥n (`num_comments_capped`) y contexto (`subreddit`, `flair`), mejor que modelos lineales; XGB qued√≥ atr√°s con este tama√±o y sparsidad.

**Identificador de best run**  
- Run final: `FINAL_random_forest_on_full_train` (v√©ase MLflow UI en el experimento).  
- Tabla comparativa exportada: `model_metric_comparison.csv`.

---

# 7) üóÇÔ∏è Registro en el Model Registry

**Nomenclatura**  
- Modelo en UC: **`workspace.default/EMI_imedia_model`**.

**Qu√© se registra**  
- **Pipeline completo** (incluye `preprocessor` dentro del `Pipeline`).  
- **Alias:**  
  - `@champion` ‚Üí **Version 3** (Random Forest, **test_rmse ‚âà 171.82**).  
  - `@challenger` ‚Üí **Version 4** (ElasticNet, **test_rmse ‚âà 248.97**).

**Documentaci√≥n del modelo**  
- En `model_description.json` se incluyen: datos, m√©trica, fecha, features, seeds, y changelog.  
- **Preprocessor** adicionalmente guardado como artifact `preprocessor/` y **local** en `../preprocesador/‚Ä¶`.

**Evidencia m√≠nima**  
- Capturas de **Catalog Explorer** muestran `emi_imedia_model` con **Version 3 (@champion)** y **Version 4 (@challenger)**.  
- En el listado del experimento se visualizan `FINAL_*` y la **m√©trica de test** de cada final.

---

## üìä Tabla comparativa (RMSE ‚Üì)

| model_key     | split | RMSE |
|---|---:|---:|
| random_forest | test | **171.82** |
| elasticnet    | test | 248.97 |
| random_forest | val  | 210.75 |
| elasticnet    | val  | 216.74 |
| xgboost       | val  | 220.67 |

> **Conclusi√≥n:** Se promueve **`workspace.default/EMI_imedia_model@champion (v3)`**. El **preprocessor** queda **dentro** del pipeline registrado y **aparte** como artifact + copia **local** en `../preprocesador/`.


In [6]:
# ============================
# Chunk 1 ¬∑ Setup + Data + MLflow + dirs para artefactos
# ============================
import os, json, warnings
warnings.filterwarnings("ignore")

from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd

from dotenv import load_dotenv

# Sklearn
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.neural_network import MLPClassifier

# Transformers / Sentence-Transformers
import torch
from scipy.special import softmax
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer

# MLflow
import mlflow
from mlflow import MlflowClient
from mlflow.models.signature import infer_signature

# -----------------------
# Configuraci√≥n
# -----------------------
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

DATA_PROCESSED = Path("../data/processed")

# Carpetas de artefactos
EMBEDDINGS_DIR = Path("../embeddings")
PREPROC_DIR = Path("../preprocesador")
EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
PREPROC_DIR.mkdir(parents=True, exist_ok=True)

EXPERIMENT_NAME = "/Users/marianasgg19@gmail.com/EMI/imedia/Sentiment_BERT_MLP"
DEEP_MODEL_NAME = "workspace.default.imedia_sentiment_mlp_transformer"

# -----------------------
# Carga de datos
# -----------------------
train_df = pd.read_parquet(DATA_PROCESSED / "sentiment_train.parquet")
val_df   = pd.read_parquet(DATA_PROCESSED / "sentiment_val.parquet")
test_df  = pd.read_parquet(DATA_PROCESSED / "sentiment_test.parquet")

FEATURE_COL = "clean_text"
TARGET_COL  = "sentiment"

X_train = train_df[FEATURE_COL].astype(str).tolist()
y_train = train_df[TARGET_COL].astype(int).values

X_val   = val_df[FEATURE_COL].astype(str).tolist()
y_val   = val_df[TARGET_COL].astype(int).values

X_test  = test_df[FEATURE_COL].astype(str).tolist()
y_test  = test_df[TARGET_COL].astype(int).values

# -----------------------
# MLflow tracking
# -----------------------
def _resolve_tracking_uri():
    load_dotenv(override=True)
    env_uri  = os.getenv("MLFLOW_TRACKING_URI")
    profile  = os.getenv("DATABRICKS_CONFIG_PROFILE")
    host     = os.getenv("DATABRICKS_HOST")
    token    = os.getenv("DATABRICKS_TOKEN")

    if env_uri: return env_uri
    if profile: return f"databricks://{profile}"
    if host and token: return "databricks"

    local = Path.cwd() / "mlruns"
    local.mkdir(exist_ok=True)
    return f"file://{local}"

def set_mlflow():
    mlflow.set_tracking_uri(_resolve_tracking_uri())
    client = MlflowClient()
    exp = client.get_experiment_by_name(EXPERIMENT_NAME)
    if exp is None:
        mlflow.create_experiment(EXPERIMENT_NAME)
    mlflow.set_experiment(EXPERIMENT_NAME)

set_mlflow()


In [7]:
# ============================
# Chunk 2 ¬∑ Baselines BERT (2 modelos) + Preprocesador placeholder NTBK
# ============================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -----------------------
# BERT #1
# -----------------------
BERT1_NAME = "nlptown/bert-base-multilingual-uncased-sentiment"
tok_bert1 = AutoTokenizer.from_pretrained(BERT1_NAME)
mdl_bert1 = AutoModelForSequenceClassification.from_pretrained(BERT1_NAME).to(device)

def predict_bert1(texts):
    preds = []
    for t in texts:
        inputs = tok_bert1(t, return_tensors="pt", truncation=True, padding=True).to(device)
        with torch.no_grad():
            logits = mdl_bert1(**inputs).logits
        probs = softmax(logits.cpu().numpy(), axis=1)
        idx = np.argmax(probs, axis=1)[0]
        preds.append(0 if idx <= 2 else 1)
    return np.array(preds)

# -----------------------
# BERT #2
# -----------------------
BERT2_NAME = "distilbert-base-uncased-finetuned-sst-2-english"
tok_bert2 = AutoTokenizer.from_pretrained(BERT2_NAME)
mdl_bert2 = AutoModelForSequenceClassification.from_pretrained(BERT2_NAME).to(device)

def predict_bert2(texts):
    preds = []
    for t in texts:
        inputs = tok_bert2(t, return_tensors="pt", truncation=True, padding=True).to(device)
        with torch.no_grad():
            logits = mdl_bert2(**inputs).logits
        probs = softmax(logits.cpu().numpy(), axis=1)
        preds.append(int(np.argmax(probs, axis=1)[0]))
    return np.array(preds)

# -----------------------
# Funci√≥n de evaluaci√≥n MLflow
# -----------------------
def eval_and_log_bert(model_name, predict_fn):
    with mlflow.start_run(run_name=f"baseline_{model_name}_NTBK") as run:
        y_val_pred  = predict_fn(X_val)
        y_test_pred = predict_fn(X_test)

        mlflow.log_param("model_family", "bert_pretrained_baseline")
        mlflow.log_param("hf_model_name", model_name)

        mlflow.log_metric("val_accuracy",  accuracy_score(y_val,  y_val_pred))
        mlflow.log_metric("val_f1",        f1_score(y_val, y_val_pred))
        mlflow.log_metric("test_accuracy", accuracy_score(y_test, y_test_pred))
        mlflow.log_metric("test_f1",       f1_score(y_test, y_test_pred))

        # -----------------------
        # Preprocesador NTBK guardado
        # -----------------------
        preproc_path = PREPROC_DIR / f"preprocessor_{model_name.replace('/','_')}_NTBK"
        preproc_path.mkdir(parents=True, exist_ok=True)
        (preproc_path / "placeholder.txt").write_text("No sklearn preprocessor (BERT model). NTBK")

        mlflow.log_artifact(preproc_path)

# Ejecutar
eval_and_log_bert(BERT1_NAME, predict_bert1)
eval_and_log_bert(BERT2_NAME, predict_bert2)


üèÉ View run baseline_nlptown/bert-base-multilingual-uncased-sentiment_NTBK at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/1690980704956355/runs/130bfa8b8c2d4ec18e57f1639c9d4829
üß™ View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/1690980704956355
üèÉ View run baseline_distilbert-base-uncased-finetuned-sst-2-english_NTBK at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/1690980704956355/runs/8d81399bad86477c8f0b74a50e367d59
üß™ View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/1690980704956355


In [8]:
# ============================
# Chunk 3 ¬∑ SentenceTransformer embeddings + MLPClassifier + Artefactos NTBK
# ============================

ST_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
st_model = SentenceTransformer(ST_MODEL_NAME)

def embed_texts(model, texts, batch_size=32):
    return model.encode(
        list(texts),
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
    )

# Generar embeddings
X_train_emb = embed_texts(st_model, X_train)
X_val_emb   = embed_texts(st_model, X_val)
X_test_emb  = embed_texts(st_model, X_test)

# -----------------------
# Guardar embeddings con sufijo NTBK
# -----------------------
timestamp = datetime.utcnow().strftime("%Y%m%dT%H%M%S")
emb_file = EMBEDDINGS_DIR / f"{ST_MODEL_NAME.replace('/','_')}_embeddings_NTBK_{timestamp}.npy"
np.save(emb_file, np.vstack([X_train_emb, X_val_emb, X_test_emb]))

metadata = {
    "model": ST_MODEL_NAME,
    "sizes": {"train": len(y_train), "val": len(y_val), "test": len(y_test)},
    "shape": X_train_emb.shape,
    "timestamp": timestamp
}
meta_file = EMBEDDINGS_DIR / f"{ST_MODEL_NAME.replace('/','_')}_metadata_NTBK_{timestamp}.json"
pd.Series(metadata).to_json(meta_file)

# -----------------------
# Entrenar MLP
# -----------------------
mlp_clf = MLPClassifier(
    hidden_layer_sizes=(256,),
    activation="relu",
    solver="adam",
    batch_size=256,
    learning_rate_init=1e-3,
    max_iter=10,
    random_state=SEED,
    verbose=True,
)

with mlflow.start_run(run_name="mlp_transformer_embeddings_NTBK") as run:
    mlp_clf.fit(X_train_emb, y_train)

    y_val_pred  = mlp_clf.predict(X_val_emb)
    y_test_pred = mlp_clf.predict(X_test_emb)

    mlflow.log_metric("val_accuracy",  accuracy_score(y_val, y_val_pred))
    mlflow.log_metric("val_f1",        f1_score(y_val,  y_val_pred))
    mlflow.log_metric("test_accuracy", accuracy_score(y_test, y_test_pred))
    mlflow.log_metric("test_f1",       f1_score(y_test, y_test_pred))

    # -----------------------
    # Guardar preprocesador NTBK
    # -----------------------
    preproc_path = PREPROC_DIR / f"preprocessor_transformer_MLP_NTBK_{timestamp}"
    preproc_path.mkdir(parents=True, exist_ok=True)
    (preproc_path / "preprocessor_info.txt").write_text(
        "No sklearn preprocessor. Uses SentenceTransformer embeddings. NTBK"
    )
    mlflow.log_artifact(preproc_path)

    # Registrar MLP en MLflow
    example_input = X_train_emb[:50]
    signature = infer_signature(example_input, mlp_clf.predict(example_input))

    mlflow.sklearn.log_model(
        sk_model=mlp_clf,
        artifact_path="model",
        registered_model_name=DEEP_MODEL_NAME,
        signature=signature,
        input_example=example_input[:5],
    )

print("Embeddings guardados en:", emb_file)
print("Preprocesador NTBK guardado en:", preproc_path)


Batches:   0%|          | 0/102 [00:00<?, ?it/s]

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Iteration 1, loss = 0.67165255
Iteration 2, loss = 0.62857594
Iteration 3, loss = 0.59494416
Iteration 4, loss = 0.57511548
Iteration 5, loss = 0.55955338
Iteration 6, loss = 0.54681950
Iteration 7, loss = 0.53513976
Iteration 8, loss = 0.52429407
Iteration 9, loss = 0.51365622
Iteration 10, loss = 0.50370759


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Registered model 'workspace.default.imedia_sentiment_mlp_transformer' already exists. Creating a new version of this model...


Uploading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Created version '2' of model 'workspace.default.imedia_sentiment_mlp_transformer'.


üèÉ View run mlp_transformer_embeddings_NTBK at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/1690980704956355/runs/a45423705a66491f885b254b8c4e6163
üß™ View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/1690980704956355
Embeddings guardados en: ../embeddings/sentence-transformers_all-MiniLM-L6-v2_embeddings_NTBK_20251202T084227.npy
Preprocesador NTBK guardado en: ../preprocesador/preprocessor_transformer_MLP_NTBK_20251202T084227


---

EXPLICAR POR QUE ESTOS MODELOS Y COMO LO HACE CADA UNO Y COMO SE VALIDA Y LA PARTE DE POR QUE LA METRICA ES F1