In [None]:
# ----------------------------------------
import os
import json
import mlflow
from mlflow.tracking import MlflowClient

# Dentro de Docker, servicios hablan por hostname del servicio: mlflow
TRACKING_URI_DOCKER = "http://mlflow:5000"
mlflow.set_tracking_uri(TRACKING_URI_DOCKER)

# Para abrir la UI desde el host:
MLFLOW_UI_HOST = "http://localhost:5002"  # host:5002 -> container:5000

EXPERIMENT_NAME = "penguins_gridsearch_v1"
REGISTERED_MODEL_NAME = "PenguinClassifier"
mlflow.set_experiment(EXPERIMENT_NAME)

print("MLflow tracking   :", mlflow.get_tracking_uri())
print("MLflow UI (host)  :", MLFLOW_UI_HOST)

In [None]:
# ----------------------------------------
import pandas as pd
from sqlalchemy import create_engine, text

# Conexión a MySQL dentro de Docker
DATA_DB_URI = "mysql+pymysql://mlflow_user:mlflow_pass@mysql:3306/penguins_db"
engine = create_engine(DATA_DB_URI)

RAW_SCHEMA, PROC_SCHEMA = "raw", "processed"
RAW_TABLE, PROC_TABLE = "penguins_raw", "penguins_processed"

def table_exists(engine, schema, table):
    with engine.connect() as con:
        q = text("""
            SELECT COUNT(*)
            FROM information_schema.tables
            WHERE table_schema=:schema AND table_name=:table
        """)
        return con.execute(q, {"schema": schema, "table": table}).scalar() > 0

if table_exists(engine, PROC_SCHEMA, PROC_TABLE):
    df = pd.read_sql_table(PROC_TABLE, con=engine, schema=PROC_SCHEMA)
    print(f"Cargado desde MySQL: {PROC_SCHEMA}.{PROC_TABLE} -> {df.shape}")
else:
    # Fallback: construimos el procesado
    print("No existe processed.penguins_processed; creando fallback en memoria…")
    from palmerpenguins import load_penguins
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    import numpy as np

    raw = load_penguins()
    LABEL_COL = "species"
    NUMERIC_COLS = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'year']
    CATEGORICAL_COLS = ['island', 'sex']

    # Split
    y = raw[LABEL_COL]
    X = raw.drop(columns=[LABEL_COL], errors='ignore').copy()

    # Numeric → coerce + impute + scale
    for c in NUMERIC_COLS:
        X[c] = pd.to_numeric(X[c], errors='coerce')
    num_imputer = SimpleImputer(strategy="median")
    X_num = num_imputer.fit_transform(X[NUMERIC_COLS])
    num_scaler = StandardScaler()
    X_num = num_scaler.fit_transform(X_num)

    # Categorical → fill + OHE
    cats = X[CATEGORICAL_COLS].astype('string').fillna('missing')
    # Nota: 'sparse_output' requiere scikit-learn >= 1.2; en versiones previas usa 'sparse=False'
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    X_cat = ohe.fit_transform(cats)

    # Concatenate
    try:
        cat_names = list(ohe.get_feature_names_out(CATEGORICAL_COLS))
    except Exception:
        cat_names = [f"cat_{i}" for i in range(X_cat.shape[1])]
    feature_names = NUMERIC_COLS + cat_names
    Xp = np.hstack([X_num, X_cat])
    df = pd.DataFrame(Xp, columns=feature_names)
    df[LABEL_COL] = y.values
    print("Procesado en memoria:", df.shape, "→ columnas:", df.columns.tolist())

# División train/test
from sklearn.model_selection import train_test_split

LABEL_COL = "species"
X = df.drop(columns=[LABEL_COL])
y = df[LABEL_COL]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=42, test_size=0.2
)
print("Train/Test shapes:", X_train.shape, X_test.shape)


In [None]:
# ----------------------------------------
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

# Dos modelos en pipeline para garantizar ≥ 20 ejecuciones
# 1) LogisticRegression (varias Cs y penalties)
# 2) RandomForest (varias profundidades y estimadores)
pipelines = {
    "logreg": Pipeline(steps=[
        ("scaler", StandardScaler(with_mean=False)),  # X ya viene escalado; seguridad por si acaso
        ("clf", LogisticRegression(max_iter=1000))
    ]),
    "rf": Pipeline(steps=[
        ("scaler", StandardScaler(with_mean=False)),
        ("clf", RandomForestClassifier(random_state=42))
    ])
}

param_grids = {
    "logreg": {
        "clf__C": [0.1, 0.3, 1.0, 3.0, 10.0],
        "clf__penalty": ["l2"],  # l1 exigiría solver diferente
        "clf__solver": ["lbfgs", "liblinear"],  # 5*1*2 = 10
    },
    "rf": {
        "clf__n_estimators": [50, 100, 150],     # 3
        "clf__max_depth": [3, 5, 8, None],       # 4
        "clf__min_samples_split": [2, 4],        # 2 -> 3*4*2 = 24 combinaciones
    },
}

# Autolog
mlflow.sklearn.autolog(
    log_model_signatures=True,
    log_input_examples=True,
    log_models=True
)

results = []
for name, pipe in pipelines.items():
    grid = GridSearchCV(pipe, param_grids[name], cv=3, n_jobs=2, verbose=0)
    with mlflow.start_run(run_name=f"gridsearch_{name}") as run:
        grid.fit(X_train, y_train)
        best_est = grid.best_estimator_
        best_score = grid.best_score_
        # Log extra metrics manuales
        mlflow.log_metric("cv_best_score", float(best_score))
        mlflow.set_tag("model_family", name)
        results.append({
            "name": name,
            "best_score": best_score,
            "best_estimator": best_est,
            "run_id": run.info.run_id
        })

len(results), results[:2]


In [None]:
# ----------------------------------------
# Elegimos el mejor por cv_best_score
best_entry = max(results, key=lambda r: r["best_score"])
best_name = best_entry["name"]
best_est = best_entry["best_estimator"]
best_entry


In [None]:
# ----------------------------------------
from sklearn.metrics import accuracy_score

with mlflow.start_run(run_name=f"register_best_{best_name}") as run:
    # Entrenamos con train completo para registrar "el" modelo
    best_est.fit(X_train, y_train)
    y_pred = best_est.predict(X_test)
    acc_test = accuracy_score(y_test, y_pred)
    mlflow.log_metric("accuracy_test", float(acc_test))

    # Log del modelo como artefacto estándar (y registro de modelo)
    mlflow.sklearn.log_model(
        sk_model=best_est,
        artifact_path="model",
        registered_model_name=REGISTERED_MODEL_NAME
    )

    client = MlflowClient()
    # Obtenemos la última versión creada por este run
    mv = client.get_latest_versions(REGISTERED_MODEL_NAME, stages=[])
    # Ordenamos por versión y nos quedamos la última
    mv = sorted(mv, key=lambda m: int(m.version))[-1]
    print("Registered:", mv.name, "version:", mv.version)

    # Promover a Production
    client.transition_model_version_stage(
        name=mv.name, version=mv.version, stage="Production", archive_existing_versions=True
    )
    print("Promoted to Production:", mv.name, "version:", mv.version, "→ ver en UI")


In [None]:
# ----------------------------------------
from urllib.parse import quote
from mlflow.tracking import MlflowClient

client = MlflowClient()
exps = client.search_experiments()
print("Experimentos:", [e.name for e in exps])

# Link a la UI (host)
print("MLflow UI (host):", MLFLOW_UI_HOST)

# Link directo al experimento
exp = [e for e in exps if e.name == EXPERIMENT_NAME][0]
print("Experimento en host:", f"{MLFLOW_UI_HOST}/#/experiments/{exp.experiment_id}")

# Link a los modelos registrados
print("Model Registry:", f"{MLFLOW_UI_HOST}/#/models/{quote(REGISTERED_MODEL_NAME)}")


In [None]:
# ----------------------------------------
import mlflow.pyfunc
import pandas as pd

model = mlflow.pyfunc.load_model(f"models:/{REGISTERED_MODEL_NAME}/Production")

# Construimos un ejemplo con el esquema en API
example = pd.DataFrame([{
    "bill_length_mm": 39.5,
    "bill_depth_mm": 17.4,
    "flipper_length_mm": 186.0,
    "body_mass_g": 3800.0,
    "year": 2007.0,
    "island_Biscoe": 0.0,
    "island_Dream": 1.0,
    "island_Torgersen": 0.0,
    "sex_female": 1.0,
    "sex_male": 0.0,
    "sex_missing": 0.0
}])

pred = model.predict(example)
print(pred)
