# Fase 05 — Modeling

Notebook ejecutable y **equivalente** a `scripts/05_modeling.py`.

**Importante:** este notebook **no** gestiona MLflow. El registro MLflow se realiza en Makefile (`publish5`/`remove5`).

## 1) Código de la fase (idéntico al script)

La celda siguiente contiene el código completo de la fase, sin el bloque `argparse` final.

In [1]:
#!/usr/bin/env python3
"""
Fase 05 — Modeling

Entrena modelos para una única familia por variante.

Produce:
- experiments/              → auditoría de trials
- model_final.h5            → modelo único seleccionado
- splits.parquet            → índices train/val/test
- 05_modeling_metadata.json → metadata enriquecida
"""

import sys
from pathlib import Path
import argparse
import json
from datetime import datetime, timezone
from time import perf_counter
import random
import os

import numpy as np
import pandas as pd
import yaml

# ============================================================
# TensorFlow runtime stabilization
# ============================================================
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["TF_NUM_INTRAOP_THREADS"] = "1"
os.environ["TF_NUM_INTEROP_THREADS"] = "1"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import legacy as legacy_optimizers


# ============================================================
# BOOTSTRAP
# ============================================================
SCRIPT_PATH = Path.cwd().resolve()
ROOT = SCRIPT_PATH
for _ in range(10):
    if (ROOT / "mlops4ofp").exists():
        break
    ROOT = ROOT.parent
else:
    raise RuntimeError("No se pudo localizar project root")

sys.path.insert(0, str(ROOT))

from mlops4ofp.tools.run_context import (
    detect_execution_dir,
    detect_project_root,
    assemble_run_context,
    print_run_context,
)
from mlops4ofp.tools.params_manager import ParamsManager
from mlops4ofp.tools.traceability import write_metadata
from mlops4ofp.tools.artifacts import get_git_hash


# ============================================================
# UTILIDADES
# ============================================================

def compute_class_weights(y):
    pos = np.sum(y == 1)
    neg = np.sum(y == 0)
    if pos == 0:
        return None
    return {0: 1.0, 1: neg / pos}


def convert_to_native_types(obj):
    """Convierte tipos numpy a tipos nativos de Python para serialización JSON."""
    if isinstance(obj, dict):
        return {k: convert_to_native_types(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_native_types(item) for item in obj]
    elif isinstance(obj, (np.integer, np.int32, np.int64)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float32, np.float64)):
        return float(obj)
    else:
        return obj


def apply_rare_events(df, imbalance_cfg, seed):
    strategy = imbalance_cfg.get("strategy", "none")

    if strategy != "rare_events":
        return df, {"strategy": "none"}

    max_majority = imbalance_cfg.get("max_majority_samples")

    if max_majority is None:
        return df, {
            "strategy": "rare_events",
            "note": "max_majority_samples=None → no reducción aplicada"
        }

    df_pos = df[df["label"] == 1]
    df_neg = df[df["label"] == 0]

    n_pos_before = len(df_pos)
    n_neg_before = len(df_neg)

    n_neg_sample = min(max_majority, n_neg_before)
    df_neg_sample = df_neg.sample(n=n_neg_sample, random_state=seed)

    df_new = pd.concat([df_pos, df_neg_sample])
    df_new = df_new.sample(frac=1.0, random_state=seed)

    info = {
        "strategy": "rare_events",
        "n_pos_before": int(n_pos_before),
        "n_neg_before": int(n_neg_before),
        "n_pos_after": int(len(df_pos)),
        "n_neg_after": int(n_neg_sample),
    }

    return df_new, info


def pad_sequences(seqs, max_len, pad_value=0):
    out = np.full((len(seqs), max_len), pad_value, dtype=np.int32)
    for i, s in enumerate(seqs):
        trunc = s[-max_len:]
        if len(trunc) == 0:
            continue
        out[i, -len(trunc):] = trunc
    return out


# ============================================================
# FAMILIAS
# ============================================================

def vectorize_dense_bow(df):
    sequences = df["OW_events"].tolist()
    y = df["label"].values.astype(np.int32)

    vocab = sorted(set(ev for s in sequences for ev in s))
    index = {ev: i for i, ev in enumerate(vocab)}

    X = np.zeros((len(sequences), len(vocab)), dtype=np.float32)
    for i, s in enumerate(sequences):
        for ev in s:
            X[i, index[ev]] += 1.0

    return X, y, {
        "input_dim": X.shape[1],
        "vocab": vocab,
        "vectorization": "dense_bow"
    }


def build_dense_bow_model(aux, hp):
    model = keras.Sequential()
    model.add(layers.Input(shape=(aux["input_dim"],)))

    for _ in range(hp["n_layers"]):
        model.add(layers.Dense(hp["units"], activation="relu"))
        if hp["dropout"] > 0:
            model.add(layers.Dropout(hp["dropout"]))

    model.add(layers.Dense(1, activation="sigmoid"))
    return model


def vectorize_sequence(df):
    sequences = df["OW_events"].tolist()
    y = df["label"].values.astype(np.int32)

    vocab = sorted(set(ev for s in sequences for ev in s))
    index = {ev: i + 1 for i, ev in enumerate(vocab)}

    seqs_idx = [[index[e] for e in s] for s in sequences]
    lengths = [len(s) for s in seqs_idx]
    max_len = max(1, int(np.percentile(lengths, 95))) if lengths else 1
    X = pad_sequences(seqs_idx, max_len)

    return X, y, {
        "vocab": vocab,
        "vocab_size": len(vocab),
        "max_len": max_len,
        "vectorization": "sequence"
    }


def build_sequence_embedding_model(aux, hp):
    model = keras.Sequential()
    model.add(layers.Input(shape=(aux["max_len"],)))
    model.add(layers.Embedding(
        input_dim=aux["vocab_size"] + 1,
        output_dim=hp["embed_dim"],
        mask_zero=True,
    ))
    model.add(layers.GlobalAveragePooling1D())

    for _ in range(hp["n_layers"]):
        model.add(layers.Dense(hp["units"], activation="relu"))
        if hp["dropout"] > 0:
            model.add(layers.Dropout(hp["dropout"]))

    model.add(layers.Dense(1, activation="sigmoid"))
    return model


def build_cnn1d_model(aux, hp):
    model = keras.Sequential()
    model.add(layers.Input(shape=(aux["max_len"],)))
    model.add(layers.Embedding(
        input_dim=aux["vocab_size"] + 1,
        output_dim=hp["embed_dim"],
    ))
    model.add(layers.Conv1D(
        filters=hp["filters"],
        kernel_size=hp["kernel_size"],
        activation="relu",
        padding="same",
    ))
    model.add(layers.GlobalMaxPooling1D())

    for _ in range(hp["n_layers"]):
        model.add(layers.Dense(hp["units"], activation="relu"))
        if hp["dropout"] > 0:
            model.add(layers.Dropout(hp["dropout"]))

    model.add(layers.Dense(1, activation="sigmoid"))
    return model


FAMILIES = {
    "dense_bow": {
        "vectorize": vectorize_dense_bow,
        "build": build_dense_bow_model,
    },
    "sequence_embedding": {
        "vectorize": vectorize_sequence,
        "build": build_sequence_embedding_model,
    },
    "cnn1d": {
        "vectorize": vectorize_sequence,
        "build": build_cnn1d_model,
    },
}


# ============================================================
# MÉTRICAS / UMBRAL / LOOP
# ============================================================

def binary_metrics(y_true, y_prob, threshold=0.5):
    y_pred = (y_prob >= threshold).astype(np.int32)

    TP = int(np.sum((y_true == 1) & (y_pred == 1)))
    TN = int(np.sum((y_true == 0) & (y_pred == 0)))
    FP = int(np.sum((y_true == 0) & (y_pred == 1)))
    FN = int(np.sum((y_true == 1) & (y_pred == 0)))

    precision = TP / (TP + FP + 1e-9)
    recall = TP / (TP + FN + 1e-9)
    f1 = 2 * precision * recall / (precision + recall + 1e-9)
    accuracy = (TP + TN) / max(TP + TN + FP + FN, 1)

    return {
        "threshold": float(threshold),
        "accuracy": float(accuracy),
        "precision": float(precision),
        "recall": float(recall),
        "f1": float(f1),
        "TP": TP,
        "TN": TN,
        "FP": FP,
        "FN": FN,
    }


def best_threshold_by_f1(y_true, y_prob):
    best = None
    for t in np.linspace(0.05, 0.95, 19):
        m = binary_metrics(y_true, y_prob, threshold=t)
        if best is None or m["f1"] > best["f1"]:
            best = m
    return best


def sample_hparams(search_space):
    hp = {}
    for k, v in search_space.items():
        if not isinstance(v, list):
            hp[k] = v
        else:
            hp[k] = random.choice(v)
    return hp


def train_one_trial(
    family_name,
    family_cfg,
    hp,
    X_train,
    y_train,
    X_val,
    y_val,
    training_cfg,
    class_weight,
    trial_id,
    model_dir,
    seed,
    verbose=0,
    should_save_model=False,
    callbacks=None,
    ):

    tf.keras.utils.set_random_seed(seed + trial_id)
    model = family_cfg["build"](family_cfg["aux"], hp)

    lr = hp.get("lr", 1e-3)
    opt = legacy_optimizers.Adam(learning_rate=lr)
    model.compile(
        optimizer=opt,
        loss="binary_crossentropy",
        metrics=[keras.metrics.AUC(name="auc")],
    )

    batch_size = int(hp.get("batch_size", 64))
    epochs = int(training_cfg.get("epochs", 5))

    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        class_weight=class_weight,
        verbose=verbose,
        callbacks=callbacks,
    )

    val_prob = model.predict(X_val, verbose=0).reshape(-1)
    best_val = best_threshold_by_f1(y_val, val_prob)

    return {
        "trial_id": trial_id,
        "hparams": hp,
        "val_metrics": best_val,
        "history": history.history,
        "model": model,
    }


# ============================================================
# MAIN
# ============================================================

def main(variant: str):
    execution_dir = detect_execution_dir()
    project_root = detect_project_root(execution_dir)
    phase = "05_modeling"

    pm = ParamsManager(phase, project_root)
    pm.set_current(variant)
    variant_root = pm.current_variant_dir()

    ctx = assemble_run_context(
        execution_dir=execution_dir,
        project_root=project_root,
        phase=phase,
        variant=variant,
        variant_root=variant_root,
    )
    print_run_context(ctx)

    params_path = ctx["variant_root"] / "params.yaml"
    with open(params_path, "r", encoding="utf-8") as f:
        params = yaml.safe_load(f) or {}
    ctx["variant_params"] = params

    parent_variant = params["parent_variant"]
    parent_dataset = project_root / "executions" / "04_targetengineering" / parent_variant / "04_targetengineering_dataset.parquet"
    if not parent_dataset.exists():
        raise FileNotFoundError(f"No existe dataset padre: {parent_dataset}")

    df = pd.read_parquet(parent_dataset)

    if "label" not in df.columns:
        raise RuntimeError("El dataset de F04 debe contener columna 'label'.")
    if "OW_events" not in df.columns:
        raise RuntimeError("El dataset de F04 debe contener columna 'OW_events'.")

    seed = int(params.get("training", {}).get("seed", 42))
    random.seed(seed)
    np.random.seed(seed)
    tf.keras.utils.set_random_seed(seed)

    imbalance_cfg = params.get("imbalance", {})
    df, imbalance_info = apply_rare_events(df, imbalance_cfg, seed)

    max_samples = params.get("training", {}).get("max_samples")
    if max_samples is not None and len(df) > max_samples:
        df = df.sample(n=max_samples, random_state=seed)

    family = params["model_family"]
    if family not in FAMILIES:
        raise ValueError(f"model_family '{family}' no soportada")

    family_cfg = FAMILIES[family]
    vectorize_fn = family_cfg["vectorize"]

    X, y, aux = vectorize_fn(df)

    idx = np.arange(len(X))
    np.random.shuffle(idx)
    X = X[idx]
    y = y[idx]

    n = len(X)
    n_train = int(0.7 * n)
    n_val = int(0.15 * n)

    X_train, y_train = X[:n_train], y[:n_train]
    X_val, y_val = X[n_train:n_train+n_val], y[n_train:n_train+n_val]
    X_test, y_test = X[n_train+n_val:], y[n_train+n_val:]

    experiments_dir = ctx["variant_root"] / "experiments"
    models_dir = experiments_dir / "models"
    experiments_dir.mkdir(parents=True, exist_ok=True)
    models_dir.mkdir(parents=True, exist_ok=True)

    pd.DataFrame({
        "split": ["train"] * len(X_train) + ["val"] * len(X_val) + ["test"] * len(X_test),
        "idx": list(range(len(X_train))) + list(range(len(X_train), len(X_train)+len(X_val))) + list(range(len(X_train)+len(X_val), n))
    }).to_parquet(experiments_dir / "splits.parquet", index=False)

    class_weight = None
    if params.get("training", {}).get("class_weight_mode") == "balanced":
        class_weight = compute_class_weights(y_train)

    automl_cfg = params.get("automl", {})
    n_trials = int(automl_cfg.get("n_trials", 1))
    search_space = params.get("search_space", {})
    search_space.setdefault("batch_size", [params.get("training", {}).get("batch_size", 64)])
    search_space.setdefault("lr", [params.get("training", {}).get("lr", 1e-3)])
    search_space.setdefault("n_layers", [1])
    search_space.setdefault("units", [64])
    search_space.setdefault("dropout", [0.0])

    if family in {"sequence_embedding", "cnn1d"}:
        search_space.setdefault("embed_dim", [32])
    if family == "cnn1d":
        search_space.setdefault("filters", [64])
        search_space.setdefault("kernel_size", [3])

    family_cfg_runtime = dict(family_cfg)
    family_cfg_runtime["aux"] = aux

    trials_rows = []
    best_trial = None

    start_all = perf_counter()

    # Callback Early Stopping configurable
    callbacks = []
    es_cfg = params.get("training", {}).get("early_stopping", {})
    if es_cfg.get("enabled", False):
        callbacks.append(
            keras.callbacks.EarlyStopping(
                monitor=es_cfg.get("monitor", "val_loss"),
                patience=int(es_cfg.get("patience", 3)),
                restore_best_weights=bool(es_cfg.get("restore_best_weights", True)),
            )
        )

    for trial_id in range(1, n_trials + 1):
        hp = sample_hparams(search_space)

        t0 = perf_counter()
        result = train_one_trial(
            family,
            family_cfg_runtime,
            hp,
            X_train, y_train,
            X_val, y_val,
            params.get("training", {}),
            class_weight,
            trial_id,
            models_dir,
            seed,
            verbose=0,
            should_save_model=False,
            callbacks=callbacks,
        )
        dt = perf_counter() - t0

        valm = result["val_metrics"]

        row = {
            "trial_id": trial_id,
            "family": family,
            "seconds": float(dt),
            "val_f1": valm["f1"],
            "val_precision": valm["precision"],
            "val_recall": valm["recall"],
            "val_acc": valm["accuracy"],
            "val_threshold": valm["threshold"],
            "hparams": json.dumps(result["hparams"], ensure_ascii=False),
        }
        trials_rows.append(row)

        if best_trial is None or valm["f1"] > best_trial["val_metrics"]["f1"]:
            best_trial = result

    trials_df = pd.DataFrame(trials_rows).sort_values("val_f1", ascending=False)
    trials_df.to_parquet(experiments_dir / "trials.parquet", index=False)

    best_model = best_trial["model"]
    best_hp = best_trial["hparams"]
    best_val = best_trial["val_metrics"]

    y_test_prob = best_model.predict(X_test, verbose=0).reshape(-1)
    test_metrics = binary_metrics(y_test, y_test_prob, threshold=best_val["threshold"] )

    model_final_path = ctx["variant_root"] / "model_final.h5"
    best_model.save(model_final_path)

    family_cfg_runtime["aux"] = convert_to_native_types(aux)

    metadata = {
        "stage": phase,
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "variant": variant,
        "parent_variant": parent_variant,
        "model_family": family,
        "inputs": [str(parent_dataset)],
        "outputs": [str(model_final_path), str(experiments_dir / "trials.parquet")],
        "params": {
            "search_space": search_space,
            "n_trials": n_trials,
            "training": params.get("training", {}),
            "imbalance": imbalance_cfg,
            "vectorization": aux,          # ← incluye vocab / max_len / input_dim
            "best_hparams": best_hp,
        },
        "metrics": {
            "best_val": best_val,
            "test": test_metrics,
            "n_rows": int(len(df)),
            "n_train": int(len(X_train)),
            "n_val": int(len(X_val)),
            "n_test": int(len(X_test)),
            "class_balance": {
                "pos": int(np.sum(y == 1)),
                "neg": int(np.sum(y == 0)),
            },
            "runtime_seconds_total": float(perf_counter() - start_all),
            "imbalance_info": imbalance_info,
        },
        "git": {"commit": get_git_hash()},
    }

    metadata_path = ctx.get("outputs", {}).get("metadata")
    if metadata_path is None:
        metadata_path = ctx["variant_root"] / f"{phase}_metadata.json"
    write_metadata(
        stage=phase,
        variant=variant,
        parent_variant=parent_variant,
        inputs=[str(parent_dataset)],
        outputs=[str(model_final_path), str(experiments_dir / "trials.parquet")],
        params=params,
        metadata_path=metadata_path,
    )

    print("[OK] Entrenamiento completado")
    print(f"[OK] Modelo final: {model_final_path}")
    print(f"[OK] Trials: {experiments_dir / 'trials.parquet'}")




## 2) Ejecutar

Define la variante (por defecto `v501`) y ejecuta `main(VARIANT)`.

In [2]:
# ============================================================
# EJECUCIÓN
# ============================================================

# Variante a ejecutar.
# - Si ejecutas con Makefile (nb5-run), puedes exportar VARIANT en entorno.
# - Si ejecutas manualmente en VSCode/Jupyter, cambia este valor.
VARIANT = os.environ.get("ACTIVE_VARIANT", "v501")

main(VARIANT)

[CTX] execution_dir: /Users/juancarlosduenaslopez/Documents/mlops/mlops4ofp/notebooks
[CTX] project_root: /Users/juancarlosduenaslopez/Documents/mlops/mlops4ofp
[CTX] phase: 05_modeling
[CTX] variant: v504
[CTX] variant_root: /Users/juancarlosduenaslopez/Documents/mlops/mlops4ofp/executions/05_modeling/v504
[CTX] figures_dir: /Users/juancarlosduenaslopez/Documents/mlops/mlops4ofp/executions/05_modeling/v504/figures


[OK] Entrenamiento completado
[OK] Modelo final: /Users/juancarlosduenaslopez/Documents/mlops/mlops4ofp/executions/05_modeling/v504/model_final.h5
[OK] Trials: /Users/juancarlosduenaslopez/Documents/mlops/mlops4ofp/executions/05_modeling/v504/experiments/trials.parquet


  saving_api.save_model(
