# Fase 05 — Modeling (Full Version)


In [None]:

# ============================================================
# Fase 05 — Modeling (Full Notebook Version)
# ============================================================

import os
import sys
import json
import random
from pathlib import Path
from datetime import datetime, timezone
from time import perf_counter

import numpy as np
import pandas as pd
import yaml

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from mlops4ofp.tools.run_context import (
    detect_execution_dir,
    detect_project_root,
    assemble_run_context,
    print_run_context,
)
from mlops4ofp.tools.params_manager import ParamsManager
from mlops4ofp.tools.traceability import write_metadata
from mlops4ofp.tools.artifacts import get_git_hash

# ============================================================
# CONTEXTO
# ============================================================

VARIANT = os.environ.get("ACTIVE_VARIANT")
if VARIANT is None:
    raise RuntimeError("ACTIVE_VARIANT no definido (usar make nb5-run VARIANT=vNNN)")

PHASE = "05_modeling"
t_start = perf_counter()

execution_dir = detect_execution_dir()
project_root = detect_project_root(execution_dir)

pm = ParamsManager(PHASE, project_root)
pm.set_current(VARIANT)
variant_root = pm.current_variant_dir()

ctx = assemble_run_context(
    execution_dir, project_root, PHASE, VARIANT, variant_root
)
print_run_context(ctx)

with open(variant_root / "params.yaml", "r") as f:
    params = yaml.safe_load(f)

parent_variant = params["parent_variant"]
model_family = params["model_family"]

# ============================================================
# CARGA DATASET
# ============================================================

dataset_path = (
    project_root
    / "executions"
    / "04_targetengineering"
    / parent_variant
    / "04_targetengineering_dataset.parquet"
)

df = pd.read_parquet(dataset_path)

sequences = df["OW_events"].tolist()
y = df["label"].values.astype(np.int32)

# ============================================================
# VECTORIZE (BoW simple)
# ============================================================

vocab = sorted(set(ev for s in sequences for ev in s))
index = {ev: i for i, ev in enumerate(vocab)}

X = np.zeros((len(sequences), len(vocab)), dtype=np.float32)
for i, s in enumerate(sequences):
    for ev in s:
        X[i, index[ev]] += 1.0

# ============================================================
# SPLIT
# ============================================================

idx = np.arange(len(X))
np.random.shuffle(idx)

split = params["evaluation"]["split"]
n = len(idx)
n_train = int(split["train"] * n)
n_val = int(split["val"] * n)

train_idx = idx[:n_train]
val_idx = idx[n_train:n_train+n_val]
test_idx = idx[n_train+n_val:]

X_train, y_train = X[train_idx], y[train_idx]
X_val, y_val = X[val_idx], y[val_idx]

# ============================================================
# EXPERIMENTOS (max_trials)
# ============================================================

max_trials = params["automl"]["max_trials"]
trials_summary = []

best_recall = -1.0
best_model = None
best_hparams = None

for trial in range(max_trials):

    units = random.choice([32, 64])
    lr = random.choice([0.001, 0.0005])
    batch_size = random.choice([32, 64])

    model = keras.Sequential([
        layers.Input(shape=(X.shape[1],)),
        layers.Dense(units, activation="relu"),
        layers.Dense(1, activation="sigmoid"),
    ])

    model.compile(
        optimizer=keras.optimizers.Adam(lr),
        loss="binary_crossentropy",
        metrics=[keras.metrics.Recall(name="recall")]
    )

    history = model.fit(
        X_train,
        y_train,
        validation_data=(X_val, y_val),
        epochs=params["training"]["epochs"],
        batch_size=batch_size,
        verbose=0,
    )

    val_recall = float(max(history.history["val_recall"]))

    trials_summary.append({
        "trial_id": trial,
        "hyperparameters": {
            "units": units,
            "learning_rate": lr,
            "batch_size": batch_size
        },
        "val_recall": val_recall
    })

    if val_recall > best_recall:
        best_recall = val_recall
        best_model = model
        best_hparams = {
            "units": units,
            "learning_rate": lr,
            "batch_size": batch_size
        }

# ============================================================
# GUARDAR MODELO FINAL
# ============================================================

final_model_path = variant_root / "model_final.h5"
best_model.save(final_model_path)

# ============================================================
# METADATA COMPLETA
# ============================================================

metadata_path = variant_root / f"{PHASE}_metadata.json"

metadata = {
    "phase": PHASE,
    "variant": VARIANT,
    "parent_variant": parent_variant,
    "model_family": model_family,
    "num_experiments": max_trials,
    "best_val_recall": best_recall,
    "best_hyperparameters": best_hparams,
    "model_path": str(final_model_path),
    "dataset_path": str(dataset_path),
    "split_sizes": {
        "train": int(len(train_idx)),
        "val": int(len(val_idx)),
        "test": int(len(test_idx))
    },
    "trials_summary": trials_summary,
    "mlflow": {
        "run_id": None,
        "published": False
    },
    "git": {
        "commit": get_git_hash()
    },
    "generated_at": datetime.now(timezone.utc).isoformat()
}

with open(metadata_path, "w") as f:
    json.dump(metadata, f, indent=2)

write_metadata(
    stage=PHASE,
    variant=VARIANT,
    parent_variant=parent_variant,
    inputs=[str(dataset_path)],
    outputs=[str(metadata_path)],
    params=params,
    metadata_path=metadata_path,
)

print(f"[DONE] Fase 05 completada en {perf_counter()-t_start:.1f}s")
