# F07 - Deploy & Runtime Validation (autocontenido)

Este notebook implementa completamente la Fase 07 sin depender de `scripts/07_deployrun.py`.

Flujo:
1) prepare: genera `manifest.json` a partir de la variante F06 padre
2) run: arranca servidor Flask (minimo), ejecuta cliente batch, guarda logs crudos (parquet+csv)
3) postprocess: calcula metricas por modelo (`prediction_name`) incluyendo recuentos `no_ref_*`
4) report: genera `report.html` y figuras en `report/figures/`
5) traceability: escribe `07_deployrun_metadata.json`

Propiedades:
- Idempotente: al ejecutar run se regeneran `runtime/ logs/ metrics/ report/`.
- Caja negra: servidor+cliente se ejecutan dentro del notebook como sistema de prueba.


In [None]:
# ===============================
# Configuracion
# ===============================
VARIANT = "v001"   # <-- cambia por tu variante F07 (vNNN)

DEFAULT_HOST = "127.0.0.1"
DEFAULT_PORT = 5005

print(f"[F07] VARIANT = {VARIANT}")


In [None]:
# ===============================
# Bootstrap: localizar project root
# ===============================
from pathlib import Path
import sys

ROOT = Path().resolve()
for _ in range(10):
    if (ROOT / "mlops4ofp").exists():
        break
    ROOT = ROOT.parent
else:
    raise RuntimeError("No se pudo localizar project root (carpeta mlops4ofp)")

sys.path.insert(0, str(ROOT))
print("[F07] project_root =", ROOT)


In [None]:
# ===============================
# Imports (proyecto + deps)
# ===============================
import json
import shutil
import time
from datetime import datetime, timezone
from pathlib import Path
from threading import Thread

import numpy as np
import pandas as pd
import requests
import yaml

import tensorflow as tf
from flask import Flask, request, jsonify
from werkzeug.serving import make_server

import matplotlib.pyplot as plt

from mlops4ofp.tools.params_manager import ParamsManager
from mlops4ofp.tools.run_context import detect_execution_dir, detect_project_root
from mlops4ofp.tools.traceability import write_metadata


In [None]:
# ===============================
# Paths y utilidades
# ===============================
PHASE = "07_deployrun"

def ensure_clean_dir(path: Path):
    if path.exists():
        shutil.rmtree(path)
    path.mkdir(parents=True, exist_ok=True)

def variant_root_from_pm(variant: str) -> Path:
    execution_dir = detect_execution_dir()
    project_root = detect_project_root(execution_dir)
    pm = ParamsManager(PHASE, project_root)
    pm.set_current(variant)
    return pm.current_variant_dir()

variant_root = variant_root_from_pm(VARIANT)
print("[F07] variant_root =", variant_root)

params_path = variant_root / "params.yaml"
if not params_path.exists():
    raise FileNotFoundError(f"No existe params.yaml para {PHASE}:{VARIANT} en {params_path}")

with open(params_path, "r", encoding="utf-8") as f:
    f07_params = yaml.safe_load(f)

parent_f06 = f07_params.get("parent_variant_f06")
if not parent_f06:
    raise ValueError("parent_variant_f06 debe estar definido en params.yaml de F07")

runtime_cfg = (f07_params.get("runtime") or {})
HOST = runtime_cfg.get("host", DEFAULT_HOST)
PORT = int(runtime_cfg.get("port", DEFAULT_PORT))

print("[F07] parent_f06 =", parent_f06)
print("[F07] runtime =", HOST, PORT)

runtime_dir = variant_root / "runtime"
logs_dir = variant_root / "logs"
metrics_dir = variant_root / "metrics"
report_dir = variant_root / "report"
figures_dir = report_dir / "figures"

manifest_path = variant_root / "manifest.json"
metadata_path = variant_root / f"{PHASE}_metadata.json"


## 1) Prepare - generar manifest.json

`manifest.json` es el contrato sellado de F07:
- lista de modelos (por `prediction_name`)
- directorio del modelo en el paquete F06 (con `model.h5` y `model_summary.json`)
- dataset asociado (parquet F04 copiado en F06)
- columnas (`OW_events`, `label`)


In [None]:
# ===============================
# Prepare: construir manifest.json
# ===============================
project_root = ROOT
f06_root = project_root / "executions" / "06_packaging" / parent_f06
if not f06_root.exists():
    raise FileNotFoundError(f"No existe paquete F06: {f06_root}")

f06_metadata_path = f06_root / "06_packaging_metadata.json"
if not f06_metadata_path.exists():
    candidates = list(f06_root.glob("*_metadata.json"))
    if not candidates:
        raise FileNotFoundError(f"No se encontro metadata F06 en {f06_root}")
    f06_metadata_path = candidates[0]

f06_metadata = json.loads(f06_metadata_path.read_text(encoding="utf-8"))

models_manifest = []
datasets_manifest = []

datasets_dir_f06 = f06_root / "datasets"
models_dir_f06 = f06_root / "models"

seen_datasets = set()

for m in f06_metadata.get("models", []):
    pred_name = m["prediction_name"]
    v05 = m["source_f05"]

    model_candidates = list(models_dir_f06.glob(f"{pred_name}__*"))
    if len(model_candidates) != 1:
        raise RuntimeError(
            f"Esperaba 1 directorio para modelo '{pred_name}' en {models_dir_f06}, encontrado: {model_candidates}"
        )
    model_dir = model_candidates[0]
    model_h5 = model_dir / "model.h5"
    model_summary = model_dir / "model_summary.json"
    if not model_h5.exists():
        raise FileNotFoundError(f"No existe model.h5 en {model_dir}")
    if not model_summary.exists():
        raise FileNotFoundError(f"No existe model_summary.json en {model_dir} (necesario para vectorizacion runtime)")

    f05_params_path = project_root / "executions" / "05_modeling" / v05 / "params.yaml"
    if not f05_params_path.exists():
        raise FileNotFoundError(f"No existe params.yaml de F05 {v05}: {f05_params_path}")
    f05_params = yaml.safe_load(f05_params_path.read_text(encoding="utf-8"))
    v04 = f05_params["parent_variant"]

    dataset_path = datasets_dir_f06 / f"{v04}__dataset.parquet"
    if not dataset_path.exists():
        raise FileNotFoundError(f"No existe dataset F04 copiado en F06: {dataset_path}")

    models_manifest.append({
        "prediction_name": pred_name,
        "source_f05": v05,
        "source_f04": v04,
        "model_dir": str(model_dir),
        "model_h5": "model.h5",
        "model_summary": "model_summary.json",
        "dataset_path": str(dataset_path),
        "x_column": "OW_events",
        "y_column": "label",
    })

    if str(dataset_path) not in seen_datasets:
        datasets_manifest.append({
            "dataset_path": str(dataset_path),
            "x_column": "OW_events",
            "y_column": "label",
            "source_f04": v04,
        })
        seen_datasets.add(str(dataset_path))

manifest = {
    "phase": PHASE,
    "variant": VARIANT,
    "f06_variant": parent_f06,
    "f06_path": str(f06_root),
    "created_at": datetime.now(timezone.utc).isoformat(),
    "runtime": {"host": HOST, "port": PORT},
    "models": models_manifest,
    "datasets": datasets_manifest,
}

manifest_path.write_text(json.dumps(manifest, indent=2), encoding="utf-8")
print("[OK] manifest.json generado en:", manifest_path)
print("[OK] modelos:", len(models_manifest), "| datasets:", len(datasets_manifest))


## 2) Run - servidor + cliente (batch) + logs crudos


In [None]:
# ===============================
# Servidor Flask en thread (para notebook)
# ===============================
class ServerThread(Thread):
    def __init__(self, app, host, port):
        super().__init__(daemon=True)
        self.server = make_server(host, port, app)
        self.ctx = app.app_context()
        self.ctx.push()

    def run(self):
        self.server.serve_forever()

    def shutdown(self):
        self.server.shutdown()


def build_runtime_server(manifest: dict) -> Flask:
    app = Flask(__name__)

    loaded_models = []
    for m in manifest["models"]:
        model_dir = Path(m["model_dir"])
        summary = json.loads((model_dir / m["model_summary"]).read_text(encoding="utf-8"))
        model = tf.keras.models.load_model(model_dir / m["model_h5"])

        loaded_models.append({
            "prediction_name": summary["prediction_name"],
            "model": model,
            "vectorization": summary["vectorization"],
            "threshold": float(summary.get("threshold", 0.5)),
        })

    def vectorize_dense_bow(window, cfg):
        vocab = cfg["vocab"]
        input_dim = int(cfg["input_dim"])
        index = {int(ev): i for i, ev in enumerate(vocab)}
        X = np.zeros((1, input_dim), dtype=np.float32)
        for ev in window:
            i = index.get(int(ev))
            if i is not None:
                X[0, i] += 1.0
        return X

    def vectorize_sequence(window, cfg):
        vocab = cfg["vocab"]
        max_len = int(cfg["max_len"])
        index = {int(ev): i + 1 for i, ev in enumerate(vocab)}
        seq = [index[int(e)] for e in window if int(e) in index]
        seq = seq[-max_len:]
        X = np.zeros((1, max_len), dtype=np.int32)
        if len(seq) > 0:
            X[0, -len(seq):] = np.array(seq, dtype=np.int32)
        return X

    def vectorize(window, cfg):
        vtype = cfg.get("vectorization")
        if vtype == "dense_bow":
            return vectorize_dense_bow(window, cfg)
        if vtype == "sequence":
            return vectorize_sequence(window, cfg)
        raise ValueError(f"Vectorization no soportada: {vtype}")

    @app.route("/infer", methods=["POST"])
    def infer():
        payload = request.get_json(force=True)
        window = payload["window"]
        results = []
        for m in loaded_models:
            X = vectorize(window, m["vectorization"])
            y_prob = float(m["model"].predict(X, verbose=0).ravel()[0])
            y_pred = int(y_prob >= m["threshold"])
            results.append({"prediction_name": m["prediction_name"], "y_pred": y_pred})
        return jsonify({"window": window, "results": results})

    @app.route("/control", methods=["POST"])
    def control():
        payload = request.get_json(force=True)
        if payload.get("cmd") == "shutdown":
            return jsonify({"status": "shutting_down"})
        return jsonify({"status": "unknown_command"})

    return app


In [None]:
# ===============================
# Run: orquestacion idempotente
# ===============================
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))

ensure_clean_dir(runtime_dir)
ensure_clean_dir(logs_dir)
ensure_clean_dir(metrics_dir)
ensure_clean_dir(report_dir)
figures_dir.mkdir(parents=True, exist_ok=True)

app = build_runtime_server(manifest)
server_thread = ServerThread(app, HOST, PORT)
server_thread.start()

base_url = f"http://{HOST}:{PORT}"
for _ in range(50):
    try:
        r = requests.post(f"{base_url}/infer", json={"window": []}, timeout=2)
        if r.status_code == 200:
            break
    except Exception:
        time.sleep(0.1)
else:
    server_thread.shutdown()
    raise RuntimeError("El servidor no ha arrancado correctamente")

raw_rows = []
for ds in manifest["datasets"]:
    df = pd.read_parquet(ds["dataset_path"])
    xcol = ds["x_column"]
    for _, row in df.iterrows():
        window = row[xcol]
        resp = requests.post(f"{base_url}/infer", json={"window": window}, timeout=30)
        resp.raise_for_status()
        data = resp.json()
        window_str = json.dumps(data["window"], separators=(",", ":"), ensure_ascii=False)
        for rr in data["results"]:
            raw_rows.append({
                "window": window_str,
                "prediction_name": rr["prediction_name"],
                "y_pred": int(rr["y_pred"]),
            })

raw_df = pd.DataFrame(raw_rows)
raw_parquet_path = logs_dir / "raw_predictions.parquet"
raw_csv_path = logs_dir / "raw_predictions.csv"
raw_df.to_parquet(raw_parquet_path, index=False)
raw_df.to_csv(raw_csv_path, index=False)

try:
    requests.post(f"{base_url}/control", json={"cmd": "shutdown"}, timeout=5)
finally:
    server_thread.shutdown()

print("[OK] logs crudos guardados:")
print(" -", raw_parquet_path)
print(" -", raw_csv_path)
print("[OK] filas:", len(raw_df))


## 3) Postprocess - metricas por modelo (`prediction_name`) + `no_ref_*`


In [None]:
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
raw_df = pd.read_parquet(raw_parquet_path)

# Indice rapido: (prediction_name, window_str) -> y_pred (primera aparicion)
pred_map = {}
for row in raw_df.itertuples(index=False):
    key = (row.prediction_name, row.window)
    if key not in pred_map:
        pred_map[key] = int(row.y_pred)

metrics_rows = []

for m in manifest["models"]:
    pred_name = m["prediction_name"]
    dataset_path = Path(m["dataset_path"])
    xcol = m["x_column"]
    ycol = m["y_column"]

    df = pd.read_parquet(dataset_path)

    ref_windows = set()
    tp = tn = fp = fn = 0

    for row in df.itertuples(index=False):
        window = getattr(row, xcol)
        y_true = int(getattr(row, ycol))
        window_str = json.dumps(window, separators=(",", ":"), ensure_ascii=False)
        ref_windows.add(window_str)

        y_pred = pred_map.get((pred_name, window_str))
        if y_pred is None:
            continue

        if y_true == 1 and y_pred == 1:
            tp += 1
        elif y_true == 0 and y_pred == 0:
            tn += 1
        elif y_true == 0 and y_pred == 1:
            fp += 1
        elif y_true == 1 and y_pred == 0:
            fn += 1

    model_preds = raw_df[raw_df["prediction_name"] == pred_name]
    no_ref_pred_1 = int(((~model_preds["window"].isin(ref_windows)) & (model_preds["y_pred"] == 1)).sum())
    no_ref_pred_0 = int(((~model_preds["window"].isin(ref_windows)) & (model_preds["y_pred"] == 0)).sum())
    no_ref_total = int(no_ref_pred_0 + no_ref_pred_1)

    precision = tp / (tp + fp) if (tp + fp) else 0.0
    recall = tp / (tp + fn) if (tp + fn) else 0.0
    f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0.0

    metrics_rows.append({
        "prediction_name": pred_name,
        "source_f05": m["source_f05"],
        "source_f04": m["source_f04"],
        "tp": tp, "tn": tn, "fp": fp, "fn": fn,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "no_ref_pred_1": no_ref_pred_1,
        "no_ref_pred_0": no_ref_pred_0,
        "no_ref_total": no_ref_total,
    })

    cm = np.array([[tn, fp], [fn, tp]], dtype=int)
    plt.figure(figsize=(4, 4))
    plt.imshow(cm)
    plt.title(f"Confusion - {pred_name}")
    plt.xticks([0, 1], ["Pred 0", "Pred 1"])
    plt.yticks([0, 1], ["True 0", "True 1"])
    for (i, j), v in np.ndenumerate(cm):
        plt.text(j, i, str(v), ha="center", va="center")
    plt.tight_layout()
    plt.savefig(figures_dir / f"confusion_{pred_name}.png")
    plt.close()

metrics_df = pd.DataFrame(metrics_rows)
metrics_csv_path = metrics_dir / "metrics_per_model.csv"
metrics_df.to_csv(metrics_csv_path, index=False)

print("[OK] metricas guardadas:", metrics_csv_path)
metrics_df


## 4) Report - HTML + figuras (`report/figures/`)


In [None]:
def html_escape(s: str) -> str:
    return (
        s.replace("&", "&amp;")
         .replace("<", "&lt;")
         .replace(">", "&gt;")
         .replace('"', "&quot;")
         .replace("'", "&#39;")
    )

rows_html = metrics_df.to_html(index=False)

imgs = []
for m in manifest["models"]:
    pred_name = m["prediction_name"]
    img_rel = f"figures/confusion_{pred_name}.png"
    img_path = figures_dir / f"confusion_{pred_name}.png"
    if img_path.exists():
        imgs.append(f"<h3>{html_escape(pred_name)}</h3><img src='{img_rel}' style='max-width:420px;'/>")

report_html = f"""<!doctype html>
<html>
<head>
  <meta charset="utf-8"/>
  <title>F07 Report - {html_escape(VARIANT)}</title>
  <style>
    body {{ font-family: Arial, sans-serif; margin: 24px; }}
    table {{ border-collapse: collapse; }}
    th, td {{ border: 1px solid #ddd; padding: 6px 10px; }}
    th {{ background: #f3f3f3; }}
    code {{ background:#f7f7f7; padding:2px 4px; }}
  </style>
</head>
<body>
  <h1>F07 - Deploy & Runtime Validation</h1>
  <p><b>Variant:</b> <code>{html_escape(VARIANT)}</code></p>
  <p><b>Parent F06:</b> <code>{html_escape(parent_f06)}</code></p>
  <p><b>Generated:</b> {datetime.now(timezone.utc).isoformat()}</p>

  <h2>Metrics per model</h2>
  {rows_html}

  <h2>Confusion matrices</h2>
  {''.join(imgs)}
</body>
</html>"""

report_path = report_dir / "report.html"
report_path.write_text(report_html, encoding="utf-8")
print("[OK] report:", report_path)


## 5) Trazabilidad - `07_deployrun_metadata.json`


In [None]:
write_metadata(
    stage=PHASE,
    variant=VARIANT,
    parent_variant=parent_f06,
    inputs=[str(manifest_path)],
    outputs=[str(logs_dir), str(metrics_dir), str(report_dir)],
    params=f07_params,
    metadata_path=metadata_path,
)

print("[OK] metadata:", metadata_path)


## 6) Resumen de artefactos generados


In [None]:
print("== Artefactos F07 ==")
print("manifest :", manifest_path)
print("logs     :", raw_parquet_path, raw_csv_path)
print("metrics  :", metrics_csv_path)
print("report   :", report_path)
