CELDA 0 ‚Äî T√≠tulo (Markdown)

# Fase 03 ‚Äî PrepareWindowsDS

Generaci√≥n del **dataset final de ventanas temporales materializadas**.

Cada fila del dataset contiene:
- `OW_events`: eventos observados (ventana de observaci√≥n)
- `PW_events`: eventos a predecir (ventana de predicci√≥n)

Este notebook reproduce **exactamente** la l√≥gica de `03_preparewindowsds.py`.


## 1. Imports y bootstrap

In [1]:
import os
import sys
from pathlib import Path
from bisect import bisect_left
from datetime import datetime, timezone
from time import perf_counter
import json
import yaml

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
# Bootstrap para localizar el proyecto
NOTEBOOK_PATH = Path.cwd().resolve()
ROOT = NOTEBOOK_PATH
for _ in range(10):
    if (ROOT / "mlops4ofp").exists():
        break
    ROOT = ROOT.parent
else:
    raise RuntimeError("No se pudo localizar el project root")

sys.path.insert(0, str(ROOT))
print("Project root:", ROOT)


Project root: /Users/juancarlosduenaslopez/Documents/mlops/mlops4ofp


## 2. Contexto de ejecuci√≥n

In [3]:
# --- BOOTSTRAP M√çNIMO (antes de cualquier import mlops4ofp) ---
execution_dir = Path.cwd().resolve()
current = execution_dir

for _ in range(10):
    if (current / "mlops4ofp").exists():
        project_root = current
        break
    current = current.parent
else:
    raise RuntimeError("‚ùå No se pudo localizar project_root")

sys.path.insert(0, str(project_root))

print(f"üìÅ Project root a√±adido a PYTHONPATH: {project_root}")
from mlops4ofp.tools.run_context import (
    detect_execution_dir,
    detect_project_root,
    assemble_run_context,
    build_phase_outputs,

)

PHASE = "03_preparewindowsds"

execution_dir = detect_execution_dir()
project_root = detect_project_root(execution_dir)

ACTIVE_VARIANT = None  # se detecta autom√°ticamente


üìÅ Project root a√±adido a PYTHONPATH: /Users/juancarlosduenaslopez/Documents/mlops/mlops4ofp


In [4]:
# Selecci√≥n de variante (requiere VARIANT)
variants_dir = project_root / "executions" / PHASE
variants = sorted(p.name for p in variants_dir.iterdir() if p.is_dir())

env_variant = os.getenv("VARIANT") or os.getenv("ACTIVE_VARIANT")

#env_variant = "v001"  # Para forzar una variante concreta (descomentar y asignar la variante deseada)

if not env_variant:
    raise RuntimeError(
        "‚ùå VARIANT no definido. Ejecuta el notebook con: make nb3-run VARIANT=v001"
    )
ACTIVE_VARIANT = env_variant

In [5]:
# Para forzar una variante concreta, descomentar la siguiente l√≠nea e indicar la variante deseada (ej: "v903")
#ACTIVE_VARIANT  = "v904"

variant_root = variants_dir / ACTIVE_VARIANT

ctx = assemble_run_context(
    project_root=project_root,
    phase=PHASE,
    variant=ACTIVE_VARIANT,
    variant_root=variant_root,
    execution_dir=execution_dir,
)

print("Variante activa:", ACTIVE_VARIANT)

OUTPUTS = build_phase_outputs(
    variant_root=variant_root,
    phase=ctx["phase"],
)

ctx["outputs"] = OUTPUTS  # para que generate_figures_and_report use ctx["outputs"]["report"]





Variante activa: v100


## 3. Carga de par√°metros F03

In [6]:
with open(variant_root / "params.yaml", "r", encoding="utf-8") as f:
    params = yaml.safe_load(f)

OW = int(params["OW"])
LT = int(params["LT"])
PW = int(params["PW"])
nan_strategy = params.get("nan_strategy", "discard")
window_strategy = params.get("window_strategy", "synchro")
parent_phase = params.get("parent_phase", "02_prepareeventsds")
parent_variant = params["parent_variant"]

print("OW, LT, PW =", OW, LT, PW)
print("window_strategy =", window_strategy)
print("nan_strategy =", nan_strategy)
print("parent_phase =", parent_phase)
print("parent_variant =", parent_variant)



OW, LT, PW = 600 100 100
window_strategy = synchro
nan_strategy = discard
parent_phase = 02_prepareeventsds
parent_variant = v010


## 4. Resoluci√≥n de Tu desde F02

In [7]:
with open(
    project_root
    / "executions"
    / parent_phase
    / parent_variant
    / f"{parent_phase}_metadata.json",
    "r",
    encoding="utf-8",
) as f:
    meta_f02 = json.load(f)

Tu_raw = params.get("Tu", None)
if Tu_raw is not None:
    Tu = float(Tu_raw)
else:
    Tu_f02 = meta_f02.get("Tu", None)
    if Tu_f02 is None:
        raise RuntimeError(
            "No se pudo determinar Tu: es None en F03 params y en F02 metadata"
        )
    Tu = float(Tu_f02)

print(f"[F03] Tu resuelto = {Tu} (origen: {'params' if Tu_raw is not None else 'F02_metadata'})", flush=True)
print("Tu =", Tu)

ctx['variant_params'] = params
ctx['variant_params']['Tu'] = Tu

[F03] Tu resuelto = 10.0 (origen: F02_metadata)


Tu = 10.0


## 5. Carga del dataset F02

In [8]:
input_dataset = (
    project_root
    / "executions"
    / parent_phase
    / parent_variant
    / f"{parent_phase}_dataset.parquet"
 )

print(f"[F03] leyendo dataset F02: {input_dataset}", flush=True)
t_read_start = perf_counter()
df = pq.read_table(input_dataset).to_pandas(
    split_blocks=True,
    self_destruct=True,
 )
t_read_elapsed = perf_counter() - t_read_start
print(f"[F03] dataset F02 cargado en {t_read_elapsed:,.1f}s", flush=True)

if not df["segs"].is_monotonic_increasing:
    df = df.sort_values("segs", kind="mergesort").reset_index(drop=True)

print("[F03] preparando arrays times/events...", flush=True)
t_arr_start = perf_counter()
times = df["segs"].to_numpy(dtype=np.int64, copy=False)
events = df["events"].to_numpy()
lengths = np.fromiter((len(evs) for evs in events), dtype=np.int64, count=len(events))
offsets = np.empty(len(events) + 1, dtype=np.int64)
offsets[0] = 0
np.cumsum(lengths, out=offsets[1:])
events_flat = [ev for evs in events for ev in evs]
times_flat = [t for t, evs in zip(times, events) for _ in evs]
has_event = lengths > 0
t_arr_elapsed = perf_counter() - t_arr_start
print(f"[F03] arrays listos en {t_arr_elapsed:,.1f}s | eventos totales: {len(events_flat):,}", flush=True)

print("F02 cargado:", len(df), "filas")

[F03] leyendo dataset F02: /Users/juancarlosduenaslopez/Documents/mlops/mlops4ofp/executions/02_prepareeventsds/v010/02_prepareeventsds_dataset.parquet


[F03] dataset F02 cargado en 0.7s


[F03] preparando arrays times/events...


[F03] arrays listos en 2.7s | eventos totales: 304,389


F02 cargado: 3887242 filas


CELDA 6 ‚Äî Cat√°logo de eventos NaN

In [9]:
with open(
    project_root
    / "executions"
    / parent_phase
    / parent_variant
    / f"{parent_phase}_event_catalog.json",
    "r",
    encoding="utf-8",
) as f:
    catalog = json.load(f)

nan_codes = {
    code for name, code in catalog.items()
    if name.endswith("_NaN_NaN")
}

t_nan_start = perf_counter()
has_nan = np.array(
    [any(ev in nan_codes for ev in evs) for evs in events],
    dtype=bool,
 )
nan_prefix = np.cumsum(has_nan, dtype=np.int64)
t_nan_elapsed = perf_counter() - t_nan_start
print(f"[F03] has_nan + prefix en {t_nan_elapsed:,.1f}s", flush=True)

[F03] has_nan + prefix en 1.8s


CELDA 7 ‚Äî Definici√≥n de ventanas

In [10]:
OW_end = OW
PW_start = OW + LT
PW_end = OW + LT + PW


In [11]:
def window_start_iterator():
    """
    Iterador de inicios de ventana (t0) seg√∫n la estrategia.

    Reglas:
    - synchro  : todos los Tu
    - withinPW : todos los Tu
    - asynPW   : todos los Tu
    - asynOW   : solo bins con ‚â•1 evento en OW
    """
    t_start = times[0]

    # geometr√≠a local (evita dependencias de orden de celdas)
    OW_span_local = OW * Tu
    PW_start_local = (OW + LT) * Tu
    PW_span_local = PW * Tu
    total_span_local = PW_start_local + PW_span_local

    t_end = times[-1] - total_span_local

    if window_strategy in ("synchro", "withinPW", "asynPW"):
        t = t_start
        while t <= t_end:
            yield t
            t += Tu

    elif window_strategy == "asynOW":
        active_bins = np.unique(
            ((times[lengths > 0] - times[0]) // Tu).astype(np.int64)
        )
        for b in active_bins:
            t = t_start + b * Tu
            if t <= t_end:
                yield t

    else:
        raise ValueError(f"window_strategy no soportada: {window_strategy}")


In [12]:
def idx_range(t0, t1):
    return bisect_left(times, t0), bisect_left(times, t1)

def has_nan_in_range(i0, i1):
    if i0 >= i1:
        return False
    return (nan_prefix[i1 - 1] - (nan_prefix[i0 - 1] if i0 > 0 else 0)) > 0

CELDA 8 ‚Äî Generaci√≥n de ventanas materializadas

In [13]:
output_path = variant_root / f"{PHASE}_dataset.parquet"

schema = pa.schema([
    ("OW_events", pa.list_(pa.int32())),
    ("PW_events", pa.list_(pa.int32())),
])
writer = pq.ParquetWriter(output_path, schema, compression="snappy")
BATCH = 100
rows = []

total = 0
kept = 0
LOG_EVERY = 100_000
t_start = perf_counter()
t_loop_start = t_start

for t0 in window_start_iterator():
    total += 1

    i_ow_0, i_ow_1 = idx_range(t0, t0 + OW * Tu)
    i_pw_0, i_pw_1 = idx_range(
        t0 + PW_start * Tu,
        t0 + PW_end * Tu,
    )
    i_pw_start0, i_pw_start1 = idx_range(
        t0 + PW_start * Tu,
        t0 + (PW_start + 1) * Tu,
    )

    ow_len = i_ow_1 - i_ow_0
    pw_len = i_pw_1 - i_pw_0

    if ow_len == 0 and pw_len == 0:
        continue

    if nan_strategy == "discard":
        if has_nan_in_range(i_ow_0, i_ow_1):
            continue
        if has_nan_in_range(i_pw_0, i_pw_1):
            continue

    ow_start = offsets[i_ow_0]
    ow_end = offsets[i_ow_1]
    pw_start = offsets[i_pw_0]
    pw_end = offsets[i_pw_1]

    ow_events = events_flat[ow_start:ow_end]
    pw_events = events_flat[pw_start:pw_end]

    if len(ow_events) == 0 and len(pw_events) == 0:
        continue

    if window_strategy == "asynOW" and len(ow_events) == 0:
        continue
    
    if window_strategy == "withinPW" and len(pw_events) == 0:
        continue

    if window_strategy == "asynPW" and i_pw_start0 == i_pw_start1:
        continue

    rows.append({
        "OW_events": ow_events,
        "PW_events": pw_events,
    })
    kept += 1

    if total % LOG_EVERY == 0:
        elapsed = perf_counter() - t_start
        print(
            f"[F03] ventanas: {total:,} | "
            f"escritas: {kept:,} | "
            f"tiempo: {elapsed:,.1f}s",
            flush=True,
        )

    if len(rows) >= BATCH:
        writer.write_table(pa.Table.from_pylist(rows, schema))
        rows.clear()

if rows:
    writer.write_table(pa.Table.from_pylist(rows, schema))

writer.close()

elapsed_total = perf_counter() - t_start
loop_elapsed = perf_counter() - t_loop_start
print("Ventanas totales:", total)
print("Ventanas v√°lidas :", kept)
print(f"Tiempo total     : {elapsed_total:,.1f}s")
print(f"Tiempo loop      : {loop_elapsed:,.1f}s")

[F03] ventanas: 100,000 | escritas: 58,538 | tiempo: 1.8s


[F03] ventanas: 400,000 | escritas: 189,836 | tiempo: 7.3s


[F03] ventanas: 700,000 | escritas: 329,531 | tiempo: 12.8s


[F03] ventanas: 800,000 | escritas: 382,652 | tiempo: 14.6s


[F03] ventanas: 900,000 | escritas: 453,379 | tiempo: 16.6s


[F03] ventanas: 1,000,000 | escritas: 553,379 | tiempo: 18.6s


[F03] ventanas: 1,100,000 | escritas: 653,379 | tiempo: 20.6s


[F03] ventanas: 1,300,000 | escritas: 813,254 | tiempo: 24.6s


[F03] ventanas: 1,400,000 | escritas: 910,362 | tiempo: 27.0s


[F03] ventanas: 1,500,000 | escritas: 1,007,266 | tiempo: 29.6s


[F03] ventanas: 1,600,000 | escritas: 1,101,021 | tiempo: 31.7s


[F03] ventanas: 1,700,000 | escritas: 1,192,552 | tiempo: 34.0s


[F03] ventanas: 1,800,000 | escritas: 1,292,500 | tiempo: 36.1s


[F03] ventanas: 1,900,000 | escritas: 1,391,100 | tiempo: 38.3s


[F03] ventanas: 2,000,000 | escritas: 1,458,949 | tiempo: 40.2s


[F03] ventanas: 2,400,000 | escritas: 1,631,930 | tiempo: 47.4s


[F03] ventanas: 2,500,000 | escritas: 1,684,984 | tiempo: 49.3s


[F03] ventanas: 2,700,000 | escritas: 1,820,708 | tiempo: 53.1s


[F03] ventanas: 2,800,000 | escritas: 1,920,624 | tiempo: 55.7s


[F03] ventanas: 2,900,000 | escritas: 2,020,313 | tiempo: 58.2s


[F03] ventanas: 3,000,000 | escritas: 2,120,313 | tiempo: 60.6s


[F03] ventanas: 3,100,000 | escritas: 2,219,564 | tiempo: 62.8s


[F03] ventanas: 3,200,000 | escritas: 2,315,546 | tiempo: 65.1s


[F03] ventanas: 3,400,000 | escritas: 2,480,895 | tiempo: 69.2s


[F03] ventanas: 3,600,000 | escritas: 2,644,014 | tiempo: 73.0s


[F03] ventanas: 3,700,000 | escritas: 2,734,072 | tiempo: 75.0s


[F03] ventanas: 3,800,000 | escritas: 2,823,598 | tiempo: 76.9s


[F03] ventanas: 3,900,000 | escritas: 2,905,681 | tiempo: 78.8s


Ventanas totales: 3947464
Ventanas v√°lidas : 2943521
Tiempo total     : 79.8s
Tiempo loop      : 79.8s


CELDA 9 ‚Äî Escritura del dataset FINAL

In [14]:
print("Dataset F03 generado:", output_path)

Dataset F03 generado: /Users/juancarlosduenaslopez/Documents/mlops/mlops4ofp/executions/03_preparewindowsds/v100/03_preparewindowsds_dataset.parquet


CELDA 10 ‚Äî Validaci√≥n r√°pida

In [15]:
df_check = pd.read_parquet(output_path)

assert df_check.shape[1] == 2
def is_sequence(x):
    return isinstance(x, (list, tuple)) or hasattr(x, "__iter__")

assert all(is_sequence(x) for x in df_check["OW_events"][:10])
assert all(is_sequence(x) for x in df_check["PW_events"][:10])
print("Comprobaci√≥n de lectura exitosa")

Comprobaci√≥n de lectura exitosa


In [16]:
print("[NB-F03] Ejecutando checks formales F03 sobre el dataset final...")

df_check = pd.read_parquet(output_path)

# Invariante global F03
bad = (
    (df_check["OW_events"].apply(len) == 0)
    & (df_check["PW_events"].apply(len) == 0)
).sum()
assert bad == 0, f"‚ùå ERROR: hay {bad} pares con OW y PW vac√≠as"
print("‚úî CHECK: no hay pares OW+PW vac√≠os")

# Checks espec√≠ficos por estrategia
if window_strategy == "withinPW":
    bad = (df_check["PW_events"].apply(len) == 0).sum()
    assert bad == 0, f"‚ùå ERROR: withinPW tiene {bad} PW vac√≠as"
    print("‚úî CHECK: withinPW cumple PW no vac√≠a")

if window_strategy == "asynPW":
    print("‚ÑπÔ∏è CHECK: asynPW usa evento al inicio de PW")

print("‚úî TODOS LOS CHECKS F03 SUPERADOS")


[NB-F03] Ejecutando checks formales F03 sobre el dataset final...


‚úî CHECK: no hay pares OW+PW vac√≠os
‚úî TODOS LOS CHECKS F03 SUPERADOS


In [17]:
# Intentar importar el m√≥dulo de reporte si existe

import mlops4ofp.tools.html_reports.html03 as preparewindows_report03
# import importlib
# importlib.reload(preparewindows_report03)


df_windows = pd.read_parquet(ctx["outputs"]["dataset"])
print(df_windows.head())

preparewindows_report03.generate_html_report(
    ctx=ctx,
    df_windows=df_windows,
    catalog=catalog,
)


  OW_events     PW_events
0        []          [27]
1        []      [27, 30]
2        []  [27, 30, 27]
3        []  [27, 30, 27]
4        []  [27, 30, 27]
[preparewindowsds] Generando informe HTML final...


[OK] Informe HTML generado en /Users/juancarlosduenaslopez/Documents/mlops/mlops4ofp/executions/03_preparewindowsds/v100/03_preparewindowsds_report.html
