# Capture Pipeline: End-to-End Exploration

This notebook demonstrates a minimal end-to-end run of the data capture pipeline, including:

- Running the capture via the CLI entrypoint using a Hydra config (conf/data_capture).
- Validating emitted artifacts with `validate_run_artifacts`.
- Inspecting provenance (`manifest.json`) including git SHA and config hash.
- Loading JSONL/Parquet for quick exploratory analysis and a small plot.

> Runtime profile: small grid (8x8), 2 episodes, deterministic seed for fast execution.


In [None]:
from pathlib import Path
import json, gzip, io

# Resolve repo root heuristically for local runs
repo = Path.cwd()
if not (repo / "src").exists():
    # Try parent (common when opening notebook directly)
    if (Path.cwd().parent / "src").exists():
        repo = Path.cwd().parent

out_root = repo / "notebooks" / "_data"
out_root.mkdir(parents=True, exist_ok=True)
out_root

In [None]:
# Run capture via CLI main() using Hydra config
from plume_nav_sim.cli.capture import main as capture_main

argv = [
    "--output",
    str(out_root),
    "--experiment",
    "nb_demo",
    "--seed",
    "123",
    "--config-name",
    "data_capture/config",
    "episodes=2",
    "env.grid_size=[8,8]",
    "env.max_steps=20",
    "--parquet",  # write parquet if pyarrow available
]
rc = capture_main(argv)
assert rc == 0, f"capture CLI failed with code {rc}"
rc

In [None]:
# Discover latest run directory
exp_dir = out_root / "nb_demo"
candidates = [p for p in exp_dir.iterdir() if p.is_dir() and p.name.startswith("run-")]
assert candidates, f"No run directory found in {exp_dir}"
run_dir = max(candidates, key=lambda p: p.stat().st_mtime)
run_dir

In [None]:
# Validate artifacts with Pandera
from plume_nav_sim.data_capture.validate import validate_run_artifacts

report = validate_run_artifacts(run_dir)
report

In [None]:
# Inspect provenance manifest: config hash, env config, git SHA
manifest = json.loads((run_dir / "manifest.json").read_text(encoding="utf-8"))
{
    "experiment": manifest.get("experiment"),
    "run_id": manifest.get("run_id"),
    "git_sha": manifest.get("git_sha"),
    "config_hash": manifest.get("config_hash"),
    "schema_version": manifest.get("schema_version"),
}

In [None]:
# Load episodes using pandas if available; fallback to JSONL parsing
episodes_path = run_dir / "episodes.parquet"
steps_path = run_dir / "steps.parquet"
use_pandas = True
try:
    import pandas as pd  # type: ignore
except Exception:
    use_pandas = False

if use_pandas and episodes_path.exists():
    episodes_df = pd.read_parquet(episodes_path)
elif use_pandas:
    episodes_df = pd.read_json(
        run_dir / "episodes.jsonl.gz", lines=True, compression="gzip"
    )
else:
    # Lightweight fallback: count episodes via JSONL
    with gzip.open(run_dir / "episodes.jsonl.gz", "rt", encoding="utf-8") as fh:
        episodes_df = [json.loads(l) for l in fh if l.strip()]

type(episodes_df)

In [None]:
# Simple exploratory plot: total steps per episode (if pandas/matplotlib available)
try:
    import matplotlib.pyplot as plt
    import pandas as pd

    if isinstance(episodes_df, pd.DataFrame):
        episodes_df[["episode_id", "total_steps"]].set_index("episode_id").plot(
            kind="bar", legend=False
        )
        plt.title("Total Steps per Episode")
        plt.ylabel("steps")
        plt.tight_layout()
    else:
        # Fallback: basic text output
        print("Episodes:", len(episodes_df))
        print("Keys:", list(episodes_df[0].keys()) if episodes_df else [])
except Exception as e:
    print("Plotting skipped:", e)

## Notes

- This notebook uses Hydra configs under `conf/data_capture`.
- Provenance is captured in `manifest.json` (config hash, git SHA, schema version).
- For faster loading, install the `data` extras to enable Parquet export/reads.

Render to HTML for docs:
```bash
make nb-render
```
