# Runner Demo (setup-only parity)

This notebook mirrors the deterministic temporal-gradient demo. The runner is used only to simplify setup (building env + policy); stepping matches the non-runner demo: policy.select_action → env.step.

In [None]:
# Ensure repo 'src' is on sys.path when running from notebooks/
import sys
import pathlib

repo_root = pathlib.Path.cwd()
if not (repo_root / "src").exists():
    repo_root = repo_root.parent
sys.path.insert(0, str(repo_root / "src"))

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib_inline.backend_inline import set_matplotlib_formats

mpl.use("module://matplotlib_inline.backend_inline")
set_matplotlib_formats("png")
from IPython.display import display

import plume_nav_sim as pns
from plume_nav_sim.compose import SimulationSpec, PolicySpec, prepare
from plume_nav_sim.policies import TemporalDerivativeDeterministicPolicy

In [None]:
# Build env + deterministic TD policy via compose; step manually for parity with demos
spec = SimulationSpec(
    grid_size=(64, 64),
    source_location=(48, 48),
    start_location=(16, 16),
    max_steps=500,
    render=True,
    policy=PolicySpec(
        builtin="deterministic_td", kwargs={"threshold": 1e-6, "alternate_cast": True}
    ),
    seed=123,
)
env, policy = prepare(spec)

# Manual loop: policy.select_action → env.step (no runner.stream), matching non-runner demo
obs, info = env.reset(seed=spec.seed)
policy.reset(seed=spec.seed)
positions = []
total_reward = 0.0
for _ in range(env.max_steps):
    a = policy.select_action(obs, explore=False)
    obs, reward, term, trunc, step_info = env.step(a)
    total_reward += float(reward)
    pos = step_info.get("agent_xy") if isinstance(step_info, dict) else None
    if pos is not None:
        positions.append(tuple(pos))
    if term or trunc:
        break

# Final frame and overlay
frame = env.render("rgb_array")
grid_w, grid_h = getattr(env, "grid_size", (frame.shape[1], frame.shape[0]))
sx, sy = getattr(env, "source_location", (grid_w // 2, grid_h // 2))
fig, ax = plt.subplots(figsize=(6, 6))
ax.imshow(frame)
ax.set_xlim(0, grid_w)
ax.set_ylim(grid_h, 0)  # invert y so row 0 is at top
if positions:
    xs = [p[0] for p in positions]
    ys = [p[1] for p in positions]
    ax.plot(xs, ys, "-o", color="yellow", markersize=2, linewidth=1)
    ax.scatter([xs[0]], [ys[0]], c="lime", s=36, marker="^", label="start")
    ax.scatter([xs[-1]], [ys[-1]], c="magenta", s=30, label="end")
ax.scatter(
    [sx],
    [sy],
    marker="s",
    s=60,
    facecolors="none",
    edgecolors="red",
    linewidths=1.5,
    label="source",
)
ax.legend(loc="upper right")
ax.set_title("Final frame with agent trajectory")
display(fig)
plt.close(fig)
print("Total reward:", total_reward)
env.close()