# Temporal-Gradient Policy Demo (Deterministic vs Stochastic)

This notebook demonstrates using Policy abstractions for an oriented agent that follows a temporal gradient of odor concentration. Choose between:
- TemporalDerivativeDeterministicPolicy
- TemporalDerivativePolicy (epsilon-greedy stochastic)


In [None]:
import os

os.environ.pop("MPLBACKEND", None)
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

import plume_nav_sim
from plume_nav_sim.policies import (
    TemporalDerivativeDeterministicPolicy,
    TemporalDerivativePolicy,
)

In [None]:
# Configure environment
grid_size = (64, 64)
source_location = (48, 48)
start_location = (16, 16)
goal_radius = 1.0
max_steps = 500
seed = 123

env = plume_nav_sim.make_env(
    grid_size=grid_size,
    source_location=source_location,
    start_location=start_location,
    goal_radius=goal_radius,
    max_steps=max_steps,
    plume_sigma=20.0,
    action_type="oriented",
    observation_type="concentration",
    reward_type="step_penalty",
    render_mode=None,
)

core_env = getattr(env, "_core_env", None)
field_array = getattr(
    getattr(core_env, "_concentration_field", None), "field_array", None
)
assert field_array is not None, "Could not access plume field for visualization"

In [None]:
# Choose a policy
policy_kind = "stochastic"  # 'deterministic' or 'stochastic'
eps = 0.05
eps_after_turn = 0.05
forward_bias = 0.05

if policy_kind == "deterministic":
    policy = TemporalDerivativeDeterministicPolicy()
else:
    policy = TemporalDerivativePolicy(
        eps=eps, eps_after_turn=eps_after_turn, eps_greedy_forward_bias=forward_bias
    )

In [None]:
# Run episode with selected policy
obs, info = env.reset(seed=seed)
policy.reset(seed=seed)

rewards = []
actions = []

positions = [tuple((info.get("agent_position") or info.get("agent_xy")))]
concentrations = [float(obs[0])]
totals = [float(info.get("total_reward", 0.0))]
distances = [float(info.get("distance_to_goal", np.nan))]
terminated = truncated = False
for _ in range(max_steps):
    action = policy.select_action(obs, explore=True)
    actions.append(action)
    obs, reward, terminated, truncated, step_info = env.step(action)
    rewards.append(float(reward))
    positions.append(tuple(step_info.get("agent_position")))
    concentrations.append(float(obs[0]))
    totals.append(float(step_info.get("total_reward", totals[-1])))
    distances.append(float(step_info.get("distance_to_goal", np.nan)))
    if terminated or truncated:
        break

print(
    f"Finished: steps={len(rewards)}, terminated={terminated}, truncated={truncated}, total_reward={totals[-1]:.3f}"
)

In [None]:
# Plot plume field with path overlay and time series
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
ax[0].imshow(field_array, cmap="gray", origin="lower")
xs = [p[0] for p in positions]
ys = [p[1] for p in positions]
ax[0].plot(xs, ys, color="cyan", linewidth=2, label="path")
ax[0].scatter([source_location[0]], [source_location[1]], c="red", s=40, label="source")
ax[0].set_title("Plume Field + Agent Path")
ax[0].legend(loc="upper right")
ax[0].set_xlim(0, grid_size[0] - 1)
ax[0].set_ylim(0, grid_size[1] - 1)

ts = np.arange(len(concentrations))
dc = np.diff(concentrations, prepend=concentrations[0])
ax[1].plot(ts, concentrations, label="concentration")
ax[1].plot(ts, dc, label="dC (temporal)")
ax[1].step(np.arange(len(rewards)) + 1, rewards, where="post", label="step reward")
ax[1].plot(ts, totals, label="total reward")
ax[1].set_title("Signals over time")
ax[1].legend()
plt.tight_layout()

In [None]:
env.close()