<a href="https://colab.research.google.com/github/OneFineStarstuff/Cosmic-Brilliance/blob/main/e2e_grid_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env python3
# e2e_grid.py
# End-to-end rollout with dynamic ethics, temporal memory decay, goal flip, and noise.

from __future__ import annotations
import argparse
import json
import math
import random
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Set, Tuple

# ---------------------------
# Utilities
# ---------------------------

Action = str  # 'UP'|'DOWN'|'LEFT'|'RIGHT'
Pos = Tuple[int, int]

ACTIONS: Tuple[Action, ...] = ('UP', 'DOWN', 'LEFT', 'RIGHT')
DELTA: Dict[Action, Tuple[int, int]] = {
    'UP': (0, -1),
    'DOWN': (0, 1),
    'LEFT': (-1, 0),
    'RIGHT': (1, 0),
}

def manhattan(a: Pos, b: Pos) -> int:
    return abs(a[0]-b[0]) + abs(a[1]-b[1])

def in_bounds(p: Pos, n: int) -> bool:
    return 0 <= p[0] < n and 0 <= p[1] < n

def parse_xy(s: str) -> Pos:
    x, y = s.split(',')
    return (int(x.strip()), int(y.strip()))

def parse_dyn_spec(s: str) -> Dict[int, Set[Pos]]:
    """
    Parse dynamic forbidden schedule like:
      "3:(1,1);4:(1,2);6:(2,2)"
    meaning: at t=3 add (1,1), at t=4 add (1,2), at t=6 add (2,2).
    """
    events: Dict[int, Set[Pos]] = {}
    if not s:
        return events
    for part in s.split(';'):
        part = part.strip()
        if not part:
            continue
        ts, coords = part.split(':')
        t = int(ts.strip())
        x, y = coords.strip().strip('()').split(',')
        pos = (int(x), int(y))
        events.setdefault(t, set()).add(pos)
    return events

def seeded_rng(seed: int) -> random.Random:
    rng = random.Random(seed)
    return rng

# ---------------------------
# Environment
# ---------------------------

@dataclass
class GridWorld:
    n: int
    start: Pos
    goal: Pos
    forbidden: Set[Pos] = field(default_factory=set)
    dyn_events: Dict[int, Set[Pos]] = field(default_factory=dict)
    action_noise: float = 0.0
    rng: random.Random = field(default_factory=lambda: seeded_rng(0))

    t: int = 0
    pos: Pos = field(init=False)

    def __post_init__(self):
        self.pos = self.start

    def maybe_apply_dynamic_forbidden(self):
        if self.t in self.dyn_events:
            for p in self.dyn_events[self.t]:
                self.forbidden.add(p)

    def sample_action(self, a: Action) -> Action:
        if self.action_noise <= 0:
            return a
        if self.rng.random() < self.action_noise:
            return self.rng.choice(ACTIONS)
        return a

    def step(self, a: Action) -> Dict:
        # Apply dynamic ethics updates at the start of each tick.
        self.maybe_apply_dynamic_forbidden()

        orig_pos = self.pos
        a_exec = self.sample_action(a)
        dx, dy = DELTA[a_exec]
        cand = (orig_pos[0] + dx, orig_pos[1] + dy)

        status = 'same'
        reward = 0.0
        reached = False

        if not in_bounds(cand, self.n) or cand in self.forbidden:
            # Blocked move
            cand = orig_pos
            status = 'blocked'
            reward = -1.0
        else:
            # Move accepted
            before = manhattan(orig_pos, self.goal)
            after = manhattan(cand, self.goal)
            if after < before:
                status = 'closer'
                reward = 1.0
            elif after > before:
                status = 'farther'
                reward = -1.0
            else:
                status = 'same'
                reward = 0.0

        self.pos = cand

        if self.pos == self.goal:
            status = 'reached'
            reward = 11.0  # shaped terminal bonus (+1 + 10)
            reached = True

        rec = {
            't': self.t,
            'pos': orig_pos,
            'action': {'type': 'MOVE', 'dir': a, 'exec': a_exec},
            'outcome': status,
            'reward': reward,
            'goal': self.goal,
            'reached': reached
        }
        self.t += 1
        return rec

# ---------------------------
# Agent + Predictor + Memory
# ---------------------------

@dataclass
class MemoryItem:
    t: int
    pos: Pos
    action: Action
    predicted_outcome: str
    observed_outcome: str

@dataclass
class MemoryBank:
    decay_lambda: float = 0.0
    items: List[MemoryItem] = field(default_factory=list)

    def add(self, it: MemoryItem):
        self.items.append(it)

    def contradiction_stats(self, t_now: int) -> Dict[str, float]:
        if not self.items:
            return {'processed': 0, 'contradictory': 0, 'avg_all': 0.0, 'avg_contra': 0.0}
        weights = []
        matches = []
        contra_matches = []
        for it in self.items:
            w = math.exp(-self.decay_lambda * max(0, t_now - it.t))
            is_match = 1.0 if (it.predicted_outcome == it.observed_outcome) else 0.0
            weights.append(w)
            matches.append(is_match * w)
            if not is_match:
                contra_matches.append(is_match * w)  # = 0, placeholder for symmetry
        total_w = sum(weights)
        avg_all = (sum(matches) / total_w) if total_w > 0 else 0.0
        # For avg_contra, compute weighted accuracy among contradictory items (which is zero by definition),
        # but report density as num_contra / processed instead.
        contradictory = sum(1 for it in self.items if it.predicted_outcome != it.observed_outcome)
        return {
            'processed': len(self.items),
            'contradictory': contradictory,
            'avg_all': avg_all,
            'avg_contra': 0.0,
        }

class GreedyAgent:
    def __init__(self, n: int, avoid_forbidden: bool = True, rng: Optional[random.Random] = None):
        self.n = n
        self.avoid_forbidden = avoid_forbidden
        self.rng = rng or seeded_rng(0)

    def plan(self, pos: Pos, goal: Pos, forbidden_view: Set[Pos]) -> Action:
        # Try actions that reduce Manhattan distance; avoid forbidden if possible.
        best_actions: List[Action] = []
        best_delta = 1e9
        for a in ACTIONS:
            dx, dy = DELTA[a]
            cand = (pos[0] + dx, pos[1] + dy)
            if not in_bounds(cand, self.n):
                continue
            if self.avoid_forbidden and cand in forbidden_view:
                continue
            d = manhattan(cand, goal)
            if d < best_delta:
                best_delta = d
                best_actions = [a]
            elif d == best_delta:
                best_actions.append(a)
        if best_actions:
            # Add a consistent tie-break preference to make runs reproducible.
            pref = ('DOWN', 'RIGHT', 'UP', 'LEFT')
            best_actions = sorted(best_actions, key=lambda x: pref.index(x) if x in pref else 999)
            return best_actions[0]
        # Fallback: any valid move
        valid = []
        for a in ACTIONS:
            dx, dy = DELTA[a]
            cand = (pos[0] + dx, pos[1] + dy)
            if in_bounds(cand, self.n) and (not self.avoid_forbidden or cand not in forbidden_view):
                valid.append(a)
        return valid[0] if valid else 'RIGHT'

    def predict_outcome(self, pos: Pos, a: Action, goal: Pos, forbidden_view: Set[Pos]) -> str:
        # Deterministic, no-noise predictor using current forbidden_view (may be stale vs env updates).
        dx, dy = DELTA[a]
        cand = (pos[0] + dx, pos[1] + dy)
        if not in_bounds(cand, self.n) or cand in forbidden_view:
            return 'blocked'
        if cand == goal:
            return 'reached'
        before = manhattan(pos, goal)
        after = manhattan(cand, goal)
        if after < before:
            return 'closer'
        if after > before:
            return 'farther'
        return 'same'

# ---------------------------
# Rollout + Auditing + Training
# ---------------------------

@dataclass
class RolloutConfig:
    grid: int = 5
    start: Pos = (0, 1)
    goal: Pos = (3, 3)
    tmax: int = 50
    t_flip: Optional[int] = None
    dynamic_forbidden: Dict[int, Set[Pos]] = field(default_factory=dict)
    action_noise: float = 0.0
    decay_lambda: float = 0.0
    seed: int = 7
    scenario: str = 'baseline'  # baseline|dynamic-ethics|temporal-decay|flip-goal|inject-noise|all
    log_jsonl: Optional[str] = None

@dataclass
class RolloutResult:
    steps: List[Dict]
    memory: MemoryBank
    contradictions: Dict[str, float]

def run_rollout(cfg: RolloutConfig) -> RolloutResult:
    rng = seeded_rng(cfg.seed)
    env = GridWorld(
        n=cfg.grid,
        start=cfg.start,
        goal=cfg.goal,
        forbidden=set(),
        dyn_events=cfg.dynamic_forbidden,
        action_noise=cfg.action_noise,
        rng=rng
    )
    agent = GreedyAgent(n=cfg.grid, avoid_forbidden=True, rng=rng)
    memory = MemoryBank(decay_lambda=cfg.decay_lambda)

    steps: List[Dict] = []
    log_f = open(cfg.log_jsonl, 'w') if cfg.log_jsonl else None

    # Print header
    print("=== Rollout ===")

    for t in range(cfg.tmax):
        # Goal flip mid-rollout if configured
        if cfg.t_flip is not None and env.t == cfg.t_flip:
            # Flip to a symmetric opposite corner by default if not specified explicitly
            old_goal = env.goal
            env.goal = (cfg.grid - 1 - old_goal[0], cfg.grid - 1 - old_goal[1])
            print(f"[Info] Goal flipped at t={env.t} from {old_goal} to {env.goal}")

        # Predictor uses the current view before env applies dynamic changes (causes contradictions)
        current_forbidden_view = set(env.forbidden)
        pos = env.pos
        goal = env.goal
        a = agent.plan(pos, goal, current_forbidden_view)
        predicted = agent.predict_outcome(pos, a, goal, current_forbidden_view)

        rec = env.step(a)
        # Audit and memory
        memory.add(MemoryItem(
            t=rec['t'],
            pos=rec['pos'],
            action=rec['action']['dir'],
            predicted_outcome=predicted,
            observed_outcome=rec['outcome'],
        ))

        steps.append(rec)
        print(f"t={rec['t']:02d} pos={rec['pos']} action={{'type': 'MOVE', 'dir': '{rec['action']['dir']}'}} "
              f"outcome={rec['outcome']} reward={rec['reward']} goal={rec['goal']} reached={rec['reached']}")

        if log_f:
            log_f.write(json.dumps(rec) + "\n")

        if rec['reached']:
            print("Goal reached. Stopping rollout.")
            break

    if log_f:
        log_f.close()

    # Contradictions audit
    print("\n=== Audited contradictions ===")
    stats = memory.contradiction_stats(t_now=env.t)
    # We surface raw contradictions, density, and weighted avg match
    if stats['processed'] == 0:
        print("(none)")
    else:
        density = stats['contradictory'] / stats['processed'] if stats['processed'] else 0.0
        print(f"processed={stats['processed']} contradictory={stats['contradictory']} density={density:.3f} "
              f"avg_match_all={stats['avg_all']:.3f}")

    # Batched revise placeholder (no parameter updates in this simple demo)
    print("\n=== Batched revise ===")
    print(f"[Revision] Contradictions processed: {stats['contradictory']}")

    # Curriculum training demo
    print("\n=== Curriculum training ===")
    thresholds = [0.70, 0.78, 0.87]
    for i, th in enumerate(thresholds, start=1):
        # In a real loop, we'd adapt policy if density > (1 - th). Here we just report.
        density = stats['contradictory'] / stats['processed'] if stats['processed'] else 0.0
        avg_all = stats['avg_all']
        avg_contra = stats['avg_contra']  # zero by definition here
        print(f"[Training] step={i} processed={stats['processed']} contradictory={stats['contradictory']} "
              f"threshold={th:.2f} avg_all={avg_all:.3f} avg_contra={avg_contra:.3f}")

    return RolloutResult(steps=steps, memory=memory, contradictions=stats)

# ---------------------------
# Scenario presets
# ---------------------------

def make_config_from_args(args: argparse.Namespace) -> RolloutConfig:
    start = parse_xy(args.start)
    goal = parse_xy(args.goal)
    dyn = parse_dyn_spec(args.dynamic)

    scenario = args.scenario
    cfg = RolloutConfig(
        grid=args.grid,
        start=start,
        goal=goal,
        tmax=args.tmax,
        t_flip=args.t_flip if scenario in ('flip-goal', 'all') else None,
        dynamic_forbidden=dyn if scenario in ('dynamic-ethics', 'all') else {},
        action_noise=(args.noise if scenario in ('inject-noise', 'all') else 0.0),
        decay_lambda=(args.decay if scenario in ('temporal-decay', 'all') else 0.0),
        seed=args.seed,
        scenario=scenario,
        log_jsonl=args.log,
    )
    return cfg

# ---------------------------
# CLI
# ---------------------------

def main():
    p = argparse.ArgumentParser(description="End-to-end grid rollout with audit and curriculum.")
    p.add_argument('--grid', type=int, default=5, help='Grid size N (NxN)')
    p.add_argument('--start', type=str, default='0,1', help='Start position "x,y"')
    p.add_argument('--goal', type=str, default='3,3', help='Goal position "x,y"')
    p.add_argument('--tmax', type=int, default=50, help='Max timesteps')
    p.add_argument('--t_flip', type=int, default=None, help='Timestep to flip the goal (only in flip-goal or all)')
    p.add_argument('--dynamic', type=str, default='', help='Dynamic forbidden spec, e.g. "3:(1,1);6:(2,2)"')
    p.add_argument('--noise', type=float, default=0.0, help='Action noise probability (only in inject-noise or all)')
    p.add_argument('--decay', type=float, default=0.0, help='Temporal decay lambda (only in temporal-decay or all)')
    p.add_argument('--seed', type=int, default=7, help='RNG seed')
    p.add_argument('--log', type=str, default=None, help='Path to JSONL step log')
    p.add_argument('--scenario', type=str, default='baseline',
                   choices=['baseline', 'dynamic-ethics', 'temporal-decay', 'flip-goal', 'inject-noise', 'all'],
                   help='Which friction(s) to enable')

    args = p.parse_args()
    cfg = make_config_from_args(args)
    run_rollout(cfg)

if __name__ == '__main__':
    main()