<a href="https://colab.research.google.com/github/OneFineStarstuff/Pinn/blob/main/agi_agent_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# %% [markdown]
# # AGI Agent: Multi-Task Experimental Harness
# Safe defaults, reproducibility, and hot-swappable modules.

# %% [markdown]
# ## 0. Setup and provenance

import os, sys, math, random, time, json, dataclasses, typing as T
from dataclasses import dataclass, field
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

try:
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    TORCH_AVAILABLE = True
except Exception:
    TORCH_AVAILABLE = False

# --- Reproducibility ---
SEED = 1234
np.random.seed(SEED)
random.seed(SEED)
if TORCH_AVAILABLE:
    torch.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

DEVICE = "cuda" if TORCH_AVAILABLE and torch.cuda.is_available() else "cpu"
RUN_ID = f"run_{int(time.time())}"
os.makedirs("runs", exist_ok=True)

def log_jsonl(path: str, record: dict):
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print({"seed": SEED, "device": DEVICE, "torch": TORCH_AVAILABLE, "run_id": RUN_ID})

# %% [markdown]
# ## 1. Configs and agent adapter

@dataclass
class EvalConfig:
    # Regression
    n_reg_train: int = 5
    n_reg_test: int = 2
    k_shot: int = 5
    q_points: int = 50

    # Multimodal
    mm_trials: int = 3
    mm_candidates: int = 4

    # Grid navigation
    grid_trials: int = 5
    grid_max_steps: int = 20

@dataclass
class AgentConfig:
    name: str = "SimpleAgent"
    use_world_model: bool = False
    use_multimodal_encoder: bool = True
    mc_dropout_passes: int = 10
    reflection_steps: int = 1
    notes: str = ""

class AgentAdapter:
    def __init__(self, cfg: AgentConfig):
        self.cfg = cfg

    def act(self, obs: dict, task_id: str, step: int, state: dict | None):
        raise NotImplementedError

    def learn(self, batch: dict, task_id: str) -> dict:
        return {"learned": False}

    def reflect(self, logs: list[dict]) -> dict:
        return {"notes": "no-op reflection", "patches": {}}

    def encode(self, modality: str, data) -> np.ndarray:
        raise NotImplementedError

    def imagine(self, state: dict, n_steps: int = 5) -> list[dict]:
        return []

# %% [markdown]
# ## 2. Instantiate configs and agent

eval_cfg = EvalConfig()

class DummyAgentAdapter(AgentAdapter):
    """
    A minimal agent adapter that can handle:
      - txt2img: choose an image index
      - img2txt: choose a text index
      - regression: return zero predictions
    Always returns Python ints for indices to avoid NumPy/PyTorch scalar issues.
    """

    def act(self, obs: dict, task_id: str, step: int, state: dict | None):
        mode = obs.get("mode")

        # Multimodal: text -> image
        if mode == "txt2img":
            # Always pick the first image (safe dummy)
            pred_idx = 0
            return pred_idx, state, {}

        # Multimodal: image -> text
        elif mode == "img2txt":
            # Always pick the first text (safe dummy)
            pred_idx = 0
            return pred_idx, state, {}

        # Simple regression task
        elif "x" in obs:
            import numpy as np
            x = obs["x"]
            preds = np.zeros_like(x, dtype=float)
            return preds, state, {}

        # Default: no action
        return None, state, {}

    def learn(self, batch: dict, task_id: str) -> dict:
        return {"learned": True}

    def encode(self, modality: str, data):
        return np.zeros(8)

agent_cfg = AgentConfig(
    name="SimpleAgent",
    use_world_model=False,
    use_multimodal_encoder=True,
    mc_dropout_passes=10,
    reflection_steps=1,
    notes="baseline"
)

agent = DummyAgentAdapter(agent_cfg)

# %% [markdown]
# ## 3. Example: generate tasks and run regression episodes

# These functions must exist in your environment
# generate_sin_tasks(seed) -> (train_tasks, test_tasks)
# run_regression_episode(agent, task, k_shot, q_points, split) -> dataclass result

sin_train, sin_test = generate_sin_tasks(SEED)

records = []
for task in sin_train[:eval_cfg.n_reg_train]:
    r = run_regression_episode(
        agent,
        task,
        k_shot=eval_cfg.k_shot,
        q_points=eval_cfg.q_points,
        split="train"
    )
    run_id = f"run_{int(time.time())}"
    records.append(dataclasses.asdict(r) | {"run_id": RUN_ID})

# Optional: test loop
for task in sin_test[:eval_cfg.n_reg_test]:
    r = run_regression_episode(
        agent,
        task,
        k_shot=eval_cfg.k_shot,
        q_points=eval_cfg.q_points,
        split="test"
    )
    run_id = f"run_{int(time.time())}"
    records.append(dataclasses.asdict(r) | {"run_id": RUN_ID})

# %% [markdown]
# ### 1.1 Simple, runnable baseline agent (replace with yours)
# - Symbol/Grid: heuristic + epsilon-random
# - Regression: small MLP with optional MAML-like inner loop
# - Multimodal: tiny text/image encoders for synthetic data
# This keeps the harness runnable end-to-end until you plug in your modules.

# %%
class SimpleAgentAdapter(AgentAdapter):
    def __init__(self, cfg: AgentConfig):
        super().__init__(cfg)
        self.global_step = 0
        self.eps = 0.1

        # --- Text encoder setup ---
        self.char_vocab = {c: i+1 for i, c in enumerate("abcdefghijklmnopqrstuvwxyz0123456789_- ")}
        self.txt_dim = 64  # bag-of-chars output size

        # --- Image encoder setup ---
        self.img_size = (32, 32)
        self.img_dim = self.img_size[0] * self.img_size[1]  # grayscale flatten

        # --- Shared multimodal embedding dimension ---
        self.shared_dim = 64

        if TORCH_AVAILABLE:
            # Trainable projections into shared space
            self.txt_proj = nn.Linear(self.txt_dim, self.shared_dim)
            self.img_proj = nn.Linear(self.img_dim, self.shared_dim)

            # Regression model
            self.reg_model = nn.Sequential(
                nn.Linear(1, 64), nn.ReLU(),
                nn.Linear(64, 64), nn.ReLU(),
                nn.Dropout(p=0.1),
                nn.Linear(64, 1)
            ).to(DEVICE)

            # Optimizers
            self.reg_opt = torch.optim.Adam(self.reg_model.parameters(), lr=5e-3)
            self.mm_opt = torch.optim.Adam(
                list(self.txt_proj.parameters()) + list(self.img_proj.parameters()),
                lr=1e-3
            )
        else:
            self.reg_model = None
            self.txt_proj = None
            self.img_proj = None

    def act(self, obs: dict, task_id: str, step: int, state: dict | None):
        self.global_step += 1

        if task_id.startswith("grid"):
            acts = obs["action_space"]
            if random.random() < self.eps:
                a = random.choice(acts)
            else:
                goal = obs.get("goal")
                pos = obs["pos"]
                def heuristic(act):
                    dx, dy = {"up": (-1, 0), "down": (1, 0), "left": (0, -1), "right": (0, 1)}.get(act, (0, 0))
                    np_next = (pos[0] + dx, pos[1] + dy)
                    if goal:
                        return - (abs(np_next[0] - goal[0]) + abs(np_next[1] - goal[1]))
                    return -random.random()
                a = min(acts, key=heuristic)
            return a, state or {}, {}

        elif task_id.startswith("symbol"):
            rules = obs["rules"]; cur = obs["string"]; target = obs["target"]
            candidates = []
            for (lhs, rhs) in rules:
                idx = cur.find(lhs)
                if idx >= 0:
                    new_s = cur[:idx] + rhs + cur[idx+len(lhs):]
                    candidates.append((lhs, rhs, new_s))
            if candidates:
                def score(c):
                    s = c[2]
                    return abs(len(s) - len(target)) + sum(1 for a, b in zip(s, target) if a != b)
                lhs, rhs, _ = min(candidates, key=score)
                return (lhs, rhs), state or {}, {}
            else:
                if rules:
                    return random.choice(rules), state or {}, {}
                return None, state or {}, {}

        elif task_id.startswith("regress"):
            x = obs["x"]
            if TORCH_AVAILABLE and self.reg_model is not None:
                with torch.no_grad():
                    self.reg_model.train(False)
                    pred = self.reg_model(torch.from_numpy(x).float().to(DEVICE)).cpu().numpy()
                    self.reg_model.train(True)
                return pred, state or {}, {}
            else:
                return np.zeros_like(x), state or {}, {}

        elif task_id.startswith("multimodal"):
            if obs["mode"] == "txt2img":
                q = self._encode_text(obs["text"])
                sims = [self._cos(q, self._encode_img(img)) for img in obs["images"]]
                idx = int(np.argmax(sims))
                return idx, state or {}, {}
            else:
                q = self._encode_img(obs["image"])
                sims = [self._cos(q, self._encode_text(t)) for t in obs["texts"]]
                idx = int(np.argmax(sims))
                return idx, state or {}, {}

        else:
            return None, state or {}, {}

    def learn(self, batch: dict, task_id: str) -> dict:
        if TORCH_AVAILABLE:
            if task_id.startswith("regress") and self.reg_model is not None:
                x = torch.from_numpy(batch["x"]).float().to(DEVICE)
                y = torch.from_numpy(batch["y"]).float().to(DEVICE)
                self.reg_opt.zero_grad()
                pred = self.reg_model(x)
                loss = F.mse_loss(pred, y)
                loss.backward()
                self.reg_opt.step()
                return {"loss": float(loss.item())}

            elif task_id.startswith("multimodal") and self.txt_proj is not None:
                # Example: simple contrastive-like loss for matching
                mode = batch.get("mode")
                if mode == "txt2img":
                    txt_emb = self._encode_text(batch["text"], torch_out=True)
                    img_embs = torch.stack([self._encode_img(im, torch_out=True) for im in batch["images"]])
                    sims = F.cosine_similarity(txt_emb.unsqueeze(0), img_embs)
                    target_idx = batch["target_idx"]
                    loss = F.cross_entropy(sims.unsqueeze(0), torch.tensor([target_idx], device=DEVICE))
                else:
                    img_emb = self._encode_img(batch["image"], torch_out=True)
                    txt_embs = torch.stack([self._encode_text(t, torch_out=True) for t in batch["texts"]])
                    sims = F.cosine_similarity(img_emb.unsqueeze(0), txt_embs)
                    target_idx = batch["target_idx"]
                    loss = F.cross_entropy(sims.unsqueeze(0), torch.tensor([target_idx], device=DEVICE))

                self.mm_opt.zero_grad()
                loss.backward()
                self.mm_opt.step()
                return {"loss": float(loss.item())}

        return {"loss": None}

    def reflect(self, logs: list[dict]) -> dict:
        fail_rate = np.mean([1.0 if r.get("success") == 0 else 0.0 for r in logs if "success" in r]) if logs else 0.0
        if fail_rate > 0.5:
            self.eps = min(0.3, self.eps + 0.05)
            note = f"Increased epsilon to {self.eps:.2f} after fail_rate={fail_rate:.2f}"
        else:
            note = f"No change; fail_rate={fail_rate:.2f}"
        return {"notes": note, "patches": {"eps": self.eps}}

    def encode(self, modality: str, data) -> np.ndarray:
        if modality == "text":
            return self._encode_text(data)
        elif modality == "image":
            return self._encode_img(data)
        else:
            raise ValueError(f"Unknown modality: {modality}")

    # --- helpers ---
    def _encode_text(self, s: str, torch_out=False):
        idxs = [self.char_vocab.get(c.lower(), 0) for c in s]
        vec = np.zeros(self.txt_dim, dtype=np.float32)
        for i in idxs:
            vec[i % self.txt_dim] += 1.0
        vec = vec / (np.linalg.norm(vec) + 1e-8)

        if TORCH_AVAILABLE:
            tvec = torch.from_numpy(vec).float().to(DEVICE)
            proj = self.txt_proj(tvec)
            return proj if torch_out else proj.detach().cpu().numpy()
        else:
            return vec

    def _encode_img(self, img: np.ndarray, torch_out=False):
        from skimage.transform import resize
        small = resize(img, self.img_size, anti_aliasing=True, preserve_range=True).astype(np.float32)
        vec = small.mean(axis=2).reshape(-1)
        vec = vec / (np.linalg.norm(vec) + 1e-8)

        if TORCH_AVAILABLE:
            tvec = torch.from_numpy(vec).float().to(DEVICE)
            proj = self.img_proj(tvec)
            return proj if torch_out else proj.detach().cpu().numpy()
        else:
            return vec

    @staticmethod
    def _cos(a: np.ndarray, b: np.ndarray) -> float:
        return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8))

# %% [markdown]
# ## 2. Tasks
# - Symbol puzzles (neurosymbolic)
# - Grid navigation (spatial + memory)
# - Few-shot regression (meta-learning baseline)
# - Multimodal matching (grounding)

# %%
@dataclass
class EpisodeResult:
    task_id: str
    split: str
    success: int
    steps: int
    reward: float
    meta: dict = field(default_factory=dict)

# %% [markdown]
# ### 2.1 Symbol puzzles

# %%
@dataclass
class SymbolTask:
    rules: list[tuple[str,str]]
    start: str
    target: str
    max_steps: int = 12

def run_symbol_episode(agent: AgentAdapter, task: SymbolTask, split="train") -> EpisodeResult:
    s = task.start
    for t in range(task.max_steps):
        obs = {"rules": task.rules, "string": s, "target": task.target}
        action, _, _ = agent.act(obs, task_id="symbol/rewrite", step=t, state=None)
        if action is None: break
        lhs, rhs = action
        idx = s.find(lhs)
        if idx >= 0:
            s = s[:idx] + rhs + s[idx+len(lhs):]
        if s == task.target:
            return EpisodeResult("symbol/rewrite", split, 1, t+1, reward=1.0, meta={"final": s})
    return EpisodeResult("symbol/rewrite", split, 0, task.max_steps, reward=0.0, meta={"final": s})

def generate_symbol_dataset(seed=SEED):
    random.seed(seed)
    train, test = [], []
    # Train rules emphasize short compositions; test holds out longer compositions and new rule combos
    base_rules = [("A","AB"),("B","BA"),("BA","A"),("AA","B"),("BB","A")]
    for _ in range(50):
        start = random.choice(["A","B","AB","BA"])
        target = random.choice(["ABA","BAB","ABAB","BAA","ABB"])
        rules = random.sample(base_rules, k=3)
        train.append(SymbolTask(rules, start, target))
    for _ in range(20):
        start = random.choice(["A","B","AB","BA"])
        target = random.choice(["ABABA","BABAB","BAAB"])
        # Hold-out rules combos
        rules = random.sample(base_rules[::-1], k=3)
        test.append(SymbolTask(rules, start, target, max_steps=16))
    return train, test

# %% [markdown]
# ### 2.2 Grid navigation

# %%
@dataclass
class GridTask:
    grid_size: int = 7
    obstacles: list[tuple[int,int]] = field(default_factory=list)
    keys: list[tuple[int,int]] = field(default_factory=list)
    start: tuple[int,int] = (0,0)
    goal: tuple[int,int] = (6,6)
    max_steps: int = 40
    require_key: bool = False

ACTIONS = ["up","down","left","right"]
DELTAS = {"up":(-1,0),"down":(1,0),"left":(0,-1),"right":(0,1)}

def run_grid_episode(agent: AgentAdapter, task: GridTask, split="train") -> EpisodeResult:
    pos = list(task.start)
    have_key = False
    for t in range(task.max_steps):
        obs = {"pos": tuple(pos), "goal": task.goal, "action_space": ACTIONS}
        act, _, _ = agent.act(obs, task_id="grid/nav", step=t, state=None)
        dx, dy = DELTAS.get(act, (0,0))
        npos = (pos[0]+dx, pos[1]+dy)
        if 0 <= npos[0] < task.grid_size and 0 <= npos[1] < task.grid_size and npos not in task.obstacles:
            pos = list(npos)
        if tuple(pos) in task.keys:
            have_key = True
        if tuple(pos) == task.goal and (not task.require_key or have_key):
            return EpisodeResult("grid/nav", split, 1, t+1, 1.0, meta={"have_key": have_key})
    return EpisodeResult("grid/nav", split, 0, task.max_steps, 0.0, meta={"have_key": have_key})

def generate_grid_dataset(seed=SEED):
    random.seed(seed)
    train, test = [], []
    def rand_obstacles(gs, count):
        obs = set()
        while len(obs) < count:
            p = (random.randrange(gs), random.randrange(gs))
            if p not in [(0,0), (gs-1, gs-1)]: obs.add(p)
        return list(obs)
    for _ in range(50):
        gs = 7
        obstacles = rand_obstacles(gs, 8)
        task = GridTask(grid_size=gs, obstacles=obstacles, start=(0,0), goal=(gs-1, gs-1), require_key=False)
        train.append(task)
    for _ in range(20):
        gs = 9
        obstacles = rand_obstacles(gs, 14)
        key = (gs//2, gs//2)
        task = GridTask(grid_size=gs, obstacles=obstacles, start=(0,0), goal=(gs-1, gs-1), keys=[key], require_key=True, max_steps=60)
        test.append(task)
    return train, test

# %% [markdown]
# ### 2.3 Few-shot regression (sinusoids)

# %%
@dataclass
class SinTask:
    amp: float
    phase: float
    freq: float

def sample_sin_task(n: int, task: SinTask, x_range=(-5, 5), noise=0.0) -> tuple[np.ndarray, np.ndarray]:
    xs = np.random.uniform(x_range[0], x_range[1], size=(n,1)).astype(np.float32)
    ys = task.amp * np.sin(task.freq * xs + task.phase) + noise*np.random.randn(n,1).astype(np.float32)
    return xs, ys

def generate_sin_tasks(seed=SEED):
    rng = np.random.RandomState(seed)
    train = [SinTask(amp=float(rng.uniform(0.1, 5.0)), phase=float(rng.uniform(0, np.pi)), freq=float(rng.uniform(0.5, 2.0))) for _ in range(20)]
    test = [SinTask(amp=float(rng.uniform(0.1, 5.0)), phase=float(rng.uniform(0, np.pi)), freq=float(rng.uniform(2.0, 3.0))) for _ in range(10)]
    return train, test

def run_regression_episode(agent: AgentAdapter, task: SinTask, k_shot=10, q_points=50, split="train") -> EpisodeResult:
    x_train, y_train = sample_sin_task(k_shot, task)
    x_query, y_query = sample_sin_task(q_points, task)
    # Pre-adapt loss
    pre = agent.act({"x": x_query}, task_id="regress/sin", step=0, state=None)[0]
    pre_mse = float(np.mean((pre - y_query)**2))
    # One simple learn step on support set
    metrics = agent.learn({"x": x_train, "y": y_train}, task_id="regress/sin")
    # Post-adapt loss
    post = agent.act({"x": x_query}, task_id="regress/sin", step=1, state=None)[0]
    post_mse = float(np.mean((post - y_query)**2))
    return EpisodeResult("regress/sin", split, int(post_mse < pre_mse), steps=1, reward=-post_mse, meta={"pre_mse": pre_mse, "post_mse": post_mse, "learn": metrics})

# %% [markdown]
# ### 2.4 Multimodal matching (synthetic shapes ↔ text)

# %%
def make_shape_image(kind: str, color: tuple[int,int,int], size=(64,64)) -> np.ndarray:
    # Return HxWxC uint8 image with a centered shape
    import cv2
    img = np.zeros((size[0], size[1], 3), dtype=np.uint8)
    c = (int(color[0]), int(color[1]), int(color[2]))
    h, w = size
    if kind == "circle":
        cv2.circle(img, (w//2, h//2), min(h,w)//4, c, -1)
    elif kind == "square":
        s = min(h,w)//3
        cv2.rectangle(img, (w//2 - s, h//2 - s), (w//2 + s, h//2 + s), c, -1)
    elif kind == "triangle":
        pts = np.array([[w//2, h//2 - w//4],[w//2 - w//5, h//2 + w//6],[w//2 + w//5, h//2 + w//6]], np.int32)
        cv2.fillPoly(img, [pts], c)
    return img

def generate_multimodal_dataset(seed=SEED):
    random.seed(seed)
    colors = {"red": (220,30,30), "green": (30,220,30), "blue": (30,30,220)}
    shapes = ["circle","square","triangle"]
    items = []
    for s in shapes:
        for name,c in colors.items():
            img = make_shape_image(s, c)
            text = f"{name} {s}"
            items.append((img, text))
    random.shuffle(items)
    # Train/test split with compositional holdout (e.g., blue triangle held out)
    test_holdouts = {"blue triangle", "green circle"}
    train = [(img, txt) for img,txt in items if txt not in test_holdouts]
    test = [(img, txt) for img,txt in items if txt in test_holdouts]
    return train, test

def run_multimodal_episode(agent: AgentAdapter, pool: list[tuple[np.ndarray,str]], q: tuple[str|np.ndarray, list], mode="txt2img", split="train") -> EpisodeResult:
    if mode == "txt2img":
        text = q[0]; images = q[1]
        pred_idx, _, _ = agent.act({"mode": "txt2img", "text": text, "images": images}, task_id="multimodal/match", step=0, state=None)
        # success if chosen image corresponds to text
        success = int(any(text == t for (im,t) in pool if (im.tobytes() == images[pred_idx].tobytes())))
    else:
        image = q[0]; texts = q[1]
        pred_idx, _, _ = agent.act({"mode": "img2txt", "image": image, "texts": texts}, task_id="multimodal/match", step=0, state=None)
        # success if chosen text corresponds to image
        success = int(any(texts[pred_idx] == t for (im,t) in pool if (im.tobytes() == image.tobytes())))
    return EpisodeResult("multimodal/match", split, success, 1, float(success), meta={"mode": mode})

# %% [markdown]
# ## 3. Evaluation protocols and metrics
# - Forward transfer: pretrain on some tasks; measure zero/low-shot on new tasks
# - Compositional generalization: held-out combos in symbol/multimodal; larger grids with keys
# - Epistemic uncertainty: MC dropout predictive variance (regression)
# - Reflective improvement: performance delta after reflect()

# %%
@dataclass
class EvalConfig:
    n_symbol_train: int = 20
    n_symbol_test: int = 10
    n_grid_train: int = 20
    n_grid_test: int = 10
    n_reg_train: int = 20
    n_reg_test: int = 10
    k_shot: int = 10
    q_points: int = 50
    mm_trials: int = 20

def mc_dropout_uncertainty(agent: SimpleAgentAdapter, x: np.ndarray, n_passes: int) -> float:
    if not TORCH_AVAILABLE or agent.reg_model is None:
        return float("nan")
    agent.reg_model.train(True)  # enable dropout
    preds = []
    with torch.no_grad():
        xt = torch.from_numpy(x).float().to(DEVICE)
        for _ in range(n_passes):
            preds.append(agent.reg_model(xt).cpu().numpy())
    P = np.stack(preds, axis=0)  # [passes, B, 1]
    var = float(np.mean(np.var(P, axis=0)))
    agent.reg_model.train(True)
    return var

# %% [markdown]
# ## 4. Orchestrator

# %%
def run_all(agent: AgentAdapter, eval_cfg=EvalConfig(), run_id=RUN_ID):
    records = []

    # --- SYMBOL ---
    sym_train, sym_test = generate_symbol_dataset(SEED)
    for task in sym_train[:eval_cfg.n_symbol_train]:
        res = run_symbol_episode(agent, task, split="train")
        records.append(dataclasses.asdict(res) | {"run_id": run_id})
    for task in sym_test[:eval_cfg.n_symbol_test]:
        res = run_symbol_episode(agent, task, split="test")
        records.append(dataclasses.asdict(res) | {"run_id": run_id})

    # --- GRID ---
    grid_train, grid_test = generate_grid_dataset(SEED)
    for task in grid_train[:eval_cfg.n_grid_train]:
        res = run_grid_episode(agent, task, split="train")
        records.append(dataclasses.asdict(res) | {"run_id": run_id})
    for task in grid_test[:eval_cfg.n_grid_test]:
        res = run_grid_episode(agent, task, split="test")
        records.append(dataclasses.asdict(res) | {"run_id": run_id})

    # --- REGRESSION (Forward transfer proxy: measure pre/post on test without training on that task) ---
sin_train, sin_test = generate_sin_tasks(SEED)
# Pretrain a bit across train tasks
for task in sin_train[:eval_cfg.n_reg_train]:
    r = run_regression_episode(agent, task, k_shot=eval_cfg.k_shot, q_points=eval_cfg.q_points, split="train")
    records.append(dataclasses.asdict(r) | {"run_id": run_id})
# Evaluate on unseen tasks with minimal adaptation
for task in sin_test[:eval_cfg.n_reg_test]:
    r = run_regression_episode(agent, task, k_shot=eval_cfg.k_shot // 2, q_points=eval_cfg.q_points, split="test")
    records.append(dataclasses.asdict(r) | {"run_id": run_id})
    # Epistemic uncertainty
    xq, _ = sample_sin_task(64, task)
    if isinstance(agent, SimpleAgentAdapter):
        unc = mc_dropout_uncertainty(agent, xq, n_passes=agent.cfg.mc_dropout_passes)
    else:
        unc = float("nan")
    records[-1]["meta"]["epistemic_var"] = unc

def run_all(agent):
    records = []
    run_id = RUN_ID  # or however you define it earlier

    # --- MULTIMODAL (Compositional holdout) ---
    mm_train, mm_test = generate_multimodal_dataset(SEED)

    # txt->img (train)
    for _ in range(eval_cfg.mm_trials):
        text = random.choice([t for (_, t) in mm_train])
        imgs = [im for (im, _) in random.sample(mm_train, k=min(4, len(mm_train)))]
        r = run_multimodal_episode(agent, mm_train, (text, imgs), mode="txt2img", split="train")
        records.append(dataclasses.asdict(r) | {"run_id": run_id})

    # img->txt (held-out compositions in test)
    for _ in range(max(5, eval_cfg.mm_trials // 2)):
        if not mm_test:
            break
        im, t_hold = random.choice(mm_test)
        candidates = [t_hold] + [t for (_, t) in random.sample(mm_train, k=min(3, len(mm_train)))]
        random.shuffle(candidates)
        r = run_multimodal_episode(agent, mm_train + mm_test, (im, candidates), mode="img2txt", split="test")
        records.append(dataclasses.asdict(r) | {"run_id": run_id})

    # --- Reflection cycle ---
    refl_notes = agent.reflect(records)
    print("Reflection:", refl_notes)

    # Recreate the test splits so they're available here
    sym_train, sym_test = generate_symbol_dataset(SEED)
    grid_train, grid_test = generate_grid_dataset(SEED)
    sin_train, sin_test = generate_sin_tasks(SEED)

    # Slim post‑reflect pass
    sym_r = run_symbol_episode(agent, sym_test[0], split="post-reflect")
    grid_r = run_grid_episode(agent, grid_test[0], split="post-reflect")
    sin_r = run_regression_episode(
        agent,
        sin_test[0],
        k_shot=eval_cfg.k_shot // 2,
        q_points=eval_cfg.q_points,
        split="post-reflect"
    )
    records += [
        dataclasses.asdict(sym_r) | {"run_id": run_id},
        dataclasses.asdict(grid_r) | {"run_id": run_id},
        dataclasses.asdict(sin_r) | {"run_id": run_id},
    ]

    # Persist
    out_path = f"runs/{run_id}.jsonl"
    for rec in records:
        log_jsonl(out_path, rec)

    # ✅ Return everything you need for the dashboard and post‑reflect analysis
    return pd.DataFrame.from_records(records), refl_notes, sym_test, grid_test, sin_test

# %% [markdown]
# 5. Dashboard

# %%
def dashboard(df: pd.DataFrame):
    # Summary by task/split
    df["success"] = df["success"].astype(int)
    agg = df.groupby(["task_id", "split"]).agg(
        success_rate=("success", "mean"),
        avg_reward=("reward", "mean"),
        avg_steps=("steps", "mean"),
        n=("success", "count"),
    ).reset_index()
    display(agg)

    # Plot success rates
    pivot = agg.pivot(index="task_id", columns="split", values="success_rate").fillna(0.0)
    pivot.plot(kind="bar", figsize=(10, 4), ylim=(0, 1), title="Success rate by task and split")
    plt.axhline(0.5, color="gray", linestyle="--", linewidth=1)
    plt.show()

    # Regression pre/post MSE distribution
    reg = df[df["task_id"] == "regress/sin"].copy()
    if not reg.empty:
        pre = [m.get("pre_mse") for m in reg["meta"] if isinstance(m, dict) and "pre_mse" in m]
        post = [m.get("post_mse") for m in reg["meta"] if isinstance(m, dict) and "post_mse" in m]
        if pre and post:
            plt.figure(figsize=(6, 4))
            plt.hist(pre, bins=15, alpha=0.6, label="pre")
            plt.hist(post, bins=15, alpha=0.6, label="post")
            plt.title("Regression MSE pre vs post adaptation")
            plt.legend()
            plt.show()

    # Epistemic variance on test (sin tasks)
    ev = [
        m.get("epistemic_var")
        for m in df[df["split"] == "test"]["meta"]
        if isinstance(m, dict) and "epistemic_var" in m
    ]
    if ev:
        plt.figure(figsize=(6, 3))
        plt.plot(ev, marker="o")
        plt.title("Epistemic variance across unseen sin tasks")
        plt.show()

# %% [markdown]
# 6. Run — using the provided SimpleAgentAdapter

# %%
cfg = AgentConfig(
    name="SimpleAgent",
    use_world_model=False,
    use_multimodal_encoder=True,
    notes="baseline"
)

agent = SimpleAgentAdapter(cfg)  # <-- your real adapter class
df, refl_notes, sym_test, grid_test, sin_test = run_all(agent)
dashboard(df)
print("Run complete:", RUN_ID)


# %% [markdown]
# ---
# 7. Integration hooks for your AGI agent
#
# If you have your own AGI scaffold (with memory, planner, reflector, etc.),
# wrap it in an adapter like this. Only run this block once `MyAGIAgent` exists.

# %%
class MyAGIAgentAdapter(AgentAdapter):
    def __init__(self, cfg: AgentConfig):
        super().__init__(cfg)
        # Instantiate your full AGI scaffold here
        self.agent = MyAGIAgent(
            # pass in any required modules/configs for your scaffold
            memory_module=...,
            planner_module=...,
            reflector_module=...,
            world_model=...,
            multimodal_encoder=...
        )

    def act(self, obs, task_id, step, state):
        return self.agent.act(obs, task_id, step, state)

    def learn(self, batch, task_id):
        return self.agent.learn(batch, task_id)

    def reflect(self, logs):
        return self.agent.reflect(logs)

    def encode(self, modality, data):
        return self.agent.encode(modality, data)

    def imagine(self, state, n_steps=5):
        return self.agent.world_model.rollout(state, n_steps)


# Example usage — only if MyAGIAgent is implemented:
# cfg = AgentConfig(
#     name="MyAGIAgent",
#     use_world_model=True,
#     use_multimodal_encoder=True,
#     notes="full AGI scaffold"
# )
# agent = MyAGIAgentAdapter(cfg)
# df, refl_notes, sym_test, grid_test, sin_test = run_all(agent)
# dashboard(df)