# Tool-call Monitor: Diff-in-Means and Logistic Probe

This notebook trains two simple probes on tool-call activations:
- Diff-in-means vector.
- Logistic classifier.

It expects activations saved by `pipeline.py` under `data/activations/good/` and `data/activations/bad/`.

In [None]:
import json
import os
from pathlib import Path
import random
from typing import List, Dict

import torch

random.seed(0)
torch.manual_seed(0)

In [None]:
DATA_ROOT = Path("data/activations")
GOOD_PATH = DATA_ROOT / "good" / "activations.jsonl"
BAD_PATH = DATA_ROOT / "bad" / "activations.jsonl"

def load_jsonl(path: Path) -> List[Dict]:
    if not path.exists():
        raise FileNotFoundError(f"Missing activations file: {path}")
    rows = []
    with path.open() as f:
        for line in f:
            if line.strip():
                rows.append(json.loads(line))
    return rows

good_rows = load_jsonl(GOOD_PATH)
bad_rows = load_jsonl(BAD_PATH)
print(f"Loaded {len(good_rows)} good activations, {len(bad_rows)} bad activations")

In [None]:
def select_examples(rows: List[Dict], n_train: int, n_eval: int) -> Dict[str, List]:
    if len(rows) < n_train + n_eval:
        raise ValueError(f"Need at least {n_train + n_eval} rows, got {len(rows)}")
    random.shuffle(rows)
    train = rows[:n_train]
    eval_ = rows[n_train:n_train + n_eval]
    return {"train": train, "eval": eval_}

N_TRAIN_PER_CLASS = 8
N_EVAL_PER_CLASS = 2

splits = {
    "good": select_examples(good_rows, N_TRAIN_PER_CLASS, N_EVAL_PER_CLASS),
    "bad": select_examples(bad_rows, N_TRAIN_PER_CLASS, N_EVAL_PER_CLASS),
}

In [None]:
def rows_to_tensor(rows: List[Dict]) -> torch.Tensor:
    return torch.tensor([r["activation"] for r in rows], dtype=torch.float32)

train_x = torch.cat([
    rows_to_tensor(splits["good"]["train"]),
    rows_to_tensor(splits["bad"]["train"]),
], dim=0)
train_y = torch.cat([
    torch.ones(len(splits["good"]["train"])),
    torch.zeros(len(splits["bad"]["train"])),
])

eval_x = torch.cat([
    rows_to_tensor(splits["good"]["eval"]),
    rows_to_tensor(splits["bad"]["eval"]),
], dim=0)
eval_y = torch.cat([
    torch.ones(len(splits["good"]["eval"])),
    torch.zeros(len(splits["bad"]["eval"])),
])

print("Train shape:", train_x.shape, "Eval shape:", eval_x.shape)

## Diff-in-means probe

In [None]:
good_mean = train_x[:len(splits["good"]["train"]), :].mean(dim=0)
bad_mean = train_x[len(splits["good"]["train"]):, :].mean(dim=0)
diff_vec = good_mean - bad_mean

def score_diff(x: torch.Tensor) -> torch.Tensor:
    return torch.matmul(x, diff_vec)

with torch.no_grad():
    scores = score_diff(eval_x)
    preds = (scores > 0).float()
    acc = (preds == eval_y).float().mean().item()
    print(f"Diff-in-means accuracy: {acc*100:.1f}% ({int(acc*len(eval_y))}/{len(eval_y)})")

## Logistic classifier

In [None]:
logit = torch.nn.Linear(train_x.shape[1], 1)
optimizer = torch.optim.Adam(logit.parameters(), lr=1e-2)
loss_fn = torch.nn.BCEWithLogitsLoss()

EPOCHS = 200
for epoch in range(EPOCHS):
    optimizer.zero_grad()
    logits = logit(train_x).squeeze(-1)
    loss = loss_fn(logits, train_y)
    loss.backward()
    optimizer.step()

with torch.no_grad():
    eval_logits = logit(eval_x).squeeze(-1)
    eval_preds = (torch.sigmoid(eval_logits) > 0.5).float()
    eval_acc = (eval_preds == eval_y).float().mean().item()
    print(f"Logistic accuracy: {eval_acc*100:.1f}% ({int(eval_acc*len(eval_y))}/{len(eval_y)})")