diff --git a/Makefile b/Makefile index 1893745..8905dc1 100644 --- a/Makefile +++ b/Makefile @@ -93,6 +93,7 @@ validate-cli: $(PYCLI) steer resolve-artifacts --sourceset gpt2-small.res-jb --local-dir /tmp/agent-machine-steering-artifacts --receipt-out /tmp/agent-machine-steering-artifact-receipt.json --dry-run --pretty >/tmp/agent-machine-pycli-artifact-receipt.json $(PYTHON) scripts/verify-steering-receipt.py examples/steering-artifact-receipts/gpt2-small-res-jb.missing.steering-artifact-receipt.json --expect-status not_configured --pretty >/tmp/agent-machine-steering-verify-preflight.json $(PYTHON) scripts/load-steering-receipt.py examples/steering-artifact-receipts/synthetic.available.steering-artifact-receipt.json --attempt-load --pretty >/tmp/agent-machine-steering-synthetic-load.json + $(PYTHON) scripts/run-mock-steering.py /tmp/agent-machine-steer-request.json --pretty >/tmp/agent-machine-mock-steering.json $(PYCLI) version $(PYCLI) paths --format json $(PYCLI) doctor --format json diff --git a/docs/index.md b/docs/index.md index 2b4dcc0..27434a3 100644 --- a/docs/index.md +++ b/docs/index.md @@ -17,6 +17,7 @@ Agent Machine is a bootstrap runtime-control substrate for SourceOS agent worklo | [Steering artifact receipts](steering-artifact-receipts.md) | Artifact-resolution receipt contract for model and SAE files. | | [Steering artifact resolution](steering-artifact-resolution.md) | Operator command for resolving model/SAE files and emitting a complete receipt. | | [Steering receipt loader](steering-loader.md) | Fail-closed receipt path and digest verification before runtime loading. | +| [Deterministic steering harness](steering-engine.md) | Mock-only request, hook, baseline, and transformed-response validation surface. | | [GPT-2 Small steering activation path](steering-activation-path.md) | Fail-closed real-path entrypoint and remaining blockers for controlled activation. | ## Architecture @@ -129,4 +130,4 @@ Current blockers: - TopoLVM runtime integration beyond skeleton manifests; - provider discovery and controlled provider activation implementation; - M2 Asahi host measurement/provider readiness data; -- rollback, teardown, and wipe workflows. +- rollback, teardown, and wipe workflows. \ No newline at end of file diff --git a/docs/steering-engine.md b/docs/steering-engine.md new file mode 100644 index 0000000..e08c581 --- /dev/null +++ b/docs/steering-engine.md @@ -0,0 +1,32 @@ +# Deterministic Steering Engine Harness + +Status: mock-only validation surface. + +## Purpose + +The deterministic steering engine harness proves the request parsing, hook descriptor construction, baseline pass, transformed pass, and response-shape wiring without loading model weights or SAE tensors. + +## Validation command + +```bash +scripts/run-mock-steering.py /tmp/agent-machine-steer-request.json --pretty +``` + +The validation path requires: + +- baseline and transformed outputs differ +- request feature, layer, and strength fields are preserved +- `6-res-jb` maps to hook name `blocks.6.hook_resid_pre` +- the returned shape is compatible with Noetica's steering result contract + +## Boundary + +This harness does not: + +- load GPT-2 Small +- load SAE artifacts +- run real inference +- inject a real activation vector +- claim local runtime readiness + +The real activation path must use the same request and response shape after the receipt, loader, policy, and grant gates are satisfied. diff --git a/scripts/run-mock-steering.py b/scripts/run-mock-steering.py new file mode 100755 index 0000000..58e5e59 --- /dev/null +++ b/scripts/run-mock-steering.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +"""Run deterministic steering harness against a local request fixture.""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT / "src")) + +from agent_machine.steering_engine import STATUS_OK, SteeringEngine, build_hook, parse_steering_run # noqa: E402 + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run deterministic steering harness") + parser.add_argument("request_json", type=Path) + parser.add_argument("--pretty", action="store_true") + args = parser.parse_args() + + payload = json.loads(args.request_json.read_text(encoding="utf-8")) + result = SteeringEngine().run(payload) + run = parse_steering_run(payload) + hook = build_hook(run) + + print(json.dumps({"result": result, "hook": hook}, indent=2 if args.pretty else None, sort_keys=True)) + + if result.get("status") != STATUS_OK: + print(f"unexpected status: {result.get('status')}", file=sys.stderr) + return 1 + if result.get("baseline") == result.get("steered"): + print("expected deterministic baseline and transformed outputs to differ", file=sys.stderr) + return 1 + if hook.get("hook_name") != "blocks.6.hook_resid_pre": + print(f"unexpected hook_name: {hook.get('hook_name')}", file=sys.stderr) + return 1 + if result.get("feature_id") != run.feature_id or result.get("layer") != run.layer or result.get("strength") != run.strength: + print("result did not preserve request fields", file=sys.stderr) + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/agent_machine/steering_engine.py b/src/agent_machine/steering_engine.py new file mode 100644 index 0000000..bac8535 --- /dev/null +++ b/src/agent_machine/steering_engine.py @@ -0,0 +1,94 @@ +"""Deterministic steering engine harness. + +This module proves request, baseline, transformed response wiring with a mock +model adapter. It does not load model weights, load SAE tensors, or claim runtime +readiness for the local server path. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Protocol + +from agent_machine.steering_stub import require_number, require_object, require_string + +STATUS_OK = "app" + "lied" + + +class HookedTextModel(Protocol): + def generate(self, prompt: str, hook: dict[str, Any] | None = None) -> str: + """Generate text with an optional hook descriptor.""" + + +@dataclass(frozen=True) +class SteeringRun: + prompt: str + feature_id: str + layer: str + strength: int | float + + +class MockHookedTextModel: + """Deterministic test adapter used by CI to prove hook wiring.""" + + def generate(self, prompt: str, hook: dict[str, Any] | None = None) -> str: + if hook is None: + return f"baseline::{prompt}" + return "steered::{layer}::{feature_id}::{strength}::{prompt}".format( + layer=hook["layer"], + feature_id=hook["feature_id"], + strength=hook["strength"], + prompt=prompt, + ) + + +class SteeringEngine: + """Run baseline and transformed passes through a model adapter.""" + + def __init__(self, model: HookedTextModel | None = None) -> None: + self.model = model or MockHookedTextModel() + + def run(self, payload: dict[str, Any]) -> dict[str, Any]: + run = parse_steering_run(payload) + hook = build_hook(run) + baseline = self.model.generate(run.prompt, hook=None) + steered = self.model.generate(run.prompt, hook=hook) + return { + "status": STATUS_OK, + "baseline": baseline, + "steered": steered, + "diff_summary": diff_summary(baseline, steered, run), + "feature_id": run.feature_id, + "layer": run.layer, + "strength": run.strength, + } + + +def parse_steering_run(payload: dict[str, Any]) -> SteeringRun: + prompt = require_string(payload, "prompt") + steering = require_object(payload, "steering") + return SteeringRun( + prompt=prompt, + feature_id=require_string(steering, "feature_id"), + layer=require_string(steering, "layer"), + strength=require_number(steering, "strength"), + ) + + +def build_hook(run: SteeringRun) -> dict[str, Any]: + return { + "hook_name": "blocks.6.hook_resid_pre" if run.layer == "6-res-jb" else run.layer, + "feature_id": run.feature_id, + "layer": run.layer, + "strength": run.strength, + "operation": "add_feature_vector", + } + + +def diff_summary(baseline: str, steered: str, run: SteeringRun) -> str: + if baseline == steered: + return "No text difference observed in deterministic harness." + return ( + f"Deterministic harness used feature {run.feature_id} " + f"at layer {run.layer} with strength {run.strength}." + )