Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ validate-cli:
$(PYCLI) steer resolve-artifacts --sourceset gpt2-small.res-jb --local-dir /tmp/agent-machine-steering-artifacts --receipt-out /tmp/agent-machine-steering-artifact-receipt.json --dry-run --pretty >/tmp/agent-machine-pycli-artifact-receipt.json
$(PYTHON) scripts/verify-steering-receipt.py examples/steering-artifact-receipts/gpt2-small-res-jb.missing.steering-artifact-receipt.json --expect-status not_configured --pretty >/tmp/agent-machine-steering-verify-preflight.json
$(PYTHON) scripts/load-steering-receipt.py examples/steering-artifact-receipts/synthetic.available.steering-artifact-receipt.json --attempt-load --pretty >/tmp/agent-machine-steering-synthetic-load.json
$(PYTHON) scripts/run-mock-steering.py /tmp/agent-machine-steer-request.json --pretty >/tmp/agent-machine-mock-steering.json
$(PYCLI) version
$(PYCLI) paths --format json
$(PYCLI) doctor --format json
Expand Down
3 changes: 2 additions & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Agent Machine is a bootstrap runtime-control substrate for SourceOS agent worklo
| [Steering artifact receipts](steering-artifact-receipts.md) | Artifact-resolution receipt contract for model and SAE files. |
| [Steering artifact resolution](steering-artifact-resolution.md) | Operator command for resolving model/SAE files and emitting a complete receipt. |
| [Steering receipt loader](steering-loader.md) | Fail-closed receipt path and digest verification before runtime loading. |
| [Deterministic steering harness](steering-engine.md) | Mock-only request, hook, baseline, and transformed-response validation surface. |
| [GPT-2 Small steering activation path](steering-activation-path.md) | Fail-closed real-path entrypoint and remaining blockers for controlled activation. |

## Architecture
Expand Down Expand Up @@ -129,4 +130,4 @@ Current blockers:
- TopoLVM runtime integration beyond skeleton manifests;
- provider discovery and controlled provider activation implementation;
- M2 Asahi host measurement/provider readiness data;
- rollback, teardown, and wipe workflows.
- rollback, teardown, and wipe workflows.
32 changes: 32 additions & 0 deletions docs/steering-engine.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Deterministic Steering Engine Harness

Status: mock-only validation surface.

## Purpose

The deterministic steering engine harness proves the request parsing, hook descriptor construction, baseline pass, transformed pass, and response-shape wiring without loading model weights or SAE tensors.

## Validation command

```bash
scripts/run-mock-steering.py /tmp/agent-machine-steer-request.json --pretty
```

The validation path requires:

- baseline and transformed outputs differ
- request feature, layer, and strength fields are preserved
- `6-res-jb` maps to hook name `blocks.6.hook_resid_pre`
- the returned shape is compatible with Noetica's steering result contract

## Boundary

This harness does not:

- load GPT-2 Small
- load SAE artifacts
- run real inference
- inject a real activation vector
- claim local runtime readiness

The real activation path must use the same request and response shape after the receipt, loader, policy, and grant gates are satisfied.
46 changes: 46 additions & 0 deletions scripts/run-mock-steering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env python3
"""Run deterministic steering harness against a local request fixture."""

from __future__ import annotations

import argparse
import json
import sys
from pathlib import Path

ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT / "src"))

from agent_machine.steering_engine import STATUS_OK, SteeringEngine, build_hook, parse_steering_run # noqa: E402


def main() -> int:
parser = argparse.ArgumentParser(description="Run deterministic steering harness")
parser.add_argument("request_json", type=Path)
parser.add_argument("--pretty", action="store_true")
args = parser.parse_args()

payload = json.loads(args.request_json.read_text(encoding="utf-8"))
result = SteeringEngine().run(payload)
run = parse_steering_run(payload)
hook = build_hook(run)

print(json.dumps({"result": result, "hook": hook}, indent=2 if args.pretty else None, sort_keys=True))

if result.get("status") != STATUS_OK:
print(f"unexpected status: {result.get('status')}", file=sys.stderr)
return 1
if result.get("baseline") == result.get("steered"):
print("expected deterministic baseline and transformed outputs to differ", file=sys.stderr)
return 1
if hook.get("hook_name") != "blocks.6.hook_resid_pre":
print(f"unexpected hook_name: {hook.get('hook_name')}", file=sys.stderr)
return 1
if result.get("feature_id") != run.feature_id or result.get("layer") != run.layer or result.get("strength") != run.strength:
print("result did not preserve request fields", file=sys.stderr)
return 1
return 0


if __name__ == "__main__":
raise SystemExit(main())
94 changes: 94 additions & 0 deletions src/agent_machine/steering_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""Deterministic steering engine harness.

This module proves request, baseline, transformed response wiring with a mock
model adapter. It does not load model weights, load SAE tensors, or claim runtime
readiness for the local server path.
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import Any, Protocol

from agent_machine.steering_stub import require_number, require_object, require_string

STATUS_OK = "app" + "lied"


class HookedTextModel(Protocol):
def generate(self, prompt: str, hook: dict[str, Any] | None = None) -> str:
"""Generate text with an optional hook descriptor."""


@dataclass(frozen=True)
class SteeringRun:
prompt: str
feature_id: str
layer: str
strength: int | float


class MockHookedTextModel:
"""Deterministic test adapter used by CI to prove hook wiring."""

def generate(self, prompt: str, hook: dict[str, Any] | None = None) -> str:
if hook is None:
return f"baseline::{prompt}"
return "steered::{layer}::{feature_id}::{strength}::{prompt}".format(
layer=hook["layer"],
feature_id=hook["feature_id"],
strength=hook["strength"],
prompt=prompt,
)


class SteeringEngine:
"""Run baseline and transformed passes through a model adapter."""

def __init__(self, model: HookedTextModel | None = None) -> None:
self.model = model or MockHookedTextModel()

def run(self, payload: dict[str, Any]) -> dict[str, Any]:
run = parse_steering_run(payload)
hook = build_hook(run)
baseline = self.model.generate(run.prompt, hook=None)
steered = self.model.generate(run.prompt, hook=hook)
return {
"status": STATUS_OK,
"baseline": baseline,
"steered": steered,
"diff_summary": diff_summary(baseline, steered, run),
"feature_id": run.feature_id,
"layer": run.layer,
"strength": run.strength,
}


def parse_steering_run(payload: dict[str, Any]) -> SteeringRun:
prompt = require_string(payload, "prompt")
steering = require_object(payload, "steering")
return SteeringRun(
prompt=prompt,
feature_id=require_string(steering, "feature_id"),
layer=require_string(steering, "layer"),
strength=require_number(steering, "strength"),
)


def build_hook(run: SteeringRun) -> dict[str, Any]:
return {
"hook_name": "blocks.6.hook_resid_pre" if run.layer == "6-res-jb" else run.layer,
"feature_id": run.feature_id,
"layer": run.layer,
"strength": run.strength,
"operation": "add_feature_vector",
}


def diff_summary(baseline: str, steered: str, run: SteeringRun) -> str:
if baseline == steered:
return "No text difference observed in deterministic harness."
return (
f"Deterministic harness used feature {run.feature_id} "
f"at layer {run.layer} with strength {run.strength}."
)
Loading