From 92575e370dbc5e5e4e211e40c30e29fd2ab23984 Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Mon, 2 Mar 2026 00:57:05 -0500 Subject: [PATCH] feat: auto-persist WAA recordings to prevent data loss - Add waa_recordings/ to .gitignore (immune to git stash -u, git clean -f) - Add _backup_file() helper: hardlinks PNGs + meta.json to ~/oa/recordings/ (zero extra disk, falls back to copy on cross-device, silent on failure) - Add _save_incremental_meta(): writes meta.json atomically after each step via .tmp rename, with recording_complete field for partial detection - Wire helpers into recording loop (before/after screenshots, step advances, done, restart cleanup) - Use systemd-first pattern for socat proxy in auto-infrastructure Co-Authored-By: Claude Opus 4.6 --- .beads/issues.jsonl | 4 +- .gitignore | 4 + scripts/record_waa_demos.py | 149 +++++++++++++++++++++++++++--------- 3 files changed, 120 insertions(+), 37 deletions(-) diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl index 791f438..6395d80 100644 --- a/.beads/issues.jsonl +++ b/.beads/issues.jsonl @@ -11,7 +11,7 @@ {"id":"openadapt-evals-dke","title":"SYSTEM: Create knowledge persistence workflow using Beads","description":"Every fix/approach must be logged as a Beads issue with:\n1. Problem description\n2. Attempted solution\n3. Result (worked/failed/partial)\n4. Root cause if known\n5. Files changed\n\nBefore any fix attempt, agent MUST:\n1. Run 'bd list --labels=fix,approach' to see prior attempts\n2. Review what was tried before\n3. Document new attempt BEFORE implementing\n\nAfter context compaction, first action:\n1. Run 'bd ready' for current tasks\n2. Run 'bd list --labels=recurring' for known recurring issues\n3. Check docs/RECURRING_ISSUES.md for patterns","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T19:00:18.155796-05:00","created_by":"Richard Abrich","updated_at":"2026-02-23T16:21:13.18811-05:00","closed_at":"2026-02-14T12:22:52.357373-05:00"} {"id":"openadapt-evals-gna","title":"Test simplified Dockerfile (Azure mode)","description":"Testing Dockerfile.simplified which uses vanilla WAA Azure mode: native OEM mechanism (C:\\oem), InstallFrom element for unattended install, VERSION=11e for no product key. Steps: 1) Delete current VM 2) Create fresh VM 3) Build simplified image 4) Test Windows installation via QEMU screenshots","notes":"2026-01-22: Confirmed the blocker is not just docker pull; even starting the existing 'winarena' container via az vm run-command timed out.\n\n- smoke-live tried to run docker start winarena via run-command and timed out (900s)\n- WAA server remained unreachable at http://172.171.112.41:5000\n- VM was deallocated after the attempt\n\nImplication: VM/docker state is unhealthy or container start is hanging (possibly due to incomplete image extraction / stuck daemon / disk pressure).\nNext: add/run a vm-debug command to capture docker/system logs and determine whether to rebuild VM/image, pin/mirror image (ACR), or adjust docker config.","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-21T12:47:15.12243-05:00","created_by":"Richard Abrich","updated_at":"2026-02-23T16:21:13.188539-05:00","closed_at":"2026-02-08T13:23:34.84444-05:00","labels":["testing","waa"],"comments":[{"id":3,"issue_id":"openadapt-evals-gna","author":"Richard Abrich","text":"Session Recovery 2026-01-22 17:58: Previous agents killed during compaction. VM state: Docker/containerd unhealthy, disk /mnt only 32GB (need 47GB+ for vanilla WAA). Git-lfs failing. User feedback: 1) use beads, 2) larger disk, 3) clean up CLI, 4) vanilla WAA config.","created_at":"2026-01-22T18:05:45Z"},{"id":4,"issue_id":"openadapt-evals-gna","author":"Richard Abrich","text":"Launched 3 parallel agents: ae159fc (VM disk upgrade), aabad47 (CLI cleanup), aee4e8a (fix containerd). Check /private/tmp/claude/-Users-abrichr-oa-src-openadapt-ml/tasks/*.output for results.","created_at":"2026-01-22T18:06:18Z"},{"id":5,"issue_id":"openadapt-evals-gna","author":"Richard Abrich","text":"WORKFLOW DOCUMENTED: VM config changes = delete VM -\u003e update code -\u003e relaunch. Added to CLAUDE.md. Default VM size now D8ds_v5 (300GB). Launching fresh VM now.","created_at":"2026-01-22T18:09:12Z"},{"id":6,"issue_id":"openadapt-evals-gna","author":"Richard Abrich","text":"2026-01-22 18:20: VM resources cleaned up, launched agent a9be1f8 to add auto-cleanup to CLI, WAA setup retrying in background (b04fcbe). Workflow documented in CLAUDE.md and STATUS.md.","created_at":"2026-01-22T18:11:56Z"},{"id":7,"issue_id":"openadapt-evals-gna","author":"Richard Abrich","text":"2026-01-22 18:30: VM created with D8s_v3 fallback (D8ds_v5 quota 0), IP 20.120.37.97. Restored waa_deploy symlink. Docker image building. W\u0026B integration agent a21c3ef running.","created_at":"2026-01-22T18:25:29Z"},{"id":8,"issue_id":"openadapt-evals-gna","author":"Richard Abrich","text":"2026-01-22 19:05: WAA Docker image built successfully! Container running. Windows booting. VM: 20.120.37.97, VNC: http://20.120.37.97:8006","created_at":"2026-01-22T18:47:03Z"}]} {"id":"openadapt-evals-hvm","title":"VL model fix PR #18 ready to merge","notes":"2026-02-08: openadapt-ml PR #18 was already merged on 2026-01-29. VL model fix is done.","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-29T16:17:03.491938-05:00","created_by":"Richard Abrich","updated_at":"2026-02-08T12:55:19.233249-05:00","closed_at":"2026-02-08T12:55:19.233249-05:00","close_reason":"PR #18 already merged 2026-01-29"} -{"id":"openadapt-evals-mx8","title":"Analyze evaluation results and publish findings","description":"After demo-conditioned evaluation completes, analyze results: success rates, failure modes, demo impact. Create data-driven roadmap for improvements.","status":"open","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:06.328838-05:00","created_by":"Richard Abrich","updated_at":"2026-02-14T12:23:06.328838-05:00"} +{"id":"openadapt-evals-mx8","title":"Analyze evaluation results and publish findings","description":"After demo-conditioned evaluation completes, analyze results: success rates, failure modes, demo impact. Create data-driven roadmap for improvements.","notes":"wright repo (OpenAdaptAI/wright) scaffolding underway. Herald + consilium repos transferred to OpenAdaptAI org. Wright will be the orchestration layer for eval pipeline.","status":"open","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:06.328838-05:00","created_by":"Richard Abrich","updated_at":"2026-03-02T00:08:08.422633-05:00"} {"id":"openadapt-evals-sz4","title":"RCA: Windows product key prompt recurring issue","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.266286-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.493102-05:00","closed_at":"2026-01-20T20:32:06.493102-05:00","close_reason":"RCA complete - root cause is VERSION mismatch (CLI=11, Dockerfile=11e). Fix documented in RECURRING_ISSUES.md and WINDOWS_PRODUCT_KEY_RCA.md"} -{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"2026-03-01: GPU grant applications reviewed and rewritten (11 files). Writing done, blocked on eval results (DC signal on harder tasks). Detailed status tracked in openadapt-internal (private repo).","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-01T23:35:11.042286-05:00"} +{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"wright repo created (OpenAdaptAI/wright), scaffolding in progress. Herald + consilium transferred to OpenAdaptAI org.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-02T00:08:08.423284-05:00"} {"id":"openadapt-evals-wis","title":"Add pre-flight check to detect Windows install issues","status":"closed","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.865052-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.757261-05:00","closed_at":"2026-01-20T20:32:06.757261-05:00","close_reason":"Duplicate of openadapt-evals-0dt"} diff --git a/.gitignore b/.gitignore index 4fc9fb1..5d6ab22 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,7 @@ benchmark_live.json # Cost reports (generated during evaluation runs) cost_report.json + +# WAA recordings (PNGs + meta.json from record-waa sessions) +# Gitignored to protect from `git stash -u` and `git clean -f` +waa_recordings/ diff --git a/scripts/record_waa_demos.py b/scripts/record_waa_demos.py index 5a2ca9e..d61ff67 100644 --- a/scripts/record_waa_demos.py +++ b/scripts/record_waa_demos.py @@ -31,6 +31,8 @@ from __future__ import annotations import json +import os +import shutil import socket import subprocess import sys @@ -1324,13 +1326,25 @@ def _auto_start_container(vm_ip: str) -> bool: def _auto_start_socat(vm_ip: str) -> bool: - """Start socat proxy on the VM for port 5050 forwarding. Returns True on success.""" + """Start socat proxy on the VM for port 5050 forwarding. + + Tries the socat-waa-evaluate systemd service first (preferred: auto-restarts + on failure). Falls back to the legacy nohup approach for older VMs that + don't have the service installed. + """ print(f" Starting socat proxy on {vm_ip} (VM:5051 -> container:5050)...") - # The socat command runs in the background on the VM - socat_cmd = ( - 'nohup socat TCP-LISTEN:5051,fork,reuseaddr ' - 'EXEC:"docker exec -i winarena socat - TCP\\:localhost\\:5050" ' - '&>/dev/null &' + script = ( + "if systemctl list-unit-files socat-waa-evaluate.service " + "| grep -q socat-waa-evaluate; then " + " sudo systemctl restart socat-waa-evaluate.service; " + "else " + " killall socat 2>/dev/null || true; sleep 1; " + " which socat >/dev/null 2>&1 " + " || sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq socat; " + " nohup socat TCP-LISTEN:5051,fork,reuseaddr " + " 'EXEC:docker exec -i winarena socat - TCP\\:127.0.0.1\\:5050' " + " /dev/null 2>&1 &; " + "fi" ) result = subprocess.run( ["ssh", @@ -1338,14 +1352,13 @@ def _auto_start_socat(vm_ip: str) -> bool: "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null", f"{_AUTO_SSH_USER}@{vm_ip}", - socat_cmd], + script], capture_output=True, text=True, timeout=30, ) if result.returncode != 0: - print(f" WARNING: socat setup returned non-zero: {result.stderr.strip()}") - # Not fatal — socat may already be running - else: - print(" Socat proxy started.") + print(f" ERROR: socat proxy setup failed: {result.stderr.strip()}") + return False + print(" Socat proxy established (VM:5051 -> container:5050).") return True @@ -1459,6 +1472,64 @@ def _attempt_auto_recovery( return False +# Default external backup root (outside the git repo) +_BACKUP_ROOT = Path.home() / "oa" / "recordings" + + +def _backup_file(src: Path, task_id: str) -> None: + """Hardlink *src* to the external backup directory. + + Creates ``~/oa/recordings/{task_id}/{src.name}``. Uses a hardlink (zero + extra disk space) and falls back to ``shutil.copy2`` for cross-device + scenarios. Failures are silently ignored — recording must never be + interrupted by a backup error. + """ + try: + backup_dir = _BACKUP_ROOT / task_id + backup_dir.mkdir(parents=True, exist_ok=True) + dest = backup_dir / src.name + if dest.exists(): + dest.unlink() + try: + os.link(src, dest) + except OSError: + shutil.copy2(src, dest) + except Exception: + pass # Silent — never interrupt recording + + +def _save_incremental_meta( + task_dir: Path, + task_id: str, + instruction: str, + steps_meta: list[dict], + step_plans: list[dict], + server: str, + is_final: bool = False, +) -> None: + """Write ``meta.json`` atomically after each step. + + Writes to a ``.tmp`` file first and renames, preventing corrupt partial + writes on crash. Includes a ``recording_complete`` boolean so downstream + scripts can detect partial recordings. + """ + meta = { + "task_id": task_id, + "instruction": instruction, + "num_steps": len(steps_meta), + "steps": steps_meta, + "step_plans": step_plans, + "server_url": server, + "recorded_at": datetime.now(timezone.utc).isoformat(), + "recording_complete": is_final, + } + tmp = task_dir / "meta.json.tmp" + final = task_dir / "meta.json" + tmp.write_text(json.dumps(meta, indent=2), encoding="utf-8") + tmp.replace(final) + _backup_file(final, task_id) + + def cmd_record_waa( tasks: str = ",".join(HARDER_TASK_IDS), server: str = "http://localhost:5001", @@ -1859,9 +1930,9 @@ def _hard_reset_task_env() -> bytes: while remaining_steps: # Save before screenshot - (task_dir / f"step_{step_idx:02d}_before.png").write_bytes( - before_png - ) + before_path = task_dir / f"step_{step_idx:02d}_before.png" + before_path.write_bytes(before_png) + _backup_file(before_path, task_id) # Display current step total = len(completed_steps) + len(remaining_steps) @@ -1879,17 +1950,17 @@ def _hard_reset_task_env() -> bytes: # RETRY: discard this attempt, take fresh before screenshot print(" Retrying step (taking fresh screenshot)...") before_png = _take_screenshot(server) - (task_dir / f"step_{step_idx:02d}_before.png").write_bytes( - before_png - ) + retry_path = task_dir / f"step_{step_idx:02d}_before.png" + retry_path.write_bytes(before_png) + _backup_file(retry_path, task_id) continue elif user_input == "": # ADVANCE: action done, move to next step after_png = _take_screenshot(server) - (task_dir / f"step_{step_idx:02d}_after.png").write_bytes( - after_png - ) + after_path = task_dir / f"step_{step_idx:02d}_after.png" + after_path.write_bytes(after_png) + _backup_file(after_path, task_id) done_step = remaining_steps.pop(0) completed_steps.append(done_step) steps_meta.append({ @@ -1897,6 +1968,10 @@ def _hard_reset_task_env() -> bytes: "suggested_step": done_step, "step_was_refined": step_idx in refined_indices, }) + _save_incremental_meta( + task_dir, task_id, instruction, steps_meta, + step_plans, server, + ) before_png = after_png step_idx += 1 print(f" Step {step_num} recorded.") @@ -1915,14 +1990,18 @@ def _hard_reset_task_env() -> bytes: elif user_input.lower() == "d": # DONE: task finished (possibly before all steps) after_png = _take_screenshot(server) - (task_dir / f"step_{step_idx:02d}_after.png").write_bytes( - after_png - ) + after_path = task_dir / f"step_{step_idx:02d}_after.png" + after_path.write_bytes(after_png) + _backup_file(after_path, task_id) steps_meta.append({ "action_hint": "d", "suggested_step": remaining_steps[0], "step_was_refined": step_idx in refined_indices, }) + _save_incremental_meta( + task_dir, task_id, instruction, steps_meta, + step_plans, server, + ) step_idx += 1 total = len(completed_steps) + len(remaining_steps) print(f"\n Task marked done at step {step_num} of {total}. Finishing recording.") @@ -1960,6 +2039,10 @@ def _hard_reset_task_env() -> bytes: print(" Restarting task (soft reset — closing apps, re-running setup)...") for f in task_dir.glob("step_*.png"): f.unlink() + # Clean external backup for this task too + backup_dir = _BACKUP_ROOT / task_id + if backup_dir.exists(): + shutil.rmtree(backup_dir, ignore_errors=True) before_png = _soft_reset_task_env() print(f"\n VNC: {vnc_url}") print(f" Task: {instruction}\n") @@ -1996,6 +2079,10 @@ def _hard_reset_task_env() -> bytes: print(" Restarting task (hard reset — QEMU reboot)...") for f in task_dir.glob("step_*.png"): f.unlink() + # Clean external backup for this task too + backup_dir = _BACKUP_ROOT / task_id + if backup_dir.exists(): + shutil.rmtree(backup_dir, ignore_errors=True) before_png = _hard_reset_task_env() print(f"\n VNC: {vnc_url}") print(f" Task: {instruction}\n") @@ -2081,18 +2168,10 @@ def _hard_reset_task_env() -> bytes: ) # No action taken — loop re-displays the (possibly new) current step - # Save metadata - meta = { - "task_id": task_id, - "instruction": instruction, - "num_steps": len(steps_meta), - "steps": steps_meta, - "step_plans": step_plans, - "server_url": server, - "recorded_at": datetime.now(timezone.utc).isoformat(), - } - (task_dir / "meta.json").write_text( - json.dumps(meta, indent=2), encoding="utf-8" + # Save final metadata (marks recording as complete) + _save_incremental_meta( + task_dir, task_id, instruction, steps_meta, + step_plans, server, is_final=True, ) # Task completed successfully — remove checkpoint