Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .beads/issues.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@
{"id":"openadapt-evals-hvm","title":"VL model fix PR #18 ready to merge","notes":"2026-02-08: openadapt-ml PR #18 was already merged on 2026-01-29. VL model fix is done.","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-29T16:17:03.491938-05:00","created_by":"Richard Abrich","updated_at":"2026-02-08T12:55:19.233249-05:00","closed_at":"2026-02-08T12:55:19.233249-05:00","close_reason":"PR #18 already merged 2026-01-29"}
{"id":"openadapt-evals-mx8","title":"Analyze evaluation results and publish findings","description":"After demo-conditioned evaluation completes, analyze results: success rates, failure modes, demo impact. Create data-driven roadmap for improvements.","notes":"wright repo (OpenAdaptAI/wright) scaffolding underway. Herald + consilium repos transferred to OpenAdaptAI org. Wright will be the orchestration layer for eval pipeline.","status":"open","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:06.328838-05:00","created_by":"Richard Abrich","updated_at":"2026-03-02T00:08:08.422633-05:00"}
{"id":"openadapt-evals-sz4","title":"RCA: Windows product key prompt recurring issue","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.266286-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.493102-05:00","closed_at":"2026-01-20T20:32:06.493102-05:00","close_reason":"RCA complete - root cause is VERSION mismatch (CLI=11, Dockerfile=11e). Fix documented in RECURRING_ISSUES.md and WINDOWS_PRODUCT_KEY_RCA.md"}
{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"wright repo created (OpenAdaptAI/wright), scaffolding in progress. Herald + consilium transferred to OpenAdaptAI org.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-02T00:08:08.423284-05:00"}
{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"2026-03-01: GPU grant applications reviewed and rewritten (11 files). Writing done, blocked on eval results (DC signal on harder tasks). Detailed status tracked in openadapt-internal (private repo).","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-02T01:14:44.513125-05:00"}
{"id":"openadapt-evals-wis","title":"Add pre-flight check to detect Windows install issues","status":"closed","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.865052-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.757261-05:00","closed_at":"2026-01-20T20:32:06.757261-05:00","close_reason":"Duplicate of openadapt-evals-0dt"}
Binary file added docs/artifacts/full/step_00_after.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_00_before.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_01_after.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_01_before.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_02_after.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_02_before.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_03_after.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_03_before.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_04_after.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_04_before.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_05_after.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_05_before.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_06_after.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_06_before.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_07_after.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_07_before.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_08_after.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_08_before.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_09_after.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_09_before.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_10_after.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_10_before.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_11_after.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_11_before.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_12_after.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/artifacts/full/step_12_before.png
Binary file added docs/artifacts/full/step_13_after.png
Binary file added docs/artifacts/full/step_13_before.png
Binary file added docs/artifacts/full/step_14_after.png
Binary file added docs/artifacts/full/step_14_before.png
Binary file added docs/artifacts/full/step_15_after.png
Binary file added docs/artifacts/full/step_15_before.png
Binary file added docs/artifacts/full/step_16_after.png
Binary file added docs/artifacts/full/step_16_before.png
Binary file added docs/artifacts/full/step_17_after.png
Binary file added docs/artifacts/full/step_17_before.png
Binary file added docs/artifacts/full/step_18_after.png
Binary file added docs/artifacts/full/step_18_before.png
Binary file added docs/artifacts/full/step_19_after.png
Binary file added docs/artifacts/full/step_19_before.png
Binary file added docs/artifacts/full/step_20_after.png
Binary file added docs/artifacts/full/step_20_before.png
Binary file added docs/artifacts/thumbnails/step_00_after.png
Binary file added docs/artifacts/thumbnails/step_00_before.png
Binary file added docs/artifacts/thumbnails/step_01_after.png
Binary file added docs/artifacts/thumbnails/step_01_before.png
Binary file added docs/artifacts/thumbnails/step_02_after.png
Binary file added docs/artifacts/thumbnails/step_02_before.png
Binary file added docs/artifacts/thumbnails/step_03_after.png
Binary file added docs/artifacts/thumbnails/step_03_before.png
Binary file added docs/artifacts/thumbnails/step_04_after.png
Binary file added docs/artifacts/thumbnails/step_04_before.png
Binary file added docs/artifacts/thumbnails/step_05_after.png
Binary file added docs/artifacts/thumbnails/step_05_before.png
Binary file added docs/artifacts/thumbnails/step_06_after.png
Binary file added docs/artifacts/thumbnails/step_06_before.png
Binary file added docs/artifacts/thumbnails/step_07_after.png
Binary file added docs/artifacts/thumbnails/step_07_before.png
Binary file added docs/artifacts/thumbnails/step_08_after.png
Binary file added docs/artifacts/thumbnails/step_08_before.png
Binary file added docs/artifacts/thumbnails/step_09_after.png
Binary file added docs/artifacts/thumbnails/step_09_before.png
Binary file added docs/artifacts/thumbnails/step_10_after.png
Binary file added docs/artifacts/thumbnails/step_10_before.png
Binary file added docs/artifacts/thumbnails/step_11_after.png
Binary file added docs/artifacts/thumbnails/step_11_before.png
Binary file added docs/artifacts/thumbnails/step_12_after.png
Binary file added docs/artifacts/thumbnails/step_12_before.png
Binary file added docs/artifacts/thumbnails/step_13_after.png
Binary file added docs/artifacts/thumbnails/step_13_before.png
Binary file added docs/artifacts/thumbnails/step_14_after.png
Binary file added docs/artifacts/thumbnails/step_14_before.png
Binary file added docs/artifacts/thumbnails/step_15_after.png
Binary file added docs/artifacts/thumbnails/step_15_before.png
Binary file added docs/artifacts/thumbnails/step_16_after.png
Binary file added docs/artifacts/thumbnails/step_16_before.png
Binary file added docs/artifacts/thumbnails/step_17_after.png
Binary file added docs/artifacts/thumbnails/step_17_before.png
Binary file added docs/artifacts/thumbnails/step_18_after.png
Binary file added docs/artifacts/thumbnails/step_18_before.png
Binary file added docs/artifacts/thumbnails/step_19_after.png
Binary file added docs/artifacts/thumbnails/step_19_before.png
Binary file added docs/artifacts/thumbnails/step_20_after.png
Binary file added docs/artifacts/thumbnails/step_20_before.png
376 changes: 376 additions & 0 deletions docs/demo_review.md

Large diffs are not rendered by default.

301 changes: 301 additions & 0 deletions scripts/generate_demo_review.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,301 @@
#!/usr/bin/env python3
"""Generate a markdown review artifact for the demo recording pipeline.

Reads a WAA recording (meta.json + screenshots), creates thumbnail images,
and produces a markdown file showing the pipeline output for each step.
The markdown is suitable for embedding in docs or PR descriptions and
renders on GitHub with relative image paths. Thumbnails link to full-resolution
originals when available.

Usage:
python scripts/generate_demo_review.py \
--recording waa_recordings/04d9aeaf-7bed-4024-bedb-e10e6f00eb7f-WOS \
--text-demo demo_prompts/04d9aeaf-7bed-4024-bedb-e10e6f00eb7f-WOS.txt \
--vlm-demo demo_prompts_vlm/04d9aeaf-7bed-4024-bedb-e10e6f00eb7f-WOS.txt \
--output docs/demo_review.md
"""
from __future__ import annotations

import json
import re
import shutil
import sys
from pathlib import Path

from PIL import Image


THUMBNAIL_WIDTH = 600


def _parse_demo_steps(demo_text: str) -> dict[int, str]:
"""Parse a demo .txt file into a dict mapping step number -> step content.

Handles both text-only and VLM-enriched formats. Returns the full text
block for each step (everything between "Step N:" markers).
"""
steps: dict[int, str] = {}
# Split on "Step N:" headers, capturing the step number
parts = re.split(r'^(Step \d+:)\s*$', demo_text, flags=re.MULTILINE)

# parts looks like: [preamble, "Step 1:", content, "Step 2:", content, ...]
for i in range(1, len(parts) - 1, 2):
header = parts[i] # e.g. "Step 3:"
content = parts[i + 1]
step_num = int(re.search(r'\d+', header).group())
# Strip trailing blank lines / separators but preserve internal structure
content = content.strip()
# Remove trailing "---" if it's the last step
if content.endswith("---"):
content = content[:-3].strip()
steps[step_num] = content

return steps


def _create_thumbnail(src: Path, dst: Path, width: int = THUMBNAIL_WIDTH) -> None:
"""Resize an image to the given width, preserving aspect ratio."""
with Image.open(src) as img:
if img.width <= width:
shutil.copy2(src, dst)
return
ratio = width / img.width
new_height = int(img.height * ratio)
resized = img.resize((width, new_height), Image.LANCZOS)
resized.save(dst, optimize=True)


def _relpath(target: Path, start: Path) -> str:
"""Compute a relative path from start to target, suitable for markdown."""
try:
return str(target.resolve().relative_to(start.resolve()))
except ValueError:
import os
return os.path.relpath(target.resolve(), start.resolve())


def _escape_md(text: str) -> str:
"""Minimal escaping so that text doesn't break markdown tables."""
return text.replace("|", "\\|").replace("\n", "<br>")


def _indent_block(text: str, prefix: str = "> ") -> str:
"""Indent every line of text with the given prefix."""
return "\n".join(prefix + line for line in text.split("\n"))


def main(
recording: str,
text_demo: str | None = None,
vlm_demo: str | None = None,
output: str = "docs/demo_review.md",
thumbnail_width: int = THUMBNAIL_WIDTH,
) -> None:
"""Generate a markdown review of the demo pipeline output.

Args:
recording: Path to the recording directory (contains meta.json + PNGs).
text_demo: Path to the text-only demo .txt file.
vlm_demo: Path to the VLM-enriched demo .txt file.
output: Output path for the generated markdown file.
thumbnail_width: Width in pixels for thumbnail images.
"""
recording_dir = Path(recording)
output_path = Path(output)

# --- Validate inputs ---
meta_path = recording_dir / "meta.json"
if not meta_path.exists():
print(f"Error: meta.json not found in {recording_dir}")
sys.exit(1)

meta = json.loads(meta_path.read_text(encoding="utf-8"))
task_id = meta["task_id"]
instruction = meta["instruction"]
num_steps = meta.get("num_steps", len(meta.get("steps", [])))
steps = meta.get("steps", [])
recorded_at = meta.get("recorded_at", "unknown")

# --- Parse demo files ---
text_steps: dict[int, str] = {}
vlm_steps: dict[int, str] = {}

if text_demo:
text_demo_path = Path(text_demo)
if text_demo_path.exists():
text_steps = _parse_demo_steps(
text_demo_path.read_text(encoding="utf-8")
)
else:
print(f"Warning: text demo not found at {text_demo_path}")

if vlm_demo:
vlm_demo_path = Path(vlm_demo)
if vlm_demo_path.exists():
vlm_steps = _parse_demo_steps(
vlm_demo_path.read_text(encoding="utf-8")
)
else:
print(f"Warning: VLM demo not found at {vlm_demo_path}")

# --- Create thumbnails and copy full-res originals ---
thumb_dir = output_path.parent / "artifacts" / "thumbnails"
thumb_dir.mkdir(parents=True, exist_ok=True)

full_dir = output_path.parent / "artifacts" / "full"
full_dir.mkdir(parents=True, exist_ok=True)

thumbnail_map: dict[str, Path] = {}
full_map: dict[str, Path] = {}
for i in range(num_steps):
for suffix in ("before", "after"):
name = f"step_{i:02d}_{suffix}"
src = recording_dir / f"{name}.png"
if src.exists():
dst = thumb_dir / f"{name}.png"
_create_thumbnail(src, dst, width=thumbnail_width)
thumbnail_map[name] = dst
# Copy full-resolution original
full_dst = full_dir / f"{name}.png"
shutil.copy2(src, full_dst)
full_map[name] = full_dst

print(f"Created {len(thumbnail_map)} thumbnails in {thumb_dir}")
if full_map:
print(f"Copied {len(full_map)} full-resolution images to {full_dir}")

# --- Build markdown ---
md_dir = output_path.parent
md_dir.mkdir(parents=True, exist_ok=True)
lines: list[str] = []

# Header
lines.append("# Demo Pipeline Review")
lines.append("")
lines.append(f"**Task ID:** `{task_id}`")
lines.append("")
lines.append(f"**Instruction:** {instruction}")
lines.append("")
lines.append(f"**Steps:** {num_steps}")
lines.append("")
lines.append(f"**Recorded at:** {recorded_at}")
lines.append("")

# --- Comparison table (first 3 steps) ---
compare_count = min(3, num_steps)
if text_steps or vlm_steps:
lines.append("## Text vs VLM Comparison (First 3 Steps)")
lines.append("")
lines.append(
"| Step | Ground Truth | Text-Only Demo | VLM-Enriched Demo |"
)
lines.append("|------|-------------|----------------|-------------------|")

for i in range(compare_count):
step_num = i + 1
gt = steps[i].get("suggested_step", "") if i < len(steps) else ""
text_content = _escape_md(text_steps.get(step_num, "*(not available)*"))
vlm_content = _escape_md(vlm_steps.get(step_num, "*(not available)*"))
gt_escaped = _escape_md(gt)
lines.append(
f"| {step_num} | {gt_escaped} | {text_content} | {vlm_content} |"
)

lines.append("")

# --- Per-step details (expanded, not collapsed) ---
for i in range(num_steps):
step_num = i + 1
gt = steps[i].get("suggested_step", f"(step {step_num})") if i < len(steps) else f"(step {step_num})"

lines.append(f"### Step {step_num}: {gt}")
lines.append("")

# Screenshots — thumbnails that link to full-resolution when available
before_key = f"step_{i:02d}_before"
after_key = f"step_{i:02d}_after"
has_before = before_key in thumbnail_map
has_after = after_key in thumbnail_map

if has_before or has_after:
if has_before and has_after:
before_thumb = _relpath(thumbnail_map[before_key], md_dir)
after_thumb = _relpath(thumbnail_map[after_key], md_dir)
if before_key in full_map:
before_full = _relpath(full_map[before_key], md_dir)
after_full = _relpath(full_map[after_key], md_dir)
lines.append(
f"[![before]({before_thumb})]({before_full}) "
f"[![after]({after_thumb})]({after_full})"
)
else:
lines.append(
f"![before]({before_thumb}) "
f"![after]({after_thumb})"
)
elif has_before:
before_thumb = _relpath(thumbnail_map[before_key], md_dir)
if before_key in full_map:
before_full = _relpath(full_map[before_key], md_dir)
lines.append(f"[![before]({before_thumb})]({before_full})")
else:
lines.append(f"![before]({before_thumb})")
elif has_after:
after_thumb = _relpath(thumbnail_map[after_key], md_dir)
if after_key in full_map:
after_full = _relpath(full_map[after_key], md_dir)
lines.append(f"[![after]({after_thumb})]({after_full})")
else:
lines.append(f"![after]({after_thumb})")

lines.append("")

# Ground truth
lines.append(f"**Ground truth:** {gt}")
lines.append("")

# Text-only demo output
if text_steps:
text_content = text_steps.get(step_num)
if text_content:
lines.append(f"**Text demo:** {text_content}")
else:
lines.append("**Text demo:** *(not available)*")
lines.append("")

# VLM-enriched demo output
if vlm_steps:
vlm_content = vlm_steps.get(step_num)
if vlm_content:
lines.append("**VLM demo:**")
lines.append("")
lines.append(_indent_block(vlm_content))
else:
lines.append("**VLM demo:** *(not available)*")
lines.append("")

lines.append("---")
lines.append("")

# --- Footer ---
lines.append(
f"*Generated by `scripts/generate_demo_review.py` from recording "
f"`{recording_dir.name}`*"
)
lines.append("")

# Write output
md_text = "\n".join(lines)
output_path.write_text(md_text, encoding="utf-8")
print(f"Wrote {len(md_text)} bytes to {output_path}")
print(f" {num_steps} steps, {len(thumbnail_map)} thumbnails")
if text_steps:
print(f" Text-only demo: {len(text_steps)} steps parsed")
if vlm_steps:
print(f" VLM-enriched demo: {len(vlm_steps)} steps parsed")


if __name__ == "__main__":
import fire

fire.Fire(main)
7 changes: 4 additions & 3 deletions scripts/record_waa_demos.py
Original file line number Diff line number Diff line change
Expand Up @@ -1356,9 +1356,10 @@ def _auto_start_socat(vm_ip: str) -> bool:
capture_output=True, text=True, timeout=30,
)
if result.returncode != 0:
print(f" ERROR: socat proxy setup failed: {result.stderr.strip()}")
return False
print(" Socat proxy established (VM:5051 -> container:5050).")
print(f" WARNING: socat setup returned non-zero: {result.stderr.strip()}")
# Not fatal — socat may already be running
else:
print(" Socat proxy established (VM:5051 -> container:5050).")
return True


Expand Down
Loading