From 4a2731a0009f7c38521150f949e77af57283203e Mon Sep 17 00:00:00 2001 From: No9 Labs Date: Thu, 9 Apr 2026 02:11:20 -0400 Subject: [PATCH 1/3] fix: proactively remove .claude/worktrees/agent-* after every session The previous cleanup_worktrees only removed worktrees marked 'prunable' by git, but agent-* directories left by completed or crashed sessions are never prunable (their directories still exist). Added Pass 1 that enumerates all .claude/worktrees/agent-* entries via git worktree list --porcelain and force-removes each, skipping the current executing worktree as a safety guard. git worktree prune runs last to clear any orphaned git metadata. This runs in both housekeeping (start of cycle, catches crash remnants) and post-session cleanup (end of cycle). --- .recursive/engine/lib-agent.sh | 39 +++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/.recursive/engine/lib-agent.sh b/.recursive/engine/lib-agent.sh index e26b6b8..81b4eec 100644 --- a/.recursive/engine/lib-agent.sh +++ b/.recursive/engine/lib-agent.sh @@ -626,23 +626,52 @@ PY # cleanup_worktrees # Prunes stale git worktrees left by sub-agent sessions. -# Removes worktrees marked 'prunable' by git. +# Removes ALL .claude/worktrees/agent-* worktrees (active sub-agent dirs), +# plus any worktrees marked 'prunable' by git, then runs git worktree prune. +# Safe to call from the daemon main loop: the daemon runs in REPO_DIR, not +# inside an agent-* worktree, so no currently-executing agent is skipped. +# If called from inside an agent worktree (e.g. evolve), the current dir is +# detected and skipped to avoid self-removal. cleanup_worktrees() { - git -C "$REPO_DIR" worktree prune 2>/dev/null || true local count=0 + local current_wt + current_wt="$(git -C "$REPO_DIR" rev-parse --show-toplevel 2>/dev/null || echo "")" + + # Pass 1: remove ALL .claude/worktrees/agent-* worktrees by path. + # Uses porcelain format to get one path per stanza reliably. + while IFS= read -r wt_path; do + # Skip empty lines + [ -z "$wt_path" ] && continue + # Skip the main worktree + [ "$wt_path" = "$REPO_DIR" ] && continue + # Skip the worktree we are currently executing inside (safety guard) + [ "$wt_path" = "$current_wt" ] && continue + # Only target agent worktrees in .claude/worktrees/ + case "$wt_path" in + */.claude/worktrees/agent-*) + git -C "$REPO_DIR" worktree remove "$wt_path" --force 2>/dev/null || true + count=$((count + 1)) + ;; + esac + done < <(git -C "$REPO_DIR" worktree list --porcelain 2>/dev/null | grep "^worktree " | sed 's/^worktree //') + + # Pass 2: remove any remaining worktrees marked prunable by git. while IFS= read -r wt_line; do local wt_path wt_path=$(echo "$wt_line" | awk '{print $1}') - # Skip the main worktree [ "$wt_path" = "$REPO_DIR" ] && continue - # Remove if marked prunable or is a daemon worktree + [ "$wt_path" = "$current_wt" ] && continue if echo "$wt_line" | grep -q "prunable" 2>/dev/null; then git -C "$REPO_DIR" worktree remove "$wt_path" --force 2>/dev/null || true count=$((count + 1)) fi done < <(git -C "$REPO_DIR" worktree list 2>/dev/null) + + # Prune git metadata for any worktrees whose directories no longer exist. + git -C "$REPO_DIR" worktree prune 2>/dev/null || true + if [ "$count" -gt 0 ]; then - echo " Cleaned up $count worktree(s)" + echo " Cleaned up $count agent worktree(s)" fi } From 312c9784931d83f6a17fcd6703915ded83366d13 Mon Sep 17 00:00:00 2001 From: No9 Labs Date: Thu, 9 Apr 2026 02:14:16 -0400 Subject: [PATCH 2/3] feat: add sessions_since_eval signal and eval cadence alert to dashboard - Add `sessions_since_eval()` to signals.py: reads the latest eval file from .recursive/evaluations/, extracts its date/mtime, then counts session index rows after that timestamp. Returns a scalar int. - Surface the signal prominently in the dashboard Health section next to Eval score, with STALE annotation when >= 5 sessions. - Add alert in the Alerts section when sessions_since_eval >= 5, pointing the brain to delegate a Phractal eval run. - Add eval cadence rule to brain.md Delegation Protocol: when eval_staleness alert fires, brain SHOULD include eval run as a delegation. - Add 8 unit tests in test_signals.py and 6 in test_dashboard.py covering all branches of the new signal and alert logic. - All 1156 tests pass (make check green). Closes task #0242. --- .recursive/agents/brain.md | 1 + .recursive/engine/dashboard.py | 18 ++++ .recursive/engine/signals.py | 45 ++++++++ .recursive/tests/test_dashboard.py | 47 +++++++++ .recursive/tests/test_signals.py | 159 +++++++++++++++++++++++++++++ 5 files changed, 270 insertions(+) diff --git a/.recursive/agents/brain.md b/.recursive/agents/brain.md index 8d4c508..1b38129 100644 --- a/.recursive/agents/brain.md +++ b/.recursive/agents/brain.md @@ -80,6 +80,7 @@ Rules for delegation: 1. Give the sub-agent a SPECIFIC task. Include the task number, file path, and acceptance criteria. 2. **Match agent to zone.** Before delegating, read the task and identify which files it touches. If the task modifies `.recursive/` framework files (engine, prompts, agents, lib, operators, ops, scripts, templates, tests, skills) or root docs (CLAUDE.md, AGENTS.md), delegate to `evolve` (framework zone) — NOT `build`. The `build` agent is for `nightshift/` project code ONLY. Tasks with `target: recursive` in frontmatter are always framework-zone. - **Security-to-framework path**: If the task queue contains pending tasks with `source: pentest` AND `target: recursive`, delegate them to `evolve`, not `build` and not `security`. The `security` agent produces findings (read-only); the `evolve` agent applies fixes to framework files. This is the only compliant path for confirmed security vulnerabilities in `.recursive/` code. The `pick-role.py` advisory will boost `evolve` when such tasks exist (signal: `pentest_framework_tasks >= 1`). + - **Eval cadence rule**: If `eval_staleness` appears in the dashboard Alerts (i.e., `sessions_since_eval >= 5`), the brain SHOULD include a Phractal E2E eval run as one of its delegations in that session. Delegate to the `build` agent with the eval task (task #0243 or the lowest-numbered pending eval task). The build agent should run `nightshift test --agent claude --cycles 2 --cycle-minutes 5 --repo-dir /tmp/nightshift-eval-NNNN` and write results to `.recursive/evaluations/NNNN.md`. Do not defer the eval indefinitely -- the build-measure-build feedback loop depends on regular measurement against Phractal. 3. Never delegate vague instructions like "improve the codebase". 4. One sub-agent per task. Do not ask one agent to do two unrelated things. 5. For build/review/oversee/achieve/strategize/security/evolve/audit-agent: always use `isolation: "worktree"`. diff --git a/.recursive/engine/dashboard.py b/.recursive/engine/dashboard.py index c4b518c..d5db0c8 100644 --- a/.recursive/engine/dashboard.py +++ b/.recursive/engine/dashboard.py @@ -38,6 +38,7 @@ read_healer_status, read_latest_autonomy_score, read_latest_eval_score, + sessions_since_eval, ) # All roles tracked by sessions_since @@ -162,6 +163,9 @@ def collect_signals(recursive_dir: Path) -> dict[str, object]: signals["task_composition"] = task_composition signals["human_tasks"] = human_tasks + # Eval staleness -- dedicated scalar signal for alert threshold + signals["sessions_since_eval"] = sessions_since_eval(evaluations_dir, index_rows) + # Decision-consequence signals (self-awareness) signals["queue_trend"] = compute_queue_trend(decisions_path) signals["agent_diversity"] = compute_agent_diversity(delegations) @@ -189,6 +193,13 @@ def format_dashboard(signals: dict[str, object]) -> str: eval_note = " (default)" if signals.get("_eval_is_default") else "" auto_note = " (default)" if signals.get("_autonomy_is_default") else "" lines.append(f"Eval score: {signals['eval_score']}/100{eval_note}") + eval_staleness = signals.get("sessions_since_eval", 0) + if isinstance(eval_staleness, int) and eval_staleness >= 5: + lines.append(f"Eval staleness: {eval_staleness} sessions [STALE -- rerun recommended]") + elif isinstance(eval_staleness, int) and eval_staleness > 0: + lines.append(f"Eval staleness: {eval_staleness} sessions") + else: + lines.append("Eval staleness: 0 sessions (up to date)") lines.append(f"Autonomy score: {signals['autonomy_score']}/100{auto_note}") lines.append(f"Healer status: {signals['healer_status']}") nh_note = " (may be inaccurate if gh unavailable)" if signals["needs_human_issues"] == 0 else "" @@ -272,6 +283,13 @@ def format_dashboard(signals: dict[str, object]) -> str: lines.append("") lines.append("Alerts:") alerts: list[str] = [] + # Eval staleness alert (threshold: 5 sessions) + eval_since = signals.get("sessions_since_eval", 0) + if isinstance(eval_since, int) and eval_since >= 5: + alerts.append( + f" eval_staleness: {eval_since} sessions since last Phractal eval" + " -- delegate build agent to run eval (task #0243 or equivalent)" + ) audit_since = signals.get("sessions_since_audit", 0) if isinstance(audit_since, int) and audit_since >= 25: alerts.append(f" Framework audit overdue ({audit_since} sessions since last)") diff --git a/.recursive/engine/signals.py b/.recursive/engine/signals.py index 22dae00..b05ef21 100644 --- a/.recursive/engine/signals.py +++ b/.recursive/engine/signals.py @@ -508,6 +508,51 @@ def compute_agent_diversity(delegations: list[set[str]], window: int = 10) -> di return dict(sorted(counts.items(), key=lambda x: -x[1])) +def sessions_since_eval(evaluations_dir: Path, sessions_index: list[dict[str, str]]) -> int: + """Count sessions since the last evaluation report. + + Reads the latest eval file from evaluations_dir (highest-numbered NNNN.md), + extracts its date, then counts how many rows in sessions_index have + a timestamp after that date. + + Returns 0 if no eval files exist or the eval is up-to-date. + Returns len(sessions_index) if the eval date cannot be determined. + + This is the primary eval-freshness signal used by the dashboard alert. + compute_eval_staleness() provides the full (sessions, files_changed) tuple + for the decision-patterns section; sessions_since_eval() is the scalar + signal consumed by the Alerts section. + """ + if not evaluations_dir.is_dir(): + return 0 + evals = sorted(evaluations_dir.glob("[0-9]*.md")) + if not evals: + return 0 + latest = evals[-1] + try: + text = latest.read_text(encoding="utf-8") + dm = re.search(r"\*?\*?[Dd]ate\*?\*?:\s*(\d{4}-\d{2}-\d{2})", text) + if not dm: + return len(sessions_index) + eval_date = dm.group(1) + # Use file mtime for more precise timestamp (YYYY-MM-DD HH:MM) + from datetime import datetime as _dt + + eval_ts = _dt.fromtimestamp(latest.stat().st_mtime).strftime("%Y-%m-%d %H:%M") + except OSError: + return len(sessions_index) + # Count sessions after the eval + compare_ts = eval_ts if eval_ts else eval_date + count = 0 + for row in reversed(sessions_index): + ts = row.get("timestamp", "") + if ts > compare_ts: + count += 1 + else: + break + return count + + def compute_eval_staleness(evaluations_dir: Path, sessions_index: list[dict[str, str]]) -> tuple[int, int]: """How stale is the eval? Returns (sessions_since_eval, files_changed). diff --git a/.recursive/tests/test_dashboard.py b/.recursive/tests/test_dashboard.py index 6f44333..f41834f 100644 --- a/.recursive/tests/test_dashboard.py +++ b/.recursive/tests/test_dashboard.py @@ -273,3 +273,50 @@ def test_classifies_safe_vs_specific(self, tmp_path: Path) -> None: ) result = compute_commitment_quality(log) assert "2/2 MET" in result + + +class TestEvalStalenessAlert: + """Dashboard alert fires when sessions_since_eval >= 5 (task #0242).""" + + def test_alert_fires_at_threshold(self, tmp_path: Path) -> None: + signals = collect_signals(tmp_path) + signals["sessions_since_eval"] = 5 + output = format_dashboard(signals) + assert "eval_staleness" in output + assert "STALE" in output # top-level staleness indicator + assert "5 sessions since last Phractal eval" in output # alert text + + def test_alert_fires_above_threshold(self, tmp_path: Path) -> None: + signals = collect_signals(tmp_path) + signals["sessions_since_eval"] = 14 + output = format_dashboard(signals) + assert "eval_staleness" in output + assert "14 sessions since last Phractal eval" in output + + def test_no_alert_below_threshold(self, tmp_path: Path) -> None: + signals = collect_signals(tmp_path) + signals["sessions_since_eval"] = 3 + output = format_dashboard(signals) + # Alert text should not appear when below threshold + assert "sessions since last Phractal eval" not in output + + def test_staleness_shown_in_health_section(self, tmp_path: Path) -> None: + """Eval staleness appears next to Eval score in the Health section.""" + signals = collect_signals(tmp_path) + signals["sessions_since_eval"] = 7 + output = format_dashboard(signals) + assert "Eval staleness:" in output + assert "7 sessions" in output + assert "STALE" in output + + def test_up_to_date_message_when_zero(self, tmp_path: Path) -> None: + signals = collect_signals(tmp_path) + signals["sessions_since_eval"] = 0 + output = format_dashboard(signals) + assert "up to date" in output + + def test_sessions_since_eval_in_collect_signals(self, tmp_path: Path) -> None: + """collect_signals includes sessions_since_eval key.""" + signals = collect_signals(tmp_path) + assert "sessions_since_eval" in signals + assert isinstance(signals["sessions_since_eval"], int) diff --git a/.recursive/tests/test_signals.py b/.recursive/tests/test_signals.py index 1e7efa8..0ac82c4 100644 --- a/.recursive/tests/test_signals.py +++ b/.recursive/tests/test_signals.py @@ -293,3 +293,162 @@ def test_date_cutoff_boundary(self, tmp_path: Path) -> None: f"status: done\nsource: pentest\ncompleted: {before}\n", ) assert signals.count_recent_pentest_tasks(tmp_path, days=3) == 1 + + +# --------------------------------------------------------------------------- +# sessions_since_eval tests (task #0242) +# --------------------------------------------------------------------------- + +_VALID_EVAL_CONTENT = ( + "**Date**: 2026-04-08\n" + "| Startup | 9/10 | OK |\n" + "| Discovery | 8/10 | OK |\n" + "| Fix quality | 8/10 | OK |\n" + "| **Total** | **86/100** | |\n" +) + + +class TestSessionsSinceEval: + """Unit tests for the sessions_since_eval signal.""" + + def _make_eval(self, evals_dir: Path, name: str, date: str) -> Path: + """Write a minimal valid eval file with the given date.""" + content = ( + f"**Date**: {date}\n" + "| Startup | 9/10 | OK |\n" + "| Discovery | 8/10 | OK |\n" + "| Fix quality | 8/10 | OK |\n" + "| **Total** | **86/100** | |\n" + ) + f = evals_dir / name + f.write_text(content) + return f + + def _make_index_rows(self, timestamps: list[str]) -> list[dict[str, str]]: + """Build session index rows with the given timestamp strings.""" + return [{"timestamp": ts, "role": "build"} for ts in timestamps] + + def test_no_eval_dir_returns_zero(self, tmp_path: Path) -> None: + evals_dir = tmp_path / "evaluations" + rows = self._make_index_rows(["2026-04-09 01:00"]) + assert signals.sessions_since_eval(evals_dir, rows) == 0 + + def test_empty_eval_dir_returns_zero(self, tmp_path: Path) -> None: + evals_dir = tmp_path / "evaluations" + evals_dir.mkdir() + rows = self._make_index_rows(["2026-04-09 01:00"]) + assert signals.sessions_since_eval(evals_dir, rows) == 0 + + def test_all_sessions_after_eval(self, tmp_path: Path) -> None: + evals_dir = tmp_path / "evaluations" + evals_dir.mkdir() + eval_file = self._make_eval(evals_dir, "0001.md", "2026-04-01") + # Force the file mtime to an earlier date via a fixed timestamp + import os + import time + + old_ts = time.mktime(time.strptime("2026-04-01 12:00", "%Y-%m-%d %H:%M")) + os.utime(str(eval_file), (old_ts, old_ts)) + rows = self._make_index_rows( + [ + "2026-04-02 01:00", + "2026-04-03 01:00", + "2026-04-04 01:00", + ] + ) + result = signals.sessions_since_eval(evals_dir, rows) + assert result == 3 + + def test_some_sessions_after_eval(self, tmp_path: Path) -> None: + evals_dir = tmp_path / "evaluations" + evals_dir.mkdir() + eval_file = self._make_eval(evals_dir, "0001.md", "2026-04-05") + import os + import time + + old_ts = time.mktime(time.strptime("2026-04-05 12:00", "%Y-%m-%d %H:%M")) + os.utime(str(eval_file), (old_ts, old_ts)) + rows = self._make_index_rows( + [ + "2026-04-04 01:00", # before eval + "2026-04-05 10:00", # before eval (mtime is 12:00) + "2026-04-06 01:00", # after eval + "2026-04-07 01:00", # after eval + ] + ) + result = signals.sessions_since_eval(evals_dir, rows) + assert result == 2 + + def test_no_sessions_after_eval_returns_zero(self, tmp_path: Path) -> None: + evals_dir = tmp_path / "evaluations" + evals_dir.mkdir() + eval_file = self._make_eval(evals_dir, "0001.md", "2026-04-09") + import os + import time + + future_ts = time.mktime(time.strptime("2026-04-09 23:00", "%Y-%m-%d %H:%M")) + os.utime(str(eval_file), (future_ts, future_ts)) + rows = self._make_index_rows( + [ + "2026-04-09 01:00", # before eval mtime + "2026-04-09 10:00", # before eval mtime + ] + ) + result = signals.sessions_since_eval(evals_dir, rows) + assert result == 0 + + def test_uses_latest_eval_file(self, tmp_path: Path) -> None: + evals_dir = tmp_path / "evaluations" + evals_dir.mkdir() + import os + import time + + # Older eval (0001.md) with old date + eval_old = self._make_eval(evals_dir, "0001.md", "2026-04-01") + old_ts = time.mktime(time.strptime("2026-04-01 12:00", "%Y-%m-%d %H:%M")) + os.utime(str(eval_old), (old_ts, old_ts)) + + # Newer eval (0002.md) with recent date -- function should use this one + eval_new = self._make_eval(evals_dir, "0002.md", "2026-04-08") + new_ts = time.mktime(time.strptime("2026-04-08 12:00", "%Y-%m-%d %H:%M")) + os.utime(str(eval_new), (new_ts, new_ts)) + + rows = self._make_index_rows( + [ + "2026-04-02 01:00", # after 0001, before 0002 + "2026-04-09 01:00", # after both evals + ] + ) + # Should use 0002.md (newest), so only 1 session after + result = signals.sessions_since_eval(evals_dir, rows) + assert result == 1 + + def test_empty_sessions_list_returns_zero(self, tmp_path: Path) -> None: + evals_dir = tmp_path / "evaluations" + evals_dir.mkdir() + self._make_eval(evals_dir, "0001.md", "2026-04-01") + result = signals.sessions_since_eval(evals_dir, []) + assert result == 0 + + def test_five_or_more_sessions_triggers_alert_threshold(self, tmp_path: Path) -> None: + """Confirm the alert threshold value: >= 5 should trigger.""" + evals_dir = tmp_path / "evaluations" + evals_dir.mkdir() + eval_file = self._make_eval(evals_dir, "0001.md", "2026-04-01") + import os + import time + + old_ts = time.mktime(time.strptime("2026-04-01 12:00", "%Y-%m-%d %H:%M")) + os.utime(str(eval_file), (old_ts, old_ts)) + rows = self._make_index_rows( + [ + "2026-04-02 01:00", + "2026-04-03 01:00", + "2026-04-04 01:00", + "2026-04-05 01:00", + "2026-04-06 01:00", + ] + ) + result = signals.sessions_since_eval(evals_dir, rows) + assert result == 5 + assert result >= 5 # alert threshold met From e0f953158a179870949c50e63d8ce6de09768d7e Mon Sep 17 00:00:00 2001 From: No9 Labs Date: Thu, 9 Apr 2026 02:20:21 -0400 Subject: [PATCH 3/3] fix: correct self-removal guard in cleanup_worktrees --- .recursive/engine/lib-agent.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.recursive/engine/lib-agent.sh b/.recursive/engine/lib-agent.sh index 81b4eec..c61659d 100644 --- a/.recursive/engine/lib-agent.sh +++ b/.recursive/engine/lib-agent.sh @@ -635,7 +635,7 @@ PY cleanup_worktrees() { local count=0 local current_wt - current_wt="$(git -C "$REPO_DIR" rev-parse --show-toplevel 2>/dev/null || echo "")" + current_wt="$(git rev-parse --show-toplevel 2>/dev/null || echo "")" # Pass 1: remove ALL .claude/worktrees/agent-* worktrees by path. # Uses porcelain format to get one path per stanza reliably.