Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .recursive/agents/brain.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ Rules for delegation:
1. Give the sub-agent a SPECIFIC task. Include the task number, file path, and acceptance criteria.
2. **Match agent to zone.** Before delegating, read the task and identify which files it touches. If the task modifies `.recursive/` framework files (engine, prompts, agents, lib, operators, ops, scripts, templates, tests, skills) or root docs (CLAUDE.md, AGENTS.md), delegate to `evolve` (framework zone) — NOT `build`. The `build` agent is for `nightshift/` project code ONLY. Tasks with `target: recursive` in frontmatter are always framework-zone.
- **Security-to-framework path**: If the task queue contains pending tasks with `source: pentest` AND `target: recursive`, delegate them to `evolve`, not `build` and not `security`. The `security` agent produces findings (read-only); the `evolve` agent applies fixes to framework files. This is the only compliant path for confirmed security vulnerabilities in `.recursive/` code. The `pick-role.py` advisory will boost `evolve` when such tasks exist (signal: `pentest_framework_tasks >= 1`).
- **Eval cadence rule**: If `eval_staleness` appears in the dashboard Alerts (i.e., `sessions_since_eval >= 5`), the brain SHOULD include a Phractal E2E eval run as one of its delegations in that session. Delegate to the `build` agent with the eval task (task #0243 or the lowest-numbered pending eval task). The build agent should run `nightshift test --agent claude --cycles 2 --cycle-minutes 5 --repo-dir /tmp/nightshift-eval-NNNN` and write results to `.recursive/evaluations/NNNN.md`. Do not defer the eval indefinitely -- the build-measure-build feedback loop depends on regular measurement against Phractal.
3. Never delegate vague instructions like "improve the codebase".
4. One sub-agent per task. Do not ask one agent to do two unrelated things.
5. For build/review/oversee/achieve/strategize/security/evolve/audit-agent: always use `isolation: "worktree"`.
Expand Down
18 changes: 18 additions & 0 deletions .recursive/engine/dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
read_healer_status,
read_latest_autonomy_score,
read_latest_eval_score,
sessions_since_eval,
)

# All roles tracked by sessions_since
Expand Down Expand Up @@ -162,6 +163,9 @@ def collect_signals(recursive_dir: Path) -> dict[str, object]:
signals["task_composition"] = task_composition
signals["human_tasks"] = human_tasks

# Eval staleness -- dedicated scalar signal for alert threshold
signals["sessions_since_eval"] = sessions_since_eval(evaluations_dir, index_rows)

# Decision-consequence signals (self-awareness)
signals["queue_trend"] = compute_queue_trend(decisions_path)
signals["agent_diversity"] = compute_agent_diversity(delegations)
Expand Down Expand Up @@ -189,6 +193,13 @@ def format_dashboard(signals: dict[str, object]) -> str:
eval_note = " (default)" if signals.get("_eval_is_default") else ""
auto_note = " (default)" if signals.get("_autonomy_is_default") else ""
lines.append(f"Eval score: {signals['eval_score']}/100{eval_note}")
eval_staleness = signals.get("sessions_since_eval", 0)
if isinstance(eval_staleness, int) and eval_staleness >= 5:
lines.append(f"Eval staleness: {eval_staleness} sessions [STALE -- rerun recommended]")
elif isinstance(eval_staleness, int) and eval_staleness > 0:
lines.append(f"Eval staleness: {eval_staleness} sessions")
else:
lines.append("Eval staleness: 0 sessions (up to date)")
lines.append(f"Autonomy score: {signals['autonomy_score']}/100{auto_note}")
lines.append(f"Healer status: {signals['healer_status']}")
nh_note = " (may be inaccurate if gh unavailable)" if signals["needs_human_issues"] == 0 else ""
Expand Down Expand Up @@ -272,6 +283,13 @@ def format_dashboard(signals: dict[str, object]) -> str:
lines.append("")
lines.append("Alerts:")
alerts: list[str] = []
# Eval staleness alert (threshold: 5 sessions)
eval_since = signals.get("sessions_since_eval", 0)
if isinstance(eval_since, int) and eval_since >= 5:
alerts.append(
f" eval_staleness: {eval_since} sessions since last Phractal eval"
" -- delegate build agent to run eval (task #0243 or equivalent)"
)
audit_since = signals.get("sessions_since_audit", 0)
if isinstance(audit_since, int) and audit_since >= 25:
alerts.append(f" Framework audit overdue ({audit_since} sessions since last)")
Expand Down
39 changes: 34 additions & 5 deletions .recursive/engine/lib-agent.sh
Original file line number Diff line number Diff line change
Expand Up @@ -626,23 +626,52 @@ PY

# cleanup_worktrees
# Prunes stale git worktrees left by sub-agent sessions.
# Removes worktrees marked 'prunable' by git.
# Removes ALL .claude/worktrees/agent-* worktrees (active sub-agent dirs),
# plus any worktrees marked 'prunable' by git, then runs git worktree prune.
# Safe to call from the daemon main loop: the daemon runs in REPO_DIR, not
# inside an agent-* worktree, so no currently-executing agent is skipped.
# If called from inside an agent worktree (e.g. evolve), the current dir is
# detected and skipped to avoid self-removal.
cleanup_worktrees() {
git -C "$REPO_DIR" worktree prune 2>/dev/null || true
local count=0
local current_wt
current_wt="$(git rev-parse --show-toplevel 2>/dev/null || echo "")"

# Pass 1: remove ALL .claude/worktrees/agent-* worktrees by path.
# Uses porcelain format to get one path per stanza reliably.
while IFS= read -r wt_path; do
# Skip empty lines
[ -z "$wt_path" ] && continue
# Skip the main worktree
[ "$wt_path" = "$REPO_DIR" ] && continue
# Skip the worktree we are currently executing inside (safety guard)
[ "$wt_path" = "$current_wt" ] && continue
# Only target agent worktrees in .claude/worktrees/
case "$wt_path" in
*/.claude/worktrees/agent-*)
git -C "$REPO_DIR" worktree remove "$wt_path" --force 2>/dev/null || true
count=$((count + 1))
;;
esac
done < <(git -C "$REPO_DIR" worktree list --porcelain 2>/dev/null | grep "^worktree " | sed 's/^worktree //')

# Pass 2: remove any remaining worktrees marked prunable by git.
while IFS= read -r wt_line; do
local wt_path
wt_path=$(echo "$wt_line" | awk '{print $1}')
# Skip the main worktree
[ "$wt_path" = "$REPO_DIR" ] && continue
# Remove if marked prunable or is a daemon worktree
[ "$wt_path" = "$current_wt" ] && continue
if echo "$wt_line" | grep -q "prunable" 2>/dev/null; then
git -C "$REPO_DIR" worktree remove "$wt_path" --force 2>/dev/null || true
count=$((count + 1))
fi
done < <(git -C "$REPO_DIR" worktree list 2>/dev/null)

# Prune git metadata for any worktrees whose directories no longer exist.
git -C "$REPO_DIR" worktree prune 2>/dev/null || true

if [ "$count" -gt 0 ]; then
echo " Cleaned up $count worktree(s)"
echo " Cleaned up $count agent worktree(s)"
fi
}

Expand Down
45 changes: 45 additions & 0 deletions .recursive/engine/signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,51 @@ def compute_agent_diversity(delegations: list[set[str]], window: int = 10) -> di
return dict(sorted(counts.items(), key=lambda x: -x[1]))


def sessions_since_eval(evaluations_dir: Path, sessions_index: list[dict[str, str]]) -> int:
"""Count sessions since the last evaluation report.

Reads the latest eval file from evaluations_dir (highest-numbered NNNN.md),
extracts its date, then counts how many rows in sessions_index have
a timestamp after that date.

Returns 0 if no eval files exist or the eval is up-to-date.
Returns len(sessions_index) if the eval date cannot be determined.

This is the primary eval-freshness signal used by the dashboard alert.
compute_eval_staleness() provides the full (sessions, files_changed) tuple
for the decision-patterns section; sessions_since_eval() is the scalar
signal consumed by the Alerts section.
"""
if not evaluations_dir.is_dir():
return 0
evals = sorted(evaluations_dir.glob("[0-9]*.md"))
if not evals:
return 0
latest = evals[-1]
try:
text = latest.read_text(encoding="utf-8")
dm = re.search(r"\*?\*?[Dd]ate\*?\*?:\s*(\d{4}-\d{2}-\d{2})", text)
if not dm:
return len(sessions_index)
eval_date = dm.group(1)
# Use file mtime for more precise timestamp (YYYY-MM-DD HH:MM)
from datetime import datetime as _dt

eval_ts = _dt.fromtimestamp(latest.stat().st_mtime).strftime("%Y-%m-%d %H:%M")
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Base eval staleness on report date, not file mtime

Using latest.stat().st_mtime as the primary comparison timestamp makes sessions_since_eval return 0 after a fresh clone/reset, because Git sets file mtimes to checkout time rather than the report’s actual run date. In that common environment, an old eval report (with many newer session rows) appears “up to date,” so the new eval_staleness alert never fires and the cadence rule is bypassed. This regresses the core purpose of task #242 whenever the working tree is recreated or eval files are touched without a new eval run.

Useful? React with 👍 / 👎.

except OSError:
return len(sessions_index)
# Count sessions after the eval
compare_ts = eval_ts if eval_ts else eval_date
count = 0
for row in reversed(sessions_index):
ts = row.get("timestamp", "")
if ts > compare_ts:
count += 1
else:
break
return count


def compute_eval_staleness(evaluations_dir: Path, sessions_index: list[dict[str, str]]) -> tuple[int, int]:
"""How stale is the eval? Returns (sessions_since_eval, files_changed).

Expand Down
47 changes: 47 additions & 0 deletions .recursive/tests/test_dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,3 +273,50 @@ def test_classifies_safe_vs_specific(self, tmp_path: Path) -> None:
)
result = compute_commitment_quality(log)
assert "2/2 MET" in result


class TestEvalStalenessAlert:
"""Dashboard alert fires when sessions_since_eval >= 5 (task #0242)."""

def test_alert_fires_at_threshold(self, tmp_path: Path) -> None:
signals = collect_signals(tmp_path)
signals["sessions_since_eval"] = 5
output = format_dashboard(signals)
assert "eval_staleness" in output
assert "STALE" in output # top-level staleness indicator
assert "5 sessions since last Phractal eval" in output # alert text

def test_alert_fires_above_threshold(self, tmp_path: Path) -> None:
signals = collect_signals(tmp_path)
signals["sessions_since_eval"] = 14
output = format_dashboard(signals)
assert "eval_staleness" in output
assert "14 sessions since last Phractal eval" in output

def test_no_alert_below_threshold(self, tmp_path: Path) -> None:
signals = collect_signals(tmp_path)
signals["sessions_since_eval"] = 3
output = format_dashboard(signals)
# Alert text should not appear when below threshold
assert "sessions since last Phractal eval" not in output

def test_staleness_shown_in_health_section(self, tmp_path: Path) -> None:
"""Eval staleness appears next to Eval score in the Health section."""
signals = collect_signals(tmp_path)
signals["sessions_since_eval"] = 7
output = format_dashboard(signals)
assert "Eval staleness:" in output
assert "7 sessions" in output
assert "STALE" in output

def test_up_to_date_message_when_zero(self, tmp_path: Path) -> None:
signals = collect_signals(tmp_path)
signals["sessions_since_eval"] = 0
output = format_dashboard(signals)
assert "up to date" in output

def test_sessions_since_eval_in_collect_signals(self, tmp_path: Path) -> None:
"""collect_signals includes sessions_since_eval key."""
signals = collect_signals(tmp_path)
assert "sessions_since_eval" in signals
assert isinstance(signals["sessions_since_eval"], int)
159 changes: 159 additions & 0 deletions .recursive/tests/test_signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,3 +293,162 @@ def test_date_cutoff_boundary(self, tmp_path: Path) -> None:
f"status: done\nsource: pentest\ncompleted: {before}\n",
)
assert signals.count_recent_pentest_tasks(tmp_path, days=3) == 1


# ---------------------------------------------------------------------------
# sessions_since_eval tests (task #0242)
# ---------------------------------------------------------------------------

_VALID_EVAL_CONTENT = (
"**Date**: 2026-04-08\n"
"| Startup | 9/10 | OK |\n"
"| Discovery | 8/10 | OK |\n"
"| Fix quality | 8/10 | OK |\n"
"| **Total** | **86/100** | |\n"
)


class TestSessionsSinceEval:
"""Unit tests for the sessions_since_eval signal."""

def _make_eval(self, evals_dir: Path, name: str, date: str) -> Path:
"""Write a minimal valid eval file with the given date."""
content = (
f"**Date**: {date}\n"
"| Startup | 9/10 | OK |\n"
"| Discovery | 8/10 | OK |\n"
"| Fix quality | 8/10 | OK |\n"
"| **Total** | **86/100** | |\n"
)
f = evals_dir / name
f.write_text(content)
return f

def _make_index_rows(self, timestamps: list[str]) -> list[dict[str, str]]:
"""Build session index rows with the given timestamp strings."""
return [{"timestamp": ts, "role": "build"} for ts in timestamps]

def test_no_eval_dir_returns_zero(self, tmp_path: Path) -> None:
evals_dir = tmp_path / "evaluations"
rows = self._make_index_rows(["2026-04-09 01:00"])
assert signals.sessions_since_eval(evals_dir, rows) == 0

def test_empty_eval_dir_returns_zero(self, tmp_path: Path) -> None:
evals_dir = tmp_path / "evaluations"
evals_dir.mkdir()
rows = self._make_index_rows(["2026-04-09 01:00"])
assert signals.sessions_since_eval(evals_dir, rows) == 0

def test_all_sessions_after_eval(self, tmp_path: Path) -> None:
evals_dir = tmp_path / "evaluations"
evals_dir.mkdir()
eval_file = self._make_eval(evals_dir, "0001.md", "2026-04-01")
# Force the file mtime to an earlier date via a fixed timestamp
import os
import time

old_ts = time.mktime(time.strptime("2026-04-01 12:00", "%Y-%m-%d %H:%M"))
os.utime(str(eval_file), (old_ts, old_ts))
rows = self._make_index_rows(
[
"2026-04-02 01:00",
"2026-04-03 01:00",
"2026-04-04 01:00",
]
)
result = signals.sessions_since_eval(evals_dir, rows)
assert result == 3

def test_some_sessions_after_eval(self, tmp_path: Path) -> None:
evals_dir = tmp_path / "evaluations"
evals_dir.mkdir()
eval_file = self._make_eval(evals_dir, "0001.md", "2026-04-05")
import os
import time

old_ts = time.mktime(time.strptime("2026-04-05 12:00", "%Y-%m-%d %H:%M"))
os.utime(str(eval_file), (old_ts, old_ts))
rows = self._make_index_rows(
[
"2026-04-04 01:00", # before eval
"2026-04-05 10:00", # before eval (mtime is 12:00)
"2026-04-06 01:00", # after eval
"2026-04-07 01:00", # after eval
]
)
result = signals.sessions_since_eval(evals_dir, rows)
assert result == 2

def test_no_sessions_after_eval_returns_zero(self, tmp_path: Path) -> None:
evals_dir = tmp_path / "evaluations"
evals_dir.mkdir()
eval_file = self._make_eval(evals_dir, "0001.md", "2026-04-09")
import os
import time

future_ts = time.mktime(time.strptime("2026-04-09 23:00", "%Y-%m-%d %H:%M"))
os.utime(str(eval_file), (future_ts, future_ts))
rows = self._make_index_rows(
[
"2026-04-09 01:00", # before eval mtime
"2026-04-09 10:00", # before eval mtime
]
)
result = signals.sessions_since_eval(evals_dir, rows)
assert result == 0

def test_uses_latest_eval_file(self, tmp_path: Path) -> None:
evals_dir = tmp_path / "evaluations"
evals_dir.mkdir()
import os
import time

# Older eval (0001.md) with old date
eval_old = self._make_eval(evals_dir, "0001.md", "2026-04-01")
old_ts = time.mktime(time.strptime("2026-04-01 12:00", "%Y-%m-%d %H:%M"))
os.utime(str(eval_old), (old_ts, old_ts))

# Newer eval (0002.md) with recent date -- function should use this one
eval_new = self._make_eval(evals_dir, "0002.md", "2026-04-08")
new_ts = time.mktime(time.strptime("2026-04-08 12:00", "%Y-%m-%d %H:%M"))
os.utime(str(eval_new), (new_ts, new_ts))

rows = self._make_index_rows(
[
"2026-04-02 01:00", # after 0001, before 0002
"2026-04-09 01:00", # after both evals
]
)
# Should use 0002.md (newest), so only 1 session after
result = signals.sessions_since_eval(evals_dir, rows)
assert result == 1

def test_empty_sessions_list_returns_zero(self, tmp_path: Path) -> None:
evals_dir = tmp_path / "evaluations"
evals_dir.mkdir()
self._make_eval(evals_dir, "0001.md", "2026-04-01")
result = signals.sessions_since_eval(evals_dir, [])
assert result == 0

def test_five_or_more_sessions_triggers_alert_threshold(self, tmp_path: Path) -> None:
"""Confirm the alert threshold value: >= 5 should trigger."""
evals_dir = tmp_path / "evaluations"
evals_dir.mkdir()
eval_file = self._make_eval(evals_dir, "0001.md", "2026-04-01")
import os
import time

old_ts = time.mktime(time.strptime("2026-04-01 12:00", "%Y-%m-%d %H:%M"))
os.utime(str(eval_file), (old_ts, old_ts))
rows = self._make_index_rows(
[
"2026-04-02 01:00",
"2026-04-03 01:00",
"2026-04-04 01:00",
"2026-04-05 01:00",
"2026-04-06 01:00",
]
)
result = signals.sessions_since_eval(evals_dir, rows)
assert result == 5
assert result >= 5 # alert threshold met
Loading