diff --git a/README.md b/README.md index 287fc4b..826de72 100644 --- a/README.md +++ b/README.md @@ -139,6 +139,11 @@ python3 -m nightshift multi /repo1 /repo2 --agent claude --test --cycles 1 python3 -m nightshift module-map --write ``` +`python3 -m nightshift test ...` now keeps its state files, runner logs, and +linked worktree under `$TMPDIR/nightshift-test-runs/...` so evaluation clones +stay clean. Full `run` mode still writes repo-local runtime artifacts under +`docs/Nightshift/`. + ### From the installed skill bundle Use the bundled wrapper scripts: diff --git a/docs/architecture/MODULE_MAP.md b/docs/architecture/MODULE_MAP.md index e3fd62a..18a59b2 100644 --- a/docs/architecture/MODULE_MAP.md +++ b/docs/architecture/MODULE_MAP.md @@ -1,6 +1,6 @@ # Module Map -Last updated: 2026-04-05 by session #0059 +Last updated: 2026-04-06 by session #0062 Generated via: `python3 -m nightshift module-map --write` Stale after: 5 newer sessions without a refresh @@ -9,37 +9,39 @@ Read it before opening modules one by one when you need fast orientation. ## Modules (29) -| Module | Lines | Purpose | Key symbols | Last changed | -|---|---:|---|---|---| -| `errors.py` | 7 | Nightshift error types. | `NightshiftError` | 2802c51 | -| `eval_targets.py` | 96 | Known evaluation targets and their repo-specific verification settings. | `infer_target_verify_command`, `_KNOWN_TARGET_VERIFY_COMMANDS` | session #0059 | -| `types.py` | 561 | Strict type definitions for all Nightshift data structures. | `NightshiftConfig`, `DiffScore`, `Counters`, `Baseline` | PR #88 (7e36fa5) | -| `constants.py` | 745 | Module-level constants and tiny utilities used across the package. | `now_local`, `print_status`, `DATA_VERSION`, `SUPPORTED_AGENTS` | PR #88 (7e36fa5) | -| `shell.py` | 161 | Subprocess execution: streaming runner, git helper, shell utilities. | `run_command`, `run_capture`, `git`, `command_exists` | PR #27 (9e953eb) | -| `summary.py` | 141 | Feature summary generation for Loop 2 build output. | `generate_feature_summary`, `_API_DIR_SEGMENTS`, `_CLI_DIR_SEGMENTS`, `_CONFIG_DIR_SEGMENTS` | PR #67 (89f8cd6) | -| `cleanup.py` | 337 | Daemon housekeeping -- log rotation, healer archiving, and branch pruning. | `rotate_healer_log`, `rotate_logs`, `prune_orphan_branches`, `_HEALER_ENTRY_RE` | PR #88 (7e36fa5) | -| `compact.py` | 318 | Handoff compaction -- merges numbered handoff files into weekly summaries. | `compact_handoffs`, `_NUMBERED_RE`, `_SECTION_RE`, `_DATE_RE` | PR #83 (56e0c97) | -| `coordination.py` | 192 | Sub-agent coordination for Loop 2 -- detects file overlaps and generates hints. | `extract_file_references`, `detect_overlaps`, `generate_coordination_hints`, `inject_hints` | PR #72 (a5a3e47) | -| `costs.py` | 672 | Cost tracking for daemon sessions -- parse token usage from logs and maintain a ledger. | `parse_session_tokens`, `calculate_cost`, `read_ledger`, `write_ledger` | PR #89 (7211bd4) | -| `module_map.py` | 298 | Generate a persistent module map for fast cross-session orientation. | `module_map_path`, `generate_module_map`, `render_module_map`, `write_module_map` | PR #86 (77e5c25) | -| `readiness.py` | 211 | Production-readiness checks for Loop 2 feature builds. | `collect_changed_files`, `check_secrets`, `check_debug_prints`, `check_test_coverage` | PR #69 (3877225) | -| `scoring.py` | 113 | Post-cycle diff scoring: evaluates production impact of cycle changes. | `score_diff`, `log_score` | PR #10 (3e5f98f) | -| `state.py` | 187 | Shift state: read, write, mutate counters, JSON I/O. | `load_json`, `write_json`, `read_state`, `top_path` | PR #28 (60e4ed5) | -| `config.py` | 241 | Configuration loading, agent resolution, and environment detection. | `merge_config`, `prompt_for_agent`, `resolve_agent`, `infer_package_manager` | session #0059 | -| `multi.py` | 117 | Multi-repo shift orchestration: run hardening loops across multiple repos. | `validate_repos`, `format_multi_summary`, `run_multi_shift` | PR #22 (12ac402) | -| `e2e.py` | 113 | End-to-end test runner for Loop 2 feature builds. | `infer_test_command`, `detect_smoke_test`, `run_e2e_tests`, `_MAKEFILE_TEST_TARGET` | PR #70 (95ef827) | -| `profiler.py` | 569 | Repo profiling for Loop 2 -- detects language, framework, dependencies, structure. | `profile_repo` | PR #78 (5cc11a3) | -| `worktree.py` | 213 | Git worktree lifecycle: create, shift log, sync, revert, cleanup. | `canonical_repo_relative_path`, `resolve_nightshift_dir`, `validate_worktree`, `validate_repo_checkout` | PR #96 (34244ff) | -| `cycle.py` | 855 | Per-cycle logic: prompt building, agent dispatch, verification, evaluation. | `extract_json`, `read_repo_instructions`, `wrap_repo_instructions`, `command_for_agent` | PR #96 (34244ff) | -| `evaluation.py` | 874 | Self-evaluation loop: score nightshift runs against real repos. | `clone_target_repo`, `run_test_shift`, `parse_shift_artifacts`, `score_startup` | PR #96 (34244ff) | -| `planner.py` | 483 | Feature planner for Loop 2 -- builds structured plans from repo profiles. | `build_plan_prompt`, `validate_plan`, `parse_plan`, `execution_order` | PR #78 (5cc11a3) | -| `subagent.py` | 281 | Sub-agent spawner for Loop 2 -- executes work orders via codex or claude CLI. | `spawn_task`, `spawn_wave`, `format_wave_result`, `_TASK_COMPLETION_REQUIRED_KEYS` | PR #33 (bd23cc4) | -| `decomposer.py` | 175 | Task decomposer for Loop 2 -- converts FeaturePlans into sub-agent work orders. | `build_work_order_prompt`, `decompose_plan`, `format_work_orders` | PR #78 (5cc11a3) | -| `integrator.py` | 325 | Wave integrator for Loop 2 -- merges sub-agent work, runs tests, handles failures. | `collect_wave_files`, `stage_files`, `run_test_suite`, `diagnose_failure` | PR #33 (bd23cc4) | -| `feature.py` | 696 | Loop 2 feature-build orchestration and persisted build state. | `feature_state_path`, `feature_log_dir`, `read_feature_state`, `write_feature_state` | PR #78 (5cc11a3) | -| `cli.py` | 543 | CLI entry points: run, test, summarize, verify-cycle, module-map. | `run_nightshift`, `summarize`, `verify_cycle_cli`, `plan_feature` | PR #96 (34244ff) | -| `__main__.py` | 5 | Entry point for python3 -m nightshift. | `main` | 2802c51 | -| `__init__.py` | 537 | Nightshift -- autonomous overnight codebase improvement agent. | `AGENT_DEFAULT_MODELS`, `BACKEND_DIR_NAMES`, `BACKEND_EXTENSIONS`, `CATEGORY_ORDER` | session #0059 | + +| Module | Lines | Purpose | Key symbols | Last changed | +| ----------------- | ----- | --------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------- | ----------------- | +| `errors.py` | 7 | Nightshift error types. | `NightshiftError` | 2802c51 | +| `eval_targets.py` | 96 | Known evaluation targets and their repo-specific verification settings. | `infer_target_verify_command`, `_KNOWN_TARGET_VERIFY_COMMANDS` | PR #106 (e2d235c) | +| `types.py` | 561 | Strict type definitions for all Nightshift data structures. | `NightshiftConfig`, `DiffScore`, `Counters`, `Baseline` | PR #88 (7e36fa5) | +| `constants.py` | 749 | Module-level constants and tiny utilities used across the package. | `now_local`, `print_status`, `DATA_VERSION`, `SUPPORTED_AGENTS` | session #0062 | +| `shell.py` | 161 | Subprocess execution: streaming runner, git helper, shell utilities. | `run_command`, `run_capture`, `git`, `command_exists` | PR #27 (9e953eb) | +| `summary.py` | 141 | Feature summary generation for Loop 2 build output. | `generate_feature_summary`, `_API_DIR_SEGMENTS`, `_CLI_DIR_SEGMENTS`, `_CONFIG_DIR_SEGMENTS` | PR #67 (89f8cd6) | +| `cleanup.py` | 337 | Daemon housekeeping -- log rotation, healer archiving, and branch pruning. | `rotate_healer_log`, `rotate_logs`, `prune_orphan_branches`, `_HEALER_ENTRY_RE` | PR #88 (7e36fa5) | +| `compact.py` | 318 | Handoff compaction -- merges numbered handoff files into weekly summaries. | `compact_handoffs`, `_NUMBERED_RE`, `_SECTION_RE`, `_DATE_RE` | PR #83 (56e0c97) | +| `coordination.py` | 192 | Sub-agent coordination for Loop 2 -- detects file overlaps and generates hints. | `extract_file_references`, `detect_overlaps`, `generate_coordination_hints`, `inject_hints` | PR #72 (a5a3e47) | +| `costs.py` | 672 | Cost tracking for daemon sessions -- parse token usage from logs and maintain a ledger. | `parse_session_tokens`, `calculate_cost`, `read_ledger`, `write_ledger` | PR #89 (7211bd4) | +| `module_map.py` | 298 | Generate a persistent module map for fast cross-session orientation. | `module_map_path`, `generate_module_map`, `render_module_map`, `write_module_map` | PR #86 (77e5c25) | +| `readiness.py` | 211 | Production-readiness checks for Loop 2 feature builds. | `collect_changed_files`, `check_secrets`, `check_debug_prints`, `check_test_coverage` | PR #69 (3877225) | +| `scoring.py` | 113 | Post-cycle diff scoring: evaluates production impact of cycle changes. | `score_diff`, `log_score` | PR #10 (3e5f98f) | +| `state.py` | 187 | Shift state: read, write, mutate counters, JSON I/O. | `load_json`, `write_json`, `read_state`, `top_path` | PR #28 (60e4ed5) | +| `config.py` | 241 | Configuration loading, agent resolution, and environment detection. | `merge_config`, `prompt_for_agent`, `resolve_agent`, `infer_package_manager` | PR #106 (e2d235c) | +| `multi.py` | 117 | Multi-repo shift orchestration: run hardening loops across multiple repos. | `validate_repos`, `format_multi_summary`, `run_multi_shift` | PR #22 (12ac402) | +| `e2e.py` | 113 | End-to-end test runner for Loop 2 feature builds. | `infer_test_command`, `detect_smoke_test`, `run_e2e_tests`, `_MAKEFILE_TEST_TARGET` | PR #70 (95ef827) | +| `profiler.py` | 569 | Repo profiling for Loop 2 -- detects language, framework, dependencies, structure. | `profile_repo` | PR #78 (5cc11a3) | +| `worktree.py` | 232 | Git worktree lifecycle: create, shift log, sync, revert, cleanup. | `canonical_repo_relative_path`, `resolve_nightshift_dir`, `resolve_shift_log_relative_dir`, `resolve_test_runtime_dir` | session #0062 | +| `cycle.py` | 855 | Per-cycle logic: prompt building, agent dispatch, verification, evaluation. | `extract_json`, `read_repo_instructions`, `wrap_repo_instructions`, `command_for_agent` | PR #96 (34244ff) | +| `evaluation.py` | 906 | Self-evaluation loop: score nightshift runs against real repos. | `clone_target_repo`, `run_test_shift`, `parse_shift_artifacts`, `score_startup` | session #0062 | +| `planner.py` | 483 | Feature planner for Loop 2 -- builds structured plans from repo profiles. | `build_plan_prompt`, `validate_plan`, `parse_plan`, `execution_order` | PR #78 (5cc11a3) | +| `subagent.py` | 281 | Sub-agent spawner for Loop 2 -- executes work orders via codex or claude CLI. | `spawn_task`, `spawn_wave`, `format_wave_result`, `_TASK_COMPLETION_REQUIRED_KEYS` | PR #33 (bd23cc4) | +| `decomposer.py` | 175 | Task decomposer for Loop 2 -- converts FeaturePlans into sub-agent work orders. | `build_work_order_prompt`, `decompose_plan`, `format_work_orders` | PR #78 (5cc11a3) | +| `integrator.py` | 325 | Wave integrator for Loop 2 -- merges sub-agent work, runs tests, handles failures. | `collect_wave_files`, `stage_files`, `run_test_suite`, `diagnose_failure` | PR #33 (bd23cc4) | +| `feature.py` | 696 | Loop 2 feature-build orchestration and persisted build state. | `feature_state_path`, `feature_log_dir`, `read_feature_state`, `write_feature_state` | PR #78 (5cc11a3) | +| `cli.py` | 550 | CLI entry points: run, test, summarize, verify-cycle, module-map. | `run_nightshift`, `summarize`, `verify_cycle_cli`, `plan_feature` | session #0062 | +| `__main__.py` | 5 | Entry point for python3 -m nightshift. | `main` | 2802c51 | +| `__init__.py` | 547 | Nightshift -- autonomous overnight codebase improvement agent. | `AGENT_DEFAULT_MODELS`, `BACKEND_DIR_NAMES`, `BACKEND_EXTENSIONS`, `CATEGORY_ORDER` | session #0062 | + ## Dependency Order @@ -50,8 +52,9 @@ Topological order derived from internal `nightshift.*` imports. ## Recent Shipped Sessions -- PR #105: docs: close stale eval startup task -- PR #104: fix: gate autonomous queue on eval score -- PR #99: test: cover malformed task frontmatter edge case -- PR #98: docs: track task parser review follow-up -- PR #97: feat: add task frontmatter validator +- PR #125: feat: watchdog + natural task creation (no artificial caps) +- PR #124: fix: round 6 audit — 9 remaining issues patched +- PR #123: feat: overseer rewrite — ticket closer, not process auditor +- PR #122: overseer: dedupe auto-release queue +- PR #121: overseer: fix unified-daemon operator docs + diff --git a/docs/changelog/v0.0.8.md b/docs/changelog/v0.0.8.md index 69832de..58e23bb 100644 --- a/docs/changelog/v0.0.8.md +++ b/docs/changelog/v0.0.8.md @@ -14,6 +14,7 @@ Closing the self-maintaining gap: auto-release, auto-changelog, evaluation CLI, - **[docs]** Refreshed `README.md` against the live repo so it now documents the real `python3 -m nightshift` entry points, installed wrapper scripts, current tracker snapshot, current config surface, and the current handoff/learnings/task workflow instead of stale marketing-era commands and percentages. (tasks `#0118`, `#0067`) ## Fixed +- **[fix]** `nightshift test` now keeps evaluation state, runner logs, and linked worktrees under an isolated temp-root runtime directory, so rejected Phractal eval runs no longer dirty the cloned target repo while evaluation artifact parsing still finds the shifted state/log files. (task `#0100`) - **[fix]** Shell scripts in `scripts/` now use ASCII-only section dividers and restart/status text, removing box-drawing and em-dash characters that violated repo conventions and rendered inconsistently across terminals/filesystems. (task #0038) - **[meta]** Corrected the authoritative Step 0 evaluation command in `docs/prompt/evolve.md` so fresh-clone Phractal evaluations pass `--repo-dir /tmp/nightshift-eval` from the Nightshift repo root instead of accidentally targeting the Nightshift checkout. (task `#0117`) - **[fix]** Nightshift now resolves the repo's actual `docs/` casing across runtime artifacts, shift-log verification, and evaluation artifact parsing, so repos that use `Docs/Nightshift/` no longer get false rejected cycles or mis-targeted self-evaluation reads, and legitimate final-cycle shift-log summary commits no longer trip the extra-commit guard rail. (tasks `#0098`, `#0121`) @@ -22,6 +23,7 @@ Closing the self-maintaining gap: auto-release, auto-changelog, evaluation CLI, ## Removed ## Internal +- **[test]** Added regression coverage for isolated test-mode runtime artifacts and for rejected test-mode runs leaving the cloned target repo clean. Test suite is now 992 passing. - **[test]** Added regression coverage for repo-URL-based evaluation verifier selection, percent-bearing git remote URLs, and the documentation contract for known target metadata. Test suite is now 943 passing. - **[meta]** Recorded `docs/evaluations/0014.md` from a fresh-clone Phractal run, confirmed the default Claude startup path still launches cleanly without `CLAUDECODE` or effort overrides, and closed stale eval task `#0097` so the eval gate now points at the remaining verification/cleanup gaps instead of obsolete startup drift. - **[meta]** Added an eval-score gate to `docs/prompt/evolve-auto.md` and mirrored it in the builder operations docs so, after Step 0, any latest real-repo evaluation below `80/100` forces the autonomous builder to prefer eval-related normal-priority tasks over unrelated queue cleanup. Added prompt-contract regression coverage for the new rule and recorded fresh Phractal evaluation `docs/evaluations/0013.md` at `70/100`. (task `#0131`) diff --git a/docs/handoffs/0062.md b/docs/handoffs/0062.md new file mode 100644 index 0000000..da3806a --- /dev/null +++ b/docs/handoffs/0062.md @@ -0,0 +1,57 @@ +# Handoff #0062 +**Date**: 2026-04-06 +**Version**: v0.0.8 in progress +**Session duration**: ~1h + +## What I Built +- **Isolated eval/test runtime artifacts**: `nightshift test` now writes state files, runner logs, and its linked worktree under an isolated temp-root runtime directory instead of dirtying the target checkout, while normal `run` mode keeps the existing repo-local `docs/Nightshift/` behavior. This closes task `#0100`. +- **Artifact discovery update**: evaluation artifact parsing now reads state/log files from both repo-local runtime artifacts and isolated test-mode locations, including dated logs inside isolated worktrees. +- Files: `nightshift/worktree.py`, `nightshift/cli.py`, `nightshift/evaluation.py`, `nightshift/constants.py`, `nightshift/__init__.py`, `tests/test_nightshift.py`, `docs/changelog/v0.0.8.md`, `docs/vision-tracker/TRACKER.md`, `docs/ops/OPERATIONS.md`, `README.md`, `docs/tasks/0100.md`, `docs/tasks/0103.md`, `docs/architecture/MODULE_MAP.md` +- Tests: +2 new, 992 total passing (`make check`) + +## Decisions Made +- **Changed only `test` mode runtime paths.** Full overnight `run` sessions still use repo-local `docs/Nightshift/` so the shipped shift-log workflow stays intact for normal use. +- **Kept repo-relative shift-log paths inside the isolated worktree.** That preserved existing verification behavior while moving only the diagnostic/runtime footprint out of the target repo. + +## Known Issues +- Rejected-cycle findings still do not surface cleanly in the human-readable artifact (`#0101`), and the evaluation scorers still under-read rejected runs (`#0102`). +- Clean-state scoring still would miss future dirty clones if they reappear because it does not inspect git status directly (`#0125`). +- Session-history trend analysis is still weak because `docs/sessions/index.md` only has sparse recent rows; `#0095` and `#0130` remain open. +- Queue archival cleanup from prior overseer sessions is still incomplete (for example the stale done-item cleanup called out in `#0012` remains unaddressed). + +## Learnings Applied +- "Task selection is a mesa-optimization problem" (`docs/learnings/2026-04-04-task-selection-mesa-optimization.md`) + Affects my approach: I treated the queue as authoritative, marked the urgent umbrella task `#0103` as blocked instead of silently skipping it, and moved directly to the first actionable eval-gate task `#0100`. + +## Current State +- Loop 1: 99% — real-repo test-mode runs no longer dirty the target clone, but rejected-run reporting/scoring gaps still keep the eval score below the `80/100` gate. +- Loop 2: 100% — unchanged and complete. +- Self-Maintaining: 68% — unchanged; auto-release/auto-changelog and session-index fidelity are still the main backlog. +- Meta-Prompt: 79% — unchanged. +- Overall: 92% — unchanged after rounding. +- Version: v0.0.8 — still in progress; `#0100` is done, but the remaining eval/reporting and self-maintaining tasks are still open. + +## Tracker Delta +- 92% -> 92% overall (Loop 1 behavior improved, but the weighted tracker display did not move after rounding) + +## Generated Tasks +- none + +## Tasks I Did NOT Pick and Why +- `#0103` — urgent but design-blocked umbrella epic; I set it to `status: blocked` and moved to actionable work. +- `#0032` — integration-only task; skipped by environment policy. +- `#0101`, `#0102`, `#0125` — remaining eval follow-ups; deferred because `#0100` was the lowest-numbered actionable eval-gate task. +- `#0045`, `#0060`, `#0063`, `#0066`, `#0069`, `#0071`, `#0072`, `#0073`, `#0075`, `#0077`, `#0078`, `#0079`, `#0080`, `#0081`, `#0082`, `#0084`, `#0085`, `#0088`, `#0089`, `#0090`, `#0091`, `#0092`, `#0093`, `#0094`, `#0095`, `#0096`, `#0104`, `#0105`, `#0106`, `#0107`, `#0108`, `#0109`, `#0110`, `#0111`, `#0112`, `#0113`, `#0114`, `#0115`, `#0116`, `#0119`, `#0120`, `#0122`, `#0123`, `#0124`, `#0127`, `#0128`, `#0129`, `#0130`, `#0132`, `#0133`, `#0134`, `#0136`, `#0137` — lower-priority internal tasks deferred because the eval gate required the first actionable evaluation repair. + +## Evaluate +Run evaluation against Phractal for the changes merged this session. + +## Next Session Should +Tasks: `#0101`, `#0102` +Fallback: if a fresh eval rerun shows the human-readable artifact gap is already covered, move to `#0125` so the clean-state scorer matches the new runtime-dir behavior. + +## Where to Look +- `nightshift/worktree.py` — runtime-dir and shift-log-path helpers for `run` vs `test` +- `nightshift/cli.py` — `run_nightshift()` test-mode isolation and summarize fallback +- `nightshift/evaluation.py` — artifact parsing across repo-local and isolated test roots +- `tests/test_nightshift.py` — regression coverage for rejected test-mode cleanliness and isolated artifact parsing diff --git a/docs/handoffs/LATEST.md b/docs/handoffs/LATEST.md index 2cede78..da3806a 100644 --- a/docs/handoffs/LATEST.md +++ b/docs/handoffs/LATEST.md @@ -1,42 +1,57 @@ -# Handoff #0061 +# Handoff #0062 **Date**: 2026-04-06 **Version**: v0.0.8 in progress -**Session duration**: ~35m +**Session duration**: ~1h ## What I Built -- **Overseer queue audit**: consolidated the duplicate auto-release backlog so the active queue now has one canonical auto-release task instead of two competing versions. -- **Task alignment**: retargeted task `#0066` to `v0.0.8` so it matches the published roadmap in `docs/changelog/README.md`. -- **Duplicate closure**: closed task `#0076` as `duplicate_of: 0066` with a resolution note explaining why `#0066` is the surviving task. -- Files: `docs/tasks/0066.md`, `docs/tasks/0076.md`, `docs/handoffs/0061.md`, `docs/handoffs/LATEST.md` -- Checks: `./scripts/validate-docs.sh`, `./scripts/validate-tasks.sh` +- **Isolated eval/test runtime artifacts**: `nightshift test` now writes state files, runner logs, and its linked worktree under an isolated temp-root runtime directory instead of dirtying the target checkout, while normal `run` mode keeps the existing repo-local `docs/Nightshift/` behavior. This closes task `#0100`. +- **Artifact discovery update**: evaluation artifact parsing now reads state/log files from both repo-local runtime artifacts and isolated test-mode locations, including dated logs inside isolated worktrees. +- Files: `nightshift/worktree.py`, `nightshift/cli.py`, `nightshift/evaluation.py`, `nightshift/constants.py`, `nightshift/__init__.py`, `tests/test_nightshift.py`, `docs/changelog/v0.0.8.md`, `docs/vision-tracker/TRACKER.md`, `docs/ops/OPERATIONS.md`, `README.md`, `docs/tasks/0100.md`, `docs/tasks/0103.md`, `docs/architecture/MODULE_MAP.md` +- Tests: +2 new, 992 total passing (`make check`) ## Decisions Made -- **Kept `#0066` as the canonical auto-release task.** It has the lower task number and the broader acceptance criteria, so it is the better single source of truth than `#0076`. -- **Aligned the milestone to `v0.0.8`.** The changelog index still lists auto-release as part of the active `v0.0.8` milestone, so leaving the canonical task on `v0.0.9` would keep the queue out of sync with the published plan. +- **Changed only `test` mode runtime paths.** Full overnight `run` sessions still use repo-local `docs/Nightshift/` so the shipped shift-log workflow stays intact for normal use. +- **Kept repo-relative shift-log paths inside the isolated worktree.** That preserved existing verification behavior while moving only the diagnostic/runtime footprint out of the target repo. ## Known Issues -- Evaluation clones still end dirty and the clean-state scorer still misses that signal (`#0100`, `#0125`). -- `docs/sessions/index.md` now survives daemon resets because it is ignored/untracked, but the file is still effectively header-only and needs fidelity repair (`#0095`). Task `#0130` still needs a stale-task reality check in a future overseer cycle. -- At least one additional stale queue item remains (`#0012` is marked done in place instead of archived), so the broader stale/duplicate sweep is not finished. +- Rejected-cycle findings still do not surface cleanly in the human-readable artifact (`#0101`), and the evaluation scorers still under-read rejected runs (`#0102`). +- Clean-state scoring still would miss future dirty clones if they reappear because it does not inspect git status directly (`#0125`). +- Session-history trend analysis is still weak because `docs/sessions/index.md` only has sparse recent rows; `#0095` and `#0130` remain open. +- Queue archival cleanup from prior overseer sessions is still incomplete (for example the stale done-item cleanup called out in `#0012` remains unaddressed). + +## Learnings Applied +- "Task selection is a mesa-optimization problem" (`docs/learnings/2026-04-04-task-selection-mesa-optimization.md`) + Affects my approach: I treated the queue as authoritative, marked the urgent umbrella task `#0103` as blocked instead of silently skipping it, and moved directly to the first actionable eval-gate task `#0100`. ## Current State -- Loop 1: 99% — unchanged; eval cleanup and clean-state scoring still gate the last real-repo hardening work. +- Loop 1: 99% — real-repo test-mode runs no longer dirty the target clone, but rejected-run reporting/scoring gaps still keep the eval score below the `80/100` gate. - Loop 2: 100% — unchanged and complete. -- Self-Maintaining: 68% — unchanged in tracker percentage, but the auto-release backlog is now represented by one task instead of two conflicting ones. +- Self-Maintaining: 68% — unchanged; auto-release/auto-changelog and session-index fidelity are still the main backlog. - Meta-Prompt: 79% — unchanged. - Overall: 92% — unchanged after rounding. -- Version: v0.0.8 — still in progress; the queue now has a single canonical auto-release task targeting this milestone. +- Version: v0.0.8 — still in progress; `#0100` is done, but the remaining eval/reporting and self-maintaining tasks are still open. + +## Tracker Delta +- 92% -> 92% overall (Loop 1 behavior improved, but the weighted tracker display did not move after rounding) + +## Generated Tasks +- none ## Tasks I Did NOT Pick and Why -- `#0130` — likely stale after commit `bdf5d11`, but I did not close it in the same cycle because the duplicated auto-release pair was a nearer-term risk of duplicate implementation work. -- `#0012` — already resolved as stale last cycle; it still needs archival cleanup, but that is lower priority than preventing duplicate release work. +- `#0103` — urgent but design-blocked umbrella epic; I set it to `status: blocked` and moved to actionable work. +- `#0032` — integration-only task; skipped by environment policy. +- `#0101`, `#0102`, `#0125` — remaining eval follow-ups; deferred because `#0100` was the lowest-numbered actionable eval-gate task. +- `#0045`, `#0060`, `#0063`, `#0066`, `#0069`, `#0071`, `#0072`, `#0073`, `#0075`, `#0077`, `#0078`, `#0079`, `#0080`, `#0081`, `#0082`, `#0084`, `#0085`, `#0088`, `#0089`, `#0090`, `#0091`, `#0092`, `#0093`, `#0094`, `#0095`, `#0096`, `#0104`, `#0105`, `#0106`, `#0107`, `#0108`, `#0109`, `#0110`, `#0111`, `#0112`, `#0113`, `#0114`, `#0115`, `#0116`, `#0119`, `#0120`, `#0122`, `#0123`, `#0124`, `#0127`, `#0128`, `#0129`, `#0130`, `#0132`, `#0133`, `#0134`, `#0136`, `#0137` — lower-priority internal tasks deferred because the eval gate required the first actionable evaluation repair. + +## Evaluate +Run evaluation against Phractal for the changes merged this session. ## Next Session Should -Tasks: `#0130` or `#0100` -Fallback: if the next cycle is BUILD, the eval gate should still drive it to `#0100`; if it is OVERSEE again, reassess `#0130` against the live ignored/untracked session-index behavior and keep sweeping stale done items. +Tasks: `#0101`, `#0102` +Fallback: if a fresh eval rerun shows the human-readable artifact gap is already covered, move to `#0125` so the clean-state scorer matches the new runtime-dir behavior. ## Where to Look -- `docs/tasks/0066.md` — canonical auto-release task -- `docs/tasks/0076.md` — closed duplicate with resolution note -- `docs/changelog/README.md` — source of truth showing auto-release in `v0.0.8` -- `docs/handoffs/LATEST.md` — current overseer state snapshot +- `nightshift/worktree.py` — runtime-dir and shift-log-path helpers for `run` vs `test` +- `nightshift/cli.py` — `run_nightshift()` test-mode isolation and summarize fallback +- `nightshift/evaluation.py` — artifact parsing across repo-local and isolated test roots +- `tests/test_nightshift.py` — regression coverage for rejected test-mode cleanliness and isolated artifact parsing diff --git a/docs/healer/log.md b/docs/healer/log.md index ed05744..4c61e84 100644 --- a/docs/healer/log.md +++ b/docs/healer/log.md @@ -219,6 +219,14 @@ Observations from the meta-layer observer. Appended chronologically. ## 2026-04-05 -- Session #0047 (Strategist prompt health) +## 2026-04-06 -- Session #0062 (isolated eval runtime artifacts) + +**System health:** caution + +- **The eval gate finally moved from cleanup to fidelity.** Task `#0100` is now done: the new `test`-mode runtime-dir helper keeps evaluation state, runner logs, and linked worktrees under `$TMPDIR/nightshift-test-runs/...`. A real `python3 -m nightshift test --cycles 0 --repo-dir ` smoke run left `git status --short` empty in the target repo, so the remaining eval backlog is now `#0101`, `#0102`, and `#0125`, not clone cleanliness. +- **Trend visibility is still weak because the session index is too sparse.** `docs/sessions/index.md` currently contains only two recent rows, so the required 5-session scan collapses into almost no historical signal. Existing tasks `#0095` and `#0130` still cover index fidelity/persistence, so I did not create a duplicate. +- **Cost recommendations are still noisy because too many sessions are typed as `unknown`.** `cost_analysis('docs/sessions')` shows 21 of 43 analyzed sessions in the `unknown` bucket at `$25.41/session`, far above the labeled `fix`/`feat`/`overseer` averages. That makes the outlier guidance directionally useful but still under-instrumented; the same session-index fidelity tasks already cover the missing structure. + ## 2026-04-05 -- Session #0058 (Close stale eval startup task) **System health:** caution diff --git a/docs/learnings/2026-04-06-test-mode-runtime-isolation.md b/docs/learnings/2026-04-06-test-mode-runtime-isolation.md new file mode 100644 index 0000000..a5c8065 --- /dev/null +++ b/docs/learnings/2026-04-06-test-mode-runtime-isolation.md @@ -0,0 +1,17 @@ +# Learning: Test-mode runtime artifacts stay outside the target repo + +**Date**: 2026-04-06 +**Session**: #0062 +**Type**: pattern + +## What happened + +The first eval-gate cleanup task was not about git cleanup at the end of the run. The real problem was where `nightshift test` wrote its runtime files: state, runner logs, synced shift logs, and the linked worktree all lived under the target checkout, so a rejected evaluation left the clone dirty by construction. + +## What worked + +Keep `run` mode unchanged for normal overnight work, but route `test` mode through a separate runtime-dir helper rooted in `$TMPDIR`. The linked worktree can still contain the repo-relative shift log for verification, while the target checkout itself stays clean. + +## Apply this next time + +When a real-repo eval bug says "cleanup," first check whether the runtime footprint belongs inside the target repo at all. If the artifacts are diagnostic-only, move them to an isolated runtime root instead of trying to scrub them after the fact. diff --git a/docs/learnings/INDEX.md b/docs/learnings/INDEX.md index 677516b..370ec0e 100644 --- a/docs/learnings/INDEX.md +++ b/docs/learnings/INDEX.md @@ -54,6 +54,7 @@ Read this file FIRST. Only open individual learning files when they are relevant - [Pattern detection via path segments](2026-04-05-summary-module-pattern-detection.md) — Split paths into segment sets and intersect with keyword sets; more robust than regex for file categorization - [Readiness checks as pure file scanners](2026-04-05-readiness-checks-pure-computation.md) — Pure Path.read_text() + regex beats shell-command wrappers; testable with tmp_path, no mocks needed - [Eval target detection via git config](2026-04-05-eval-target-detection-via-git-config.md) — Repo-specific behavior should read `.git/config` directly instead of shelling out to `git` +- [Test-mode runtime artifacts stay outside the target repo](2026-04-06-test-mode-runtime-isolation.md) — Isolate evaluation/test state, logs, and worktrees under a temp-root runtime dir; keep only full overnight runs in `docs/Nightshift/` - [Pure scorer pattern for evaluation](2026-04-05-pure-scorer-pattern-for-evaluation.md) — Scorers take typed artifacts, return scores; I/O at edges only; 66 tests with zero mocks - [E2E as gate before final verify](2026-04-05-e2e-gate-before-final-verify.md) — New pipeline steps sharing checks with later steps should gate; grep for integration tests to update - [Case-insensitive path resolution needs directory enumeration](2026-04-05-case-insensitive-path-resolution-needs-directory-enumeration.md) — `Path.exists()` keeps the queried casing on macOS; enumerate parent entries to recover the real on-disk path diff --git a/docs/ops/OPERATIONS.md b/docs/ops/OPERATIONS.md index 214c835..17d038f 100644 --- a/docs/ops/OPERATIONS.md +++ b/docs/ops/OPERATIONS.md @@ -546,24 +546,27 @@ of the normal session workflow. --- -## System 12: Runtime Artifacts (`docs/Nightshift/`) +## System 12: Runtime Artifacts (`docs/Nightshift/` + isolated test roots) ### What it is -Created when Nightshift runs. NOT checked into git (except shift logs). +Created when Nightshift runs. Full overnight `run` sessions use repo-local +`docs/Nightshift/`; `test`/evaluation sessions keep their machine-readable +artifacts and linked worktrees under `$TMPDIR/nightshift-test-runs/...` so the +target checkout stays clean. ### Files (generated at runtime) | File | Purpose | Git status | |------|---------|------------| -| `YYYY-MM-DD.md` | Shift log (human-readable) | Committed to nightshift branch | -| `YYYY-MM-DD.state.json` | Machine-readable state | Gitignored | -| `YYYY-MM-DD.runner.log` | Raw runner output | Gitignored | -| `worktree-YYYY-MM-DD/` | Isolated git worktree | Gitignored | +| `YYYY-MM-DD.md` | Shift log (human-readable) | Committed to nightshift branch for `run`; lives in the isolated worktree for `test` | +| `YYYY-MM-DD.state.json` | Machine-readable state | Gitignored / temp-root for `test` | +| `YYYY-MM-DD.runner.log` | Raw runner output | Gitignored / temp-root for `test` | +| `worktree-YYYY-MM-DD/` | Isolated git worktree | Gitignored in `run`; temp-root for `test` | ### How to clean up after a test run ```bash -git worktree remove docs/Nightshift/worktree-YYYY-MM-DD +git worktree remove "$TMPDIR"/nightshift-test-runs/-/worktree-YYYY-MM-DD git branch -d nightshift/YYYY-MM-DD -rm -f docs/Nightshift/YYYY-MM-DD.state.json docs/Nightshift/YYYY-MM-DD.runner.log +rm -rf "$TMPDIR"/nightshift-test-runs/- ``` --- @@ -785,8 +788,10 @@ git clone https://github.com/fazxes/Phractal.git /tmp/nightshift-test-target cd /tmp/nightshift-test-target python3 -m nightshift test --agent claude --cycles 2 --cycle-minutes 5 -# Check results -cat docs/Nightshift/YYYY-MM-DD.md +# Check results (use the paths printed at the end of the run) +# Shift log: ... +# State file: ... +# Runner log: ... ``` This is a real full-stack project with real issues. Use it to validate that Loop 1 actually finds and fixes things. diff --git a/docs/tasks/0100.md b/docs/tasks/0100.md index 841f405..f2d8162 100644 --- a/docs/tasks/0100.md +++ b/docs/tasks/0100.md @@ -1,11 +1,11 @@ --- -status: pending +status: done priority: normal target: v0.0.8 vision_section: self-maintaining created: 2026-04-05 source: evaluation-0001 -completed: +completed: 2026-04-06 --- # Evaluation #0001: leave evaluation clones clean after rejected cycles diff --git a/docs/tasks/0103.md b/docs/tasks/0103.md index 2e2397c..e850b9e 100644 --- a/docs/tasks/0103.md +++ b/docs/tasks/0103.md @@ -1,5 +1,5 @@ --- -status: pending +status: blocked priority: urgent target: v0.0.9 blocked_reason: design diff --git a/docs/vision-tracker/TRACKER.md b/docs/vision-tracker/TRACKER.md index 41f5975..2b3ddf6 100644 --- a/docs/vision-tracker/TRACKER.md +++ b/docs/vision-tracker/TRACKER.md @@ -1,6 +1,6 @@ # Vision Tracker -Last updated: 2026-04-05 by agent session #0059 (Phractal eval verification metadata). +Last updated: 2026-04-06 by agent session #0062 (isolated eval runtime artifacts). This file is the single source of truth for how close Nightshift is to its vision. Updated by the agent every session. The human never edits this — the agent reads the code, checks what exists, and recalculates. @@ -20,7 +20,7 @@ NIGHTSHIFT VISION ███████████ ## Loop 1 — Hardening Loop (99%) -The core loop still works on the happy path, and the latest runner hardening fixed the `docs/` vs `Docs/` false rejection plus the brittle final-cycle shift-log commit accounting. Real Phractal evaluations now auto-apply a repo-specific baseline verifier, but rejected-run cleanup/reporting still keep Loop 1 just below 100% on real repos. +The core loop still works on the happy path, and the latest runner hardening fixed the `docs/` vs `Docs/` false rejection plus the brittle final-cycle shift-log commit accounting. Real Phractal evaluations now auto-apply a repo-specific baseline verifier, and `nightshift test` isolates its state/log/worktree footprint away from the target checkout so rejected eval runs no longer dirty the clone. Rejected-run reporting/scoring still keep Loop 1 just below 100% on real repos. | Component | Status | Progress | |---|---|---| @@ -31,13 +31,13 @@ The core loop still works on the happy path, and the latest runner hardening fix | Runner-enforced guard rails | Done | ████████████████████ 100% | | Machine-readable state | Done | ████████████████████ 100% | | Baseline verification | Done | ████████████████████ 100% | -| Post-cycle verification | In progress | ███████████████████░ 98% | +| Post-cycle verification | In progress | ███████████████████░ 99% | | Shift log generation | Done | ████████████████████ 100% | | Category dominance check | Done | ████████████████████ 100% | | Path bias detection | Done | ████████████████████ 100% | | Hot-file protection | Done | ████████████████████ 100% | | Halt conditions | Done | ████████████████████ 100% | -| Test suite (984 tests) | Done | ████████████████████ 100% | +| Test suite (992 tests) | Done | ████████████████████ 100% | | Post-cycle diff scorer | Done | ████████████████████ 100% | | Cycle-to-cycle state injection | Done | ████████████████████ 100% | | Test writing incentives | Done | ████████████████████ 100% | @@ -48,7 +48,7 @@ The core loop still works on the happy path, and the latest runner hardening fix | run_command timeout fix | Done | ████████████████████ 100% | ### Bugs Found (not yet fixed) -- Rejected evaluation runs still leave the target clone dirty and hide useful findings in the human-readable artifacts, confirming tasks #0100, #0101, and #0102 remain active. +- Rejected evaluation runs still under-report findings in the human-readable artifacts and the scorer still does not account for dirty clones directly, confirming tasks #0101, #0102, and #0125 remain active. --- diff --git a/nightshift/__init__.py b/nightshift/__init__.py index 5d4dba9..8a35553 100644 --- a/nightshift/__init__.py +++ b/nightshift/__init__.py @@ -71,6 +71,7 @@ SUBAGENT_DEFAULT_TIMEOUT, SUBAGENT_MAX_TURNS, SUPPORTED_AGENTS, + TEST_RUNTIME_ARTIFACT_DIRNAME, UNTRUSTED_INSTRUCTIONS_PREAMBLE, UNTRUSTED_INSTRUCTIONS_SUFFIX, now_local, @@ -282,6 +283,10 @@ git_changed_files_for_commit, git_name_status_for_commit, install_dependencies_if_needed, + resolve_nightshift_dir, + resolve_runtime_dir, + resolve_shift_log_relative_dir, + resolve_test_runtime_dir, revert_cycle, sync_shift_log, validate_repo_checkout, @@ -334,6 +339,7 @@ "SUBAGENT_DEFAULT_TIMEOUT", "SUBAGENT_MAX_TURNS", "SUPPORTED_AGENTS", + "TEST_RUNTIME_ARTIFACT_DIRNAME", "UNTRUSTED_INSTRUCTIONS_PREAMBLE", "UNTRUSTED_INSTRUCTIONS_SUFFIX", "ArchitectureDoc", @@ -488,6 +494,10 @@ "record_session_bundle", "render_module_map", "resolve_agent", + "resolve_nightshift_dir", + "resolve_runtime_dir", + "resolve_shift_log_relative_dir", + "resolve_test_runtime_dir", "revert_cycle", "rotate_healer_log", "rotate_logs", diff --git a/nightshift/cli.py b/nightshift/cli.py index cfbd65a..422eed9 100644 --- a/nightshift/cli.py +++ b/nightshift/cli.py @@ -40,7 +40,8 @@ ensure_shift_log_committed, ensure_worktree, install_dependencies_if_needed, - resolve_nightshift_dir, + resolve_runtime_dir, + resolve_shift_log_relative_dir, revert_cycle, sync_shift_log, ) @@ -58,13 +59,13 @@ def run_nightshift(args: argparse.Namespace, *, test_mode: bool) -> int: if getattr(args, "cycle_minutes", None) is not None: config["cycle_minutes"] = args.cycle_minutes today = args.date or now_local().strftime("%Y-%m-%d") - nightshift_dir = resolve_nightshift_dir(repo_dir) - nightshift_relative_dir = nightshift_dir.relative_to(repo_dir).as_posix() - worktree_dir = nightshift_dir / f"worktree-{today}" + runtime_dir = resolve_runtime_dir(repo_dir, test_mode=test_mode) + shift_log_dir = resolve_shift_log_relative_dir(repo_dir) + worktree_dir = runtime_dir / f"worktree-{today}" branch = f"nightshift/{today}" - shift_log_relative = f"{nightshift_relative_dir}/{today}.md" - state_path = nightshift_dir / f"{today}.state.json" - runner_log = nightshift_dir / f"{today}.runner.log" + shift_log_relative = f"{shift_log_dir}/{today}.md" + state_path = runtime_dir / f"{today}.state.json" + runner_log = runtime_dir / f"{today}.runner.log" base_branch = discover_base_branch(repo_dir) verify_command = infer_verify_command(repo_dir, config) @@ -124,11 +125,12 @@ def run_nightshift(args: argparse.Namespace, *, test_mode: bool) -> int: if not command_exists(agent): raise NightshiftError(f"`{agent}` is not installed or not on PATH.") - nightshift_dir.mkdir(parents=True, exist_ok=True) + runtime_dir.mkdir(parents=True, exist_ok=True) ensure_worktree(repo_dir, worktree_dir, branch) ensure_shift_log(worktree_dir / shift_log_relative, today=today, branch=branch, base_branch=base_branch) ensure_shift_log_committed(worktree_dir, shift_log_relative) - sync_shift_log(worktree_dir, repo_dir, shift_log_relative) + if not test_mode: + sync_shift_log(worktree_dir, repo_dir, shift_log_relative) install_dependencies_if_needed(worktree_dir, runner_log) evaluate_baseline(worktree_dir=worktree_dir, runner_log=runner_log, state=state) write_json(state_path, state) @@ -197,7 +199,7 @@ def run_nightshift(args: argparse.Namespace, *, test_mode: bool) -> int: repo_instructions=target_repo_instructions, ) - message_path = nightshift_dir / f"{today}.cycle-{cycle_number}.json" + message_path = runtime_dir / f"{today}.cycle-{cycle_number}.json" if message_path.exists(): message_path.unlink() cmd = command_for_agent( @@ -297,7 +299,8 @@ def run_nightshift(args: argparse.Namespace, *, test_mode: bool) -> int: cycle_result=cycle_result, verification=verification, ) - sync_shift_log(worktree_dir, repo_dir, shift_log_relative) + if not test_mode: + sync_shift_log(worktree_dir, repo_dir, shift_log_relative) write_json(state_path, state) if state["counters"]["empty_cycles"] >= int(config["stop_after_empty_cycles"]): @@ -314,7 +317,8 @@ def run_nightshift(args: argparse.Namespace, *, test_mode: bool) -> int: print_status(f"| Halted: {halt_reason[:36]:<36}|") print_status("+--------------------------------------------------+") print_status("") - print_status(f"Shift log: {repo_dir / shift_log_relative}") + shift_log_path = (worktree_dir / shift_log_relative) if test_mode else (repo_dir / shift_log_relative) + print_status(f"Shift log: {shift_log_path}") print_status(f"State file: {state_path}") print_status(f"Runner log: {runner_log}") print_status(f"Branch: {branch}") @@ -329,8 +333,11 @@ def run_nightshift(args: argparse.Namespace, *, test_mode: bool) -> int: def summarize(args: argparse.Namespace) -> int: repo_dir = Path(args.repo_dir or os.getcwd()).resolve() date = args.date or now_local().strftime("%Y-%m-%d") - nightshift_dir = resolve_nightshift_dir(repo_dir) - state_path = nightshift_dir / f"{date}.state.json" + state_path = resolve_runtime_dir(repo_dir, test_mode=False) / f"{date}.state.json" + if not state_path.exists(): + test_state_path = resolve_runtime_dir(repo_dir, test_mode=True) / f"{date}.state.json" + if test_state_path.exists(): + state_path = test_state_path if not state_path.exists(): raise NightshiftError(f"No state file found at {state_path}") state = load_json(state_path) @@ -344,10 +351,10 @@ def verify_cycle_cli(args: argparse.Namespace) -> int: config = merge_config(repo_dir) agent_name = resolve_agent(config, args.agent) config["agent"] = agent_name - nightshift_dir = resolve_nightshift_dir(repo_dir) - nightshift_relative_dir = nightshift_dir.relative_to(repo_dir).as_posix() + runtime_dir = resolve_runtime_dir(repo_dir, test_mode=False) + shift_log_dir = resolve_shift_log_relative_dir(repo_dir) state = read_state( - nightshift_dir / f"{date}.state.json", + runtime_dir / f"{date}.state.json", today=date, branch=f"nightshift/{date}", agent=agent_name, @@ -357,12 +364,12 @@ def verify_cycle_cli(args: argparse.Namespace) -> int: cycle_result = _as_cycle_result(raw_result) if raw_result is not None else None valid, verification = verify_cycle( worktree_dir=Path(args.worktree_dir).resolve(), - shift_log_relative=f"{nightshift_relative_dir}/{date}.md", + shift_log_relative=f"{shift_log_dir}/{date}.md", pre_head=args.pre_head, cycle_result=cycle_result, config=config, state=state, - runner_log=nightshift_dir / f"{date}.runner.log", + runner_log=runtime_dir / f"{date}.runner.log", ) payload = {"valid": valid, "verification": verification} print(json.dumps(payload, indent=2, sort_keys=True)) diff --git a/nightshift/constants.py b/nightshift/constants.py index 57780ba..1faf61a 100644 --- a/nightshift/constants.py +++ b/nightshift/constants.py @@ -743,3 +743,7 @@ def print_status(message: str) -> None: # Timeout in seconds for the entire evaluation shift subprocess. EVALUATION_SHIFT_TIMEOUT = 900 + +# Directory name under the system temp root used for isolated test/evaluation +# runtime artifacts so `nightshift test` does not dirty the target checkout. +TEST_RUNTIME_ARTIFACT_DIRNAME = "nightshift-test-runs" diff --git a/nightshift/evaluation.py b/nightshift/evaluation.py index f41d444..47081d9 100644 --- a/nightshift/evaluation.py +++ b/nightshift/evaluation.py @@ -21,7 +21,7 @@ EVALUATION_SHIFT_TIMEOUT, ) from nightshift.types import DimensionScore, EvaluationResult, ShiftArtifacts -from nightshift.worktree import resolve_nightshift_dir +from nightshift.worktree import resolve_runtime_dir # --------------------------------------------------------------------------- # Clone / run helpers @@ -83,10 +83,10 @@ def run_test_shift( def parse_shift_artifacts(repo_dir: Path) -> ShiftArtifacts: """Read state file and shift log from a completed test shift.""" - ns_dir = resolve_nightshift_dir(repo_dir) + runtime_dirs = _runtime_artifact_dirs(repo_dir) # State file - state_files = sorted(glob.glob(str(ns_dir / "*.state.json"))) + state_files = _glob_runtime_candidates(runtime_dirs, "*.state.json") state: dict[str, object] | None = None state_valid = False if state_files: @@ -100,7 +100,7 @@ def parse_shift_artifacts(repo_dir: Path) -> ShiftArtifacts: # Shift log shift_log = "" shift_log_exists = False - log_candidates = sorted(glob.glob(str(ns_dir / "SHIFT-LOG*.md"))) + log_candidates = _shift_log_candidates(runtime_dirs) if log_candidates: try: shift_log = Path(log_candidates[-1]).read_text(encoding="utf-8") @@ -117,6 +117,38 @@ def parse_shift_artifacts(repo_dir: Path) -> ShiftArtifacts: ) +_DATED_SHIFT_LOG_GLOB = "[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9].md" + + +def _runtime_artifact_dirs(repo_dir: Path) -> list[Path]: + runtime_dirs: list[Path] = [] + for test_mode in (False, True): + candidate = resolve_runtime_dir(repo_dir, test_mode=test_mode) + if candidate not in runtime_dirs: + runtime_dirs.append(candidate) + return runtime_dirs + + +def _glob_runtime_candidates(runtime_dirs: list[Path], pattern: str) -> list[str]: + matches: list[str] = [] + for runtime_dir in runtime_dirs: + matches.extend(glob.glob(str(runtime_dir / pattern))) + return sorted(set(matches)) + + +def _shift_log_candidates(runtime_dirs: list[Path]) -> list[str]: + patterns = [ + "SHIFT-LOG*.md", + _DATED_SHIFT_LOG_GLOB, + "worktree-*/*/Nightshift/SHIFT-LOG*.md", + f"worktree-*/*/Nightshift/{_DATED_SHIFT_LOG_GLOB}", + ] + matches: list[str] = [] + for pattern in patterns: + matches.extend(_glob_runtime_candidates(runtime_dirs, pattern)) + return sorted(set(matches)) + + # --------------------------------------------------------------------------- # Dimension scorers -- pure functions # --------------------------------------------------------------------------- diff --git a/nightshift/worktree.py b/nightshift/worktree.py index ebd34bd..1d32732 100644 --- a/nightshift/worktree.py +++ b/nightshift/worktree.py @@ -2,8 +2,10 @@ from __future__ import annotations +import hashlib import shutil import subprocess +import tempfile from pathlib import Path, PurePosixPath from nightshift.config import infer_install_command @@ -11,6 +13,7 @@ SAFE_ARTIFACT_DIRS, SAFE_ARTIFACT_GLOBS, SHIFT_LOG_TEMPLATE, + TEST_RUNTIME_ARTIFACT_DIRNAME, now_local, print_status, ) @@ -61,10 +64,26 @@ def canonical_repo_relative_path(repo_dir: Path, relative_path: str) -> str: def resolve_nightshift_dir(repo_dir: Path) -> Path: - relative_dir = canonical_repo_relative_path(repo_dir, "docs/Nightshift") + relative_dir = resolve_shift_log_relative_dir(repo_dir) return repo_dir / Path(*PurePosixPath(relative_dir).parts) +def resolve_shift_log_relative_dir(repo_dir: Path) -> str: + """Return the repo-relative shift-log directory with on-disk casing.""" + return canonical_repo_relative_path(repo_dir, "docs/Nightshift") + + +def resolve_test_runtime_dir(repo_dir: Path) -> Path: + """Return an isolated runtime directory for test-mode runs.""" + digest = hashlib.sha256(str(repo_dir).encode("utf-8")).hexdigest()[:12] + return Path(tempfile.gettempdir()) / TEST_RUNTIME_ARTIFACT_DIRNAME / f"{repo_dir.name}-{digest}" + + +def resolve_runtime_dir(repo_dir: Path, *, test_mode: bool) -> Path: + """Resolve the runtime-artifact directory for the current run mode.""" + return resolve_test_runtime_dir(repo_dir) if test_mode else resolve_nightshift_dir(repo_dir) + + def validate_worktree(worktree_dir: Path) -> None: try: inside = git(worktree_dir, "rev-parse", "--is-inside-work-tree") diff --git a/tests/test_nightshift.py b/tests/test_nightshift.py index 11d780f..4c269c4 100644 --- a/tests/test_nightshift.py +++ b/tests/test_nightshift.py @@ -907,6 +907,51 @@ def test_dry_run_uses_existing_docs_case(self, tmp_path: Path, capsys: pytest.Ca captured = capsys.readouterr() assert "Docs/Nightshift" in captured.out + def test_rejected_test_mode_run_keeps_target_repo_clean(self, tmp_path: Path) -> None: + repo = _init_git_repo_for_test_mode(tmp_path / "repo") + runtime_root = tmp_path / "runtime-root" + parser = nightshift.build_parser() + args = parser.parse_args( + ["test", "--agent", "codex", "--repo-dir", str(repo), "--cycles", "1", "--cycle-minutes", "1"] + ) + verification = { + "files_touched": [], + "dominant_path": "(none)", + "commits": [], + "violations": ["simulated rejection"], + "verify_command": None, + "verify_status": "failed", + "verify_exit_code": 1, + } + + with ( + patch("nightshift.worktree.tempfile.gettempdir", return_value=str(runtime_root)), + patch("nightshift.cli.command_exists", return_value=True), + patch("nightshift.cli.read_repo_instructions", return_value=""), + patch("nightshift.cli.evaluate_baseline"), + patch("nightshift.cli.install_dependencies_if_needed"), + patch("nightshift.cli.command_for_agent", return_value=["python3", "-c", "print('{}')"]), + patch("nightshift.cli.parse_cycle_result", return_value={"fixes": [], "logged_issues": []}), + patch("nightshift.cli.verify_cycle", return_value=(False, verification)), + ): + result = nightshift.run_nightshift(args, test_mode=True) + runtime_dir = nightshift.resolve_test_runtime_dir(repo) + + assert result == 1 + status = subprocess.run( + ["git", "status", "--short"], + cwd=repo, + capture_output=True, + text=True, + check=True, + ) + assert status.stdout == "" + assert not (repo / "docs" / "Nightshift").exists() + + today = nightshift.now_local().strftime("%Y-%m-%d") + assert (runtime_dir / f"{today}.state.json").exists() + assert (runtime_dir / f"{today}.runner.log").exists() + class TestValidateWorktree: def test_reports_missing_gitdir_from_git_file(self, tmp_path: Path) -> None: @@ -11492,6 +11537,26 @@ def test_uses_existing_docs_case(self, tmp_path): assert arts["shift_log_exists"] assert arts["state"] == state + def test_reads_isolated_test_runtime_artifacts(self, tmp_path: Path) -> None: + repo = tmp_path / "repo" + repo.mkdir() + runtime_root = tmp_path / "runtime-root" + + with patch("nightshift.worktree.tempfile.gettempdir", return_value=str(runtime_root)): + runtime_dir = nightshift.resolve_test_runtime_dir(repo) + runtime_dir.mkdir(parents=True) + state = {"version": 1, "cycles": []} + (runtime_dir / "2026-04-05.state.json").write_text(json.dumps(state), encoding="utf-8") + log_dir = runtime_dir / "worktree-2026-04-05" / "Docs" / "Nightshift" + log_dir.mkdir(parents=True) + (log_dir / "2026-04-05.md").write_text("# Nightshift log", encoding="utf-8") + + arts = nightshift.parse_shift_artifacts(repo) + + assert arts["state_file_valid"] + assert arts["shift_log_exists"] + assert arts["state"] == state + class TestRunTestShift: def test_targets_repo_dir_explicitly(self, tmp_path: Path) -> None: @@ -11516,6 +11581,17 @@ def test_targets_repo_dir_explicitly(self, tmp_path: Path) -> None: assert kwargs["env"]["PYTHONPATH"] == str(nightshift_dir) +def _init_git_repo_for_test_mode(repo: Path) -> Path: + repo.mkdir() + subprocess.run(["git", "init"], cwd=repo, capture_output=True, check=True) + subprocess.run(["git", "config", "user.email", "test@test.com"], cwd=repo, capture_output=True, check=True) + subprocess.run(["git", "config", "user.name", "Test"], cwd=repo, capture_output=True, check=True) + (repo / "README.md").write_text("hello\n", encoding="utf-8") + subprocess.run(["git", "add", "README.md"], cwd=repo, capture_output=True, check=True) + subprocess.run(["git", "commit", "-m", "init"], cwd=repo, capture_output=True, check=True) + return repo + + class TestEvaluationConstants: def test_dimensions_count(self): assert len(nightshift.EVALUATION_DIMENSIONS) == 10