From d3bf815627fdcfcf87f1f8abbabbb2d6749419a0 Mon Sep 17 00:00:00 2001 From: mohamed-elkholy95 Date: Sat, 30 May 2026 21:52:01 -0400 Subject: [PATCH 1/8] docs(agents): add simplicity scope discipline --- AGENTS.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/AGENTS.md b/AGENTS.md index 3c081be..dfc2761 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -38,6 +38,22 @@ subagents, skills, web/visualization UIs, and multi-provider LLM authentication. any "Actionable comments posted: N" findings. Do not merge while CodeRabbit is still reviewing or on an unreviewed commit; surface unresolved actionable findings instead of merging past them. +## Simplicity and scope discipline + +- Before implementing, identify the Minimum Viable Change: the smallest code delta that solves the + current task with acceptable correctness and maintainability. +- Prefer the 80/20 fix over broad infrastructure. Do not build large systems for edge cases unless + the task explicitly requires them. +- Avoid speculative abstractions: no new generic interfaces, configuration layers, wrappers around + built-ins, or design patterns unless the current requirement directly needs them. +- Match existing conventions for naming, layout, error handling, async boundaries, and tests. If the + existing pattern is imperfect but not the cause of the bug, follow it. +- Fix the target issue only. Do not refactor unrelated systems, reformat untouched code, or create + cascading changes unless required to address the root cause. +- When reviewing code, treat low-delta maintainability as a primary quality signal. Flag unnecessary + abstraction, custom logic where native features or existing helpers suffice, and changes a junior + maintainer would struggle to follow. + ## Quick commands Use these first; they encode the supported local workflow. From 07516c51e65e8c0892b47e23d71a0f60f873a728 Mon Sep 17 00:00:00 2001 From: mohamed-elkholy95 Date: Sat, 30 May 2026 23:00:00 -0400 Subject: [PATCH 2/8] fix(background): preserve terminal recovery state --- src/pythinker_code/background/manager.py | 133 +++++++++++++---------- tests/background/test_manager.py | 21 ++++ 2 files changed, 94 insertions(+), 60 deletions(-) diff --git a/src/pythinker_code/background/manager.py b/src/pythinker_code/background/manager.py index 6732e4c..58a1bee 100644 --- a/src/pythinker_code/background/manager.py +++ b/src/pythinker_code/background/manager.py @@ -7,6 +7,7 @@ import sys import threading import time +from collections.abc import Callable from pathlib import Path from typing import TYPE_CHECKING, Any, Literal @@ -634,14 +635,17 @@ def _recover_agent_view(self, view: TaskView, *, now: float, live_agent_ids: set ``TaskRuntime`` — closing the window where a process crash between :meth:`finalize_agent_task`'s two writes, or between :meth:`kill` and the runner's cancellation handler, left the two records divergent. + + The runtime is re-read under the store's update lock before recovery so a + stale ``TaskView`` snapshot cannot overwrite an authoritative terminal + status that was written after the snapshot was created. """ agent_id_raw = (view.spec.kind_payload or {}).get("agent_id") agent_id = agent_id_raw if isinstance(agent_id_raw, str) else None - runtime_status: TaskStatus = view.runtime.status - if not is_terminal_status(runtime_status): - if view.spec.id in self._live_agent_tasks: - return - runtime = view.runtime.model_copy() + + def recover_orphan(runtime: TaskRuntime) -> bool: + if is_terminal_status(runtime.status) or view.spec.id in self._live_agent_tasks: + return False runtime.finished_at = now runtime.updated_at = now runtime.status = "recoverable" if agent_id is not None else "lost" @@ -651,9 +655,10 @@ def _recover_agent_view(self, view: TaskView, *, now: float, live_agent_ids: set if agent_id is not None else "In-process background agent is no longer running" ) - self._store.write_runtime(view.spec.id, runtime) - runtime_status = runtime.status - self._reconcile_subagent_status(agent_id, runtime_status, live_agent_ids) + return True + + runtime = self._store.update_runtime(view.spec.id, recover_orphan) + self._reconcile_subagent_status(agent_id, runtime.status, live_agent_ids) def _reconcile_subagent_status( self, agent_id: str | None, runtime_status: TaskStatus, live_agent_ids: set[str] @@ -814,105 +819,113 @@ def finalize_agent_task( if subagent_status is not None: self._runtime.subagent_store.update_instance(agent_id, status=subagent_status) - def _mark_task_running(self, task_id: str) -> None: - with self._store._runtime_lock(task_id): # pyright: ignore[reportPrivateUsage] - runtime = self._store.read_runtime(task_id) + def _transition_status( + self, + task_id: str, + *, + mutate: Callable[[TaskRuntime], None], + ) -> TaskRuntime | None: + """Locked, terminal-guarded status write via the public store API. + + Stamps ``updated_at`` then applies ``mutate``. Returns the resulting + runtime when the transition was applied, or ``None`` when the task was + already terminal (no write performed, so callers skip telemetry). + """ + applied = False + + def _apply(runtime: TaskRuntime) -> bool: + nonlocal applied if is_terminal_status(runtime.status): - return - runtime.status = "running" + return False runtime.updated_at = time.time() + mutate(runtime) + applied = True + return True + + runtime = self._store.update_runtime(task_id, _apply) + return runtime if applied else None + + def _mark_task_running(self, task_id: str) -> None: + def mutate(runtime: TaskRuntime) -> None: + runtime.status = "running" runtime.heartbeat_at = runtime.updated_at runtime.failure_reason = None - self._store._write_runtime_unlocked(task_id, runtime) # pyright: ignore[reportPrivateUsage] + + self._transition_status(task_id, mutate=mutate) def _mark_task_awaiting_approval(self, task_id: str, reason: str) -> None: - with self._store._runtime_lock(task_id): # pyright: ignore[reportPrivateUsage] - runtime = self._store.read_runtime(task_id) - if is_terminal_status(runtime.status): - return + def mutate(runtime: TaskRuntime) -> None: runtime.status = "awaiting_approval" - runtime.updated_at = time.time() runtime.failure_reason = reason - self._store._write_runtime_unlocked(task_id, runtime) # pyright: ignore[reportPrivateUsage] + + self._transition_status(task_id, mutate=mutate) def _mark_task_completed(self, task_id: str) -> None: - with self._store._runtime_lock(task_id): # pyright: ignore[reportPrivateUsage] - runtime = self._store.read_runtime(task_id) - if is_terminal_status(runtime.status): - return + def mutate(runtime: TaskRuntime) -> None: runtime.status = "completed" - runtime.updated_at = time.time() runtime.finished_at = runtime.updated_at runtime.failure_reason = None - self._store._write_runtime_unlocked(task_id, runtime) # pyright: ignore[reportPrivateUsage] - from pythinker_code.telemetry import track - if runtime.started_at and runtime.finished_at: - duration = runtime.finished_at - runtime.started_at - track("background_task_completed", success=True, duration_s=duration) + runtime = self._transition_status(task_id, mutate=mutate) + if runtime is not None and runtime.started_at and runtime.finished_at: + from pythinker_code.telemetry import track + + track( + "background_task_completed", + success=True, + duration_s=runtime.finished_at - runtime.started_at, + ) def _mark_task_failed(self, task_id: str, reason: str) -> None: - with self._store._runtime_lock(task_id): # pyright: ignore[reportPrivateUsage] - runtime = self._store.read_runtime(task_id) - if is_terminal_status(runtime.status): - return + def mutate(runtime: TaskRuntime) -> None: runtime.status = "failed" - runtime.updated_at = time.time() runtime.finished_at = runtime.updated_at runtime.failure_reason = reason - self._store._write_runtime_unlocked(task_id, runtime) # pyright: ignore[reportPrivateUsage] - from pythinker_code.telemetry import track - if runtime.started_at and runtime.finished_at: - duration = runtime.finished_at - runtime.started_at + runtime = self._transition_status(task_id, mutate=mutate) + if runtime is not None and runtime.started_at and runtime.finished_at: + from pythinker_code.telemetry import track + track( "background_task_completed", success=False, - duration_s=duration, + duration_s=runtime.finished_at - runtime.started_at, reason="error", ) def _mark_task_timed_out(self, task_id: str, reason: str) -> None: - with self._store._runtime_lock(task_id): # pyright: ignore[reportPrivateUsage] - runtime = self._store.read_runtime(task_id) - if is_terminal_status(runtime.status): - return + def mutate(runtime: TaskRuntime) -> None: runtime.status = "failed" - runtime.updated_at = time.time() runtime.finished_at = runtime.updated_at runtime.interrupted = True runtime.timed_out = True runtime.failure_reason = reason - self._store._write_runtime_unlocked(task_id, runtime) # pyright: ignore[reportPrivateUsage] - from pythinker_code.telemetry import track - if runtime.started_at and runtime.finished_at: - duration = runtime.finished_at - runtime.started_at + runtime = self._transition_status(task_id, mutate=mutate) + if runtime is not None and runtime.started_at and runtime.finished_at: + from pythinker_code.telemetry import track + track( "background_task_completed", success=False, - duration_s=duration, + duration_s=runtime.finished_at - runtime.started_at, reason="timeout", ) def _mark_task_killed(self, task_id: str, reason: str) -> None: - with self._store._runtime_lock(task_id): # pyright: ignore[reportPrivateUsage] - runtime = self._store.read_runtime(task_id) - if is_terminal_status(runtime.status): - return + def mutate(runtime: TaskRuntime) -> None: runtime.status = "killed" - runtime.updated_at = time.time() runtime.finished_at = runtime.updated_at runtime.interrupted = True runtime.failure_reason = reason - self._store._write_runtime_unlocked(task_id, runtime) # pyright: ignore[reportPrivateUsage] - from pythinker_code.telemetry import track - if runtime.started_at and runtime.finished_at: - duration = runtime.finished_at - runtime.started_at + runtime = self._transition_status(task_id, mutate=mutate) + if runtime is not None and runtime.started_at and runtime.finished_at: + from pythinker_code.telemetry import track + track( "background_task_completed", success=False, - duration_s=duration, + duration_s=runtime.finished_at - runtime.started_at, reason="killed", ) diff --git a/tests/background/test_manager.py b/tests/background/test_manager.py index 4dc2943..109faa8 100644 --- a/tests/background/test_manager.py +++ b/tests/background/test_manager.py @@ -753,6 +753,27 @@ def test_recover_reconciles_subagent_record_for_terminal_agent_task(runtime): assert manager.store.merged_view("acrashtask").runtime.status == "failed" +def test_recover_agent_view_does_not_clobber_terminal_runtime_from_stale_view(runtime): + manager = runtime.background_tasks + spec = _seed_agent_task( + runtime, + task_id="astaletask", + agent_id="astaleagent", + task_status="running", + ) + stale_view = manager.store.merged_view(spec.id) + now = time.time() + manager.store.write_runtime( + spec.id, + TaskRuntime(status="completed", updated_at=now, finished_at=now), + ) + + manager._recover_agent_view(stale_view, now=now + 1, live_agent_ids=set()) + + assert manager.store.merged_view(spec.id).runtime.status == "completed" + assert runtime.subagent_store.require_instance("astaleagent").status == "idle" + + def test_recover_reconciles_subagent_record_for_killed_agent_task(runtime): # kill() marks TaskRuntime killed first, then cancels the asyncio task. A # crash before the runner's cancel handler writes the instance leaves it From d0da45d1e9b4461677c736c67d06d48b6120eb08 Mon Sep 17 00:00:00 2001 From: mohamed-elkholy95 Date: Sat, 30 May 2026 23:00:04 -0400 Subject: [PATCH 3/8] refactor(memory): remove unused retriever seam --- src/pythinker_code/memory/consolidation.py | 2 +- src/pythinker_code/memory/recall.py | 4 -- src/pythinker_code/memory/retriever.py | 13 ++---- src/pythinker_code/memory/retriever_sqlite.py | 42 ------------------- src/pythinker_code/project_memory.py | 4 ++ tests/core/test_memory_phase_bcd.py | 9 ---- tests/core/test_recall_provider.py | 3 -- 7 files changed, 8 insertions(+), 69 deletions(-) delete mode 100644 src/pythinker_code/memory/retriever_sqlite.py diff --git a/src/pythinker_code/memory/consolidation.py b/src/pythinker_code/memory/consolidation.py index 9381392..69c5c39 100644 --- a/src/pythinker_code/memory/consolidation.py +++ b/src/pythinker_code/memory/consolidation.py @@ -33,7 +33,7 @@ def _memory_entry_hash(content: str) -> str: async def inbox_dir(store: ProjectMemoryStore) -> Path: - root = await store._ensure_dir() # pyright: ignore[reportPrivateUsage] + root = await store.ensure_root() path = root / "memory" / "inbox" path.mkdir(parents=True, exist_ok=True) return path diff --git a/src/pythinker_code/memory/recall.py b/src/pythinker_code/memory/recall.py index 03ab5fc..b659290 100644 --- a/src/pythinker_code/memory/recall.py +++ b/src/pythinker_code/memory/recall.py @@ -89,12 +89,10 @@ async def build_recall_block( query: RecallQuery, open_todos: list[tuple[str, list[str]]], budget_tokens: int, - store_path: str, ) -> str: ranked = await LexicalRetriever(candidates).retrieve(query, budget_tokens) if not ranked and not open_todos: return "" - _ = store_path lines: list[str] = ["Relevant project memory — recalled by relevance, not the full store."] if open_todos: lines.append("\n## Open todos from recent sessions") @@ -249,13 +247,11 @@ async def get_injections( text=_last_user_text(history), labels=tuple(title for _label, items in open_todos for title in items), ) - store_root = await self._store._ensure_dir() # pyright: ignore[reportPrivateUsage] block = await build_recall_block( candidates=candidates, query=query, open_todos=open_todos, budget_tokens=INJECTION_BUDGET_BYTES // 4, - store_path=str(store_root / "memory"), ) except Exception: logger.debug("recall: snapshot failed") diff --git a/src/pythinker_code/memory/retriever.py b/src/pythinker_code/memory/retriever.py index c8f8fed..03a9f55 100644 --- a/src/pythinker_code/memory/retriever.py +++ b/src/pythinker_code/memory/retriever.py @@ -4,7 +4,7 @@ import re import time import unicodedata -from abc import ABC, abstractmethod +from collections import Counter from dataclasses import dataclass, replace # Unicode word runs, with underscores treated as separators for snake_case. @@ -46,12 +46,7 @@ class RankedBlock: content: str -class Retriever(ABC): - @abstractmethod - async def retrieve(self, query: RecallQuery, budget_tokens: int) -> list[RankedBlock]: ... - - -class LexicalRetriever(Retriever): +class LexicalRetriever: """Hand-rolled BM25 + recency decay + label/path boost. Stdlib only.""" def __init__(self, candidates: list[RankedBlock], *, now: float | None = None) -> None: @@ -73,9 +68,7 @@ async def retrieve(self, query: RecallQuery, budget_tokens: int) -> list[RankedB scored: list[RankedBlock] = [] for cand, doc in zip(self._candidates, docs, strict=True): dl = len(doc) or 1 - tf: dict[str, int] = {} - for term in doc: - tf[term] = tf.get(term, 0) + 1 + tf = Counter(doc) bm25 = 0.0 for term in q_terms: if term not in tf: diff --git a/src/pythinker_code/memory/retriever_sqlite.py b/src/pythinker_code/memory/retriever_sqlite.py deleted file mode 100644 index 14dc692..0000000 --- a/src/pythinker_code/memory/retriever_sqlite.py +++ /dev/null @@ -1,42 +0,0 @@ -from __future__ import annotations - -import sqlite3 - -from pythinker_code.memory.retriever import LexicalRetriever, RankedBlock, RecallQuery, Retriever - - -def sqlite_fts5_available() -> bool: - """Return whether this Python sqlite build supports FTS5.""" - try: - conn = sqlite3.connect(":memory:") - try: - conn.execute("CREATE VIRTUAL TABLE _probe USING fts5(x)") - finally: - conn.close() - except sqlite3.Error: - return False - return True - - -class SqliteFts5Retriever(Retriever): - """FTS5-capability seam with lexical fallback. - - The SQLite index is intentionally derived/rebuildable and is not wired into the - default app path yet. Until the indexed path is expanded, this class preserves - Retriever semantics by delegating to the dependency-free lexical retriever. - """ - - def __init__( - self, candidates: list[RankedBlock], *, fts5_available: bool | None = None - ) -> None: - self._candidates = candidates - self._fts5_available = sqlite_fts5_available() if fts5_available is None else fts5_available - self._fallback = LexicalRetriever(candidates) - - @property - def uses_fts5(self) -> bool: - return self._fts5_available - - async def retrieve(self, query: RecallQuery, budget_tokens: int) -> list[RankedBlock]: - # Derived FTS5 indexing is intentionally deferred until measurements justify it. - return await self._fallback.retrieve(query, budget_tokens) diff --git a/src/pythinker_code/project_memory.py b/src/pythinker_code/project_memory.py index f105cd5..5644e49 100644 --- a/src/pythinker_code/project_memory.py +++ b/src/pythinker_code/project_memory.py @@ -116,6 +116,10 @@ async def _ensure_dir(self) -> Path: self._root = root return self._root + async def ensure_root(self) -> Path: + """Public entry point for :meth:`_ensure_dir` used by collaborators.""" + return await self._ensure_dir() + def _filename(self, target: Target) -> str: return "USER.md" if target == "user" else "MEMORY.md" diff --git a/tests/core/test_memory_phase_bcd.py b/tests/core/test_memory_phase_bcd.py index 709b890..b085fbe 100644 --- a/tests/core/test_memory_phase_bcd.py +++ b/tests/core/test_memory_phase_bcd.py @@ -15,7 +15,6 @@ RecallQuery, estimate_tokens, ) -from pythinker_code.memory.retriever_sqlite import SqliteFts5Retriever from pythinker_code.project_memory import ProjectMemoryStore from pythinker_code.session_state import SessionState, TodoItemState @@ -151,11 +150,3 @@ async def test_generate_inbox_candidates_ignores_corrupt_duplicate_file(tmp_path (inbox / f"{first[0].id}.json").write_text("{not json", encoding="utf-8") assert await generate_inbox_candidates(store, _hp(repo)) == [] - - -async def test_sqlite_retriever_falls_back_to_lexical(): - block = _ranked("lexical sqlite fallback") - out = await SqliteFts5Retriever([block], fts5_available=False).retrieve( - RecallQuery(text="sqlite fallback"), budget_tokens=100 - ) - assert out and out[0].content == "lexical sqlite fallback" diff --git a/tests/core/test_recall_provider.py b/tests/core/test_recall_provider.py index 206fec6..1377edc 100644 --- a/tests/core/test_recall_provider.py +++ b/tests/core/test_recall_provider.py @@ -36,7 +36,6 @@ async def test_build_recall_block_includes_open_todos_and_facts(): query=RecallQuery(text="retriever"), open_todos=[("prior session", ["wire the retriever"])], budget_tokens=1000, - store_path="/tmp/x/memory", ) assert "use the lexical retriever" in block assert "wire the retriever" in block @@ -49,7 +48,6 @@ async def test_build_recall_block_empty_when_nothing(): query=RecallQuery(text="x"), open_todos=[], budget_tokens=1000, - store_path="/tmp/x/memory", ) assert block == "" @@ -60,7 +58,6 @@ async def test_build_recall_block_open_todos_only_does_not_suggest_missing_files query=RecallQuery(text="x"), open_todos=[("prior", ["finish review"])], budget_tokens=1000, - store_path="/tmp/x/memory", ) assert "finish review" in block assert "/tmp/x/memory" not in block From c4f78e7e4cba2e46ed42f622a51a0e0914408192 Mon Sep 17 00:00:00 2001 From: mohamed-elkholy95 Date: Sat, 30 May 2026 23:00:11 -0400 Subject: [PATCH 4/8] fix(shell): echo resolved prompt text --- src/pythinker_code/ui/shell/__init__.py | 4 ++-- src/pythinker_code/ui/shell/echo.py | 2 +- src/pythinker_code/ui/shell/prompt.py | 6 +++-- .../ui/shell/visualize/_interactive.py | 4 ++-- tests/ui_and_conv/test_shell_prompt_echo.py | 4 ++-- .../test_shell_run_placeholders.py | 6 ++--- .../test_visualize_running_prompt.py | 23 +++++++++++++++++++ 7 files changed, 37 insertions(+), 12 deletions(-) diff --git a/src/pythinker_code/ui/shell/__init__.py b/src/pythinker_code/ui/shell/__init__.py index 1e76db4..d8634bf 100644 --- a/src/pythinker_code/ui/shell/__init__.py +++ b/src/pythinker_code/ui/shell/__init__.py @@ -478,7 +478,7 @@ def _should_echo_agent_input(self, user_input: UserInput) -> bool: @staticmethod def _echo_agent_input(user_input: UserInput) -> None: - console.print(render_user_echo_text(user_input.command)) + console.print(render_user_echo_text(user_input.resolved_command)) def _bind_running_input( self, @@ -1198,7 +1198,7 @@ def _on_view_ready(view: Any) -> None: if not pending: break queued = pending.pop(0) - console.print(render_user_echo_text(queued.command)) + console.print(render_user_echo_text(queued.resolved_command)) await run_soul( self.soul, queued.content, diff --git a/src/pythinker_code/ui/shell/echo.py b/src/pythinker_code/ui/shell/echo.py index aab02db..b425774 100644 --- a/src/pythinker_code/ui/shell/echo.py +++ b/src/pythinker_code/ui/shell/echo.py @@ -19,5 +19,5 @@ def render_user_echo(message: Message) -> RenderableType: def render_user_echo_text(text: str) -> RenderableType: - """Render the local prompt text exactly as the user saw it in the buffer.""" + """Render submitted local prompt text in the transcript.""" return Text(f"{PROMPT_SYMBOL_AGENT_INPUT} {text}") diff --git a/src/pythinker_code/ui/shell/prompt.py b/src/pythinker_code/ui/shell/prompt.py index 6d72fce..d0b63e4 100644 --- a/src/pythinker_code/ui/shell/prompt.py +++ b/src/pythinker_code/ui/shell/prompt.py @@ -2558,10 +2558,12 @@ def _render_background_todo_rows(self, columns: int) -> FormattedText: fragments: FormattedText = FormattedText() visible = todos[:5] hidden = todos[5:] + first_prefix = f" {TRANSCRIPT_TOOL_GUTTER} " + continuation_prefix = " " * _display_width(first_prefix) for index, todo in enumerate(visible): if fragments: fragments.append(("", "\n")) - prefix = f" {TRANSCRIPT_TOOL_GUTTER} " if index == 0 else " " + prefix = first_prefix if index == 0 else continuation_prefix if todo.status == "done": icon = "✔" icon_style = success_style @@ -2591,7 +2593,7 @@ def _render_background_todo_rows(self, columns: int) -> FormattedText: if hidden_done == len(hidden) else "more" ) - fragments.append((muted_style, f" … +{len(hidden)} {label}")) + fragments.append((muted_style, f"{continuation_prefix}… +{len(hidden)} {label}")) return fragments def _render_background_working_status( diff --git a/src/pythinker_code/ui/shell/visualize/_interactive.py b/src/pythinker_code/ui/shell/visualize/_interactive.py index ec1c3b1..99235e4 100644 --- a/src/pythinker_code/ui/shell/visualize/_interactive.py +++ b/src/pythinker_code/ui/shell/visualize/_interactive.py @@ -341,8 +341,8 @@ def handle_immediate_steer(self, user_input: UserInput) -> None: duration=3.0, ) return - # Print permanently in conversation flow (shows placeholder for pasted text) - console.print(render_user_echo_text(user_input.command)) + # Print permanently in conversation flow with UI-only text placeholders expanded. + console.print(render_user_echo_text(user_input.resolved_command)) from pythinker_code.telemetry import track track("input_steer") diff --git a/tests/ui_and_conv/test_shell_prompt_echo.py b/tests/ui_and_conv/test_shell_prompt_echo.py index 16877a8..1532ccc 100644 --- a/tests/ui_and_conv/test_shell_prompt_echo.py +++ b/tests/ui_and_conv/test_shell_prompt_echo.py @@ -61,7 +61,7 @@ def test_echo_agent_input_prints_stringified_user_message(monkeypatch) -> None: assert [text.plain for text in printed] == ["❯ hi"] -def test_echo_agent_input_uses_display_command_for_placeholders(monkeypatch) -> None: +def test_echo_agent_input_uses_resolved_command_for_placeholders(monkeypatch) -> None: printed: list[Text] = [] monkeypatch.setattr(shell_module.console, "print", lambda text: printed.append(text)) @@ -74,7 +74,7 @@ def test_echo_agent_input_uses_display_command_for_placeholders(monkeypatch) -> Shell._echo_agent_input(user_input) - assert [text.plain for text in printed] == ["❯ [Pasted text #1 +3 lines]"] + assert [text.plain for text in printed] == ["❯ line1\nline2\nline3"] def test_render_user_echo_preserves_literal_brackets() -> None: diff --git a/tests/ui_and_conv/test_shell_run_placeholders.py b/tests/ui_and_conv/test_shell_run_placeholders.py index 081f15d..543eb79 100644 --- a/tests/ui_and_conv/test_shell_run_placeholders.py +++ b/tests/ui_and_conv/test_shell_run_placeholders.py @@ -152,7 +152,7 @@ async def test_shell_run_treats_hidden_slash_in_placeholder_as_regular_agent_inp assert _FakePromptSession.instances[0].prompt_calls == 2 shell.run_soul_command.assert_awaited_once_with([TextPart(text="/quit\nstill send this")]) shell._run_slash_command.assert_not_awaited() - assert printed == ["❯ [Pasted text #1 +3 lines]", "", "Bye!"] + assert printed == ["❯ /quit\nstill send this", "", "Bye!"] @pytest.mark.asyncio @@ -221,7 +221,7 @@ async def test_shell_run_echoes_visible_skill_slash_with_placeholder_before_disp assert _FakePromptSession.instances[0].prompt_calls == 2 shell.run_soul_command.assert_awaited_once_with("/skill:demo line1\nline2\nline3") shell._run_slash_command.assert_not_awaited() - assert printed == ["❯ /skill:demo [Pasted text #1 +3 lines]", "", "Bye!"] + assert printed == ["❯ /skill:demo line1\nline2\nline3", "", "Bye!"] @pytest.mark.asyncio @@ -256,7 +256,7 @@ async def test_shell_run_echoes_visible_flow_slash_with_placeholder_before_dispa assert _FakePromptSession.instances[0].prompt_calls == 2 shell.run_soul_command.assert_awaited_once_with("/flow:demo line1\nline2\nline3") shell._run_slash_command.assert_not_awaited() - assert printed == ["❯ /flow:demo [Pasted text #1 +3 lines]", "", "Bye!"] + assert printed == ["❯ /flow:demo line1\nline2\nline3", "", "Bye!"] @pytest.mark.asyncio diff --git a/tests/ui_and_conv/test_visualize_running_prompt.py b/tests/ui_and_conv/test_visualize_running_prompt.py index 0a17459..7d32e3f 100644 --- a/tests/ui_and_conv/test_visualize_running_prompt.py +++ b/tests/ui_and_conv/test_visualize_running_prompt.py @@ -298,6 +298,29 @@ def test_prompt_status_keeps_todos_visible_during_background_tasks() -> None: assert "◻ Code quality review" in text +def test_prompt_background_todo_rows_align_icons_and_titles() -> None: + session = object.__new__(CustomPromptSession) + session._latest_todos = ( + TodoDisplayItem( + title="Launch parallel deep scan agents (overengineering, simplicity, architecture, bug hunt)", + status="in_progress", + ), + TodoDisplayItem(title="Cross-validate findings against live code", status="pending"), + TodoDisplayItem( + title="Synthesize consolidated findings report with fixes", status="pending" + ), + ) + + rendered = CustomPromptSession._render_background_todo_rows(session, 120) + lines = "".join(item[1] for item in rendered).splitlines() + + assert lines[0].startswith(" ⎿ ◼ ") + assert lines[1].startswith(" ◻ ") + assert lines[2].startswith(" ◻ ") + assert lines[1].index("◻") == lines[0].index("◼") + assert lines[2].index("Synthesize") == lines[0].index("Launch") + + def test_background_status_drops_verb_when_working_indicator_pinned() -> None: """During an active turn the pinned working indicator owns the verb, so the background-task line must show the count *without* repeating it.""" From 723fe11fd7a7619afacb7612a13724e7e7d05113 Mon Sep 17 00:00:00 2001 From: mohamed-elkholy95 Date: Sat, 30 May 2026 23:00:14 -0400 Subject: [PATCH 5/8] docs(agents): discourage over-fragmentation --- src/pythinker_code/agents/default/system.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pythinker_code/agents/default/system.md b/src/pythinker_code/agents/default/system.md index 6519d45..ca828cf 100644 --- a/src/pythinker_code/agents/default/system.md +++ b/src/pythinker_code/agents/default/system.md @@ -77,7 +77,8 @@ These principles govern every engineering response. They override speed: a slow - No "flexibility" or "configurability" that wasn't requested. - No error handling for impossible scenarios; validate at boundaries only. - If a 200-line draft could be 50 lines, rewrite it before showing it. -- Self-check: *"Would a senior engineer call this overcomplicated?"* If yes, simplify. +- Over-fragmentation is overcomplication too: don't scatter logic across many tiny files or extra abstraction layers to satisfy a design pattern. Match the codebase's existing granularity. +- Self-check: *"Would a senior engineer call this overcomplicated or over-engineered?"* If yes, simplify. **3. Goal-driven execution — define success criteria, then loop until verified.** - Transform vague tasks into verifiable goals before writing code: From a4d13845b6dbdb27f358fb75ce2672f9cfb740dc Mon Sep 17 00:00:00 2001 From: mohamed-elkholy95 Date: Sat, 30 May 2026 23:00:17 -0400 Subject: [PATCH 6/8] fix(update): revalidate stale startup cache --- .github/workflows/promote-release.yml | 102 ++++++++++++++++++------- src/pythinker_code/ui/shell/update.py | 30 ++++++-- tests/ui_and_conv/test_shell_update.py | 47 ++++++++++++ 3 files changed, 144 insertions(+), 35 deletions(-) diff --git a/.github/workflows/promote-release.yml b/.github/workflows/promote-release.yml index d0ddc29..f59e6f8 100644 --- a/.github/workflows/promote-release.yml +++ b/.github/workflows/promote-release.yml @@ -3,9 +3,10 @@ name: Promote release # Platform build workflows (windows-installer, linux-installer, # release-pythinker-cli) create the GitHub Release as a PRERELEASE so it stays # out of the date-based /releases/latest endpoint (which ignores make_latest) -# until every asset has uploaded. This workflow waits for all expected assets, -# then clears `prerelease` and marks the release latest — the single point -# where a version becomes resolvable by the install scripts and in-app updater. +# until every install channel is ready. This workflow waits for exact release +# assets, PyPI, and the Homebrew formula, then clears `prerelease` and marks the +# release latest — the single point where a version becomes resolvable by the +# install scripts and in-app updater. # # It runs on the tag push (not `release: published`, which a GITHUB_TOKEN-created # release never fires) so promotion always happens. workflow_dispatch allows a @@ -55,24 +56,44 @@ jobs: fi echo "tag=$tag" >> "$GITHUB_OUTPUT" - - name: Wait for all release assets + - name: Wait for install-channel readiness env: GH_TOKEN: ${{ github.token }} TAG: ${{ steps.tag.outputs.tag }} REPO: ${{ github.repository }} run: | set -euo pipefail - required=( - "PythinkerSetup-" - "_amd64.deb" - "_arm64.deb" - ".x86_64.rpm" - ".aarch64.rpm" - "x86_64-unknown-linux-gnu.tar.gz" - "aarch64-unknown-linux-gnu.tar.gz" - "aarch64-apple-darwin.tar.gz" - "x86_64-apple-darwin.tar.gz" + version="${TAG#v}" + required_assets=( + "PythinkerSetup-${version}.exe" + "PythinkerSetup-${version}.exe.sha256" + "pythinker-code_${version}_amd64.deb" + "pythinker-code_${version}_amd64.deb.sha256" + "pythinker-code_${version}_arm64.deb" + "pythinker-code_${version}_arm64.deb.sha256" + "pythinker-code-${version}.x86_64.rpm" + "pythinker-code-${version}.x86_64.rpm.sha256" + "pythinker-code-${version}.aarch64.rpm" + "pythinker-code-${version}.aarch64.rpm.sha256" + "pythinker-${version}-x86_64-unknown-linux-gnu.tar.gz" + "pythinker-${version}-x86_64-unknown-linux-gnu.tar.gz.sha256" + "pythinker-${version}-aarch64-unknown-linux-gnu.tar.gz" + "pythinker-${version}-aarch64-unknown-linux-gnu.tar.gz.sha256" + "pythinker-${version}-aarch64-apple-darwin.tar.gz" + "pythinker-${version}-aarch64-apple-darwin.tar.gz.sha256" + "pythinker-${version}-x86_64-apple-darwin.tar.gz" + "pythinker-${version}-x86_64-apple-darwin.tar.gz.sha256" + "pythinker-${version}-x86_64-unknown-linux-gnu-onedir.tar.gz" + "pythinker-${version}-x86_64-unknown-linux-gnu-onedir.tar.gz.sha256" + "pythinker-${version}-aarch64-unknown-linux-gnu-onedir.tar.gz" + "pythinker-${version}-aarch64-unknown-linux-gnu-onedir.tar.gz.sha256" + "pythinker-${version}-aarch64-apple-darwin-onedir.tar.gz" + "pythinker-${version}-aarch64-apple-darwin-onedir.tar.gz.sha256" + "pythinker-${version}-x86_64-apple-darwin-onedir.tar.gz" + "pythinker-${version}-x86_64-apple-darwin-onedir.tar.gz.sha256" ) + pypi_url="https://pypi.org/pypi/pythinker-code/${version}/json" + homebrew_formula_url="https://raw.githubusercontent.com/TechMatrix-labs/homebrew-pythinker/main/Formula/pythinker-code.rb" # The budget must comfortably exceed the slowest platform build, since # this job runs on the tag push in parallel with them. The long pole is # linux-installer's emulated arm64 .deb/.rpm step: on the 0.26.0 release @@ -83,26 +104,51 @@ jobs: max_attempts=80 poll_interval=30 budget_min=$(( max_attempts * poll_interval / 60 )) - echo "Polling for all release assets on $TAG (up to ${budget_min}m)..." - all_present=false + echo "Polling install-channel readiness for $TAG (up to ${budget_min}m)..." + all_ready=false for i in $(seq 1 "$max_attempts"); do - assets=$(gh api "repos/$REPO/releases/tags/$TAG" --jq '[.assets[].name] | join(" ")' 2>/dev/null || echo "") - all_present=true - for p in "${required[@]}"; do - if [[ "$assets" != *"$p"* ]]; then - all_present=false - break + assets_json=$(gh api "repos/$REPO/releases/tags/$TAG" --jq '[.assets[].name]' 2>/dev/null || printf '[]') + missing_assets=() + for asset in "${required_assets[@]}"; do + if ! jq -e --arg name "$asset" 'index($name)' <<<"$assets_json" >/dev/null; then + missing_assets+=("$asset") fi done - if [[ "$all_present" == "true" ]]; then - echo "All assets present (attempt $i)" + + pypi_ready=false + if curl -fsSL --retry 2 --retry-delay 2 -o /dev/null "$pypi_url"; then + pypi_ready=true + fi + + homebrew_ready=false + formula_text=$(curl -fsSL --retry 2 --retry-delay 2 "$homebrew_formula_url" 2>/dev/null || true) + if grep -qF "version \"${version}\"" <<<"$formula_text"; then + homebrew_ready=true + fi + + if [[ "${#missing_assets[@]}" -eq 0 && "$pypi_ready" == "true" && "$homebrew_ready" == "true" ]]; then + all_ready=true + echo "All install channels ready (attempt $i)" break fi - echo "Attempt $i/$max_attempts: assets not ready, retrying in ${poll_interval}s..." - sleep "$poll_interval" + + echo "Attempt $i/$max_attempts: install channels not ready." + if [[ "${#missing_assets[@]}" -gt 0 ]]; then + printf 'Missing release assets: %s\n' "${missing_assets[*]}" + fi + if [[ "$pypi_ready" != "true" ]]; then + echo "PyPI is not serving ${version} yet: $pypi_url" + fi + if [[ "$homebrew_ready" != "true" ]]; then + echo "Homebrew tap formula is not at ${version} yet: $homebrew_formula_url" + fi + if [[ "$i" -lt "$max_attempts" ]]; then + echo "Retrying in ${poll_interval}s..." + sleep "$poll_interval" + fi done - if [[ "$all_present" != "true" ]]; then - echo "::error::Release assets not fully uploaded after ${budget_min} minutes" + if [[ "$all_ready" != "true" ]]; then + echo "::error::Install channels were not fully ready after ${budget_min} minutes" exit 1 fi diff --git a/src/pythinker_code/ui/shell/update.py b/src/pythinker_code/ui/shell/update.py index f9c2c35..a0203b9 100644 --- a/src/pythinker_code/ui/shell/update.py +++ b/src/pythinker_code/ui/shell/update.py @@ -52,6 +52,7 @@ LAST_UPDATE_CHECK_FILE = get_share_dir() / "last_update_check.txt" DISMISSED_VERSION_FILE = get_share_dir() / "dismissed_update_version.txt" AUTO_UPDATE_CHECK_INTERVAL_SECONDS = 24 * 60 * 60 +PROMPT_UPDATE_REFRESH_TIMEOUT_SECONDS = 2.0 _UPDATE_LOCK = asyncio.Lock() _skipped_version_this_session: str | None = None @@ -404,17 +405,32 @@ async def _refresh_update_cache(*, force: bool) -> UpdateResult | None: return result +async def _refresh_update_cache_for_prompt(*, force: bool) -> UpdateResult | None: + """Refresh the startup-prompt version cache without hanging shell startup.""" + try: + return await asyncio.wait_for( + _refresh_update_cache(force=force), + timeout=PROMPT_UPDATE_REFRESH_TIMEOUT_SECONDS, + ) + except TimeoutError: + logger.warning("Update prompt refresh timed out; using cached latest version") + return None + + async def _resolve_latest_version_for_prompt(*, force_refresh: bool = False) -> str | None: - """Return the latest known native release, refreshing when due or uncached. + """Return the latest known native release for the pre-start prompt. - Used by the blocking pre-start prompt and by the explicit ``/update`` flow. - A missing cache forces one network check so fresh installs can be prompted - before the agent starts instead of seeing only a later background notice. - ``/update`` passes ``force_refresh=True`` so explicit user checks hit the - release API even when the startup throttle is not due. + The background notifier uses a 24h throttle, but the blocking startup prompt + must not trust a cached "already current" answer forever: a release can land + minutes after the last successful check. Revalidate missing/stale caches with + a short timeout and GitHub's cached ETag; keep the throttle only when the + cached version is already newer than the running version. """ + from pythinker_code.constant import VERSION as current_version + cached = _read_latest_version_cache() - refresh_result = await _refresh_update_cache(force=force_refresh or not cached) + cached_is_stale = cached is None or semver_tuple(cached) <= semver_tuple(current_version) + refresh_result = await _refresh_update_cache_for_prompt(force=force_refresh or cached_is_stale) if refresh_result is not None: return _read_latest_version_cache() or cached return cached diff --git a/tests/ui_and_conv/test_shell_update.py b/tests/ui_and_conv/test_shell_update.py index e510474..bf018b9 100644 --- a/tests/ui_and_conv/test_shell_update.py +++ b/tests/ui_and_conv/test_shell_update.py @@ -1,5 +1,6 @@ from __future__ import annotations +import asyncio from pathlib import Path from types import SimpleNamespace @@ -319,6 +320,52 @@ async def fail_do_update(*, print: bool, check_only: bool) -> update.UpdateResul assert await update._resolve_latest_version_for_prompt() == "3.1.0" +@pytest.mark.asyncio +async def test_resolve_latest_version_revalidates_stale_cache_when_not_due(monkeypatch, tmp_path): + latest_file = tmp_path / "latest.txt" + latest_file.write_text("0.0.0", encoding="utf-8") + last_check_file = tmp_path / "last_update_check.txt" + calls: list[tuple[bool, bool]] = [] + + async def fake_do_update(*, print: bool, check_only: bool) -> update.UpdateResult: + calls.append((print, check_only)) + latest_file.write_text("999.0.0", encoding="utf-8") + return update.UpdateResult.UPDATE_AVAILABLE + + def fail_should_auto_check() -> bool: + raise AssertionError("stale prompt cache must bypass the 24h throttle") + + monkeypatch.setattr(update, "LATEST_VERSION_FILE", latest_file) + monkeypatch.setattr(update, "LAST_UPDATE_CHECK_FILE", last_check_file) + monkeypatch.setattr(update, "_should_auto_check_for_updates", fail_should_auto_check) + monkeypatch.setattr(update, "do_update", fake_do_update) + + assert await update._resolve_latest_version_for_prompt() == "999.0.0" + assert calls == [(False, True)] + assert last_check_file.exists() + + +@pytest.mark.asyncio +async def test_resolve_latest_version_keeps_cache_when_prompt_refresh_times_out( + monkeypatch, tmp_path +): + latest_file = tmp_path / "latest.txt" + latest_file.write_text("0.0.0", encoding="utf-8") + calls: list[bool] = [] + + async def slow_refresh(*, force: bool) -> update.UpdateResult: + calls.append(force) + await asyncio.sleep(60) + return update.UpdateResult.UPDATE_AVAILABLE + + monkeypatch.setattr(update, "LATEST_VERSION_FILE", latest_file) + monkeypatch.setattr(update, "PROMPT_UPDATE_REFRESH_TIMEOUT_SECONDS", 0.01) + monkeypatch.setattr(update, "_refresh_update_cache", slow_refresh) + + assert await update._resolve_latest_version_for_prompt() == "0.0.0" + assert calls == [True] + + @pytest.mark.asyncio async def test_refresh_cache_does_not_throttle_on_failure(monkeypatch, tmp_path): """A failed check must not mark the throttle file. From 1c285ddb27235364249ab569beb7500b2becbd00 Mon Sep 17 00:00:00 2001 From: mohamed-elkholy95 Date: Sat, 30 May 2026 23:00:25 -0400 Subject: [PATCH 7/8] docs: document reliability fixes --- .pythinker/reports/arch-review-2026-05-30.md | 319 +++++++++++++ .pythinker/reports/deep-code-scan-findings.md | 426 ++++++++++++++++++ CHANGELOG.md | 15 + 3 files changed, 760 insertions(+) create mode 100644 .pythinker/reports/arch-review-2026-05-30.md create mode 100644 .pythinker/reports/deep-code-scan-findings.md diff --git a/.pythinker/reports/arch-review-2026-05-30.md b/.pythinker/reports/arch-review-2026-05-30.md new file mode 100644 index 0000000..e9e4cb0 --- /dev/null +++ b/.pythinker/reports/arch-review-2026-05-30.md @@ -0,0 +1,319 @@ +# Validated Architectural Review Report + +**Review scope:** `src/pythinker_code/background/`, `src/pythinker_code/memory/`, and `src/pythinker_code/auth/opencode_go.py` +**Validation date:** 2026-05-30 +**Validated against:** current working tree at `d3bf815627fdcfcf87f1f8abbabbb2d6749419a0` +**Status:** findings re-validated against the live tree (tests re-run, usages re-grepped). Each finding now carries a concrete, behavior-preserving fix. + +--- + +## Summary + +The original draft mixed valid cleanup opportunities with stale or incorrect findings. This version keeps only findings that still match the current codebase, and pairs each with the most robust fix that preserves existing behavior. + +**Current actionable findings:** + +1. `memory/retriever.py` still has an unused abstraction seam around `Retriever` / `LexicalRetriever` / `SqliteFts5Retriever`. +2. `background/manager.py` still duplicates locked task-runtime status mutation logic across six `_mark_task_*` methods. +3. `background/manager.py` still reaches into private store locking/write helpers where the public `update_runtime()` API can cover most cases. +4. `memory/recall.py` still passes a `store_path` argument that is immediately discarded. +5. `memory/consolidation.py` still calls the private `ProjectMemoryStore._ensure_dir()` method. + +**Optional cleanups:** + +- `Counter(doc)` can replace the manual term-frequency loop in `memory/retriever.py`. +- OpenCode Go response parsing can be clarified, but any Pydantic refactor must preserve current tolerant parsing behavior. + +**Corroborating evidence gathered during re-validation:** + +- The retriever seam (Finding 1) has **zero production references** — `SqliteFts5Retriever` and `sqlite_fts5_available` appear only in `retriever_sqlite.py` itself and one test; the only retriever used in the app path is `LexicalRetriever`. +- The migration target in Finding 3, `BackgroundTaskStore.update_runtime()`, is **already the established pattern** in this module (`manager.py:277`, `manager.py:659`, `worker.py:276`). The six `_mark_task_*` methods and the stale-recovery block are the remaining holdouts. +- `update_runtime_under_lock()` does **not** exist anywhere in the tree — no new lock-exposing API is needed. + +The stale and incorrect draft entries have been removed; the sections below list only current findings, robust fixes, and optional cleanups that still match the codebase. + +--- + +## Recommended fix order + +The findings interact. Applying them in this order avoids rework and keeps each diff behavior-preserving: + +1. **Finding 4** — drop the dead `store_path` parameter. This removes the `_ensure_dir()` call at `recall.py:252`. +2. **Finding 5** — add the public `ensure_root()` alias. After step 1, `consolidation.py:36` is the only external `_ensure_dir()` caller, so this isolates the change. +3. **Finding 2** — extract the status-transition helper. Route it through `update_runtime()`. +4. **Finding 3** — step 3 already eliminates the private `_runtime_lock` / `_write_runtime_unlocked` usage in the six mark-methods (`manager.py:821-912`). Only the stale-recovery block (`manager.py:598-626`) remains; decide its treatment per Finding 3 below. +5. **Finding 1** — delete the unused retriever seam. +6. **Optional A / B** — apply if desired; B is "leave as-is" by default. + +--- + +## Validated Findings + +### 1. Unused retriever abstraction and SQLite seam + +**Files:** + +- `src/pythinker_code/memory/retriever.py:7,49-54` +- `src/pythinker_code/memory/retriever_sqlite.py` (entire file) + +**Severity:** Low +**Category:** Overengineering / YAGNI + +`Retriever` is an abstract base class with no production polymorphic dispatch. `LexicalRetriever` is the real implementation used by recall (`recall.py:13,94`), while `SqliteFts5Retriever` is a capability seam that delegates directly to `LexicalRetriever`. + +Re-validation tightened this: `SqliteFts5Retriever` and `sqlite_fts5_available` have **no production callers at all** — they are referenced only inside `retriever_sqlite.py` and a single test (`tests/core/test_memory_phase_bcd.py:18,158`, `test_sqlite_retriever_falls_back_to_lexical`). There is no `memory/__init__.py` re-exporting these symbols, so nothing outside the package depends on them. + +**Robust fix:** + +Delete the dead seam outright — it is safe because there is no public export and no production dispatch: + +- Delete `src/pythinker_code/memory/retriever_sqlite.py`. +- In `retriever.py`, remove the `Retriever` ABC (lines 49-51) and make `LexicalRetriever` a plain class (`class LexicalRetriever:`). Remove the now-orphaned `from abc import ABC, abstractmethod` import (line 7). +- Keep the name `LexicalRetriever` — it is imported at `recall.py:13` and used at `recall.py:94`; renaming is pure churn for no behavioral gain. +- Delete the test `test_sqlite_retriever_falls_back_to_lexical` and its import at `tests/core/test_memory_phase_bcd.py:18`. (The fallback it asserts no longer exists once the wrapper is gone.) + +Do **not** reintroduce an FTS5 abstraction until there is a measured indexed implementation and a real dispatch path. If a future plugin contract genuinely needs polymorphism, add the protocol back at that point with a concrete second implementation — not before. + +--- + +### 2. Duplicated background task status mutation methods + +**File:** `src/pythinker_code/background/manager.py:821-922` +**Severity:** Medium +**Category:** Duplication / maintainability + +There are six near-identical status mutation methods, each performing the same locked read-check-mutate-write sequence: + +- `_mark_task_running` +- `_mark_task_awaiting_approval` +- `_mark_task_completed` +- `_mark_task_failed` +- `_mark_task_timed_out` +- `_mark_task_killed` + +**Semantics that must be preserved exactly:** + +- terminal states must not be overwritten (early no-op); +- `updated_at` is set to now on every applied transition; +- `_mark_task_running()` sets `heartbeat_at = updated_at` and clears `failure_reason`; +- `completed` clears `failure_reason`; `completed`/`failed`/`timed_out`/`killed` set `finished_at = updated_at`; `running`/`awaiting_approval` do **not** set `finished_at`; +- `timed_out` sets both `interrupted` and `timed_out`; `killed` sets `interrupted`; +- telemetry fires **only** when the transition was actually applied (not on the terminal no-op), guarded by `started_at and finished_at`, with reason labels unchanged: `completed` → `success=True` (no reason); `failed` → `reason="error"`; `timed_out` → `reason="timeout"`; `killed` → `reason="killed"`; `running`/`awaiting_approval` → no telemetry. + +**Robust fix:** + +Extract one private helper that owns the locked, terminal-guarded write via the **public** `update_runtime()` API, and signals whether the transition was applied so telemetry stays in the callers: + +```python +def _transition_status( + self, + task_id: str, + *, + mutate: Callable[[TaskRuntime], None], +) -> TaskRuntime | None: + """Locked, terminal-guarded status write via the public store API. + + Stamps ``updated_at`` then applies ``mutate``. Returns the resulting + runtime when the transition was applied, or ``None`` when the task was + already terminal (no write performed, so callers skip telemetry). + """ + applied = False + + def _apply(runtime: TaskRuntime) -> bool: + nonlocal applied + if is_terminal_status(runtime.status): + return False + runtime.updated_at = time.time() + mutate(runtime) + applied = True + return True + + runtime = self._store.update_runtime(task_id, _apply) + return runtime if applied else None +``` + +Each mark-method becomes a thin mutator plus its own telemetry. Two representative cases: + +```python +def _mark_task_running(self, task_id: str) -> None: + def mutate(r: TaskRuntime) -> None: + r.status = "running" + r.heartbeat_at = r.updated_at + r.failure_reason = None + + self._transition_status(task_id, mutate=mutate) + +def _mark_task_completed(self, task_id: str) -> None: + def mutate(r: TaskRuntime) -> None: + r.status = "completed" + r.finished_at = r.updated_at + r.failure_reason = None + + runtime = self._transition_status(task_id, mutate=mutate) + if runtime and runtime.started_at and runtime.finished_at: + from pythinker_code.telemetry import track + + track( + "background_task_completed", + success=True, + duration_s=runtime.finished_at - runtime.started_at, + ) +``` + +`_mark_task_failed` / `_mark_task_timed_out` / `_mark_task_killed` follow the same shape, each setting its status, `finished_at`, flags, and `failure_reason` in `mutate`, then emitting telemetry with its own `reason` label guarded by `if runtime and runtime.started_at and runtime.finished_at`. `_mark_task_awaiting_approval` uses only `mutate` with no telemetry. + +This collapses six bodies to one shared critical section while keeping each method's distinct mutation and telemetry explicit. Ensure `Callable` is imported (`from collections.abc import Callable`) if it is not already. + +--- + +### 3. Manager still uses private store lock/write internals + +**Files:** + +- `src/pythinker_code/background/manager.py:598-626` (stale-task recovery) +- `src/pythinker_code/background/manager.py:821-912` (the six mark-methods) +- `src/pythinker_code/background/store.py:143-153` (`update_runtime`) + +**Severity:** Medium +**Category:** Architectural coupling + +`BackgroundTaskManager` calls private store internals directly — `self._store._runtime_lock(...)` and `self._store._write_runtime_unlocked(...)` — in both the mark-methods and the stale-recovery block. The store already exposes `update_runtime(task_id, update_fn)`, which performs the locked read-modify-write and returns the resulting runtime, and which the manager **already uses** at `manager.py:277` and `manager.py:659` (and the worker at `worker.py:276`). No `update_runtime_under_lock()` API is needed and none exists. + +**Robust fix:** + +- **Six mark-methods (`821-912`):** resolved for free by Finding 2 — routing `_transition_status` through `update_runtime()` removes every private `_runtime_lock` / `_write_runtime_unlocked` call in these methods. +- **Stale-recovery block (`598-626`):** this block reads *both* `read_runtime` and `read_control` under the same lock and branches on `fresh_control.kill_requested_at` (`manager.py:600,615`). `update_runtime`'s callback is only *passed* the runtime, but it is a closure over `self._store`, so it can call `self._store.read_control(view.spec.id)` itself — that read executes inside the same `_runtime_lock` and does not deadlock, because `read_runtime`/`read_control` are lock-free (`store.py` takes `_runtime_lock` only in `write_runtime`/`update_runtime`; the current block already calls `read_control` while holding the lock). So this path *can* migrate with **no new store API**. Two reasonable options: + 1. **Leave it as-is.** The block is a single, well-commented critical section (`manager.py:595-597` explains *why* the lock spans the read and the write). Holding `_runtime_lock` here is correct and the private access is localized. Most surgical, lowest-risk — the default recommendation. + 2. **Migrate via a closure that reads control in-callback.** Move the body into a `recover_stale(runtime) -> bool` closure passed to `update_runtime()`; the early-outs (terminal / not-yet-stale) become `return False`, replacing today's `continue`. The surrounding per-view loop is unchanged — each iteration calls `update_runtime(view.spec.id, recover_stale)`: + + ```python + def recover_stale(runtime: TaskRuntime) -> bool: + if is_terminal_status(runtime.status): + return False + progress = ( + runtime.heartbeat_at or runtime.started_at + or runtime.updated_at or view.spec.created_at + ) + if now - progress <= stale_after: + return False + control = self._store.read_control(view.spec.id) # inside the lock via the closure + heartbeat_missing = runtime.heartbeat_at is None + runtime.finished_at = now + runtime.updated_at = now + if control.kill_requested_at is not None: + runtime.status = "killed" + runtime.interrupted = True + runtime.failure_reason = control.kill_reason or "Killed during recovery" + else: + runtime.status = "lost" + runtime.failure_reason = ( + "Background worker never heartbeat after startup" + if heartbeat_missing + else "Background worker heartbeat expired" + ) + return True + + self._store.update_runtime(view.spec.id, recover_stale) + ``` + +Neither option needs a new lock-exposing or recovery-specific store method. Prefer (1) for minimal churn, or (2) if you want zero private-internal access from the manager. + +--- + +### 4. `build_recall_block()` accepts an unused `store_path` + +**File:** `src/pythinker_code/memory/recall.py:86-97,252-258` +**Severity:** Low +**Category:** Dead parameter / unnecessary private access + +`build_recall_block()` accepts `store_path`, then immediately discards it with `_ = store_path` (`recall.py:97`). The call site computes that value through `self._store._ensure_dir()` (`recall.py:252`) purely to feed the dead parameter. `build_recall_block` is called only at `recall.py:253` and three tests, so the signature is safe to change. + +**Robust fix:** + +- Remove the `store_path` parameter from `build_recall_block()` (`recall.py:92`) and delete the `_ = store_path` discard line (`recall.py:97`). +- At the call site, delete `store_root = await self._store._ensure_dir()` (`recall.py:252`) and the `store_path=str(store_root / "memory")` argument (`recall.py:258`). This also removes one private `_ensure_dir()` usage. +- Update the three call sites that pass the kwarg, dropping `store_path=...`: + - `tests/core/test_recall_provider.py:39` (`test_build_recall_block_includes_open_todos_and_facts`) + - `tests/core/test_recall_provider.py:52` (`test_build_recall_block_empty_when_nothing`) + - `tests/core/test_recall_provider.py:63` (`test_build_recall_block_open_todos_only_does_not_suggest_missing_files`) + +The recall output is unchanged — the parameter and its value were never used in the block. The only incidental difference is that recall no longer eagerly creates the store directory via `_ensure_dir()`; the recall read path (candidates are passed in already loaded) does not depend on that side effect. + +--- + +### 5. Memory consolidation calls a private store method + +**File:** `src/pythinker_code/memory/consolidation.py:35-37` +**Severity:** Low +**Category:** Architectural coupling + +`inbox_dir()` calls `ProjectMemoryStore._ensure_dir()` from outside `project_memory.py` and suppresses the private-usage warning (`consolidation.py:36`). After Finding 4 removes the `recall.py:252` usage, `consolidation.py:36` is the **only** external `_ensure_dir()` caller. + +**Robust fix:** + +- Add a public async alias on `ProjectMemoryStore` that delegates to the existing implementation: + + ```python + async def ensure_root(self) -> Path: + """Public entry point for ``_ensure_dir`` used by collaborators.""" + return await self._ensure_dir() + ``` + +- Update `consolidation.py:36` to `root = await store.ensure_root()` and drop the `# pyright: ignore[reportPrivateUsage]` suppression. + +Keep `_ensure_dir()` as the single source of truth; `ensure_root()` is only a public surface so collaborators don't reach into a private method. (Naming note: the background and notifications stores use a private `_ensure_root`; the public `ensure_root` here is intentional and reads cleanly as the published method.) + +--- + +## Optional Cleanups + +### A. Use `collections.Counter` for term frequency + +**File:** `src/pythinker_code/memory/retriever.py:76-78` +**Severity:** Low + +Replace the manual term-frequency loop: + +```python +tf: dict[str, int] = {} +for term in doc: + tf[term] = tf.get(term, 0) + 1 +``` + +with `tf = Counter(doc)` (`from collections import Counter`). This is a drop-in: the scoring loop guards every access with `if term not in tf: continue` before reading `tf[term]`, so `Counter`'s default-zero behavior changes nothing. Readability cleanup only — not a correctness issue. The adjacent document-frequency loop (`retriever.py:67-70`) could likewise use `df.update(set(doc))`, but leave it unless you are already touching that block. + +--- + +### B. OpenCode Go response parsing can be made clearer, but not stricter by accident + +**File:** `src/pythinker_code/auth/opencode_go.py:179-228` +**Severity:** Low/Medium + +The current implementation manually validates `/models` and `models.dev` payloads with `isinstance()` and casts. The behavior is intentionally tolerant: + +- malformed top-level payloads return an empty result (`_extract_model_ids` → `[]`, `_parse_models_dev_metadata` → `{}`); +- malformed list/dict entries are skipped (`continue`) while valid entries are kept; +- models.dev enrichment is best-effort and must not break login. + +**Robust fix: prefer leaving this as-is.** The current `isinstance`/`cast` code already encodes exactly the tolerance required, and a Pydantic rewrite risks silently regressing it for no functional gain. If clarity is the goal, add a short comment documenting the tolerance contract rather than rewriting the parser. + +If a Pydantic refactor is nonetheless mandated, it must preserve those semantics: validate entries **item-by-item** with `model_config = ConfigDict(extra="ignore")`, skipping individual `ValidationError`s, and never reject the whole response because one entry is malformed. Top-level shape mismatch must still yield empty results, and enrichment failure must never propagate into the login path. + +--- + +## Verification Performed + +Targeted validation command (re-run during this pass): + +```bash +uv run pytest \ + tests/background/test_manager.py::test_recover_agent_view_does_not_clobber_terminal_runtime_from_stale_view \ + tests/background/test_manager.py::test_mark_task_completed_is_lock_protected \ + tests/background/test_worker.py::test_worker_completes_successfully \ + tests/core/test_memory_phase_bcd.py::test_sqlite_retriever_falls_back_to_lexical \ + tests/core/test_recall_provider.py::test_build_recall_block_includes_open_todos_and_facts +``` + +Result: **5 passed** (re-confirmed 2026-05-30, `0.08s`). + +Note: after applying Finding 1 (delete the retriever seam) and Finding 4 (drop `store_path`), two of these tests change by design — `test_sqlite_retriever_falls_back_to_lexical` is deleted with the seam, and `test_build_recall_block_includes_open_todos_and_facts` drops its `store_path` kwarg. Re-run the full `tests/background` and `tests/core` suites after each fix to confirm no behavioral regressions. diff --git a/.pythinker/reports/deep-code-scan-findings.md b/.pythinker/reports/deep-code-scan-findings.md new file mode 100644 index 0000000..3308f05 --- /dev/null +++ b/.pythinker/reports/deep-code-scan-findings.md @@ -0,0 +1,426 @@ +# Deep Code Scan: Architectural Criticism & Bug Analysis + +**Date:** 2026-05-30 +**Scope:** Recent changes across auth, background tasks, memory, soul loop, file tools, and UI components +**Analyzed by:** 4 parallel agents (architecture scout, bug hunter, simplicity reviewer, overengineering scanner) + +--- + +## Executive Summary + +Found **1 critical bug** (race condition in background task recovery), **3 high-severity architectural issues** (over-engineering, duplication, fragility), and **5 medium-severity elegance violations**. The codebase shows signs of defensive over-engineering and speculative abstractions that could be simplified. + +--- + +## CRITICAL: The Known Bug + +### Race Condition in Background Task Recovery + +**Location:** `src/pythinker_code/background/manager.py:628-656` in `_recover_agent_view()` + +**The Problem:** The method reads the task runtime **without holding the lock**, then later writes a "recoverable" status **with the lock**. Between the read and write, the task could complete normally, causing the authoritative "completed" status to be overwritten with "recoverable". + +**Root Cause:** +```python +def _recover_agent_view(self, view: TaskView, *, now: float, live_agent_ids: set[str]) -> None: + runtime_status: TaskStatus = view.runtime.status # ← READ WITHOUT LOCK + if not is_terminal_status(runtime_status): + if view.spec.id in self._live_agent_tasks: + return + runtime = view.runtime.model_copy() + runtime.status = "recoverable" if agent_id is not None else "lost" + self._store.write_runtime(view.spec.id, runtime) # ← WRITE WITH LOCK +``` + +The safety check in `_write_runtime_unlocked` only prevents overwriting a terminal status with a non-terminal one: +```python +if is_terminal_status(current.status) and not is_terminal_status(runtime.status): + return +``` + +But both "completed" and "recoverable" are terminal, so the check passes and the overwrite happens. + +**Impact:** +- Task appears "recoverable" when it actually completed successfully +- User may unnecessarily resume a task that already finished +- Subagent status reconciliation may incorrectly mark the instance as "idle" instead of respecting the completed state + +**Reproduction:** +1. Start a background agent task +2. Task completes and writes `status = "completed"` to disk +3. Before `reconcile()` runs, the manager's `_live_agent_tasks` dict is cleared (e.g., process restart) +4. `recover()` is called, sees the task as "running" (stale view), marks it "recoverable" +5. The authoritative "completed" status is lost + +**The Fix:** +```python +def _recover_agent_view(self, view: TaskView, *, now: float, live_agent_ids: set[str]) -> None: + agent_id_raw = (view.spec.kind_payload or {}).get("agent_id") + agent_id = agent_id_raw if isinstance(agent_id_raw, str) else None + + # Re-read under lock to get authoritative current state + with self._store._runtime_lock(view.spec.id): + runtime = self._store.read_runtime(view.spec.id) + if is_terminal_status(runtime.status): + # Already terminal, just reconcile subagent status + self._reconcile_subagent_status(agent_id, runtime.status, live_agent_ids) + return + + if view.spec.id in self._live_agent_tasks: + return + + # Mark as recoverable/lost + runtime.finished_at = now + runtime.updated_at = now + runtime.status = "recoverable" if agent_id is not None else "lost" + runtime.failure_reason = ( + "In-process background agent is no longer running; resume the stored agent " + f"instance {agent_id} to continue." + if agent_id is not None + else "In-process background agent is no longer running" + ) + self._store._write_runtime_unlocked(view.spec.id, runtime) + + self._reconcile_subagent_status(agent_id, runtime.status, live_agent_ids) +``` + +--- + +## HIGH: Overengineering & YAGNI Violations + +### 1. Speculative Abstract Retriever Class + +**Location:** `src/pythinker_code/memory/retriever.py:49-52` + +**The Problem:** The `Retriever` abstract base class defines a single `retrieve` method, but only one implementation (`LexicalRetriever`) exists. This is speculative infrastructure - an abstraction with no second implementation to justify it. + +**The Code:** +```python +class Retriever(ABC): + @abstractmethod + async def retrieve(self, query: RecallQuery, budget_tokens: int) -> list[RankedBlock]: ... + +class LexicalRetriever(Retriever): + """Hand-rolled BM25 + recency decay + label/path boost. Stdlib only.""" + # ... only implementation +``` + +**The Fix:** Remove the abstract class and make `LexicalRetriever` a concrete class. If a second retriever is needed later, introduce the abstraction then. + +```python +class LexicalRetriever: + """Hand-rolled BM25 + recency decay + label/path boost. Stdlib only.""" + + async def retrieve(self, query: RecallQuery, budget_tokens: int) -> list[RankedBlock]: + # ... implementation +``` + +--- + +### 2. Duplicated Background Task State Management + +**Location:** `src/pythinker_code/background/manager.py:817-918` + +**The Problem:** Four nearly identical methods (`_mark_task_running`, `_mark_task_completed`, `_mark_task_failed`, `_mark_task_timed_out`, `_mark_task_killed`) share the same lock→read→check→write→telemetry skeleton. This violates DRY and creates maintenance burden. + +**The Code:** +```python +def _mark_task_completed(self, task_id: str) -> None: + with self._store._runtime_lock(task_id): + runtime = self._store.read_runtime(task_id) + if is_terminal_status(runtime.status): + return + runtime.status = "completed" + runtime.updated_at = time.time() + runtime.finished_at = runtime.updated_at + runtime.failure_reason = None + self._store._write_runtime_unlocked(task_id, runtime) + # telemetry... + +def _mark_task_failed(self, task_id: str, reason: str) -> None: + with self._store._runtime_lock(task_id): + runtime = self._store.read_runtime(task_id) + if is_terminal_status(runtime.status): + return + runtime.status = "failed" + runtime.updated_at = time.time() + runtime.finished_at = runtime.updated_at + runtime.failure_reason = reason + self._store._write_runtime_unlocked(task_id, runtime) + # telemetry... + +# ... 3 more similar methods +``` + +**The Fix:** Extract a single generic method: + +```python +def _mark_task_terminal( + self, + task_id: str, + status: TaskStatus, + *, + reason: str | None = None, + interrupted: bool = False, + timed_out: bool = False, +) -> None: + with self._store._runtime_lock(task_id): + runtime = self._store.read_runtime(task_id) + if is_terminal_status(runtime.status): + return + runtime.status = status + runtime.updated_at = time.time() + runtime.finished_at = runtime.updated_at + runtime.failure_reason = reason + runtime.interrupted = interrupted + runtime.timed_out = timed_out + self._store._write_runtime_unlocked(task_id, runtime) + + # Telemetry + if runtime.started_at and runtime.finished_at: + from pythinker_code.telemetry import track + duration = runtime.finished_at - runtime.started_at + track( + "background_task_completed", + success=(status == "completed"), + duration_s=duration, + reason="timeout" if timed_out else ("killed" if interrupted else ("error" if status == "failed" else None)), + ) + +# Then the specific methods become one-liners: +def _mark_task_completed(self, task_id: str) -> None: + self._mark_task_terminal(task_id, "completed") + +def _mark_task_failed(self, task_id: str, reason: str) -> None: + self._mark_task_terminal(task_id, "failed", reason=reason) + +def _mark_task_timed_out(self, task_id: str, reason: str) -> None: + self._mark_task_terminal(task_id, "failed", reason=reason, interrupted=True, timed_out=True) + +def _mark_task_killed(self, task_id: str, reason: str) -> None: + self._mark_task_terminal(task_id, "killed", reason=reason, interrupted=True) +``` + +--- + +### 3. Fragile Markdown Table Repair Heuristics + +**Location:** `src/pythinker_code/ui/shell/components/markdown.py` (400+ lines across multiple functions) + +**The Problem:** Complex heuristic logic for repairing malformed markdown tables is brittle and requires ongoing maintenance as LLM output patterns change. The code attempts to fix common table formatting errors but lacks clear contracts for what constitutes "valid" vs "repairable" input. + +**Evidence:** Architecture scout identified this as a "fragility risk" - changes to LLM output patterns could require ongoing maintenance. + +**The Fix:** Two options: + +**Option A (Simpler):** Accept that LLM-generated tables may be malformed and render them as-is, letting the user see the raw output. Document the limitation. + +**Option B (More robust):** Define a strict contract for table validation and repair, with clear test cases for each heuristic. Move the repair logic to a separate, well-tested module with explicit input/output examples. + +Recommend **Option A** for simplicity - the repair heuristics are solving a problem that may not be worth the complexity. + +--- + +## MEDIUM: Elegance & Simplicity Violations + +### 1. Unreachable Null Guard in Retriever + +**Location:** `src/pythinker_code/memory/retriever.py:66` + +**The Problem:** The `if n` guard is unreachable because the function already returns early on empty candidates at line 62. + +**The Code:** +```python +async def retrieve(self, query: RecallQuery, budget_tokens: int) -> list[RankedBlock]: + if not self._candidates or budget_tokens <= 0: + return [] + docs = [_tokenize(c.content + " " + c.title) for c in self._candidates] + n = len(docs) + avgdl = sum(len(d) for d in docs) / n if n else 0.0 # ← `if n` is unreachable +``` + +**The Fix:** +```python +avgdl = sum(len(d) for d in docs) / n +``` + +--- + +### 2. Manual Term-Frequency Dictionary Building + +**Location:** `src/pythinker_code/memory/retriever.py:76-78` + +**The Problem:** Manual loop to build a term-frequency dict when `collections.Counter` does this in one line. + +**The Code:** +```python +tf: dict[str, int] = {} +for term in doc: + tf[term] = tf.get(term, 0) + 1 +``` + +**The Fix:** +```python +from collections import Counter +tf = Counter(doc) +``` + +--- + +### 3. Dead Bool Return in Worker Finish Callback + +**Location:** `src/pythinker_code/background/worker.py:250-274` + +**The Problem:** The `finish_runtime` callback always returns `True`, but the return value is used to decide whether to write the runtime. Since it always returns `True`, the return is dead code. + +**The Code:** +```python +def finish_runtime(runtime: TaskRuntime) -> bool: + runtime.finished_at = time.time() + # ... mutations ... + return True # ← always True, never False + +store.update_runtime(task_id, finish_runtime) +``` + +**The Fix:** Either: +- Remove the return and change `update_runtime` to always write +- Or make the return meaningful (e.g., return False if the runtime is already terminal) + +--- + +### 4. Trivial Wrapper Method + +**Location:** `src/pythinker_code/background/manager.py:148-155` + +**The Problem:** `active_task_count()` is a trivial one-line wrapper around `_active_task_count()`. One of them is unnecessary. + +**The Code:** +```python +def _active_task_count(self) -> int: + return sum( + 1 for view in self._store.list_views() if not is_terminal_status(view.runtime.status) + ) + +def active_task_count(self) -> int: + """Return the number of non-terminal background tasks.""" + return self._active_task_count() +``` + +**The Fix:** Merge into a single public method: +```python +def active_task_count(self) -> int: + """Return the number of non-terminal background tasks.""" + return sum( + 1 for view in self._store.list_views() if not is_terminal_status(view.runtime.status) + ) +``` + +--- + +### 5. Heavy isinstance Guard Chains on Controlled JSON + +**Location:** `src/pythinker_code/auth/opencode_go.py:179-193, 196-228` + +**The Problem:** Extensive `isinstance` checks on JSON API responses that are controlled by the API contract. The code is defensively checking every level of the JSON structure when it could use try/except or Pydantic validation. + +**The Code:** +```python +def _extract_model_ids(data: object) -> list[str]: + if not isinstance(data, dict): + return [] + raw_items = cast(dict[str, Any], data).get("data") + if not isinstance(raw_items, list): + return [] + ids: list[str] = [] + for item in cast(list[Any], raw_items): + if not isinstance(item, dict): + continue + model_id = cast(dict[str, Any], item).get("id") + if isinstance(model_id, str) and model_id: + ids.append(model_id) + return ids +``` + +**The Fix:** Use Pydantic models to validate the API response structure: +```python +from pydantic import BaseModel + +class ModelsResponse(BaseModel): + data: list[ModelItem] + +class ModelItem(BaseModel): + id: str + +def _extract_model_ids(data: object) -> list[str]: + try: + response = ModelsResponse.model_validate(data) + return [item.id for item in response.data] + except (ValidationError, TypeError): + return [] +``` + +--- + +## Architectural Coherence Issues + +### Cross-Process Locking Complexity + +**Location:** `src/pythinker_code/background/store.py:116-137` + +**The Problem:** The background task system uses cross-process file locking (`fcntl.flock`) to coordinate between the manager and worker processes. This is necessary but adds significant complexity, and race conditions in this area could be difficult to debug. + +**Assessment:** This is **acceptable complexity** given the requirement for cross-process coordination, but the locking logic should be better documented and tested. + +**Recommendation:** Add integration tests that specifically exercise the locking behavior under concurrent access. + +--- + +## Positive Findings + +The following areas were flagged as **well-designed** despite initial suspicion: + +1. **API error classification** (`opencode_go.py:137-174`) - Necessary complexity for telemetry, properly exposed for testing +2. **StrReplaceFile tool** (`tools/file/replace.py`) - Clean validation logic, correct batch edit handling +3. **Background task store** (`background/store.py`) - Proper use of atomic writes, good fallback handling + +--- + +## Recommendations Summary + +### Immediate (Critical) +1. **Fix the race condition** in `_recover_agent_view` by holding the lock during the entire read-check-write sequence + +### Short-term (High Value) +2. **Remove the abstract Retriever class** - YAGNI violation with no second implementation +3. **Consolidate the `_mark_task_*` methods** - Reduce duplication and maintenance burden +4. **Simplify markdown table handling** - Either document limitations or add proper contracts/tests + +### Medium-term (Elegance) +5. **Remove unreachable code** (null guard in retriever) +6. **Use `collections.Counter`** for term-frequency counting +7. **Remove dead return values** (finish_runtime callback) +8. **Merge trivial wrapper methods** (active_task_count) +9. **Use Pydantic for API validation** instead of manual isinstance chains + +--- + +## Files Analyzed + +- `src/pythinker_code/auth/opencode_go.py` +- `src/pythinker_code/background/manager.py` +- `src/pythinker_code/background/store.py` +- `src/pythinker_code/background/worker.py` +- `src/pythinker_code/memory/consolidation.py` +- `src/pythinker_code/memory/retriever.py` +- `src/pythinker_code/soul/pythinkersoul.py` +- `src/pythinker_code/soul/toolset.py` +- `src/pythinker_code/tools/file/replace.py` +- `src/pythinker_code/ui/shell/components/markdown.py` +- `src/pythinker_code/ui/shell/components/report.py` + +--- + +**Report generated by:** Pythinker Deep Code Scan +**Agents:** architecture-scout, bug-hunter, simplicity-reviewer, overengineering-scanner +**Cross-validated against:** Live code reads and manual inspection diff --git a/CHANGELOG.md b/CHANGELOG.md index cfde48c..bda9e5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,21 @@ GitHub Releases page; `0.8.0` is the new starting line. ## Unreleased +- **Fresh releases surface in the startup update prompt immediately.** The pre-start + update prompt now revalidates a cached "already current" answer with a bounded + conditional request instead of waiting for the 24-hour background-check throttle. + Release promotion also waits for exact native assets, PyPI, and the Homebrew tap + before marking a version latest, so installed clients no longer resolve a release + before its install channel is ready. +- **Background agent recovery preserves terminal task state.** Recovery now re-reads + task runtime under the store update lock before marking orphaned agent work + recoverable, so a stale snapshot can no longer clobber a task that completed. +- **Project memory recall keeps the simple lexical path.** The unused SQLite FTS + retriever seam and dead recall path argument are removed; collaborators now use + a public memory-store root accessor instead of reaching into private methods. +- **Shell prompt echoes and background todo rows are clearer.** Transcript echoes now + show the resolved submitted text for pasted-content placeholders, and background + todo rows align continuation icons with the first row. - **StrReplaceFile now refuses ambiguous single replacements.** A non-`replace_all` edit now errors when `old` matches more than once instead of silently editing the first match. Add surrounding context to make the old string unique, or pass `replace_all=true` when every From 1faa86df8ea7124722e110b7f805b8ca60ec48aa Mon Sep 17 00:00:00 2001 From: mohamed-elkholy95 Date: Sat, 30 May 2026 23:09:31 -0400 Subject: [PATCH 8/8] test: update reliability regressions --- tests/core/test_default_agent.py | 3 ++- tests/telemetry/test_instrumentation.py | 25 +++++++++++++++----- tests/test_release_update_pipeline.py | 31 +++++++++++++++---------- 3 files changed, 40 insertions(+), 19 deletions(-) diff --git a/tests/core/test_default_agent.py b/tests/core/test_default_agent.py index 13c459a..2b8a9fd 100644 --- a/tests/core/test_default_agent.py +++ b/tests/core/test_default_agent.py @@ -97,7 +97,8 @@ async def test_default_agent(runtime: Runtime): - No "flexibility" or "configurability" that wasn't requested. - No error handling for impossible scenarios; validate at boundaries only. - If a 200-line draft could be 50 lines, rewrite it before showing it. -- Self-check: *"Would a senior engineer call this overcomplicated?"* If yes, simplify. +- Over-fragmentation is overcomplication too: don't scatter logic across many tiny files or extra abstraction layers to satisfy a design pattern. Match the codebase's existing granularity. +- Self-check: *"Would a senior engineer call this overcomplicated or over-engineered?"* If yes, simplify. **3. Goal-driven execution — define success criteria, then loop until verified.** - Transform vague tasks into verifiable goals before writing code: diff --git a/tests/telemetry/test_instrumentation.py b/tests/telemetry/test_instrumentation.py index b9057fa..ab43330 100644 --- a/tests/telemetry/test_instrumentation.py +++ b/tests/telemetry/test_instrumentation.py @@ -416,6 +416,22 @@ def test_queue_overflow_preserves_newest(self): # --------------------------------------------------------------------------- +def _mock_store_for(runtime: Any) -> MagicMock: + """A mock BackgroundTaskStore whose ``update_runtime`` runs the callback and + returns the (mutated) runtime, mirroring the real read-modify-write store so + the manager's ``_transition_status`` path is actually exercised.""" + store = MagicMock() + store.read_runtime.return_value = runtime + + def _update(task_id: str, update_fn: Any) -> Any: + rt = store.read_runtime(task_id) + update_fn(rt) + return rt + + store.update_runtime.side_effect = _update + return store + + class TestEventPropertyCorrectness: """Verify specific events carry the right property types and values.""" @@ -608,8 +624,7 @@ def test_background_task_no_event_without_start_time(self): from pythinker_code.background.models import TaskRuntime runtime = TaskRuntime(status="running", started_at=None) - mock_store = MagicMock() - mock_store.read_runtime.return_value = runtime + mock_store = _mock_store_for(runtime) manager = object.__new__(BackgroundTaskManager) manager._store = mock_store @@ -626,8 +641,7 @@ def test_mark_task_killed_emits_completed_event(self): runtime = TaskRuntime(status="running", started_at=1000.0) - mock_store = MagicMock() - mock_store.read_runtime.return_value = runtime + mock_store = _mock_store_for(runtime) manager = object.__new__(BackgroundTaskManager) manager._store = mock_store @@ -647,8 +661,7 @@ def test_mark_task_killed_no_event_without_start_time(self): from pythinker_code.background.models import TaskRuntime runtime = TaskRuntime(status="running", started_at=None) - mock_store = MagicMock() - mock_store.read_runtime.return_value = runtime + mock_store = _mock_store_for(runtime) manager = object.__new__(BackgroundTaskManager) manager._store = mock_store diff --git a/tests/test_release_update_pipeline.py b/tests/test_release_update_pipeline.py index 131c170..b2c281d 100644 --- a/tests/test_release_update_pipeline.py +++ b/tests/test_release_update_pipeline.py @@ -36,7 +36,7 @@ def test_release_is_promoted_only_after_update_assets_are_ready() -> None: # Promotion clears prerelease AND marks latest, and only after the wait. assert "-F prerelease=false" in promote_workflow assert "-f make_latest=true" in promote_workflow - assert promote_workflow.index("Wait for all release assets") < promote_workflow.index( + assert promote_workflow.index("Wait for install-channel readiness") < promote_workflow.index( "-F prerelease=false" ) @@ -67,15 +67,22 @@ def test_install_scripts_gate_on_asset_readiness() -> None: def test_release_asset_wait_covers_all_updater_channels() -> None: promote_workflow = (WORKFLOWS / "promote-release.yml").read_text() - for expected_asset_fragment in ( - "PythinkerSetup-", - "_amd64.deb", - "_arm64.deb", - ".x86_64.rpm", - ".aarch64.rpm", - "x86_64-unknown-linux-gnu.tar.gz", - "aarch64-unknown-linux-gnu.tar.gz", - "aarch64-apple-darwin.tar.gz", - "x86_64-apple-darwin.tar.gz", + for expected_readiness_marker in ( + "PythinkerSetup-${version}.exe.sha256", + "pythinker-code_${version}_amd64.deb.sha256", + "pythinker-code_${version}_arm64.deb.sha256", + "pythinker-code-${version}.x86_64.rpm.sha256", + "pythinker-code-${version}.aarch64.rpm.sha256", + "pythinker-${version}-x86_64-unknown-linux-gnu.tar.gz.sha256", + "pythinker-${version}-aarch64-unknown-linux-gnu.tar.gz.sha256", + "pythinker-${version}-aarch64-apple-darwin.tar.gz.sha256", + "pythinker-${version}-x86_64-apple-darwin.tar.gz.sha256", + "pythinker-${version}-x86_64-unknown-linux-gnu-onedir.tar.gz.sha256", + "pythinker-${version}-aarch64-unknown-linux-gnu-onedir.tar.gz.sha256", + "pythinker-${version}-aarch64-apple-darwin-onedir.tar.gz.sha256", + "pythinker-${version}-x86_64-apple-darwin-onedir.tar.gz.sha256", + "https://pypi.org/pypi/pythinker-code/${version}/json", + "raw.githubusercontent.com/TechMatrix-labs/homebrew-pythinker", + 'version \\"${version}\\"', ): - assert expected_asset_fragment in promote_workflow + assert expected_readiness_marker in promote_workflow