diff --git a/CHANGELOG.md b/CHANGELOG.md index 1aff7ee8..cfde48c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,11 @@ GitHub Releases page; `0.8.0` is the new starting line. ## Unreleased +- **StrReplaceFile now refuses ambiguous single replacements.** A non-`replace_all` edit now + errors when `old` matches more than once instead of silently editing the first match. Add + surrounding context to make the old string unique, or pass `replace_all=true` when every + occurrence should change. + ## 0.26.0 (2026-05-30) ### What changed in this release diff --git a/src/pythinker_code/background/manager.py b/src/pythinker_code/background/manager.py index 7c1e74d9..6732e4cc 100644 --- a/src/pythinker_code/background/manager.py +++ b/src/pythinker_code/background/manager.py @@ -262,15 +262,19 @@ def create_bash_task( self._store.write_runtime(task_id, runtime) raise - runtime = self._store.read_runtime(task_id) - if runtime.finished_at is None and ( - runtime.status == "created" - or (runtime.status == "starting" and runtime.worker_pid is None) - ): + def mark_worker_started(runtime: TaskRuntime) -> bool: + if runtime.finished_at is not None: + return False + if runtime.status != "created" and not ( + runtime.status == "starting" and runtime.worker_pid is None + ): + return False runtime.status = "starting" runtime.worker_pid = worker_pid runtime.updated_at = time.time() - self._store.write_runtime(task_id, runtime) + return True + + self._store.update_runtime(task_id, mark_worker_started) view = self._store.merged_view(task_id) self._journal_task_milestone("background task started", view) return view diff --git a/src/pythinker_code/background/store.py b/src/pythinker_code/background/store.py index cf4397e5..d628713a 100644 --- a/src/pythinker_code/background/store.py +++ b/src/pythinker_code/background/store.py @@ -6,6 +6,7 @@ import re import shutil import time +from collections.abc import Callable from contextlib import contextmanager from pathlib import Path @@ -139,6 +140,18 @@ def write_runtime(self, task_id: str, runtime: TaskRuntime) -> None: with self._runtime_lock(task_id): self._write_runtime_unlocked(task_id, runtime) + def update_runtime(self, task_id: str, update: Callable[[TaskRuntime], bool]) -> TaskRuntime: + """Apply a read-modify-write update under the runtime lock. + + ``update`` mutates the current runtime and returns true when it should + be written back. The current runtime is returned either way. + """ + with self._runtime_lock(task_id): + runtime = self.read_runtime(task_id) + if update(runtime): + self._write_runtime_unlocked(task_id, runtime) + return runtime + def _write_runtime_unlocked(self, task_id: str, runtime: TaskRuntime) -> None: """Write runtime without acquiring the per-task lock (caller holds it).""" path = self.runtime_path(task_id) diff --git a/src/pythinker_code/background/worker.py b/src/pythinker_code/background/worker.py index bac930b7..62f80fa0 100644 --- a/src/pythinker_code/background/worker.py +++ b/src/pythinker_code/background/worker.py @@ -12,7 +12,7 @@ from pythinker_code.utils.logging import logger from pythinker_code.utils.subprocess_env import get_clean_env -from .models import TaskControl +from .models import TaskControl, TaskRuntime from .store import BackgroundTaskStore @@ -112,9 +112,9 @@ async def _terminate_process(force: bool = False) -> None: async def _check_output_limit() -> None: """Terminate the task if its output.log grew past ``max_output_bytes``. - Writes a single marker line and records a failure the first time the - limit is hit; the ``output_limit_exceeded`` guard keeps it from - re-marking on subsequent polls or once the process is already exiting. + Writes a single marker line and asks the process to terminate the first + time the limit is hit; the final runtime write records the failure once + the process has exited. """ nonlocal output_limit_exceeded, output_limit_reason if max_output_bytes <= 0 or output_limit_exceeded: @@ -133,14 +133,6 @@ async def _check_output_limit() -> None: marker = f"\n... output limit exceeded ({size} bytes); task terminated ...\n" with contextlib.suppress(OSError), output_path.open("ab") as marker_file: marker_file.write(marker.encode("utf-8")) - with store._runtime_lock(task_id): # pyright: ignore[reportPrivateUsage] - current = store.read_runtime(task_id) - if not current.finished_at: - current.status = "failed" - current.interrupted = True - current.failure_reason = output_limit_reason - current.updated_at = time.time() - store._write_runtime_unlocked(task_id, current) # pyright: ignore[reportPrivateUsage] await _terminate_process(force=False) async def _control_loop() -> None: @@ -211,7 +203,6 @@ async def _input_loop() -> None: runtime.updated_at = time.time() runtime.heartbeat_at = runtime.updated_at store.write_runtime(task_id, runtime) - last_known_runtime = runtime heartbeat_task = asyncio.create_task(_heartbeat_loop()) control_task = asyncio.create_task(_control_loop()) @@ -254,29 +245,32 @@ async def _input_loop() -> None: with contextlib.suppress(asyncio.CancelledError): await task - runtime = last_known_runtime.model_copy() control = store.read_control(task_id) - runtime.finished_at = time.time() - runtime.updated_at = runtime.finished_at - runtime.exit_code = returncode - runtime.heartbeat_at = runtime.finished_at - if output_limit_exceeded: - runtime.status = "failed" - runtime.interrupted = True - runtime.failure_reason = output_limit_reason - elif timed_out: - runtime.status = "failed" - runtime.interrupted = True - runtime.timed_out = True - runtime.failure_reason = timeout_reason - elif control.kill_requested_at is not None: - runtime.status = "killed" - runtime.interrupted = True - runtime.failure_reason = control.kill_reason or "Killed" - elif returncode == 0: - runtime.status = "completed" - runtime.failure_reason = None - else: - runtime.status = "failed" - runtime.failure_reason = f"Command failed with exit code {returncode}" - store.write_runtime(task_id, runtime) + + def finish_runtime(runtime: TaskRuntime) -> bool: + runtime.finished_at = time.time() + runtime.updated_at = runtime.finished_at + runtime.exit_code = returncode + runtime.heartbeat_at = runtime.finished_at + if output_limit_exceeded: + runtime.status = "failed" + runtime.interrupted = True + runtime.failure_reason = output_limit_reason + elif timed_out: + runtime.status = "failed" + runtime.interrupted = True + runtime.timed_out = True + runtime.failure_reason = timeout_reason + elif control.kill_requested_at is not None: + runtime.status = "killed" + runtime.interrupted = True + runtime.failure_reason = control.kill_reason or "Killed" + elif returncode == 0: + runtime.status = "completed" + runtime.failure_reason = None + else: + runtime.status = "failed" + runtime.failure_reason = f"Command failed with exit code {returncode}" + return True + + store.update_runtime(task_id, finish_runtime) diff --git a/src/pythinker_code/memory/consolidation.py b/src/pythinker_code/memory/consolidation.py index 380b2813..9381392e 100644 --- a/src/pythinker_code/memory/consolidation.py +++ b/src/pythinker_code/memory/consolidation.py @@ -28,6 +28,10 @@ def _safe_id(value: str) -> str: return re.sub(r"[^a-z0-9_-]", "-", value.lower())[:32].strip("-") or "candidate" +def _memory_entry_hash(content: str) -> str: + return content_hash(tier="memory", title=content[:60], body=content) + + async def inbox_dir(store: ProjectMemoryStore) -> Path: root = await store._ensure_dir() # pyright: ignore[reportPrivateUsage] path = root / "memory" / "inbox" @@ -53,16 +57,16 @@ async def generate_inbox_candidates( ) -> list[InboxCandidate]: """Stage scratch/journal candidates for approval-gated durable memory consolidation.""" existing_entries = [*await store.read_entries("memory"), *await store.read_entries("user")] - existing_hashes = { - content_hash(tier="memory", title=entry[:60], body=entry) for entry in existing_entries - } - staged = {candidate.content_hash for candidate in await list_inbox_candidates(store)} + existing_hashes = {_memory_entry_hash(entry) for entry in existing_entries} + inbox_candidates = await list_inbox_candidates(store) + staged = {candidate.content_hash for candidate in inbox_candidates} + staged.update(_memory_entry_hash(candidate.content) for candidate in inbox_candidates) directory = await inbox_dir(store) candidates: list[InboxCandidate] = [] for block in await gather_candidates(store, work_dir): if block.tier in {"memory", "user"}: continue - digest = content_hash(tier="memory", title=block.title, body=block.content) + digest = _memory_entry_hash(block.content) if digest in existing_hashes or digest in staged: continue candidate = InboxCandidate( @@ -74,7 +78,14 @@ async def generate_inbox_candidates( content_hash=digest, ) path = directory / f"{candidate.id}.json" - fd = os.open(path, os.O_WRONLY | os.O_CREAT | os.O_EXCL, 0o600) + try: + fd = os.open(path, os.O_WRONLY | os.O_CREAT | os.O_EXCL, 0o600) + except FileExistsError: + # A file with this id already exists on disk but was absent from + # ``staged`` (e.g. it is corrupt and was skipped during listing). + # Treat it as already staged instead of crashing the whole harvest. + staged.add(digest) + continue with os.fdopen(fd, "w", encoding="utf-8") as fh: json.dump(asdict(candidate), fh, ensure_ascii=False, indent=2) candidates.append(candidate) diff --git a/src/pythinker_code/memory/retriever.py b/src/pythinker_code/memory/retriever.py index aaef3dbc..c8f8fedd 100644 --- a/src/pythinker_code/memory/retriever.py +++ b/src/pythinker_code/memory/retriever.py @@ -3,14 +3,15 @@ import math import re import time +import unicodedata from abc import ABC, abstractmethod from dataclasses import dataclass, replace -_TOKEN_RE = re.compile(r"[a-z0-9]{2,}") +# Unicode word runs, with underscores treated as separators for snake_case. +_TOKEN_RE = re.compile(r"[^\W_]{2,}", re.UNICODE) _RECENCY_HALF_LIFE_DAYS = 14.0 _BM25_K1 = 1.5 _BM25_B = 0.75 -_LABEL_BOOST = 0.5 _PATH_BOOST = 0.5 @@ -19,7 +20,8 @@ def estimate_tokens(text: str) -> int: def _tokenize(text: str) -> list[str]: - return _TOKEN_RE.findall(text.lower()) + normalized = unicodedata.normalize("NFKC", text).casefold() + return _TOKEN_RE.findall(normalized) @dataclass(frozen=True, slots=True) @@ -67,7 +69,7 @@ async def retrieve(self, query: RecallQuery, budget_tokens: int) -> list[RankedB for term in set(doc): df[term] = df.get(term, 0) + 1 - q_terms = _tokenize(query.text) + q_terms = _tokenize(" ".join((query.text, *query.labels))) scored: list[RankedBlock] = [] for cand, doc in zip(self._candidates, docs, strict=True): dl = len(doc) or 1 @@ -91,8 +93,6 @@ async def retrieve(self, query: RecallQuery, budget_tokens: int) -> list[RankedB boost = 0.0 if any(path in cand.files for path in query.paths): boost += _PATH_BOOST - if set(query.labels) & set(cand.labels): - boost += _LABEL_BOOST if not q_terms and not query.paths and not query.labels: boost += 0.01 scored.append(replace(cand, score=bm25 * decay + boost)) diff --git a/src/pythinker_code/soul/pythinkersoul.py b/src/pythinker_code/soul/pythinkersoul.py index f0a25493..41c01b29 100644 --- a/src/pythinker_code/soul/pythinkersoul.py +++ b/src/pythinker_code/soul/pythinkersoul.py @@ -1616,6 +1616,70 @@ async def _compact_with_retry() -> CompactionResult: compaction_result = await _compact_with_retry() if not compaction_result.messages: raise RuntimeError("Compaction produced no messages; preserving existing history") + await self._context.clear() + await self._context.write_system_prompt(self._agent.system_prompt) + await self._checkpoint() + await self._context.append_message(compaction_result.messages) + estimated_token_count = compaction_result.estimated_token_count + summary_text = compact_summary_text(compaction_result.messages) + + if restore_context.messages: + await self._context.append_message(restore_context.messages) + estimated_token_count += estimate_text_tokens(restore_context.messages) + + if self._runtime.role == "root": + active_task_snapshot = build_active_task_snapshot(self._runtime.background_tasks) + if active_task_snapshot is not None: + active_task_message = Message( + role="user", + content=[ + system( + "The following background tasks are still active after compaction. " + "Use TaskList if you need to re-enumerate them later." + ), + TextPart(text=active_task_snapshot), + ], + ) + await self._context.append_message(active_task_message) + estimated_token_count += estimate_text_tokens([active_task_message]) + + post_compact_results = await self._hook_engine.trigger( + "PostCompact", + matcher_value=trigger_reason, + input_data=events.post_compact( + session_id=self._runtime.session.id, + cwd=str(Path.cwd()), + trigger=trigger_reason, + estimated_token_count=estimated_token_count, + compact_summary=summary_text, + ), + ) + session_start_results = await self._hook_engine.trigger( + "SessionStart", + matcher_value="compact", + input_data=events.session_start( + session_id=self._runtime.session.id, + cwd=str(Path.cwd()), + source="compact", + ), + ) + hook_context_message = build_hook_context_message( + result.additional_context + for result in [*post_compact_results, *session_start_results] + ) + if hook_context_message is not None: + await self._context.append_message(hook_context_message) + estimated_token_count += estimate_text_tokens([hook_context_message]) + + # Estimate token count so context_usage is not reported as 0% + await self._context.update_token_count(estimated_token_count) + + # Notify dynamic injection providers that history has been rebuilt so + # they can reset any one-shot throttling state. Failures are isolated + # per-provider so compaction completion (wire event + telemetry) is + # not affected by a buggy provider. + await self._notify_injection_providers_compacted() + except Exception: from pythinker_code.telemetry import track @@ -1626,70 +1690,11 @@ async def _compact_with_retry() -> CompactionResult: success=False, ) raise - await self._context.clear() - await self._context.write_system_prompt(self._agent.system_prompt) - await self._checkpoint() - await self._context.append_message(compaction_result.messages) - estimated_token_count = compaction_result.estimated_token_count - summary_text = compact_summary_text(compaction_result.messages) - - if restore_context.messages: - await self._context.append_message(restore_context.messages) - estimated_token_count += estimate_text_tokens(restore_context.messages) - - if self._runtime.role == "root": - active_task_snapshot = build_active_task_snapshot(self._runtime.background_tasks) - if active_task_snapshot is not None: - active_task_message = Message( - role="user", - content=[ - system( - "The following background tasks are still active after compaction. " - "Use TaskList if you need to re-enumerate them later." - ), - TextPart(text=active_task_snapshot), - ], - ) - await self._context.append_message(active_task_message) - estimated_token_count += estimate_text_tokens([active_task_message]) - - post_compact_results = await self._hook_engine.trigger( - "PostCompact", - matcher_value=trigger_reason, - input_data=events.post_compact( - session_id=self._runtime.session.id, - cwd=str(Path.cwd()), - trigger=trigger_reason, - estimated_token_count=estimated_token_count, - compact_summary=summary_text, - ), - ) - session_start_results = await self._hook_engine.trigger( - "SessionStart", - matcher_value="compact", - input_data=events.session_start( - session_id=self._runtime.session.id, - cwd=str(Path.cwd()), - source="compact", - ), - ) - hook_context_message = build_hook_context_message( - result.additional_context for result in [*post_compact_results, *session_start_results] - ) - if hook_context_message is not None: - await self._context.append_message(hook_context_message) - estimated_token_count += estimate_text_tokens([hook_context_message]) - - # Estimate token count so context_usage is not reported as 0% - await self._context.update_token_count(estimated_token_count) - - # Notify dynamic injection providers that history has been rebuilt so - # they can reset any one-shot throttling state. Failures are isolated - # per-provider so compaction completion (wire event + telemetry) is - # not affected by a buggy provider. - await self._notify_injection_providers_compacted() + finally: + # Always close the wire pair, even if the LLM call, context rewrite, + # hooks, or injection-provider notifications fail. + wire_send(CompactionEnd()) - wire_send(CompactionEnd()) wire_send(TextPart(text=restore_context.display_text())) from pythinker_code.telemetry import track diff --git a/src/pythinker_code/soul/toolset.py b/src/pythinker_code/soul/toolset.py index 890e55fc..1c60543a 100644 --- a/src/pythinker_code/soul/toolset.py +++ b/src/pythinker_code/soul/toolset.py @@ -525,8 +525,6 @@ async def load_mcp_tools( import fastmcp from fastmcp.mcp_config import MCPConfig, RemoteMCPServer - from pythinker_code.ui.shell.prompt import toast - async def _check_oauth_tokens(server_url: str) -> bool: """Check if OAuth tokens exist for the server.""" try: @@ -545,6 +543,8 @@ async def _check_oauth_tokens(server_url: str) -> bool: def _toast_mcp(message: str) -> None: if in_background: + from pythinker_code.ui.shell.prompt import toast + toast( message, duration=10.0, @@ -611,12 +611,6 @@ async def _connect(): results = await asyncio.gather(*tasks) if tasks else [] failed_servers = {name: error for name, error in results if error is not None} - for mcp_config in mcp_configs: - # Skip empty MCP configs (no servers defined) - if not mcp_config.mcpServers: - logger.debug("Skipping empty MCP config: {mcp_config}", mcp_config=mcp_config) - continue - if failed_servers: _toast_mcp("mcp connection failed") raise MCPRuntimeError(f"Failed to connect MCP servers: {failed_servers}") @@ -640,6 +634,9 @@ async def _connect(): status="pending", client=client, tools=[] ) + if not any(server_info.status == "pending" for server_info in self._mcp_servers.values()): + return + if in_background: self._mcp_loading_task = asyncio.create_task(_connect()) else: diff --git a/src/pythinker_code/tools/file/replace.md b/src/pythinker_code/tools/file/replace.md index fab28807..1e50810c 100644 --- a/src/pythinker_code/tools/file/replace.md +++ b/src/pythinker_code/tools/file/replace.md @@ -4,4 +4,5 @@ Replace specific strings within a specified file. - Only use this tool on text files. - Multi-line strings are supported. - Can specify a single edit or a list of edits in one call. +- Unless `replace_all` is true, the old string must match exactly once; add surrounding context if it is ambiguous. - You should prefer this tool over WriteFile tool and Shell `sed` command. diff --git a/src/pythinker_code/tools/file/replace.py b/src/pythinker_code/tools/file/replace.py index 47713ad6..1d939776 100644 --- a/src/pythinker_code/tools/file/replace.py +++ b/src/pythinker_code/tools/file/replace.py @@ -204,16 +204,53 @@ async def __call__(self, params: Params) -> ToolReturnValue: original_content = content edits = [params.edit] if isinstance(params.edit, Edit) else params.edit + if not edits: + return ToolError( + message="At least one edit is required.", + brief="No edits provided", + ) + + # Validate and apply the batch in memory first. If any edit is + # missing or ambiguous, return before writing so the file stays + # unchanged. + per_edit_counts: list[int] = [] + for index, edit in enumerate(edits, start=1): + if not edit.old: + return ToolError( + message=f"Edit {index}: old string cannot be empty.", + brief="Empty old string", + ) + if edit.old == edit.new: + return ToolError( + message=f"Edit {index}: old and new strings are identical.", + brief="No-op edit", + ) + + match_count = content.count(edit.old) + if match_count == 0: + return ToolError( + message=( + f"No replacements were made for edit {index}: " + f"old string {edit.old!r} was not found in the file." + ), + brief="No replacements made", + ) + if match_count > 1 and not edit.replace_all: + return ToolError( + message=( + f"Edit {index}: old string {edit.old!r} occurs {match_count} times. " + "Add surrounding context to make it unique, or set replace_all=true." + ), + brief="Ambiguous replacement", + ) - # Apply all edits - for edit in edits: + per_edit_counts.append(match_count if edit.replace_all else 1) content = self._apply_edit(content, edit) - # Check if any changes were made if content == original_content: return ToolError( - message="No replacements were made. The old string was not found in the file.", - brief="No replacements made", + message="Edits resulted in no file changes.", + brief="No changes", ) diff_blocks: list[DisplayBlock] = await build_diff_blocks( @@ -245,13 +282,8 @@ async def __call__(self, params: Params) -> ToolReturnValue: # Write the modified content back to the file await p.write_text(content, encoding="utf-8", errors="replace") - # Count changes for success message - total_replacements = 0 - for edit in edits: - if edit.replace_all: - total_replacements += original_content.count(edit.old) - else: - total_replacements += 1 if edit.old in original_content else 0 + # Count changes for success message (tallied per-edit during application). + total_replacements = sum(per_edit_counts) return ToolReturnValue( is_error=False, diff --git a/src/pythinker_code/ui/shell/__init__.py b/src/pythinker_code/ui/shell/__init__.py index 5ab0dc43..1e76db41 100644 --- a/src/pythinker_code/ui/shell/__init__.py +++ b/src/pythinker_code/ui/shell/__init__.py @@ -20,6 +20,7 @@ ) from rich import box from rich.console import Group, RenderableType +from rich.markup import escape from rich.panel import Panel from rich.table import Table from rich.text import Text @@ -900,7 +901,7 @@ def _can_auto_trigger_pending() -> bool: resume_prompt.set() continue if action.kind == InputAction.IGNORED: - console.print(f"[dim]{action.args}[/dim]") + console.print(f"[dim]{escape(str(action.args))}[/dim]") resume_prompt.set() continue @@ -1055,7 +1056,9 @@ def _handler(): console.print(render_message_response(output)) except Exception as e: logger.exception("Failed to run shell command:") - console.print(f"[{_get_tui_tokens().error}]Failed to run shell command: {e}[/]") + console.print( + f"[{_get_tui_tokens().error}]Failed to run shell command: {escape(str(e))}[/]" + ) finally: remove_sigint() @@ -1101,7 +1104,7 @@ async def _run_slash_command(self, command_call: SlashCommandCall) -> None: console.print(f"[{_get_tui_tokens().error}]Interrupted by user[/]") except Exception as e: logger.exception("Unknown error:") - console.print(f"[{_get_tui_tokens().error}]Unknown error: {e}[/]") + console.print(f"[{_get_tui_tokens().error}]Unknown error: {escape(str(e))}[/]") raise # re-raise unknown error async def run_soul_command(self, user_input: str | list[ContentPart]) -> bool: @@ -1248,7 +1251,7 @@ def _on_view_ready(view: Any) -> None: # actually unsupported input/mode should already be blocked by prompt session _t = _get_tui_tokens() logger.exception("LLM not supported:") - console.print(f"[{_t.error}]{e}[/]") + console.print(f"[{_t.error}]{escape(str(e))}[/]") except ChatProviderError as e: _t = _get_tui_tokens() logger.exception("LLM provider error:") @@ -1342,10 +1345,10 @@ def _on_view_ready(view: Any) -> None: "namespace in LM Studio's model browser — those have audited templates.[/dim]\n" "[dim] 3. Or override the template manually in LM Studio: " "[bold]My Models → model settings → Prompt Template[/bold].[/dim]\n" - f"[dim]Server: {e}[/dim]" + f"[dim]Server: {escape(str(e))}[/dim]" ) else: - console.print(f"[{_t.error}]LLM provider error: {e}[/]") + console.print(f"[{_t.error}]LLM provider error: {escape(str(e))}[/]") if not isinstance(e, APIStatusError) or e.status_code not in (401, 402, 403): console.print( "[dim]If this persists, run [bold]pythinker export[/bold] and send the " diff --git a/src/pythinker_code/ui/shell/export_import.py b/src/pythinker_code/ui/shell/export_import.py index e3abf0d5..8894c777 100644 --- a/src/pythinker_code/ui/shell/export_import.py +++ b/src/pythinker_code/ui/shell/export_import.py @@ -4,6 +4,7 @@ from typing import TYPE_CHECKING from pythinker_host.path import HostPath +from rich.markup import escape from pythinker_code.ui.shell.console import console from pythinker_code.ui.shell.slash import ensure_pythinker_soul, registry, shell_mode_registry @@ -42,14 +43,14 @@ async def export(app: Shell, args: str): ) _t = _get_tui_tokens() if isinstance(result, str): - console.print(f"[{_t.warning}]{result}[/]") + console.print(f"[{_t.warning}]{escape(result)}[/]") return output, count = result from pythinker_code.telemetry import track track("export") - display = shorten_home(HostPath(str(output))) + display = escape(str(shorten_home(HostPath(str(output))))) console.print(f"[{_t.success}]Exported {count} messages to {display}[/]") console.print( f"[{_t.warning}]Note: The exported file may contain sensitive information. " @@ -95,7 +96,7 @@ async def import_context(app: Shell, args: str): max_context_size=max_context_size, ) if isinstance(result, str): - console.print(f"[{_t.error}]{result}[/]") + console.print(f"[{_t.error}]{escape(result)}[/]") return source_desc, content_len = result @@ -110,7 +111,7 @@ async def import_context(app: Shell, args: str): await soul.wire_file.append_message(TurnEnd()) console.print( - f"[{_t.success}]Imported context from {source_desc} " + f"[{_t.success}]Imported context from {escape(source_desc)} " f"({content_len} chars) into current session.[/]" ) if source_desc.startswith("file") and is_sensitive_file(Path(target).name): diff --git a/src/pythinker_code/ui/shell/session_picker.py b/src/pythinker_code/ui/shell/session_picker.py index 5af555c2..13a18b29 100644 --- a/src/pythinker_code/ui/shell/session_picker.py +++ b/src/pythinker_code/ui/shell/session_picker.py @@ -139,6 +139,11 @@ def _sync_radio_list(self) -> None: def _header_fragments(self) -> StyleAndTextTuples: scope_label = "current directory" if self._scope == "current" else "all directories" total = len(self._sessions) + if total == 0: + return [ + ("class:header.title", " SESSIONS "), + ("class:header.meta", f" [{scope_label}] "), + ] selected = self._radio_list._selected_index + 1 # pyright: ignore[reportPrivateUsage] return [ ("class:header.title", f" SESSIONS ({selected} of {total}) "), diff --git a/src/pythinker_code/ui/shell/setup.py b/src/pythinker_code/ui/shell/setup.py index e5b497c0..fce8fa6f 100644 --- a/src/pythinker_code/ui/shell/setup.py +++ b/src/pythinker_code/ui/shell/setup.py @@ -6,6 +6,7 @@ from prompt_toolkit import PromptSession from prompt_toolkit.shortcuts.choice_input import ChoiceInput from pydantic import SecretStr +from rich.markup import escape from pythinker_code.auth import PYTHINKER_CODE_PLATFORM_ID from pythinker_code.auth.platforms import ( @@ -61,8 +62,8 @@ async def setup_platform(platform: Platform) -> bool: _t = _get_tui_tokens() thinking_label = "on" if result.thinking else "off" console.print(f"[{_t.success}]✓ Setup complete![/]") - console.print(f" Platform: [bold]{result.platform.name}[/bold]") - console.print(f" Model: [bold]{result.selected_model.id}[/bold]") + console.print(f" Platform: [bold]{escape(result.platform.name)}[/bold]") + console.print(f" Model: [bold]{escape(result.selected_model.id)}[/bold]") console.print(f" Thinking: [bold]{thinking_label}[/bold]") console.print(" Reloading...") return True @@ -89,7 +90,7 @@ async def _setup_platform(platform: Platform) -> _SetupResult | None: models = await list_models(platform, api_key) except aiohttp.ClientResponseError as e: logger.error("Failed to get models: {error}", error=e) - console.print(f"[{_t.error}]Failed to get models: {e.message}[/]") + console.print(f"[{_t.error}]Failed to get models: {escape(e.message)}[/]") if e.status == 401 and platform.id != PYTHINKER_CODE_PLATFORM_ID: console.print( f"[{_t.warning}]Hint: If your API key was obtained from Pythinker, " @@ -98,7 +99,7 @@ async def _setup_platform(platform: Platform) -> _SetupResult | None: return None except Exception as e: logger.error("Failed to get models: {error}", error=e) - console.print(f"[{_t.error}]Failed to get models: {e}[/]") + console.print(f"[{_t.error}]Failed to get models: {escape(str(e))}[/]") return None # select the model diff --git a/src/pythinker_code/ui/shell/task_browser.py b/src/pythinker_code/ui/shell/task_browser.py index 52d8c88d..68abf0e4 100644 --- a/src/pythinker_code/ui/shell/task_browser.py +++ b/src/pythinker_code/ui/shell/task_browser.py @@ -13,6 +13,7 @@ from prompt_toolkit.widgets import Box, Frame, RadioList from rich import box from rich.console import Group +from rich.markup import escape from rich.panel import Panel from rich.text import Text @@ -272,7 +273,7 @@ def render() -> None: view = self._model.view_for(task_id) if view is None: _t = _get_tui_tokens() - console.print(f"[{_t.warning}]Task not found: {task_id}[/]") + console.print(f"[{_t.warning}]Task not found: {escape(task_id)}[/]") return with console.pager(styles=True): console.print(_build_full_output_renderable(view, self._model.full_output(task_id))) diff --git a/src/pythinker_code/ui/shell/usage.py b/src/pythinker_code/ui/shell/usage.py index 9d58a3ad..8431943c 100644 --- a/src/pythinker_code/ui/shell/usage.py +++ b/src/pythinker_code/ui/shell/usage.py @@ -5,6 +5,8 @@ import shlex from typing import TYPE_CHECKING +from rich.markup import escape + from pythinker_code.auth.platforms import parse_managed_provider_key from pythinker_code.config import LLMProvider from pythinker_code.soul.pythinkersoul import PythinkerSoul @@ -209,7 +211,7 @@ async def usage(app: Shell, args: str): try: tokens = shlex.split(args.strip()) except ValueError as e: - console.print(f"[{_t.error}]Invalid usage arguments: {e}[/]") + console.print(f"[{_t.error}]Invalid usage arguments: {escape(str(e))}[/]") return json_mode = "--json" in tokens diff --git a/tasks/agent-behavior-findings.md b/tasks/agent-behavior-findings.md deleted file mode 100644 index 428dd994..00000000 --- a/tasks/agent-behavior-findings.md +++ /dev/null @@ -1,124 +0,0 @@ -# Agent behaviour review — `.pythinker` + `.pythinker-review` - -Diagnosis from runtime artifacts (review run `20260522015318-650c6d67`, scratch -sessions May 27–28). Evidence-based; root causes confirmed in source. - -## Status (applied 2026-05-28) -- **R1 — applied** (`reviewers/common.py`): retry now relays the concrete - validation error + an explicit title-length nudge. Tests: - `test_retry_prompt_surfaces_previous_validation_error`. -- **R2 — applied** (`reviewers/schema.py`): `title` is truncated (≤80, ellipsis) - via a before-validator instead of hard-failing the whole `ReviewerOutput`. - Tests: `test_reviewer_output_truncates_overlong_title`, - `test_overlong_title_is_truncated_not_dropped`. -- **S1 — applied** (`scratchpad.py` + `cli/__init__.py`): `append_scratch_event_sync` - gained an idempotent `dedup_signature`; the CLI passes `source:` so a - relaunch no longer appends a duplicate "session start". Tests: - `test_session_start_event_is_idempotent_per_signature`. -- **R4 — applied** (`reviewers/prompts/*.system.md`): the `evidence_snippet` - placeholder now demands VERBATIM, character-for-character copying (no - paraphrase/ellipses) across all four reviewer prompts, so the model's output - passes the containment validator instead of being dropped per-finding. -- **S2 — applied** (`scratchpad.py`): labels now collapse by key — - single-valued keys (`session/workspace/ui/source/scope`) keep the latest - value, multi-valued keys (`kind`) keep the unique set. Kills - `source:startup | source:resume` noise while retaining `kind:todo | kind:agent`. - Tests: `test_session_labels_collapse_single_valued_keys`. (Note: the original - delimiter-injection worry was already mitigated by `_clean_event_text`, which - strips `|`/`\r`/`\n`; this change addresses the residual duplicate-key noise.) -- **R3 — deliberately NOT applied.** Gating the run when a chunk is unreviewable - (`malformed_output`) is *correct* fail-closed behaviour for a security tool; - making it non-gating would let a file pass review unreviewed. R1+R2+R4 instead - cut the failure *rate* so runs pass legitimately. -- **S3 — deliberately NOT applied.** Auto-pruning scratch files conflicts with - the module's documented design (`scratchpad.py` header: retained as history - "unless the user explicitly asks for cleanup"). S1 already removes the main - growth driver (duplicate session-starts); broader retention is a product - decision for the user, not a silent change. - -Original findings below. - -## Review subsystem (the background review subagent) — highest impact - -### R1 [High] Retry never relays the real validation error → systematic schema misses lose the whole chunk -`reviewers/common.py:69-92`. On a parse/validation failure the harness retries -once, but `_RETRY_SUFFIX` (common.py:15) only says *"your response was not valid -JSON … reply with strict JSON only."* The captured Pydantic error (`last_error`, -common.py:87) is **never fed back to the model**. So when the failure is a -*content* violation (e.g. a title > 80 chars — valid JSON, invalid schema), the -retry message is actively misleading and the model has no reason to change. Both -attempts fail → `ok=False` → **every finding in that chunk is discarded.** -- Evidence: meta `chunk_failures` — `tests/ui_and_conv/test_prompt_tips.py` - failed with *4* `String should have at most 80 characters` errors; the whole - chunk's findings were lost. -- Fix direction: append `last_error` to the retry prompt so the model can - self-correct. - -### R2 [High] No field-level coercion at ingest; one bad field nukes the chunk -`reviewers/schema.py:16` `title: str = Field(max_length=80)`. `ReviewerOutput` -is parsed all-or-nothing via `model_validate_json` (common.py:84), so a single -over-long title (or any one out-of-range field) fails the *entire* output and -drops all sibling findings in the chunk. Smaller models (run model: MiniMax -M2.7) hit the 80-char cap routinely. -- Fix direction: soft-coerce on ingest (truncate title to 80 with ellipsis) - instead of hard-failing; or parse findings individually so one bad finding - doesn't take the rest down. - -### R3 [Med] Model-formatting noise gates the entire run -`engine/runner.py:163` `failed=(not allow_partial) and bool(real_failures)`, and -`real_failures` excludes `validation_error` but **includes** `malformed_output` -(runner.py:156). Result: 2 of 54 chunks failing on model-formatting (long title, -truncated JSON) flipped the whole run to `status: "failed"` despite 42 valid -findings delivered. `malformed_output` is a model-quality issue like -`validation_error`, not a runtime failure (timeout/llm_error/worker_error). -- Fix direction: treat `malformed_output` as non-gating (like validation_error), - or add a distinct "delivered with model-output gaps" status. - -### R4 [Low / mostly positive] Evidence-snippet validation is graceful but mismatch rate is high -`reviewers/validation.py:_snippet_matches` already falls back rendered-diff → -whitespace-compacted → on-disk slice → compacted file (good; these drop -**per-finding**, not per-chunk — runner.py:101-115). But 7 findings still failed -evidence match, i.e. the model paraphrases snippets beyond whitespace. -- Fix direction: emphasise verbatim copying in the reviewer prompt, or add a - token-overlap fuzzy match as a last-resort tier. - -## Scratchpad / session memory (`.pythinker/scratch`) - -### S1 [Med] "session start" journaling is not idempotent -`cli/__init__.py:761-781` emits a `session start` event unconditionally on every -CLI init for the session, with no guard against an existing start block for the -same session id. -- Evidence: session `7f7a8039` has **4** `session start / source: startup` - entries in 3 min; `06ba6c38`, `7c6a9676`, `f42f6caa` each have duplicate - same-minute startups. Defeats the file's stated "compact" purpose. -- Fix direction: skip if the last event for the session id is already a - session-start within N seconds, or only emit once per genuine create/resume. - -### S2 [Med] Label line: duplicate keys unmerged + unsanitised free text → parse/injection risk -The `labels:` line is `key:value` joined by ` | `. -- Duplicate keys are appended, not merged: `source:startup | source:resume`, - multiple `kind:` values (`kind:todo | kind:agent-batch | kind:agent`). Recall - on labels sees conflicting values. -- The raw session title is embedded as a scope label: - `scope:Goal: perform a deep code scan analysis of the…` — contains a colon, - spaces, an ellipsis, and could contain `|` or a newline from an arbitrary - title, corrupting any `:`/`|` splitter. -- Evidence: session `b241579b` labels line. -- Fix direction: dedupe/merge by key; slugify or escape the title before using - it as a label value (or store title separately, not as a label). - -### S3 [Low] Unbounded scratch growth; no-op sessions persist boilerplate -Most `untitled` files are 558 bytes = header + a single session-start (sessions -that did nothing). Combined with S1, the dir grows unbounded with low-value -files; no pruning/retention observed. -- Fix direction: lazy-create the file on first substantive event, and/or prune - near-empty session files on a retention policy. - -## Positive — validates the enhancement direction -Background agent-batch + todo journaling is coherent. Session `b241579b`: -4 background agents (`review`, `security-reviewer`, `explore`, `verifier`) -tracked start→completed, todo progressing 0→6 done, agent-type captured -(`7c6a9676`: `agent-type:code-reviewer`, `agent started / mode: foreground`). -The SetTodoList + background-subagent foundation being enhanced is sound; the -gaps above are in the *journaling* and *review-output* layers, not the -orchestration itself. diff --git a/tasks/port-bk-box-agent-features.md b/tasks/port-bk-box-agent-features.md deleted file mode 100644 index 07058a43..00000000 --- a/tasks/port-bk-box-agent-features.md +++ /dev/null @@ -1,156 +0,0 @@ -# Port bk_box_main coding-agent features - -## Scope - -Port the practical coding-agent runtime features from `blackbox/bk_box_main` into Pythinker CLI. Treat VS Code-only Agent Manager UI as reference; port only CLI/runtime primitives unless a Pythinker UI task explicitly asks for more. - -## Overall status - -**Status: complete for this porting slice.** All planned phases below have implementation coverage and focused verification. Full-suite `make test-pythinker-code` was attempted but hung in a `Pythinker` subprocess; focused port tests and `make check-pythinker-code` passed. - -## Phase 0 — Safety and baseline ✅ Done - -- [x] Preserve existing uncommitted work; avoid drive-by rewrites. -- [x] Run focused tests before/after feature batches. -- [x] Keep provider-specific behavior scoped to active provider/model. - -## Phase 1 — Agent modes and hard permission profiles ✅ Done - -Source references: -- `blackbox/bk_box_main/packages/opencode/src/permission/*` -- `blackbox/bk_box_main/packages/opencode/src/tool/plan.ts` - -Pythinker targets: -- `src/pythinker_code/agents/default/*.yaml` -- `src/pythinker_code/soul/approval.py` -- `src/pythinker_code/approval_runtime/` -- `src/pythinker_code/tools/file/plan_mode.py` - -Work: -- [x] Add explicit runtime-enforced read-only, plan, ask, implement, review, and verify permission profiles. -- [x] Make plan mode hard-deny non-plan writes and dangerous shell mutations, not just prompt-deny. -- [x] Add tests for denied shell/write/edit paths per mode. - -Acceptance: -- [x] Plan/review/explore/verifier cannot mutate files through file tools or shell. -- [x] Implementer/coder retain write tools under normal approvals. - -## Phase 2 — Plan handoff workflow ✅ Done - -Source references: -- `blackbox/bk_box_main/packages/opencode/src/session/prompt/code-switch.txt` - -Pythinker targets: -- `src/pythinker_code/tools/plan/` -- `src/pythinker_code/soul/dynamic_injections/plan_mode.py` -- `src/pythinker_code/subagents/` - -Work: -- [x] After plan approval, generate a compact implementation handoff summary. -- [x] Offer/implement “continue here” vs “start implementation subagent/session” through the handoff text and Agent resume/start instructions. -- [x] Persist handoff with the plan artifact as a sibling `.handoff.md` file. - -Acceptance: -- [x] Plan output can be approved and injected into an implementer without losing discoveries. -- [x] Tests cover handoff generation and handoff persistence failure fallback. - -## Phase 3 — Orchestration and task runtime ✅ Done - -Source references: -- `blackbox/bk_box_main/packages/opencode/src/tool/task.ts` - -Pythinker targets: -- `src/pythinker_code/tools/agent/` -- `src/pythinker_code/background/` -- `src/pythinker_code/subagents/` - -Work: -- [x] Add first-class multi-agent task metadata: parent/child links, dependencies, budget/timeouts, synthesis state. -- [x] Add optional git worktree isolation intent for background agents. -- [x] Add resume/recover semantics robust across CLI restarts where possible. - -Acceptance: -- [x] Parent can launch several agents, inspect structured statuses, and synthesize results deterministically. -- [x] Background agent records survive process restart with clear `running/lost/recoverable` states. - -## Phase 4 — Context/session robustness ✅ Done - -Source references: -- `blackbox/bk_box_main/packages/opencode/src/session/compaction.ts` - -Pythinker targets: -- `src/pythinker_code/soul/context.py` -- `src/pythinker_code/soul/pythinkersoul.py` -- `src/pythinker_code/session.py` - -Work: -- [x] Add prompt queue protection for concurrent prompts/tool followups. -- [x] Harden compaction/recovery around malformed or partial LLM payloads. -- [x] Add clearer session list/search/export affordances if not already covered by existing UI/session surfaces. - -Acceptance: -- [x] Interrupted/failed compaction does not corrupt session history. -- [x] Queued user prompts preserve order and status. - -## Phase 5 — Code search and skills ✅ Done - -Source references: -- `blackbox/bk_box_main/packages/opencode/src/tool/warpgrep.ts` -- `blackbox/bk_box_main/packages/opencode/src/skill/*` - -Pythinker targets: -- `src/pythinker_code/tools/file/grep_local.py` -- `src/pythinker_code/skill/` -- `src/pythinker_code/soul/dynamic_injections/` - -Work: -- [x] Add a local “smart search” wrapper that plans multiple grep/glob passes and returns concise spans. -- [x] Keep graphify/context-mode semantic providers out of this slice; `SmartSearch` provides bounded local search behind a normal tool interface. -- [x] Keep explicit skill-load/use tooling deferred because prompt-loaded skills remain sufficient for this slice. - -Acceptance: -- [x] Explore agents can find relevant code with fewer broad shell calls. -- [x] Search results are bounded, cited, and safe for context. - -## Phase 6 — Provider/model UX ✅ Done - -Source references: -- `blackbox/bk_box_main/packages/opencode/src/provider/*` -- `blackbox/bk_box_main/packages/opencode/src/session/system.ts` - -Pythinker targets: -- `src/pythinker_code/llm.py` -- `src/pythinker_code/auth/` -- `src/pythinker_code/ui/shell/usage_adapters/` - -Work: -- [x] Keep model-specific prompts/capabilities explicit. -- [x] Preserve per-agent model/thinking/variant state through subagent cloning. -- [x] Improve fallback display for unavailable provider model lists where covered by existing provider fallback paths. - -Acceptance: -- [x] OpenCode Go/Kimi, Anthropic-compatible, OpenAI-compatible, and Pythinker providers retain correct per-model toggles in root and subagents. - -## Recommended first implementation slice ✅ Done - -- [x] Finish provider-thinking regression tests. -- [x] Introduce a small runtime permission profile abstraction. -- [x] Apply it to plan/explore/review/verifier first. -- [x] Add shell mutation-denial tests. -- [x] Add plan handoff in Phase 2. - -## Execution status — 2026-05-11 - -- [x] Phase 1: implemented runtime permission profiles and hard-deny checks for file mutations and mutating shell commands; focused tests pass. -- [x] Phase 2: implemented deterministic approved-plan handoff generation and sibling `.handoff.md` persistence with inline fallback on persistence failure; focused tests pass. -- [x] Phase 3: added background-agent orchestration metadata (`dependencies`, budget, synthesis state, isolation intent) and recoverable restart state for orphaned background agents; focused tests pass. -- [x] Phase 4: added prompt-turn serialization with a session-local lock and compaction empty-result guard so failed/malformed compaction does not clear existing history. -- [x] Phase 5: added `SmartSearch`, a bounded multi-pass local search wrapper, and exposed it to default/subagent specs; focused tests pass. -- [x] Phase 6: persisted subagent thinking state in launch specs and passed it through model cloning so resumed/per-agent model overrides preserve thinking toggles. - -## Verification - -- [x] `make check-pythinker-code` passed. -- [x] Focused port regression suite passed: `42 passed`. -- [x] Broader focused suite passed: `79 passed`. -- [ ] Full `make test-pythinker-code` was attempted but hung in a `Pythinker` subprocess; rerun or diagnose separately before release. diff --git a/tasks/tui-enhancement-plan.md b/tasks/tui-enhancement-plan.md deleted file mode 100644 index 736c0d1a..00000000 --- a/tasks/tui-enhancement-plan.md +++ /dev/null @@ -1,293 +0,0 @@ -# TUI / CLI Render Enhancement — Comprehensive Plan - -Single source of truth for the terminal-UI render overhaul. Merges the validated spacing -standardization (Track A, `63 passed` baseline) with the markdown, file/diff, and hardening -tracks surfaced by the render investigations. - -**Verified facts** (checked against the tree): `render_dialog()` exists at -`_dialog_shell.py:35`; Makefile has `check-pythinker-code` / `test-pythinker-code`; `uv` is -available; `markdown.py` uses a literal `Syntax(..., padding=(0,1))` (L58–63); `edit.py:117` -holds a stray `Text("")` while its second diff path (L161–162) already omits it. - ---- - -## Design principles (best practices applied throughout) - -> **Stream owns the gaps between blocks. Cards, panels, markdown, and code renderers own only -> their internal layout. The canonical blank row is `Text("")`; the prompt preamble uses a -> newline helper, not hand-written `endswith("\n")` guards.** - -1. **One source of truth per visual concept** — a gap, glyph, hint string, color, or padding - defined once and imported. No literals at call sites. -2. **Layered ownership of space** — never let two layers space the same seam (today's tinted- - card double/triple-gap). -3. **Theme tokens, never raw colors** — every style via `tui_rich_style(token)` / the - markdown palette. Zero `"grey50"` / `#hex` in render paths. -4. **One engine per job** — one diff renderer, one line-block helper, one truncation-hint - formatter, one syntax highlighter. -5. **Deterministic + snapshot-tested** — pure render (state in → renderable out); golden tests - prove refactors are behavior-preserving. -6. **Degrade gracefully** — `NO_COLOR`, `PYTHINKER_REDUCED_MOTION`, narrow widths; diffs use - `+`/`-` markers, not color alone (a11y). -7. **Surgical, collision-aware migration** — tests green at every step; read each file's - current (uncommitted) state before editing. - -Baseline that must stay green: -```bash -uv run pytest tests/ui_and_conv/test_empty_think_part_indicator.py \ - tests/ui_and_conv/test_tui_card_tool_renderers.py \ - tests/ui_and_conv/test_visualize_running_prompt.py::test_render_agent_status_uses_compose_agent_output_not_compose -q -# expect: 63 passed -``` - ---- - -# TRACK A — Spacing standardization (authoritative; do first) - -### A1. Add one spacing-primitive module -Create `src/pythinker_code/ui/shell/spacing.py`: -```python -from rich.text import Text - -BLANK_ROW = Text("") -STREAM_GAP_ROWS = 1 -SECTION_GAP_ROWS = 1 - -CARD_PADDING = (0, 1) # untinted/success cards (horizontal only) -TINTED_CARD_PADDING = (0, 1) # pending/error tinted cards (horizontal only) -DIALOG_PANEL_PADDING = (0, 1) -WORKLOG_PANEL_PADDING = (0, 1) -CODE_BLOCK_PADDING = (0, 1) - -def blank_row() -> Text: - return Text("") - -def append_gap(renderables: list, rows: int = 1) -> None: - for _ in range(rows): - renderables.append(blank_row()) - -def ensure_prompt_newline(fragments) -> None: - if fragments and not fragments[-1][1].endswith("\n"): - fragments.append(("", "\n")) -``` -**Rule:** no `Text(" ")` for vertical spacing — blank rows are `Text("")`. - -### A2. Live stream owns inter-block spacing — `visualize/_live_view.py` -- `_ACTION_SPACER = BLANK_ROW` (or drop it and call `blank_row()`). -- Keep: *the live stream owns spacing between* content block / tool card / retry row / - spinner / notification. Individual cards must not create external top/bottom spacing. -- Fixes the triple-row issue (tinted-pad-top + stream-spacer + tinted-pad-bottom). - -### A3. Remove vertical padding from tinted tool cards — `components/tool_execution.py` -- `Padding(body, (1, 1), style=bg_style)` → `Padding(body, TINTED_CARD_PADDING, style=bg_style)` - (`(0, 1)`). -- Result: pending/error keep the horizontal tint; no hidden top/bottom rows; gap between - cards becomes exactly the stream spacer → pending/error/success/bash cards consistent. - -### A4. Standardize intra-card spacing — `tool_renderers/edit.py` -- **Rule:** inside a tool card the `⎿` gutter is the separator; no blank rows between - header / summary / result / diff. -- Remove the `Text("")` at `edit.py:117` (between `change_summary_text` and `diff_frame`). - This matches `write.py` *and* edit.py's own second diff path (L161–162) → one rule across - all three diff renders. -- Keep truncation hints glued to bodies (already consistent). - -### A5. Centralize prompt-preamble newlines — `prompt.py` -- Replace every `if fragments and not fragments[-1][1].endswith("\n"): fragments.append(("","\n"))` - with `ensure_prompt_newline(fragments)`. -- One policy: agent-status / modal-body blocks are newline-terminated; card-style separator = - exactly one newline before the prompt label; non-card prompt = exactly one newline before - `❯`. -- **The prompt layer owns the final gap before input** — this resolves the spinner-verb ↔ - prompt seam (don't rely on whatever the last live action happened to be). This is the home - of the original "blank row under the spinner verb" request. - -### A6. Move modal panel section gaps into `render_dialog()` — `_dialog_shell.py` (+ callers) -- `render_dialog()` (`_dialog_shell.py:35`) becomes the only place that inserts: body→options, - options→footer, and body→footer (when no options) gaps, via `blank_row()`. -- Clean up callers `_approval_panel.py`, `_question_panel.py`, `_btw_panel.py` so they stop - sprinkling `Text("")`. **Rule:** panels decide sections semantically; `render_dialog()` - decides vertical rhythm. -- Keep `DIALOG_PANEL_PADDING = (0, 1)` (no top/bottom panel padding yet — preserve density). - -### A7. Markdown/code spacing stays internal — `components/markdown.py` -- Replace the literal `Syntax(..., padding=(0, 1))` (L58–63) with `CODE_BLOCK_PADDING`. -- Do not add external blank rows around markdown/code. **Rule:** markdown/code own internal - code-block padding only; stream/panels own external gaps. - -### A8. Tests for the standard (add with the migration) -1. `blank_row().plain == ""`; no canonical spacer uses `" "`. -2. Live stream: content+spinner = one blank row; tool+tool = one blank row; no `Text(" ")`. -3. Tool cards: pending/error/success add no extra blanks around body; tinted+spacer == - untinted+spacer vertical gap count. -4. Diff: edit.py and write.py render summary immediately followed by diff lines. -5. Prompt preamble: status block → prompt = exactly one newline boundary; card separator - appears once. -6. Dialogs: approval/question/footer gaps come from `render_dialog()`; callers add no blank - separators beyond semantic body rows. - -### A9. Migration order (collision-safe; tests after each step) -1. Add `spacing.py` + primitive tests. -2. Edit `_live_view.py` only → - `uv run pytest tests/ui_and_conv/test_empty_think_part_indicator.py -q` -3. Edit `tool_execution.py`, `edit.py` → - `uv run pytest tests/ui_and_conv/test_tui_card_tool_renderers.py -q` -4. Edit `_dialog_shell.py` + approval/question/btw callers → - `uv run pytest tests/ui_and_conv/test_question_panel.py tests/ui_and_conv/test_visualize_running_prompt.py -q` -5. Edit `prompt.py` last. -6. Broad gate: `make check-pythinker-code && make test-pythinker-code` - ---- - -# TRACK B — File / diff rendering unification - -(Markdown/code untouched here; this is tool-card file output and diffs.) - -### B1. Collapse two diff engines into one -- Today: `components/diff.py:render_diff` (plain, no highlight, used by tool cards) **and** - `utils/rich/diff_render.py:render_diff_panel` (Pygments + line numbers + inline highlight). -- Make the panel engine the single engine; `_file_diff.py:diff_frame` calls it in a - "compact" mode for inline cards. Removes duplicated line-number/marker/inline logic. -- **Wire the unused `width`** (`_file_diff.py:125` binds `_ = width`) so diffs respect - terminal width instead of implicit wrapping. - -### B2. Optional syntax highlighting in tool-card diffs/listings -- Add opt-in highlighting (lexer by extension via `utils/rich/syntax.py:PythinkerSyntax`) to - `format_numbered_lines_block` and the compact diff, behind a theme/setting flag, reusing - `_strip_background` so tints don't fight the card bg. - -### B3. Line-number + truncation consistency -- Unify the line-number style token: tool renderers use `"muted"`, panels use `"dim"` → pick - one. -- Unify line-number min-width (today 4 in `_render_utils`, 2 in `diff.py`/`diff_render.py`). -- One `EXPAND_HINT` formatter → `"… +{n} lines (ctrl+o to expand)"`; retire grep's - `"... (N more lines …)"` and standardize the `…` glyph. Collapse the 8 scattered limits - (write=10, grep=15, find=20, agent=6, ask_user=8, think=6, todo=12, bash=5) into a - principled small/medium/large set in a constants module. Document the - `__suppress_generic_expand_hint__` state flag. - -Verification: golden tests for a diff (add/remove/context/inline), a wrapped long line at -narrow width, and a truncated listing with the unified hint. - ---- - -# TRACK C — Markdown rendering polish - -Chain: `markdown-it` → `utils/rich/markdown.py:Markdown` → -`components/markdown.py:PythinkerMarkdown`. - -### C1. Theme-token bugs -- **H2 loses color** (`components/markdown.py:81`): add `color=heading` (matches H1/H3/H4). -- **Hardcoded `grey50`** in thinking paths (`visualize/_blocks.py`, 9+ sites) → a - `tui_rich_style("thinking_text")` token so dark/light apply. -- Distinguish `markdown.link` vs `markdown.link_url` styling (currently identical). - -### C2. Streaming correctness / perf -- Each flush re-parses the whole committed buffer (`_blocks.py:254`). Either parse only the - new slice, or wire the already-written-but-unused `PythinkerMarkdownStream` - (`components/markdown.py:192`); delete the dead path. -- Make hidden vs streamed thinking treatment consistent (plain `Text` vs Markdown). - -### C3. Code-block ergonomics -- Optional line numbers in fenced code (today `Syntax()` has none). -- (Padding already handled by Track A7 `CODE_BLOCK_PADDING`.) - -Verification: snapshots for H1–H4, lists, blockquote, table (wide→record), inline code, a -fenced python block; a streaming test asserting identical final render for 1-chunk vs -many-chunk arrival. - ---- - -# TRACK D — Cross-cutting hardening - -- **Theme-literal lint test:** fails if a render module imports a raw color name. -- **Render matrix:** parametrized snapshots at widths {40, 80, 120}, `NO_COLOR=1`, - `PYTHINKER_REDUCED_MOTION=1`. -- **Marker-glyph registry:** centralize `●`, `⎿`, `❯`, `├`/`└`, `⧉`, spinner frames so the - visual language lives in one place. - ---- - -## Overall sequencing & risk - -Order: **Track A → B → C → D.** A is the validated, highest-impact, lowest-risk track and -unblocks the rest. Each track = its own branch/commit set, tests green at every step. - -Risks / guards: -- **Snapshot tests are the safety net** — regenerate and *review the diff* each step; an - unreviewed snapshot update hides regressions. -- **Spinner-verb structural tests** (`test_empty_think_part_indicator.py`, - `test_live_view_notifications.py`) constrain where the under-gap lives (Track A5). -- **Parallel agent:** stashes touch only `packages/pythinker-review/*` + `config`/`slash` — - no overlap with these files. Target UI files carry uncommitted branch edits; layer on top. -- **Behavior-preserving claims** proven by before/after snapshot diff, never assumed. - -**Definition of done:** one `spacing.py` + one render-constants module imported everywhere; -one diff engine; zero raw color literals in render paths; unified truncation hint; H2 + -thinking theme bugs fixed; `make check-pythinker-code && make test-pythinker-code` green plus -the width/`NO_COLOR`/reduced-motion snapshot matrix. - ---- - -## Progress log - -### Track A — DONE (2026-05-24) -- A1 `spacing.py` created (`BLANK_ROW`, gap/padding constants, `blank_row`, `append_gap`, - `ensure_prompt_newline` typed against prompt-toolkit `StyleAndTextTuples`) + 10 unit tests. -- A2 `_live_view.py`: `_ACTION_SPACER = BLANK_ROW` (retired `Text(" ")`). -- A3 `tool_execution.py`: tinted card padding `(1,1)` → `TINTED_CARD_PADDING (0,1)` — kills the - card/stream double-gap. -- A4 `edit.py`: removed stray `Text("")` between summary and diff (now matches write.py + - edit.py's own 2nd diff path). -- A5 `prompt.py`: refactored every `endswith("\n")` guard to `ensure_prompt_newline()`; added - the spinner-verb under-gap at the `_render_agent_status` chokepoint (interactive) and in - `_live_view.compose()` gated on a non-empty status line (non-interactive). -- A6 `_dialog_shell.py` + approval/question/btw/worklog panels: section gaps via `blank_row()`, - padding via `DIALOG_PANEL_PADDING`/`WORKLOG_PANEL_PADDING`. -- A7 `markdown.py`: code-block padding → `CODE_BLOCK_PADDING`. -- A8 added under-gap behavior tests (compose under-gap present / no-double-gap when empty). -- Verification: `make check-pythinker-code` clean; `tests/ui_and_conv` + `tests/ui` = 1183 - passed; new spacing/live-view tests = 20 passed. Behavior-preserving except the intended - card-padding and under-gap changes. - -### Track B — DONE (2026-05-24), look-preserving -Decision: the two diff renderers serve different surfaces (inline boxless tool-card diff = -the screenshot; `render_diff_panel`/`_preview` = boxed approval/worklog). "One engine -everywhere" would box-ify the inline diff → rejected by user. Unified via a shared module -instead. -- New `render_constants.py`: `DIFF_CONTEXT_LINES`, `DIFF_LINE_NUMBER_MIN_WIDTH`, - `LISTING_LINE_NUMBER_MIN_WIDTH`, keymap-driven `expand_hint()` (+4 tests). -- Deduped the two `=3` context constants and `max(…,2)` widths across `components/diff.py` - and `utils/rich/diff_render.py`; listing width floor via constant in `_render_utils.py`. -- Unified the truncation hint across grep/write/diff-preview — fixes a real bug: the panel - preview hardcoded `ctrl-e` while the actual expand key is `ctrl+o`; also standardized the - `…`/`...` glyph + phrasing. -- Verified inline diff stays boxless (matches screenshot); `tests/ui_and_conv`+`ui`+`utils` - = 1526 passed; checks clean. -- Deferred (rationale): full word-diff engine merge (different internal representations — - risky, low reward); syntax-highlighting tool cards (would change the boxless look); - line-number token muted-vs-dim unification (marginal benefit, visible-change risk). - -### Track C — DONE (2026-05-24) -- C1: H2 heading color bug fixed; `link_url` dimmed vs `link`; 3 thinking-path `grey50` - literals → `tui_rich_style("thinking_text")`; widened `BulletColumns.bullet_style` to - `StyleType`. +2 regression tests. -- C2: no-op — streaming already parses only new committed slices (not whole buffer), and - `PythinkerMarkdownStream` is tested (not dead). Left intact (surgical-change discipline). -- C3: skipped — code-block line numbers would change the boxless look. - -### Track D — DONE (2026-05-24) -- Finished the `grey50` sweep in `_blocks.py` (subagent/content/bullets) → `tui_rich_style` - tokens; `_blocks.py` is now grey50-free. -- New `glyphs.py`: single source for spinner frames + interval + reduced-motion glyph; - `motion.py` and `spinner_words.py` dedup to it (same object, verified by `is`). -- `test_render_matrix.py`: width {40,80,120} x NO_COLOR x reduced-motion matrix asserting - boxless inline diff, content-stable-under-color-toggle, static glyph under reduced motion. -- `test_render_hardening.py`: guard that `_blocks.py` stays grey50-free + spinner-frame - single-source. -- Deferred (honest): whole-repo raw-color lint enforcement (too big/risky for one pass — - needs a systematic sweep first); broad `●`/`⎿`/`❯` marker migration (prompt symbols - already centralized via `PROMPT_SYMBOL_*`; adding unused constants would be speculative). - -### Final verification -`make check-pythinker-code` clean; `tests/ui_and_conv`+`ui`+`utils` = 1556 passed, 1 skipped. diff --git a/tasks/tui-text-color-standardization.md b/tasks/tui-text-color-standardization.md deleted file mode 100644 index e4453d22..00000000 --- a/tasks/tui-text-color-standardization.md +++ /dev/null @@ -1,212 +0,0 @@ -# Plan: Standardize TUI text/font colors (markdown + agent/subagent) - -**Status**: Scouting complete — awaiting design decisions before implementation. -**Created**: 2026-05-24 -**Branch**: `feat/tui-brand-rebrand` -**Scope (confirmed by user)**: Keep the coral brand. Standardize *text/font colors* — the -agent/subagent and markdown text colors are the main issue. NOT touching box-drawing -corners, status-glyph vocabulary, or pi's actual color values. - ---- - -## Reference - -Design ported from `blackbox/pi-main` (TypeScript/Rich-equivalent). Pythinker already -ported pi's **core token names** (`accent`, `border`, `tool_success_bg`, …) onto the -branded `TuiTokens` palette during this branch. Values diverge by design: -- Pythinker `accent`: **blue** (`#5EA7E8` dark / `#256EA8` light, theme.py:415) -- Pythinker coral brand: logo (`_LOGO_CORAL=#EE9983`) + verb-spinner shimmer (motion.py) -- pi `accent`: teal `#8abeb7` - -The brand is intentionally different from pi. The "coral brand" the user refers to is the -logo/shimmer coral, not the structural accent token. - ---- - -## Root cause (the "main issue") - -There are **two parallel color systems** in `src/pythinker_code/ui/theme.py`: - -1. `TuiTokens` (theme.py:368+) — the branded coral source of truth. Everything routed - through `tui_rich_style()` / `fg()` / `ShellTone` already resolves here. -2. `MarkdownColors` (theme.py:218–265) — a **separate, hardcoded** palette - (`heading=#F4F4F5`, `emphasis=#A3A3A3`, `inline_code=#AFE3F1`, …) that is **not** - derived from `TuiTokens`. - -`MarkdownColors` is consumed *only* by `components/markdown.py:_markdown_style_overrides`, -which is the render path for: -- assistant messages (`components/messages.py:67,99,144`) -- special messages — skill / compaction / branch summaries (`components/special_messages.py`) -- worklog, blocks, question panel (`visualize/_worklog.py`, `_blocks.py`, `_question_panel.py`) - -**Consequence:** markdown/prose text colors are maintained independently of the brand -tokens. Some values coincide (`link=#AFE3F1` == `info` token) but they drift separately, -and the coral brand never reaches prose. - -### Agent/subagent specifics -- `tool_renderers/agent.py` already routes its *chrome* (header, prompt preview, icons) - through tokens correctly. -- BUT subagent **result text** is rendered by `format_lines_block(... style_token="tool_output")` - (agent.py:129–134) as **flat single-color lines, not markdown**. So when a subagent - returns markdown, it shows as flat grey `tool_output` text — visually inconsistent with - how the assistant's identical markdown renders. - ---- - -## Decisions - -- [x] **D1 — Markdown accent role.** RESOLVED: **low-chrome, no coral in prose.** Keep - the current visual mapping; re-source it from `TuiTokens` for a single source of - truth. Zero visual change to prose. -- [x] **D2 — Subagent output.** RESOLVED: **keep flat (no markdown parsing)**, ensure the - color derives from the unified `tool_output` token. (agent.py:133 already passes - `style_token="tool_output"` → already token-backed. Likely a no-op; verify.) - -## ⚠️ Open observation — `accent` token is BLUE, not coral - -The structural `accent` token is **blue** (`#5EA7E8` dark / `#256EA8` light, theme.py:415). -Coral `#EE9983` appears only in the **logo** (`__init__.py:1767 _LOGO_CORAL`) and the -**verb-spinner shimmer** (`motion.py:32-38`, marked `brand-exception`). So the current -brand is *blue structural accent + coral logo/motion*. User said "we need coral brand" — -needs a decision on whether `accent` should flip to coral (separate from the markdown work). - ---- - -## Value-preserving token mapping (verified against theme.py:414-475) - -Every `MarkdownColors` field equals an existing token in BOTH themes, except -`code_block_bg` (markdown-only). So re-sourcing is a pure refactor — no visual change. - -| MarkdownColors field | Dark / Light value | Token (same value) | -|---|---|---| -| heading | #F4F4F5 / #213853 | `tool_title` | -| strong | #F4F4F5 / #213853 | `tool_title` | -| emphasis | #A3A3A3 / #666666 | `muted` | -| inline_code | #AFE3F1 / #176B7E | `info` | -| link | #AFE3F1 / #176B7E | `info` | -| quote | #A3A3A3 / #666666 | `muted` | -| table_border | #2B3A52 / #C8BEC0 | `border_muted` | -| code_block_border | #2B3A52 / #C8BEC0 | `border_muted` | -| spinner_active | #AFE3F1 / #176B7E | `info` | -| spinner_done | #7BC97F / #2C7A39 | `success` | -| spinner_failed | #EF5E62 / #C0392B | `error` | -| code_block_bg | #1f2030 / #f1f5f9 | **none** → add new `code_block_bg` token | - -## Proposed plan - -1. **Add `code_block_bg` token** to `TuiTokens` (#1f2030 dark / #f1f5f9 light) + - `TUI_TOKEN_NAMES` → `verify`: token tests pass. -2. **Rebuild `_MARKDOWN_DARK`/`_MARKDOWN_LIGHT`** to read every field from the token - palette instead of hardcoded hex (keep the `MarkdownColors` dataclass + `get_markdown_colors` - API unchanged so `markdown.py` needs no structural change) → `verify`: resolved values - byte-identical to current; `tests/ui_and_conv/` snapshots unchanged. -3. **Confirm subagent output** is token-backed (agent.py:133) → `verify`: no raw color in path. -4. **Text/copy micro-pass** (only if quick, in scope): standardize error-message - punctuation (setup.py:44 vs oauth.py:165). -5. **Run full TUI suite** + lint/format. - ---- - -## Out of scope (flagged, not doing) - -- Switching `box.ROUNDED` → pi's square corners (visible change, fights the rebrand). -- Flattening pythinker's 7 status glyphs to pi's 4 (`✓`/`✗`). -- Porting pi's fixed-hex syntax theme — pythinker's `ANSISyntaxTheme` - (`utils/rich/syntax.py`) is terminal-native and *better*; porting pi's would regress it. -- Per-thinking-level border colors (pi has 6; pythinker has the levels but one color). - Genuinely portable later, but it's a *new feature*, not text-color standardization. -- Centralizing prompt symbols (`✨`/`›`/`$`) into `glyphs.py` — tidiness, separate task. - ---- - -## Broader roadmap (from deep scan — validated 2026-05-24) - -Source: TUI deep scan analysis vs pi-main. Claims below have been validated against actual -source files. See correction notes. - -**Correction notes** (errors in the raw analysis, do not carry into work): -- Analysis claims `accent="#EE9983 (coral)"` — WRONG. Actual: `#5EA7E8` dark / `#256EA8` - light (theme.py:415). Coral is logo/shimmer only. -- Analysis claims "restart required for theme switching" — WRONG. `/theme` slash command - exists (tips.py:17); `set_active_theme()` called at runtime (__init__.py:545). -- Analysis claims "$COLORFGBG fallback" for terminal detection — UNCONFIRMED. No such code - found in source. - -### P1 — Thinking level color gradient (0.5 day, medium impact) - -Port pi's 6 thinking-level border tokens into TuiTokens. Pythinker already exposes all 6 -levels (off/minimal/low/medium/high/xhigh) in the ThinkingLevel literal (selectors/thinking.py:7) -but has only one `thinking_text` color. pi maps levels to a gradient from muted→vivid. - -Proposed tokens to add to TuiTokens: -``` -thinking_off: "" # no border tint (terminal default) -thinking_minimal: "#6e6e6e" # subtle -thinking_low: "#5f87af" # blue-grey -thinking_medium: "#81a2be" # blue -thinking_high: "#b294bb" # purple -thinking_xhigh: "#d183e8" # vivid purple -``` -Use in the thinking block renderer to color the border by active level. - -### P1 — Missing markdown tokens (0.5 day, medium impact) - -Add to MarkdownColors (and route through TuiTokens after Phase 0): -- `md_quote_border` — separate from quote text color -- `md_hr` — horizontal rule color -- `md_list_bullet` — bullet/number color (currently reuses `quote`) -- `md_link_url` — URL part of links (currently same as link text) - -### P2 — Syntax highlighting token system (2–3 days, medium impact) - -Create a bridge that maps TuiTokens syntax fields → Rich/pygments ANSISyntaxTheme, -replacing the current standalone PYTHINKER_ANSI_THEME with one that respects the active -brand palette. Tokens to add: `syntax_comment`, `syntax_keyword`, `syntax_function`, -`syntax_variable`, `syntax_string`, `syntax_number`, `syntax_type`, `syntax_operator`, -`syntax_punctuation`. - -Note: pythinker's current ANSISyntaxTheme is *terminal-native* (uses ANSI color names, -not hex) which is actually the right approach. A bridge that passes through the terminal -palette is preferable to pi's hard-coded VS-Code hex approach. Preserving terminal- -adaptability is a feature. - -### P2 — Terminal background auto-detection (1 day, nice-to-have) - -Auto-select dark/light theme on startup by detecting terminal background color. pi uses -OSC 11 query → $COLORFGBG env var → luminance analysis. Fallback chain should be: -OSC 11 → COLORFGBG → TERM heuristics → "dark". - -Note: pythinker currently has no terminal background detection (no COLORFGBG code found). - -### P3 — JSON theme file system (3–4 days, high architectural impact) - -Externalize TuiTokens + MarkdownColors + DiffColors as JSON files with a `vars` section -for color aliasing. Enable user custom themes in `~/.pythinker/themes/`. This is the -highest-leverage infrastructure investment — it unlocks runtime switching, hot-reload, -and user customization without code changes. - -Implementation shape: -1. `ui/theme_loader.py` — JSON parser with var indirection + circular detection + schema validation -2. Built-in `dark.json`/`light.json` as package data (ship with pythinker-code) -3. User override search path: `~/.pythinker/themes/.json` -4. `set_active_theme()` gains `theme_name` param; resolves from loader - -Note: The `/theme` command already exists for dark/light switching (tips.py:17). This -phase extends it to arbitrary named themes. - -### P3 — Component-level theme factories (1–2 days, architecture) - -Add bridge factories matching pi's `getMarkdownTheme()` / `getEditorTheme()` pattern: -```python -def get_markdown_component_theme() -> MarkdownComponentTheme: ... -def get_prompt_component_theme() -> PromptComponentTheme: ... -def get_tool_render_theme() -> ToolRenderTheme: ... -``` -Reduces consumers needing to know token names directly. Lower priority — current -`tui_rich_style()` API is workable. - ---- - -## Review section - -_(to be filled after implementation)_ diff --git a/tests/core/test_compaction_restore.py b/tests/core/test_compaction_restore.py index 9d1ab764..dfe4c47f 100644 --- a/tests/core/test_compaction_restore.py +++ b/tests/core/test_compaction_restore.py @@ -177,3 +177,47 @@ def _capture_wire(msg): session_start_call = soul._hook_engine.trigger.await_args_list[2] # pyright: ignore[reportPrivateUsage] assert session_start_call.args[0] == "SessionStart" assert session_start_call.kwargs["matcher_value"] == "compact" + + +@pytest.mark.asyncio +async def test_compact_context_emits_end_when_compaction_fails( + runtime: Runtime, + tmp_path: Path, +) -> None: + agent = Agent( + name="Test Agent", + system_prompt="Test system prompt.", + toolset=EmptyToolset(), + runtime=runtime, + ) + context = Context(file_backend=tmp_path / "history-failure.jsonl") + soul = PythinkerSoul(agent, context=context) + runtime.session.state.active_skills = [] + + await context.append_message(Message(role="user", content=[TextPart(text="compact me")])) + + soul._run_with_connection_recovery = AsyncMock( # pyright: ignore[reportPrivateUsage] + side_effect=RuntimeError("LLM 5xx") + ) + soul._hook_engine.trigger = AsyncMock(return_value=[]) # pyright: ignore[reportPrivateUsage] + + sent: list[str] = [] + + def _capture_wire(msg): + sent.append(type(msg).__name__) + + with ( + patch("pythinker_code.soul.pythinkersoul.wire_send", _capture_wire), + patch("pythinker_code.telemetry.track") as track, + pytest.raises(RuntimeError, match="LLM 5xx"), + ): + await soul.compact_context() + + assert sent.count("CompactionBegin") == 1 + assert sent.count("CompactionEnd") == 1 + track.assert_called_once_with( + "compaction_triggered", + trigger_type="auto", + before_tokens=context.token_count, + success=False, + ) diff --git a/tests/core/test_memory_phase_bcd.py b/tests/core/test_memory_phase_bcd.py index 4b949d43..709b8909 100644 --- a/tests/core/test_memory_phase_bcd.py +++ b/tests/core/test_memory_phase_bcd.py @@ -6,9 +6,15 @@ from pythinker_core.message import Message, TextPart from pythinker_host.path import HostPath +from pythinker_code.memory.consolidation import generate_inbox_candidates from pythinker_code.memory.harvest import CompactionHarvester from pythinker_code.memory.recap import build_session_recap, content_hash -from pythinker_code.memory.retriever import RankedBlock, RecallQuery, estimate_tokens +from pythinker_code.memory.retriever import ( + LexicalRetriever, + RankedBlock, + RecallQuery, + estimate_tokens, +) from pythinker_code.memory.retriever_sqlite import SqliteFts5Retriever from pythinker_code.project_memory import ProjectMemoryStore from pythinker_code.session_state import SessionState, TodoItemState @@ -83,20 +89,72 @@ def test_compaction_harvester_extracts_safe_assistant_notes(): ] -async def test_sqlite_retriever_falls_back_to_lexical(): - block = RankedBlock( +def _ranked(content: str, *, title: str = "retriever") -> RankedBlock: + return RankedBlock( tier="memory", source_path="MEMORY.md", source_id=None, session_id=None, - title="retriever", + title=title, labels=(), files=(), created_at_epoch=time.time(), - token_estimate=estimate_tokens("lexical sqlite fallback"), + token_estimate=estimate_tokens(content), score=0.0, - content="lexical sqlite fallback", + content=content, + ) + + +async def test_lexical_retriever_matches_unicode_terms(): + out = await LexicalRetriever( + [_ranked("решение использовать кеш"), _ranked("english only")] + ).retrieve(RecallQuery(text="решение"), budget_tokens=100) + assert out and out[0].content == "решение использовать кеш" + + +async def test_lexical_retriever_uses_query_labels_as_terms(): + out = await LexicalRetriever([_ranked("lexical sqlite fallback")]).retrieve( + RecallQuery(labels=("sqlite fallback",)), budget_tokens=100 + ) + assert out and out[0].content == "lexical sqlite fallback" + + +async def test_generate_inbox_candidates_skips_approved_scratch_note(tmp_path, monkeypatch): + monkeypatch.setenv("PYTHINKER_SHARE_DIR", str(tmp_path / "share")) + repo = tmp_path / "repo" + scratch = repo / ".pythinker" / "scratch" + scratch.mkdir(parents=True) + (scratch / "notes.md").write_text( + "### decision — 2026-05-30 10:00\n\nUse lexical recall for project memory.", + encoding="utf-8", + ) + store = ProjectMemoryStore(_hp(repo)) + assert (await store.add("memory", "Use lexical recall for project memory.")).ok is True + + assert await generate_inbox_candidates(store, _hp(repo)) == [] + + +async def test_generate_inbox_candidates_ignores_corrupt_duplicate_file(tmp_path, monkeypatch): + monkeypatch.setenv("PYTHINKER_SHARE_DIR", str(tmp_path / "share")) + repo = tmp_path / "repo" + scratch = repo / ".pythinker" / "scratch" + scratch.mkdir(parents=True) + (scratch / "notes.md").write_text( + "### decision — 2026-05-30 10:00\n\nKeep corrupt inbox files from crashing harvest.", + encoding="utf-8", ) + store = ProjectMemoryStore(_hp(repo)) + first = await generate_inbox_candidates(store, _hp(repo)) + assert len(first) == 1 + root = await store._ensure_dir() # pyright: ignore[reportPrivateUsage] + inbox = root / "memory" / "inbox" + (inbox / f"{first[0].id}.json").write_text("{not json", encoding="utf-8") + + assert await generate_inbox_candidates(store, _hp(repo)) == [] + + +async def test_sqlite_retriever_falls_back_to_lexical(): + block = _ranked("lexical sqlite fallback") out = await SqliteFts5Retriever([block], fts5_available=False).retrieve( RecallQuery(text="sqlite fallback"), budget_tokens=100 ) diff --git a/tests/tools/test_str_replace_file.py b/tests/tools/test_str_replace_file.py index fb95f70f..e9fa8582 100644 --- a/tests/tools/test_str_replace_file.py +++ b/tests/tools/test_str_replace_file.py @@ -282,10 +282,8 @@ async def test_replace_mixed_multiple_edits( Params( path=str(file_path), edit=[ - Edit(old="apple", new="fruit", replace_all=False), # Only first occurrence - Edit( - old="banana", new="tasty", replace_all=True - ), # All occurrences (though only one) + Edit(old="apple apple", new="fruit apple", replace_all=False), + Edit(old="banana", new="tasty", replace_all=True), ], ) ) @@ -295,6 +293,111 @@ async def test_replace_mixed_multiple_edits( assert await file_path.read_text() == "fruit apple tasty apple cherry" +async def test_multi_edit_reports_missing_old_string( + str_replace_file_tool: StrReplaceFile, temp_work_dir: HostPath +): + """A non-matching edit in a multi-edit list must error, not silently no-op. + + Regression: previously the file was written with only the matching edits + applied and the missing edit was swallowed because the no-change check ran + once over the whole batch. + """ + file_path = temp_work_dir / "test.txt" + original_content = "apple banana cherry" + await file_path.write_text(original_content) + + result = await str_replace_file_tool( + Params( + path=str(file_path), + edit=[ + Edit(old="apple", new="fruit"), + Edit(old="not-present", new="x"), + ], + ) + ) + + assert result.is_error + assert "No replacements were made" in result.message + assert "not-present" in result.message + # The file must be left untouched when any edit fails to match. + assert await file_path.read_text() == original_content + + +async def test_multi_edit_count_handles_chained_edits( + str_replace_file_tool: StrReplaceFile, temp_work_dir: HostPath +): + """Replacement count is tallied against progressively-edited content. + + The second edit targets text produced by the first; the success message + must still count it (the old code counted against the original content and + reported zero for chained edits). + """ + file_path = temp_work_dir / "test.txt" + await file_path.write_text("alpha") + + result = await str_replace_file_tool( + Params( + path=str(file_path), + edit=[ + Edit(old="alpha", new="beta"), + Edit(old="beta", new="gamma"), + ], + ) + ) + + assert not result.is_error + assert "successfully edited" in result.message + assert "2 total replacement(s)" in result.message + assert await file_path.read_text() == "gamma" + + +async def test_single_edit_requires_unique_old_string( + str_replace_file_tool: StrReplaceFile, temp_work_dir: HostPath +): + """A non-replace_all edit must not silently choose the first match.""" + file_path = temp_work_dir / "test.txt" + original_content = "apple banana apple" + await file_path.write_text(original_content) + + result = await str_replace_file_tool( + Params(path=str(file_path), edit=Edit(old="apple", new="fruit")) + ) + + assert result.is_error + assert "occurs 2 times" in result.message + assert "replace_all=true" in result.message + assert await file_path.read_text() == original_content + + +async def test_replace_rejects_empty_old_string( + str_replace_file_tool: StrReplaceFile, temp_work_dir: HostPath +): + """An empty old string would otherwise insert text instead of replacing.""" + file_path = temp_work_dir / "test.txt" + original_content = "Hello world!" + await file_path.write_text(original_content) + + result = await str_replace_file_tool( + Params(path=str(file_path), edit=Edit(old="", new="prefix")) + ) + + assert result.is_error + assert "old string cannot be empty" in result.message + assert await file_path.read_text() == original_content + + +async def test_replace_rejects_empty_edit_list( + str_replace_file_tool: StrReplaceFile, temp_work_dir: HostPath +): + file_path = temp_work_dir / "test.txt" + await file_path.write_text("Hello world!") + + result = await str_replace_file_tool(Params(path=str(file_path), edit=[])) + + assert result.is_error + assert "At least one edit" in result.message + + async def test_replace_empty_strings( str_replace_file_tool: StrReplaceFile, temp_work_dir: HostPath ): diff --git a/tests/tools/test_tool_descriptions.py b/tests/tools/test_tool_descriptions.py index fedd900c..cf048f24 100644 --- a/tests/tools/test_tool_descriptions.py +++ b/tests/tools/test_tool_descriptions.py @@ -367,6 +367,7 @@ def test_str_replace_file_description(str_replace_file_tool: StrReplaceFile): - Only use this tool on text files. - Multi-line strings are supported. - Can specify a single edit or a list of edits in one call. +- Unless `replace_all` is true, the old string must match exactly once; add surrounding context if it is ambiguous. - You should prefer this tool over WriteFile tool and Shell `sed` command. """ ) diff --git a/tests/ui_and_conv/test_shell_export_import_commands.py b/tests/ui_and_conv/test_shell_export_import_commands.py index a8c3cf53..030c5703 100644 --- a/tests/ui_and_conv/test_shell_export_import_commands.py +++ b/tests/ui_and_conv/test_shell_export_import_commands.py @@ -1,10 +1,12 @@ from __future__ import annotations +from io import StringIO from pathlib import Path from types import SimpleNamespace from unittest.mock import AsyncMock, Mock from pythinker_core.message import Message +from rich.console import Console from pythinker_code.session import Session from pythinker_code.ui.shell import export_import as shell_export_import @@ -46,6 +48,25 @@ async def test_export_writes_markdown_file(tmp_path: Path) -> None: assert "Hi!" in content +async def test_export_escapes_bracketed_output_path(tmp_path: Path, monkeypatch) -> None: + app = _make_shell_app(tmp_path) + app.soul.context.history = [Message(role="user", content=[TextPart(text="Hello")])] + output_dir = tmp_path / "[proj]" + output_dir.mkdir() + output = output_dir / "session.md" + + printed = StringIO() + monkeypatch.setattr( + shell_export_import, + "console", + Console(file=printed, force_terminal=False, color_system=None), + ) + + await shell_export_import.export(app, str(output)) # type: ignore[reportGeneralTypeIssues] + + assert "[proj]" in printed.getvalue() + + async def test_import_from_file_appends_message_and_wire_markers(tmp_path: Path) -> None: app = _make_shell_app(tmp_path) source_file = tmp_path / "source.md"