diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1aff7ee8..cfde48c7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,11 @@ GitHub Releases page; `0.8.0` is the new starting line.
 
 ## Unreleased
 
+- **StrReplaceFile now refuses ambiguous single replacements.** A non-`replace_all` edit now
+  errors when `old` matches more than once instead of silently editing the first match. Add
+  surrounding context to make the old string unique, or pass `replace_all=true` when every
+  occurrence should change.
+
 ## 0.26.0 (2026-05-30)
 
 ### What changed in this release
diff --git a/src/pythinker_code/background/manager.py b/src/pythinker_code/background/manager.py
index 7c1e74d9..6732e4cc 100644
--- a/src/pythinker_code/background/manager.py
+++ b/src/pythinker_code/background/manager.py
@@ -262,15 +262,19 @@ def create_bash_task(
             self._store.write_runtime(task_id, runtime)
             raise
 
-        runtime = self._store.read_runtime(task_id)
-        if runtime.finished_at is None and (
-            runtime.status == "created"
-            or (runtime.status == "starting" and runtime.worker_pid is None)
-        ):
+        def mark_worker_started(runtime: TaskRuntime) -> bool:
+            if runtime.finished_at is not None:
+                return False
+            if runtime.status != "created" and not (
+                runtime.status == "starting" and runtime.worker_pid is None
+            ):
+                return False
             runtime.status = "starting"
             runtime.worker_pid = worker_pid
             runtime.updated_at = time.time()
-            self._store.write_runtime(task_id, runtime)
+            return True
+
+        self._store.update_runtime(task_id, mark_worker_started)
         view = self._store.merged_view(task_id)
         self._journal_task_milestone("background task started", view)
         return view
diff --git a/src/pythinker_code/background/store.py b/src/pythinker_code/background/store.py
index cf4397e5..d628713a 100644
--- a/src/pythinker_code/background/store.py
+++ b/src/pythinker_code/background/store.py
@@ -6,6 +6,7 @@
 import re
 import shutil
 import time
+from collections.abc import Callable
 from contextlib import contextmanager
 from pathlib import Path
 
@@ -139,6 +140,18 @@ def write_runtime(self, task_id: str, runtime: TaskRuntime) -> None:
         with self._runtime_lock(task_id):
             self._write_runtime_unlocked(task_id, runtime)
 
+    def update_runtime(self, task_id: str, update: Callable[[TaskRuntime], bool]) -> TaskRuntime:
+        """Apply a read-modify-write update under the runtime lock.
+
+        ``update`` mutates the current runtime and returns true when it should
+        be written back. The current runtime is returned either way.
+        """
+        with self._runtime_lock(task_id):
+            runtime = self.read_runtime(task_id)
+            if update(runtime):
+                self._write_runtime_unlocked(task_id, runtime)
+            return runtime
+
     def _write_runtime_unlocked(self, task_id: str, runtime: TaskRuntime) -> None:
         """Write runtime without acquiring the per-task lock (caller holds it)."""
         path = self.runtime_path(task_id)
diff --git a/src/pythinker_code/background/worker.py b/src/pythinker_code/background/worker.py
index bac930b7..62f80fa0 100644
--- a/src/pythinker_code/background/worker.py
+++ b/src/pythinker_code/background/worker.py
@@ -12,7 +12,7 @@
 from pythinker_code.utils.logging import logger
 from pythinker_code.utils.subprocess_env import get_clean_env
 
-from .models import TaskControl
+from .models import TaskControl, TaskRuntime
 from .store import BackgroundTaskStore
 
 
@@ -112,9 +112,9 @@ async def _terminate_process(force: bool = False) -> None:
     async def _check_output_limit() -> None:
         """Terminate the task if its output.log grew past ``max_output_bytes``.
 
-        Writes a single marker line and records a failure the first time the
-        limit is hit; the ``output_limit_exceeded`` guard keeps it from
-        re-marking on subsequent polls or once the process is already exiting.
+        Writes a single marker line and asks the process to terminate the first
+        time the limit is hit; the final runtime write records the failure once
+        the process has exited.
         """
         nonlocal output_limit_exceeded, output_limit_reason
         if max_output_bytes <= 0 or output_limit_exceeded:
@@ -133,14 +133,6 @@ async def _check_output_limit() -> None:
         marker = f"\n... output limit exceeded ({size} bytes); task terminated ...\n"
         with contextlib.suppress(OSError), output_path.open("ab") as marker_file:
             marker_file.write(marker.encode("utf-8"))
-        with store._runtime_lock(task_id):  # pyright: ignore[reportPrivateUsage]
-            current = store.read_runtime(task_id)
-            if not current.finished_at:
-                current.status = "failed"
-                current.interrupted = True
-                current.failure_reason = output_limit_reason
-                current.updated_at = time.time()
-                store._write_runtime_unlocked(task_id, current)  # pyright: ignore[reportPrivateUsage]
         await _terminate_process(force=False)
 
     async def _control_loop() -> None:
@@ -211,7 +203,6 @@ async def _input_loop() -> None:
             runtime.updated_at = time.time()
             runtime.heartbeat_at = runtime.updated_at
             store.write_runtime(task_id, runtime)
-            last_known_runtime = runtime
 
             heartbeat_task = asyncio.create_task(_heartbeat_loop())
             control_task = asyncio.create_task(_control_loop())
@@ -254,29 +245,32 @@ async def _input_loop() -> None:
                 with contextlib.suppress(asyncio.CancelledError):
                     await task
 
-    runtime = last_known_runtime.model_copy()
     control = store.read_control(task_id)
-    runtime.finished_at = time.time()
-    runtime.updated_at = runtime.finished_at
-    runtime.exit_code = returncode
-    runtime.heartbeat_at = runtime.finished_at
-    if output_limit_exceeded:
-        runtime.status = "failed"
-        runtime.interrupted = True
-        runtime.failure_reason = output_limit_reason
-    elif timed_out:
-        runtime.status = "failed"
-        runtime.interrupted = True
-        runtime.timed_out = True
-        runtime.failure_reason = timeout_reason
-    elif control.kill_requested_at is not None:
-        runtime.status = "killed"
-        runtime.interrupted = True
-        runtime.failure_reason = control.kill_reason or "Killed"
-    elif returncode == 0:
-        runtime.status = "completed"
-        runtime.failure_reason = None
-    else:
-        runtime.status = "failed"
-        runtime.failure_reason = f"Command failed with exit code {returncode}"
-    store.write_runtime(task_id, runtime)
+
+    def finish_runtime(runtime: TaskRuntime) -> bool:
+        runtime.finished_at = time.time()
+        runtime.updated_at = runtime.finished_at
+        runtime.exit_code = returncode
+        runtime.heartbeat_at = runtime.finished_at
+        if output_limit_exceeded:
+            runtime.status = "failed"
+            runtime.interrupted = True
+            runtime.failure_reason = output_limit_reason
+        elif timed_out:
+            runtime.status = "failed"
+            runtime.interrupted = True
+            runtime.timed_out = True
+            runtime.failure_reason = timeout_reason
+        elif control.kill_requested_at is not None:
+            runtime.status = "killed"
+            runtime.interrupted = True
+            runtime.failure_reason = control.kill_reason or "Killed"
+        elif returncode == 0:
+            runtime.status = "completed"
+            runtime.failure_reason = None
+        else:
+            runtime.status = "failed"
+            runtime.failure_reason = f"Command failed with exit code {returncode}"
+        return True
+
+    store.update_runtime(task_id, finish_runtime)
diff --git a/src/pythinker_code/memory/consolidation.py b/src/pythinker_code/memory/consolidation.py
index 380b2813..9381392e 100644
--- a/src/pythinker_code/memory/consolidation.py
+++ b/src/pythinker_code/memory/consolidation.py
@@ -28,6 +28,10 @@ def _safe_id(value: str) -> str:
     return re.sub(r"[^a-z0-9_-]", "-", value.lower())[:32].strip("-") or "candidate"
 
 
+def _memory_entry_hash(content: str) -> str:
+    return content_hash(tier="memory", title=content[:60], body=content)
+
+
 async def inbox_dir(store: ProjectMemoryStore) -> Path:
     root = await store._ensure_dir()  # pyright: ignore[reportPrivateUsage]
     path = root / "memory" / "inbox"
@@ -53,16 +57,16 @@ async def generate_inbox_candidates(
 ) -> list[InboxCandidate]:
     """Stage scratch/journal candidates for approval-gated durable memory consolidation."""
     existing_entries = [*await store.read_entries("memory"), *await store.read_entries("user")]
-    existing_hashes = {
-        content_hash(tier="memory", title=entry[:60], body=entry) for entry in existing_entries
-    }
-    staged = {candidate.content_hash for candidate in await list_inbox_candidates(store)}
+    existing_hashes = {_memory_entry_hash(entry) for entry in existing_entries}
+    inbox_candidates = await list_inbox_candidates(store)
+    staged = {candidate.content_hash for candidate in inbox_candidates}
+    staged.update(_memory_entry_hash(candidate.content) for candidate in inbox_candidates)
     directory = await inbox_dir(store)
     candidates: list[InboxCandidate] = []
     for block in await gather_candidates(store, work_dir):
         if block.tier in {"memory", "user"}:
             continue
-        digest = content_hash(tier="memory", title=block.title, body=block.content)
+        digest = _memory_entry_hash(block.content)
         if digest in existing_hashes or digest in staged:
             continue
         candidate = InboxCandidate(
@@ -74,7 +78,14 @@ async def generate_inbox_candidates(
             content_hash=digest,
         )
         path = directory / f"{candidate.id}.json"
-        fd = os.open(path, os.O_WRONLY | os.O_CREAT | os.O_EXCL, 0o600)
+        try:
+            fd = os.open(path, os.O_WRONLY | os.O_CREAT | os.O_EXCL, 0o600)
+        except FileExistsError:
+            # A file with this id already exists on disk but was absent from
+            # ``staged`` (e.g. it is corrupt and was skipped during listing).
+            # Treat it as already staged instead of crashing the whole harvest.
+            staged.add(digest)
+            continue
         with os.fdopen(fd, "w", encoding="utf-8") as fh:
             json.dump(asdict(candidate), fh, ensure_ascii=False, indent=2)
         candidates.append(candidate)
diff --git a/src/pythinker_code/memory/retriever.py b/src/pythinker_code/memory/retriever.py
index aaef3dbc..c8f8fedd 100644
--- a/src/pythinker_code/memory/retriever.py
+++ b/src/pythinker_code/memory/retriever.py
@@ -3,14 +3,15 @@
 import math
 import re
 import time
+import unicodedata
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, replace
 
-_TOKEN_RE = re.compile(r"[a-z0-9]{2,}")
+# Unicode word runs, with underscores treated as separators for snake_case.
+_TOKEN_RE = re.compile(r"[^\W_]{2,}", re.UNICODE)
 _RECENCY_HALF_LIFE_DAYS = 14.0
 _BM25_K1 = 1.5
 _BM25_B = 0.75
-_LABEL_BOOST = 0.5
 _PATH_BOOST = 0.5
 
 
@@ -19,7 +20,8 @@ def estimate_tokens(text: str) -> int:
 
 
 def _tokenize(text: str) -> list[str]:
-    return _TOKEN_RE.findall(text.lower())
+    normalized = unicodedata.normalize("NFKC", text).casefold()
+    return _TOKEN_RE.findall(normalized)
 
 
 @dataclass(frozen=True, slots=True)
@@ -67,7 +69,7 @@ async def retrieve(self, query: RecallQuery, budget_tokens: int) -> list[RankedB
             for term in set(doc):
                 df[term] = df.get(term, 0) + 1
 
-        q_terms = _tokenize(query.text)
+        q_terms = _tokenize(" ".join((query.text, *query.labels)))
         scored: list[RankedBlock] = []
         for cand, doc in zip(self._candidates, docs, strict=True):
             dl = len(doc) or 1
@@ -91,8 +93,6 @@ async def retrieve(self, query: RecallQuery, budget_tokens: int) -> list[RankedB
             boost = 0.0
             if any(path in cand.files for path in query.paths):
                 boost += _PATH_BOOST
-            if set(query.labels) & set(cand.labels):
-                boost += _LABEL_BOOST
             if not q_terms and not query.paths and not query.labels:
                 boost += 0.01
             scored.append(replace(cand, score=bm25 * decay + boost))
diff --git a/src/pythinker_code/soul/pythinkersoul.py b/src/pythinker_code/soul/pythinkersoul.py
index f0a25493..41c01b29 100644
--- a/src/pythinker_code/soul/pythinkersoul.py
+++ b/src/pythinker_code/soul/pythinkersoul.py
@@ -1616,6 +1616,70 @@ async def _compact_with_retry() -> CompactionResult:
             compaction_result = await _compact_with_retry()
             if not compaction_result.messages:
                 raise RuntimeError("Compaction produced no messages; preserving existing history")
+            await self._context.clear()
+            await self._context.write_system_prompt(self._agent.system_prompt)
+            await self._checkpoint()
+            await self._context.append_message(compaction_result.messages)
+            estimated_token_count = compaction_result.estimated_token_count
+            summary_text = compact_summary_text(compaction_result.messages)
+
+            if restore_context.messages:
+                await self._context.append_message(restore_context.messages)
+                estimated_token_count += estimate_text_tokens(restore_context.messages)
+
+            if self._runtime.role == "root":
+                active_task_snapshot = build_active_task_snapshot(self._runtime.background_tasks)
+                if active_task_snapshot is not None:
+                    active_task_message = Message(
+                        role="user",
+                        content=[
+                            system(
+                                "The following background tasks are still active after compaction. "
+                                "Use TaskList if you need to re-enumerate them later."
+                            ),
+                            TextPart(text=active_task_snapshot),
+                        ],
+                    )
+                    await self._context.append_message(active_task_message)
+                    estimated_token_count += estimate_text_tokens([active_task_message])
+
+            post_compact_results = await self._hook_engine.trigger(
+                "PostCompact",
+                matcher_value=trigger_reason,
+                input_data=events.post_compact(
+                    session_id=self._runtime.session.id,
+                    cwd=str(Path.cwd()),
+                    trigger=trigger_reason,
+                    estimated_token_count=estimated_token_count,
+                    compact_summary=summary_text,
+                ),
+            )
+            session_start_results = await self._hook_engine.trigger(
+                "SessionStart",
+                matcher_value="compact",
+                input_data=events.session_start(
+                    session_id=self._runtime.session.id,
+                    cwd=str(Path.cwd()),
+                    source="compact",
+                ),
+            )
+            hook_context_message = build_hook_context_message(
+                result.additional_context
+                for result in [*post_compact_results, *session_start_results]
+            )
+            if hook_context_message is not None:
+                await self._context.append_message(hook_context_message)
+                estimated_token_count += estimate_text_tokens([hook_context_message])
+
+            # Estimate token count so context_usage is not reported as 0%
+            await self._context.update_token_count(estimated_token_count)
+
+            # Notify dynamic injection providers that history has been rebuilt so
+            # they can reset any one-shot throttling state. Failures are isolated
+            # per-provider so compaction completion (wire event + telemetry) is
+            # not affected by a buggy provider.
+            await self._notify_injection_providers_compacted()
+
         except Exception:
             from pythinker_code.telemetry import track
 
@@ -1626,70 +1690,11 @@ async def _compact_with_retry() -> CompactionResult:
                 success=False,
             )
             raise
-        await self._context.clear()
-        await self._context.write_system_prompt(self._agent.system_prompt)
-        await self._checkpoint()
-        await self._context.append_message(compaction_result.messages)
-        estimated_token_count = compaction_result.estimated_token_count
-        summary_text = compact_summary_text(compaction_result.messages)
-
-        if restore_context.messages:
-            await self._context.append_message(restore_context.messages)
-            estimated_token_count += estimate_text_tokens(restore_context.messages)
-
-        if self._runtime.role == "root":
-            active_task_snapshot = build_active_task_snapshot(self._runtime.background_tasks)
-            if active_task_snapshot is not None:
-                active_task_message = Message(
-                    role="user",
-                    content=[
-                        system(
-                            "The following background tasks are still active after compaction. "
-                            "Use TaskList if you need to re-enumerate them later."
-                        ),
-                        TextPart(text=active_task_snapshot),
-                    ],
-                )
-                await self._context.append_message(active_task_message)
-                estimated_token_count += estimate_text_tokens([active_task_message])
-
-        post_compact_results = await self._hook_engine.trigger(
-            "PostCompact",
-            matcher_value=trigger_reason,
-            input_data=events.post_compact(
-                session_id=self._runtime.session.id,
-                cwd=str(Path.cwd()),
-                trigger=trigger_reason,
-                estimated_token_count=estimated_token_count,
-                compact_summary=summary_text,
-            ),
-        )
-        session_start_results = await self._hook_engine.trigger(
-            "SessionStart",
-            matcher_value="compact",
-            input_data=events.session_start(
-                session_id=self._runtime.session.id,
-                cwd=str(Path.cwd()),
-                source="compact",
-            ),
-        )
-        hook_context_message = build_hook_context_message(
-            result.additional_context for result in [*post_compact_results, *session_start_results]
-        )
-        if hook_context_message is not None:
-            await self._context.append_message(hook_context_message)
-            estimated_token_count += estimate_text_tokens([hook_context_message])
-
-        # Estimate token count so context_usage is not reported as 0%
-        await self._context.update_token_count(estimated_token_count)
-
-        # Notify dynamic injection providers that history has been rebuilt so
-        # they can reset any one-shot throttling state. Failures are isolated
-        # per-provider so compaction completion (wire event + telemetry) is
-        # not affected by a buggy provider.
-        await self._notify_injection_providers_compacted()
+        finally:
+            # Always close the wire pair, even if the LLM call, context rewrite,
+            # hooks, or injection-provider notifications fail.
+            wire_send(CompactionEnd())
 
-        wire_send(CompactionEnd())
         wire_send(TextPart(text=restore_context.display_text()))
 
         from pythinker_code.telemetry import track
diff --git a/src/pythinker_code/soul/toolset.py b/src/pythinker_code/soul/toolset.py
index 890e55fc..1c60543a 100644
--- a/src/pythinker_code/soul/toolset.py
+++ b/src/pythinker_code/soul/toolset.py
@@ -525,8 +525,6 @@ async def load_mcp_tools(
         import fastmcp
         from fastmcp.mcp_config import MCPConfig, RemoteMCPServer
 
-        from pythinker_code.ui.shell.prompt import toast
-
         async def _check_oauth_tokens(server_url: str) -> bool:
             """Check if OAuth tokens exist for the server."""
             try:
@@ -545,6 +543,8 @@ async def _check_oauth_tokens(server_url: str) -> bool:
 
         def _toast_mcp(message: str) -> None:
             if in_background:
+                from pythinker_code.ui.shell.prompt import toast
+
                 toast(
                     message,
                     duration=10.0,
@@ -611,12 +611,6 @@ async def _connect():
             results = await asyncio.gather(*tasks) if tasks else []
             failed_servers = {name: error for name, error in results if error is not None}
 
-            for mcp_config in mcp_configs:
-                # Skip empty MCP configs (no servers defined)
-                if not mcp_config.mcpServers:
-                    logger.debug("Skipping empty MCP config: {mcp_config}", mcp_config=mcp_config)
-                    continue
-
             if failed_servers:
                 _toast_mcp("mcp connection failed")
                 raise MCPRuntimeError(f"Failed to connect MCP servers: {failed_servers}")
@@ -640,6 +634,9 @@ async def _connect():
                     status="pending", client=client, tools=[]
                 )
 
+        if not any(server_info.status == "pending" for server_info in self._mcp_servers.values()):
+            return
+
         if in_background:
             self._mcp_loading_task = asyncio.create_task(_connect())
         else:
diff --git a/src/pythinker_code/tools/file/replace.md b/src/pythinker_code/tools/file/replace.md
index fab28807..1e50810c 100644
--- a/src/pythinker_code/tools/file/replace.md
+++ b/src/pythinker_code/tools/file/replace.md
@@ -4,4 +4,5 @@ Replace specific strings within a specified file.
 - Only use this tool on text files.
 - Multi-line strings are supported.
 - Can specify a single edit or a list of edits in one call.
+- Unless `replace_all` is true, the old string must match exactly once; add surrounding context if it is ambiguous.
 - You should prefer this tool over WriteFile tool and Shell `sed` command.
diff --git a/src/pythinker_code/tools/file/replace.py b/src/pythinker_code/tools/file/replace.py
index 47713ad6..1d939776 100644
--- a/src/pythinker_code/tools/file/replace.py
+++ b/src/pythinker_code/tools/file/replace.py
@@ -204,16 +204,53 @@ async def __call__(self, params: Params) -> ToolReturnValue:
 
             original_content = content
             edits = [params.edit] if isinstance(params.edit, Edit) else params.edit
+            if not edits:
+                return ToolError(
+                    message="At least one edit is required.",
+                    brief="No edits provided",
+                )
+
+            # Validate and apply the batch in memory first. If any edit is
+            # missing or ambiguous, return before writing so the file stays
+            # unchanged.
+            per_edit_counts: list[int] = []
+            for index, edit in enumerate(edits, start=1):
+                if not edit.old:
+                    return ToolError(
+                        message=f"Edit {index}: old string cannot be empty.",
+                        brief="Empty old string",
+                    )
+                if edit.old == edit.new:
+                    return ToolError(
+                        message=f"Edit {index}: old and new strings are identical.",
+                        brief="No-op edit",
+                    )
+
+                match_count = content.count(edit.old)
+                if match_count == 0:
+                    return ToolError(
+                        message=(
+                            f"No replacements were made for edit {index}: "
+                            f"old string {edit.old!r} was not found in the file."
+                        ),
+                        brief="No replacements made",
+                    )
+                if match_count > 1 and not edit.replace_all:
+                    return ToolError(
+                        message=(
+                            f"Edit {index}: old string {edit.old!r} occurs {match_count} times. "
+                            "Add surrounding context to make it unique, or set replace_all=true."
+                        ),
+                        brief="Ambiguous replacement",
+                    )
 
-            # Apply all edits
-            for edit in edits:
+                per_edit_counts.append(match_count if edit.replace_all else 1)
                 content = self._apply_edit(content, edit)
 
-            # Check if any changes were made
             if content == original_content:
                 return ToolError(
-                    message="No replacements were made. The old string was not found in the file.",
-                    brief="No replacements made",
+                    message="Edits resulted in no file changes.",
+                    brief="No changes",
                 )
 
             diff_blocks: list[DisplayBlock] = await build_diff_blocks(
@@ -245,13 +282,8 @@ async def __call__(self, params: Params) -> ToolReturnValue:
             # Write the modified content back to the file
             await p.write_text(content, encoding="utf-8", errors="replace")
 
-            # Count changes for success message
-            total_replacements = 0
-            for edit in edits:
-                if edit.replace_all:
-                    total_replacements += original_content.count(edit.old)
-                else:
-                    total_replacements += 1 if edit.old in original_content else 0
+            # Count changes for success message (tallied per-edit during application).
+            total_replacements = sum(per_edit_counts)
 
             return ToolReturnValue(
                 is_error=False,
diff --git a/src/pythinker_code/ui/shell/__init__.py b/src/pythinker_code/ui/shell/__init__.py
index 5ab0dc43..1e76db41 100644
--- a/src/pythinker_code/ui/shell/__init__.py
+++ b/src/pythinker_code/ui/shell/__init__.py
@@ -20,6 +20,7 @@
 )
 from rich import box
 from rich.console import Group, RenderableType
+from rich.markup import escape
 from rich.panel import Panel
 from rich.table import Table
 from rich.text import Text
@@ -900,7 +901,7 @@ def _can_auto_trigger_pending() -> bool:
                         resume_prompt.set()
                         continue
                     if action.kind == InputAction.IGNORED:
-                        console.print(f"[dim]{action.args}[/dim]")
+                        console.print(f"[dim]{escape(str(action.args))}[/dim]")
                         resume_prompt.set()
                         continue
 
@@ -1055,7 +1056,9 @@ def _handler():
                 console.print(render_message_response(output))
         except Exception as e:
             logger.exception("Failed to run shell command:")
-            console.print(f"[{_get_tui_tokens().error}]Failed to run shell command: {e}[/]")
+            console.print(
+                f"[{_get_tui_tokens().error}]Failed to run shell command: {escape(str(e))}[/]"
+            )
         finally:
             remove_sigint()
 
@@ -1101,7 +1104,7 @@ async def _run_slash_command(self, command_call: SlashCommandCall) -> None:
             console.print(f"[{_get_tui_tokens().error}]Interrupted by user[/]")
         except Exception as e:
             logger.exception("Unknown error:")
-            console.print(f"[{_get_tui_tokens().error}]Unknown error: {e}[/]")
+            console.print(f"[{_get_tui_tokens().error}]Unknown error: {escape(str(e))}[/]")
             raise  # re-raise unknown error
 
     async def run_soul_command(self, user_input: str | list[ContentPart]) -> bool:
@@ -1248,7 +1251,7 @@ def _on_view_ready(view: Any) -> None:
             # actually unsupported input/mode should already be blocked by prompt session
             _t = _get_tui_tokens()
             logger.exception("LLM not supported:")
-            console.print(f"[{_t.error}]{e}[/]")
+            console.print(f"[{_t.error}]{escape(str(e))}[/]")
         except ChatProviderError as e:
             _t = _get_tui_tokens()
             logger.exception("LLM provider error:")
@@ -1342,10 +1345,10 @@ def _on_view_ready(view: Any) -> None:
                     "namespace in LM Studio's model browser — those have audited templates.[/dim]\n"
                     "[dim]  3. Or override the template manually in LM Studio: "
                     "[bold]My Models → model settings → Prompt Template[/bold].[/dim]\n"
-                    f"[dim]Server: {e}[/dim]"
+                    f"[dim]Server: {escape(str(e))}[/dim]"
                 )
             else:
-                console.print(f"[{_t.error}]LLM provider error: {e}[/]")
+                console.print(f"[{_t.error}]LLM provider error: {escape(str(e))}[/]")
             if not isinstance(e, APIStatusError) or e.status_code not in (401, 402, 403):
                 console.print(
                     "[dim]If this persists, run [bold]pythinker export[/bold] and send the "
diff --git a/src/pythinker_code/ui/shell/export_import.py b/src/pythinker_code/ui/shell/export_import.py
index e3abf0d5..8894c777 100644
--- a/src/pythinker_code/ui/shell/export_import.py
+++ b/src/pythinker_code/ui/shell/export_import.py
@@ -4,6 +4,7 @@
 from typing import TYPE_CHECKING
 
 from pythinker_host.path import HostPath
+from rich.markup import escape
 
 from pythinker_code.ui.shell.console import console
 from pythinker_code.ui.shell.slash import ensure_pythinker_soul, registry, shell_mode_registry
@@ -42,14 +43,14 @@ async def export(app: Shell, args: str):
     )
     _t = _get_tui_tokens()
     if isinstance(result, str):
-        console.print(f"[{_t.warning}]{result}[/]")
+        console.print(f"[{_t.warning}]{escape(result)}[/]")
         return
 
     output, count = result
     from pythinker_code.telemetry import track
 
     track("export")
-    display = shorten_home(HostPath(str(output)))
+    display = escape(str(shorten_home(HostPath(str(output)))))
     console.print(f"[{_t.success}]Exported {count} messages to {display}[/]")
     console.print(
         f"[{_t.warning}]Note: The exported file may contain sensitive information. "
@@ -95,7 +96,7 @@ async def import_context(app: Shell, args: str):
         max_context_size=max_context_size,
     )
     if isinstance(result, str):
-        console.print(f"[{_t.error}]{result}[/]")
+        console.print(f"[{_t.error}]{escape(result)}[/]")
         return
 
     source_desc, content_len = result
@@ -110,7 +111,7 @@ async def import_context(app: Shell, args: str):
     await soul.wire_file.append_message(TurnEnd())
 
     console.print(
-        f"[{_t.success}]Imported context from {source_desc} "
+        f"[{_t.success}]Imported context from {escape(source_desc)} "
         f"({content_len} chars) into current session.[/]"
     )
     if source_desc.startswith("file") and is_sensitive_file(Path(target).name):
diff --git a/src/pythinker_code/ui/shell/session_picker.py b/src/pythinker_code/ui/shell/session_picker.py
index 5af555c2..13a18b29 100644
--- a/src/pythinker_code/ui/shell/session_picker.py
+++ b/src/pythinker_code/ui/shell/session_picker.py
@@ -139,6 +139,11 @@ def _sync_radio_list(self) -> None:
     def _header_fragments(self) -> StyleAndTextTuples:
         scope_label = "current directory" if self._scope == "current" else "all directories"
         total = len(self._sessions)
+        if total == 0:
+            return [
+                ("class:header.title", " SESSIONS "),
+                ("class:header.meta", f" [{scope_label}] "),
+            ]
         selected = self._radio_list._selected_index + 1  # pyright: ignore[reportPrivateUsage]
         return [
             ("class:header.title", f" SESSIONS ({selected} of {total}) "),
diff --git a/src/pythinker_code/ui/shell/setup.py b/src/pythinker_code/ui/shell/setup.py
index e5b497c0..fce8fa6f 100644
--- a/src/pythinker_code/ui/shell/setup.py
+++ b/src/pythinker_code/ui/shell/setup.py
@@ -6,6 +6,7 @@
 from prompt_toolkit import PromptSession
 from prompt_toolkit.shortcuts.choice_input import ChoiceInput
 from pydantic import SecretStr
+from rich.markup import escape
 
 from pythinker_code.auth import PYTHINKER_CODE_PLATFORM_ID
 from pythinker_code.auth.platforms import (
@@ -61,8 +62,8 @@ async def setup_platform(platform: Platform) -> bool:
     _t = _get_tui_tokens()
     thinking_label = "on" if result.thinking else "off"
     console.print(f"[{_t.success}]✓ Setup complete![/]")
-    console.print(f"  Platform: [bold]{result.platform.name}[/bold]")
-    console.print(f"  Model:    [bold]{result.selected_model.id}[/bold]")
+    console.print(f"  Platform: [bold]{escape(result.platform.name)}[/bold]")
+    console.print(f"  Model:    [bold]{escape(result.selected_model.id)}[/bold]")
     console.print(f"  Thinking: [bold]{thinking_label}[/bold]")
     console.print("  Reloading...")
     return True
@@ -89,7 +90,7 @@ async def _setup_platform(platform: Platform) -> _SetupResult | None:
             models = await list_models(platform, api_key)
     except aiohttp.ClientResponseError as e:
         logger.error("Failed to get models: {error}", error=e)
-        console.print(f"[{_t.error}]Failed to get models: {e.message}[/]")
+        console.print(f"[{_t.error}]Failed to get models: {escape(e.message)}[/]")
         if e.status == 401 and platform.id != PYTHINKER_CODE_PLATFORM_ID:
             console.print(
                 f"[{_t.warning}]Hint: If your API key was obtained from Pythinker, "
@@ -98,7 +99,7 @@ async def _setup_platform(platform: Platform) -> _SetupResult | None:
         return None
     except Exception as e:
         logger.error("Failed to get models: {error}", error=e)
-        console.print(f"[{_t.error}]Failed to get models: {e}[/]")
+        console.print(f"[{_t.error}]Failed to get models: {escape(str(e))}[/]")
         return None
 
     # select the model
diff --git a/src/pythinker_code/ui/shell/task_browser.py b/src/pythinker_code/ui/shell/task_browser.py
index 52d8c88d..68abf0e4 100644
--- a/src/pythinker_code/ui/shell/task_browser.py
+++ b/src/pythinker_code/ui/shell/task_browser.py
@@ -13,6 +13,7 @@
 from prompt_toolkit.widgets import Box, Frame, RadioList
 from rich import box
 from rich.console import Group
+from rich.markup import escape
 from rich.panel import Panel
 from rich.text import Text
 
@@ -272,7 +273,7 @@ def render() -> None:
             view = self._model.view_for(task_id)
             if view is None:
                 _t = _get_tui_tokens()
-                console.print(f"[{_t.warning}]Task not found: {task_id}[/]")
+                console.print(f"[{_t.warning}]Task not found: {escape(task_id)}[/]")
                 return
             with console.pager(styles=True):
                 console.print(_build_full_output_renderable(view, self._model.full_output(task_id)))
diff --git a/src/pythinker_code/ui/shell/usage.py b/src/pythinker_code/ui/shell/usage.py
index 9d58a3ad..8431943c 100644
--- a/src/pythinker_code/ui/shell/usage.py
+++ b/src/pythinker_code/ui/shell/usage.py
@@ -5,6 +5,8 @@
 import shlex
 from typing import TYPE_CHECKING
 
+from rich.markup import escape
+
 from pythinker_code.auth.platforms import parse_managed_provider_key
 from pythinker_code.config import LLMProvider
 from pythinker_code.soul.pythinkersoul import PythinkerSoul
@@ -209,7 +211,7 @@ async def usage(app: Shell, args: str):
     try:
         tokens = shlex.split(args.strip())
     except ValueError as e:
-        console.print(f"[{_t.error}]Invalid usage arguments: {e}[/]")
+        console.print(f"[{_t.error}]Invalid usage arguments: {escape(str(e))}[/]")
         return
 
     json_mode = "--json" in tokens
diff --git a/tasks/agent-behavior-findings.md b/tasks/agent-behavior-findings.md
deleted file mode 100644
index 428dd994..00000000
--- a/tasks/agent-behavior-findings.md
+++ /dev/null
@@ -1,124 +0,0 @@
-# Agent behaviour review — `.pythinker` + `.pythinker-review`
-
-Diagnosis from runtime artifacts (review run `20260522015318-650c6d67`, scratch
-sessions May 27–28). Evidence-based; root causes confirmed in source.
-
-## Status (applied 2026-05-28)
-- **R1 — applied** (`reviewers/common.py`): retry now relays the concrete
-  validation error + an explicit title-length nudge. Tests:
-  `test_retry_prompt_surfaces_previous_validation_error`.
-- **R2 — applied** (`reviewers/schema.py`): `title` is truncated (≤80, ellipsis)
-  via a before-validator instead of hard-failing the whole `ReviewerOutput`.
-  Tests: `test_reviewer_output_truncates_overlong_title`,
-  `test_overlong_title_is_truncated_not_dropped`.
-- **S1 — applied** (`scratchpad.py` + `cli/__init__.py`): `append_scratch_event_sync`
-  gained an idempotent `dedup_signature`; the CLI passes `source:<src>` so a
-  relaunch no longer appends a duplicate "session start". Tests:
-  `test_session_start_event_is_idempotent_per_signature`.
-- **R4 — applied** (`reviewers/prompts/*.system.md`): the `evidence_snippet`
-  placeholder now demands VERBATIM, character-for-character copying (no
-  paraphrase/ellipses) across all four reviewer prompts, so the model's output
-  passes the containment validator instead of being dropped per-finding.
-- **S2 — applied** (`scratchpad.py`): labels now collapse by key —
-  single-valued keys (`session/workspace/ui/source/scope`) keep the latest
-  value, multi-valued keys (`kind`) keep the unique set. Kills
-  `source:startup | source:resume` noise while retaining `kind:todo | kind:agent`.
-  Tests: `test_session_labels_collapse_single_valued_keys`. (Note: the original
-  delimiter-injection worry was already mitigated by `_clean_event_text`, which
-  strips `|`/`\r`/`\n`; this change addresses the residual duplicate-key noise.)
-- **R3 — deliberately NOT applied.** Gating the run when a chunk is unreviewable
-  (`malformed_output`) is *correct* fail-closed behaviour for a security tool;
-  making it non-gating would let a file pass review unreviewed. R1+R2+R4 instead
-  cut the failure *rate* so runs pass legitimately.
-- **S3 — deliberately NOT applied.** Auto-pruning scratch files conflicts with
-  the module's documented design (`scratchpad.py` header: retained as history
-  "unless the user explicitly asks for cleanup"). S1 already removes the main
-  growth driver (duplicate session-starts); broader retention is a product
-  decision for the user, not a silent change.
-
-Original findings below.
-
-## Review subsystem (the background review subagent) — highest impact
-
-### R1 [High] Retry never relays the real validation error → systematic schema misses lose the whole chunk
-`reviewers/common.py:69-92`. On a parse/validation failure the harness retries
-once, but `_RETRY_SUFFIX` (common.py:15) only says *"your response was not valid
-JSON … reply with strict JSON only."* The captured Pydantic error (`last_error`,
-common.py:87) is **never fed back to the model**. So when the failure is a
-*content* violation (e.g. a title > 80 chars — valid JSON, invalid schema), the
-retry message is actively misleading and the model has no reason to change. Both
-attempts fail → `ok=False` → **every finding in that chunk is discarded.**
-- Evidence: meta `chunk_failures` — `tests/ui_and_conv/test_prompt_tips.py`
-  failed with *4* `String should have at most 80 characters` errors; the whole
-  chunk's findings were lost.
-- Fix direction: append `last_error` to the retry prompt so the model can
-  self-correct.
-
-### R2 [High] No field-level coercion at ingest; one bad field nukes the chunk
-`reviewers/schema.py:16` `title: str = Field(max_length=80)`. `ReviewerOutput`
-is parsed all-or-nothing via `model_validate_json` (common.py:84), so a single
-over-long title (or any one out-of-range field) fails the *entire* output and
-drops all sibling findings in the chunk. Smaller models (run model: MiniMax
-M2.7) hit the 80-char cap routinely.
-- Fix direction: soft-coerce on ingest (truncate title to 80 with ellipsis)
-  instead of hard-failing; or parse findings individually so one bad finding
-  doesn't take the rest down.
-
-### R3 [Med] Model-formatting noise gates the entire run
-`engine/runner.py:163` `failed=(not allow_partial) and bool(real_failures)`, and
-`real_failures` excludes `validation_error` but **includes** `malformed_output`
-(runner.py:156). Result: 2 of 54 chunks failing on model-formatting (long title,
-truncated JSON) flipped the whole run to `status: "failed"` despite 42 valid
-findings delivered. `malformed_output` is a model-quality issue like
-`validation_error`, not a runtime failure (timeout/llm_error/worker_error).
-- Fix direction: treat `malformed_output` as non-gating (like validation_error),
-  or add a distinct "delivered with model-output gaps" status.
-
-### R4 [Low / mostly positive] Evidence-snippet validation is graceful but mismatch rate is high
-`reviewers/validation.py:_snippet_matches` already falls back rendered-diff →
-whitespace-compacted → on-disk slice → compacted file (good; these drop
-**per-finding**, not per-chunk — runner.py:101-115). But 7 findings still failed
-evidence match, i.e. the model paraphrases snippets beyond whitespace.
-- Fix direction: emphasise verbatim copying in the reviewer prompt, or add a
-  token-overlap fuzzy match as a last-resort tier.
-
-## Scratchpad / session memory (`.pythinker/scratch`)
-
-### S1 [Med] "session start" journaling is not idempotent
-`cli/__init__.py:761-781` emits a `session start` event unconditionally on every
-CLI init for the session, with no guard against an existing start block for the
-same session id.
-- Evidence: session `7f7a8039` has **4** `session start / source: startup`
-  entries in 3 min; `06ba6c38`, `7c6a9676`, `f42f6caa` each have duplicate
-  same-minute startups. Defeats the file's stated "compact" purpose.
-- Fix direction: skip if the last event for the session id is already a
-  session-start within N seconds, or only emit once per genuine create/resume.
-
-### S2 [Med] Label line: duplicate keys unmerged + unsanitised free text → parse/injection risk
-The `labels:` line is `key:value` joined by ` | `.
-- Duplicate keys are appended, not merged: `source:startup | source:resume`,
-  multiple `kind:` values (`kind:todo | kind:agent-batch | kind:agent`). Recall
-  on labels sees conflicting values.
-- The raw session title is embedded as a scope label:
-  `scope:Goal: perform a deep code scan analysis of the…` — contains a colon,
-  spaces, an ellipsis, and could contain `|` or a newline from an arbitrary
-  title, corrupting any `:`/`|` splitter.
-- Evidence: session `b241579b` labels line.
-- Fix direction: dedupe/merge by key; slugify or escape the title before using
-  it as a label value (or store title separately, not as a label).
-
-### S3 [Low] Unbounded scratch growth; no-op sessions persist boilerplate
-Most `untitled` files are 558 bytes = header + a single session-start (sessions
-that did nothing). Combined with S1, the dir grows unbounded with low-value
-files; no pruning/retention observed.
-- Fix direction: lazy-create the file on first substantive event, and/or prune
-  near-empty session files on a retention policy.
-
-## Positive — validates the enhancement direction
-Background agent-batch + todo journaling is coherent. Session `b241579b`:
-4 background agents (`review`, `security-reviewer`, `explore`, `verifier`)
-tracked start→completed, todo progressing 0→6 done, agent-type captured
-(`7c6a9676`: `agent-type:code-reviewer`, `agent started / mode: foreground`).
-The SetTodoList + background-subagent foundation being enhanced is sound; the
-gaps above are in the *journaling* and *review-output* layers, not the
-orchestration itself.
diff --git a/tasks/port-bk-box-agent-features.md b/tasks/port-bk-box-agent-features.md
deleted file mode 100644
index 07058a43..00000000
--- a/tasks/port-bk-box-agent-features.md
+++ /dev/null
@@ -1,156 +0,0 @@
-# Port bk_box_main coding-agent features
-
-## Scope
-
-Port the practical coding-agent runtime features from `blackbox/bk_box_main` into Pythinker CLI. Treat VS Code-only Agent Manager UI as reference; port only CLI/runtime primitives unless a Pythinker UI task explicitly asks for more.
-
-## Overall status
-
-**Status: complete for this porting slice.** All planned phases below have implementation coverage and focused verification. Full-suite `make test-pythinker-code` was attempted but hung in a `Pythinker` subprocess; focused port tests and `make check-pythinker-code` passed.
-
-## Phase 0 — Safety and baseline ✅ Done
-
-- [x] Preserve existing uncommitted work; avoid drive-by rewrites.
-- [x] Run focused tests before/after feature batches.
-- [x] Keep provider-specific behavior scoped to active provider/model.
-
-## Phase 1 — Agent modes and hard permission profiles ✅ Done
-
-Source references:
-- `blackbox/bk_box_main/packages/opencode/src/permission/*`
-- `blackbox/bk_box_main/packages/opencode/src/tool/plan.ts`
-
-Pythinker targets:
-- `src/pythinker_code/agents/default/*.yaml`
-- `src/pythinker_code/soul/approval.py`
-- `src/pythinker_code/approval_runtime/`
-- `src/pythinker_code/tools/file/plan_mode.py`
-
-Work:
-- [x] Add explicit runtime-enforced read-only, plan, ask, implement, review, and verify permission profiles.
-- [x] Make plan mode hard-deny non-plan writes and dangerous shell mutations, not just prompt-deny.
-- [x] Add tests for denied shell/write/edit paths per mode.
-
-Acceptance:
-- [x] Plan/review/explore/verifier cannot mutate files through file tools or shell.
-- [x] Implementer/coder retain write tools under normal approvals.
-
-## Phase 2 — Plan handoff workflow ✅ Done
-
-Source references:
-- `blackbox/bk_box_main/packages/opencode/src/session/prompt/code-switch.txt`
-
-Pythinker targets:
-- `src/pythinker_code/tools/plan/`
-- `src/pythinker_code/soul/dynamic_injections/plan_mode.py`
-- `src/pythinker_code/subagents/`
-
-Work:
-- [x] After plan approval, generate a compact implementation handoff summary.
-- [x] Offer/implement “continue here” vs “start implementation subagent/session” through the handoff text and Agent resume/start instructions.
-- [x] Persist handoff with the plan artifact as a sibling `.handoff.md` file.
-
-Acceptance:
-- [x] Plan output can be approved and injected into an implementer without losing discoveries.
-- [x] Tests cover handoff generation and handoff persistence failure fallback.
-
-## Phase 3 — Orchestration and task runtime ✅ Done
-
-Source references:
-- `blackbox/bk_box_main/packages/opencode/src/tool/task.ts`
-
-Pythinker targets:
-- `src/pythinker_code/tools/agent/`
-- `src/pythinker_code/background/`
-- `src/pythinker_code/subagents/`
-
-Work:
-- [x] Add first-class multi-agent task metadata: parent/child links, dependencies, budget/timeouts, synthesis state.
-- [x] Add optional git worktree isolation intent for background agents.
-- [x] Add resume/recover semantics robust across CLI restarts where possible.
-
-Acceptance:
-- [x] Parent can launch several agents, inspect structured statuses, and synthesize results deterministically.
-- [x] Background agent records survive process restart with clear `running/lost/recoverable` states.
-
-## Phase 4 — Context/session robustness ✅ Done
-
-Source references:
-- `blackbox/bk_box_main/packages/opencode/src/session/compaction.ts`
-
-Pythinker targets:
-- `src/pythinker_code/soul/context.py`
-- `src/pythinker_code/soul/pythinkersoul.py`
-- `src/pythinker_code/session.py`
-
-Work:
-- [x] Add prompt queue protection for concurrent prompts/tool followups.
-- [x] Harden compaction/recovery around malformed or partial LLM payloads.
-- [x] Add clearer session list/search/export affordances if not already covered by existing UI/session surfaces.
-
-Acceptance:
-- [x] Interrupted/failed compaction does not corrupt session history.
-- [x] Queued user prompts preserve order and status.
-
-## Phase 5 — Code search and skills ✅ Done
-
-Source references:
-- `blackbox/bk_box_main/packages/opencode/src/tool/warpgrep.ts`
-- `blackbox/bk_box_main/packages/opencode/src/skill/*`
-
-Pythinker targets:
-- `src/pythinker_code/tools/file/grep_local.py`
-- `src/pythinker_code/skill/`
-- `src/pythinker_code/soul/dynamic_injections/`
-
-Work:
-- [x] Add a local “smart search” wrapper that plans multiple grep/glob passes and returns concise spans.
-- [x] Keep graphify/context-mode semantic providers out of this slice; `SmartSearch` provides bounded local search behind a normal tool interface.
-- [x] Keep explicit skill-load/use tooling deferred because prompt-loaded skills remain sufficient for this slice.
-
-Acceptance:
-- [x] Explore agents can find relevant code with fewer broad shell calls.
-- [x] Search results are bounded, cited, and safe for context.
-
-## Phase 6 — Provider/model UX ✅ Done
-
-Source references:
-- `blackbox/bk_box_main/packages/opencode/src/provider/*`
-- `blackbox/bk_box_main/packages/opencode/src/session/system.ts`
-
-Pythinker targets:
-- `src/pythinker_code/llm.py`
-- `src/pythinker_code/auth/`
-- `src/pythinker_code/ui/shell/usage_adapters/`
-
-Work:
-- [x] Keep model-specific prompts/capabilities explicit.
-- [x] Preserve per-agent model/thinking/variant state through subagent cloning.
-- [x] Improve fallback display for unavailable provider model lists where covered by existing provider fallback paths.
-
-Acceptance:
-- [x] OpenCode Go/Kimi, Anthropic-compatible, OpenAI-compatible, and Pythinker providers retain correct per-model toggles in root and subagents.
-
-## Recommended first implementation slice ✅ Done
-
-- [x] Finish provider-thinking regression tests.
-- [x] Introduce a small runtime permission profile abstraction.
-- [x] Apply it to plan/explore/review/verifier first.
-- [x] Add shell mutation-denial tests.
-- [x] Add plan handoff in Phase 2.
-
-## Execution status — 2026-05-11
-
-- [x] Phase 1: implemented runtime permission profiles and hard-deny checks for file mutations and mutating shell commands; focused tests pass.
-- [x] Phase 2: implemented deterministic approved-plan handoff generation and sibling `.handoff.md` persistence with inline fallback on persistence failure; focused tests pass.
-- [x] Phase 3: added background-agent orchestration metadata (`dependencies`, budget, synthesis state, isolation intent) and recoverable restart state for orphaned background agents; focused tests pass.
-- [x] Phase 4: added prompt-turn serialization with a session-local lock and compaction empty-result guard so failed/malformed compaction does not clear existing history.
-- [x] Phase 5: added `SmartSearch`, a bounded multi-pass local search wrapper, and exposed it to default/subagent specs; focused tests pass.
-- [x] Phase 6: persisted subagent thinking state in launch specs and passed it through model cloning so resumed/per-agent model overrides preserve thinking toggles.
-
-## Verification
-
-- [x] `make check-pythinker-code` passed.
-- [x] Focused port regression suite passed: `42 passed`.
-- [x] Broader focused suite passed: `79 passed`.
-- [ ] Full `make test-pythinker-code` was attempted but hung in a `Pythinker` subprocess; rerun or diagnose separately before release.
diff --git a/tasks/tui-enhancement-plan.md b/tasks/tui-enhancement-plan.md
deleted file mode 100644
index 736c0d1a..00000000
--- a/tasks/tui-enhancement-plan.md
+++ /dev/null
@@ -1,293 +0,0 @@
-# TUI / CLI Render Enhancement — Comprehensive Plan
-
-Single source of truth for the terminal-UI render overhaul. Merges the validated spacing
-standardization (Track A, `63 passed` baseline) with the markdown, file/diff, and hardening
-tracks surfaced by the render investigations.
-
-**Verified facts** (checked against the tree): `render_dialog()` exists at
-`_dialog_shell.py:35`; Makefile has `check-pythinker-code` / `test-pythinker-code`; `uv` is
-available; `markdown.py` uses a literal `Syntax(..., padding=(0,1))` (L58–63); `edit.py:117`
-holds a stray `Text("")` while its second diff path (L161–162) already omits it.
-
----
-
-## Design principles (best practices applied throughout)
-
-> **Stream owns the gaps between blocks. Cards, panels, markdown, and code renderers own only
-> their internal layout. The canonical blank row is `Text("")`; the prompt preamble uses a
-> newline helper, not hand-written `endswith("\n")` guards.**
-
-1. **One source of truth per visual concept** — a gap, glyph, hint string, color, or padding
-   defined once and imported. No literals at call sites.
-2. **Layered ownership of space** — never let two layers space the same seam (today's tinted-
-   card double/triple-gap).
-3. **Theme tokens, never raw colors** — every style via `tui_rich_style(token)` / the
-   markdown palette. Zero `"grey50"` / `#hex` in render paths.
-4. **One engine per job** — one diff renderer, one line-block helper, one truncation-hint
-   formatter, one syntax highlighter.
-5. **Deterministic + snapshot-tested** — pure render (state in → renderable out); golden tests
-   prove refactors are behavior-preserving.
-6. **Degrade gracefully** — `NO_COLOR`, `PYTHINKER_REDUCED_MOTION`, narrow widths; diffs use
-   `+`/`-` markers, not color alone (a11y).
-7. **Surgical, collision-aware migration** — tests green at every step; read each file's
-   current (uncommitted) state before editing.
-
-Baseline that must stay green:
-```bash
-uv run pytest tests/ui_and_conv/test_empty_think_part_indicator.py \
-  tests/ui_and_conv/test_tui_card_tool_renderers.py \
-  tests/ui_and_conv/test_visualize_running_prompt.py::test_render_agent_status_uses_compose_agent_output_not_compose -q
-# expect: 63 passed
-```
-
----
-
-# TRACK A — Spacing standardization (authoritative; do first)
-
-### A1. Add one spacing-primitive module
-Create `src/pythinker_code/ui/shell/spacing.py`:
-```python
-from rich.text import Text
-
-BLANK_ROW = Text("")
-STREAM_GAP_ROWS = 1
-SECTION_GAP_ROWS = 1
-
-CARD_PADDING = (0, 1)          # untinted/success cards (horizontal only)
-TINTED_CARD_PADDING = (0, 1)   # pending/error tinted cards (horizontal only)
-DIALOG_PANEL_PADDING = (0, 1)
-WORKLOG_PANEL_PADDING = (0, 1)
-CODE_BLOCK_PADDING = (0, 1)
-
-def blank_row() -> Text:
-    return Text("")
-
-def append_gap(renderables: list, rows: int = 1) -> None:
-    for _ in range(rows):
-        renderables.append(blank_row())
-
-def ensure_prompt_newline(fragments) -> None:
-    if fragments and not fragments[-1][1].endswith("\n"):
-        fragments.append(("", "\n"))
-```
-**Rule:** no `Text(" ")` for vertical spacing — blank rows are `Text("")`.
-
-### A2. Live stream owns inter-block spacing — `visualize/_live_view.py`
-- `_ACTION_SPACER = BLANK_ROW` (or drop it and call `blank_row()`).
-- Keep: *the live stream owns spacing between* content block / tool card / retry row /
-  spinner / notification. Individual cards must not create external top/bottom spacing.
-- Fixes the triple-row issue (tinted-pad-top + stream-spacer + tinted-pad-bottom).
-
-### A3. Remove vertical padding from tinted tool cards — `components/tool_execution.py`
-- `Padding(body, (1, 1), style=bg_style)` → `Padding(body, TINTED_CARD_PADDING, style=bg_style)`
-  (`(0, 1)`).
-- Result: pending/error keep the horizontal tint; no hidden top/bottom rows; gap between
-  cards becomes exactly the stream spacer → pending/error/success/bash cards consistent.
-
-### A4. Standardize intra-card spacing — `tool_renderers/edit.py`
-- **Rule:** inside a tool card the `⎿` gutter is the separator; no blank rows between
-  header / summary / result / diff.
-- Remove the `Text("")` at `edit.py:117` (between `change_summary_text` and `diff_frame`).
-  This matches `write.py` *and* edit.py's own second diff path (L161–162) → one rule across
-  all three diff renders.
-- Keep truncation hints glued to bodies (already consistent).
-
-### A5. Centralize prompt-preamble newlines — `prompt.py`
-- Replace every `if fragments and not fragments[-1][1].endswith("\n"): fragments.append(("","\n"))`
-  with `ensure_prompt_newline(fragments)`.
-- One policy: agent-status / modal-body blocks are newline-terminated; card-style separator =
-  exactly one newline before the prompt label; non-card prompt = exactly one newline before
-  `❯`.
-- **The prompt layer owns the final gap before input** — this resolves the spinner-verb ↔
-  prompt seam (don't rely on whatever the last live action happened to be). This is the home
-  of the original "blank row under the spinner verb" request.
-
-### A6. Move modal panel section gaps into `render_dialog()` — `_dialog_shell.py` (+ callers)
-- `render_dialog()` (`_dialog_shell.py:35`) becomes the only place that inserts: body→options,
-  options→footer, and body→footer (when no options) gaps, via `blank_row()`.
-- Clean up callers `_approval_panel.py`, `_question_panel.py`, `_btw_panel.py` so they stop
-  sprinkling `Text("")`. **Rule:** panels decide sections semantically; `render_dialog()`
-  decides vertical rhythm.
-- Keep `DIALOG_PANEL_PADDING = (0, 1)` (no top/bottom panel padding yet — preserve density).
-
-### A7. Markdown/code spacing stays internal — `components/markdown.py`
-- Replace the literal `Syntax(..., padding=(0, 1))` (L58–63) with `CODE_BLOCK_PADDING`.
-- Do not add external blank rows around markdown/code. **Rule:** markdown/code own internal
-  code-block padding only; stream/panels own external gaps.
-
-### A8. Tests for the standard (add with the migration)
-1. `blank_row().plain == ""`; no canonical spacer uses `" "`.
-2. Live stream: content+spinner = one blank row; tool+tool = one blank row; no `Text(" ")`.
-3. Tool cards: pending/error/success add no extra blanks around body; tinted+spacer == 
-   untinted+spacer vertical gap count.
-4. Diff: edit.py and write.py render summary immediately followed by diff lines.
-5. Prompt preamble: status block → prompt = exactly one newline boundary; card separator
-   appears once.
-6. Dialogs: approval/question/footer gaps come from `render_dialog()`; callers add no blank
-   separators beyond semantic body rows.
-
-### A9. Migration order (collision-safe; tests after each step)
-1. Add `spacing.py` + primitive tests.
-2. Edit `_live_view.py` only →
-   `uv run pytest tests/ui_and_conv/test_empty_think_part_indicator.py -q`
-3. Edit `tool_execution.py`, `edit.py` →
-   `uv run pytest tests/ui_and_conv/test_tui_card_tool_renderers.py -q`
-4. Edit `_dialog_shell.py` + approval/question/btw callers →
-   `uv run pytest tests/ui_and_conv/test_question_panel.py tests/ui_and_conv/test_visualize_running_prompt.py -q`
-5. Edit `prompt.py` last.
-6. Broad gate: `make check-pythinker-code && make test-pythinker-code`
-
----
-
-# TRACK B — File / diff rendering unification
-
-(Markdown/code untouched here; this is tool-card file output and diffs.)
-
-### B1. Collapse two diff engines into one
-- Today: `components/diff.py:render_diff` (plain, no highlight, used by tool cards) **and**
-  `utils/rich/diff_render.py:render_diff_panel` (Pygments + line numbers + inline highlight).
-- Make the panel engine the single engine; `_file_diff.py:diff_frame` calls it in a
-  "compact" mode for inline cards. Removes duplicated line-number/marker/inline logic.
-- **Wire the unused `width`** (`_file_diff.py:125` binds `_ = width`) so diffs respect
-  terminal width instead of implicit wrapping.
-
-### B2. Optional syntax highlighting in tool-card diffs/listings
-- Add opt-in highlighting (lexer by extension via `utils/rich/syntax.py:PythinkerSyntax`) to
-  `format_numbered_lines_block` and the compact diff, behind a theme/setting flag, reusing
-  `_strip_background` so tints don't fight the card bg.
-
-### B3. Line-number + truncation consistency
-- Unify the line-number style token: tool renderers use `"muted"`, panels use `"dim"` → pick
-  one.
-- Unify line-number min-width (today 4 in `_render_utils`, 2 in `diff.py`/`diff_render.py`).
-- One `EXPAND_HINT` formatter → `"… +{n} lines (ctrl+o to expand)"`; retire grep's
-  `"... (N more lines …)"` and standardize the `…` glyph. Collapse the 8 scattered limits
-  (write=10, grep=15, find=20, agent=6, ask_user=8, think=6, todo=12, bash=5) into a
-  principled small/medium/large set in a constants module. Document the
-  `__suppress_generic_expand_hint__` state flag.
-
-Verification: golden tests for a diff (add/remove/context/inline), a wrapped long line at
-narrow width, and a truncated listing with the unified hint.
-
----
-
-# TRACK C — Markdown rendering polish
-
-Chain: `markdown-it` → `utils/rich/markdown.py:Markdown` →
-`components/markdown.py:PythinkerMarkdown`.
-
-### C1. Theme-token bugs
-- **H2 loses color** (`components/markdown.py:81`): add `color=heading` (matches H1/H3/H4).
-- **Hardcoded `grey50`** in thinking paths (`visualize/_blocks.py`, 9+ sites) → a
-  `tui_rich_style("thinking_text")` token so dark/light apply.
-- Distinguish `markdown.link` vs `markdown.link_url` styling (currently identical).
-
-### C2. Streaming correctness / perf
-- Each flush re-parses the whole committed buffer (`_blocks.py:254`). Either parse only the
-  new slice, or wire the already-written-but-unused `PythinkerMarkdownStream`
-  (`components/markdown.py:192`); delete the dead path.
-- Make hidden vs streamed thinking treatment consistent (plain `Text` vs Markdown).
-
-### C3. Code-block ergonomics
-- Optional line numbers in fenced code (today `Syntax()` has none).
-- (Padding already handled by Track A7 `CODE_BLOCK_PADDING`.)
-
-Verification: snapshots for H1–H4, lists, blockquote, table (wide→record), inline code, a
-fenced python block; a streaming test asserting identical final render for 1-chunk vs
-many-chunk arrival.
-
----
-
-# TRACK D — Cross-cutting hardening
-
-- **Theme-literal lint test:** fails if a render module imports a raw color name.
-- **Render matrix:** parametrized snapshots at widths {40, 80, 120}, `NO_COLOR=1`,
-  `PYTHINKER_REDUCED_MOTION=1`.
-- **Marker-glyph registry:** centralize `●`, `⎿`, `❯`, `├`/`└`, `⧉`, spinner frames so the
-  visual language lives in one place.
-
----
-
-## Overall sequencing & risk
-
-Order: **Track A → B → C → D.** A is the validated, highest-impact, lowest-risk track and
-unblocks the rest. Each track = its own branch/commit set, tests green at every step.
-
-Risks / guards:
-- **Snapshot tests are the safety net** — regenerate and *review the diff* each step; an
-  unreviewed snapshot update hides regressions.
-- **Spinner-verb structural tests** (`test_empty_think_part_indicator.py`,
-  `test_live_view_notifications.py`) constrain where the under-gap lives (Track A5).
-- **Parallel agent:** stashes touch only `packages/pythinker-review/*` + `config`/`slash` —
-  no overlap with these files. Target UI files carry uncommitted branch edits; layer on top.
-- **Behavior-preserving claims** proven by before/after snapshot diff, never assumed.
-
-**Definition of done:** one `spacing.py` + one render-constants module imported everywhere;
-one diff engine; zero raw color literals in render paths; unified truncation hint; H2 +
-thinking theme bugs fixed; `make check-pythinker-code && make test-pythinker-code` green plus
-the width/`NO_COLOR`/reduced-motion snapshot matrix.
-
----
-
-## Progress log
-
-### Track A — DONE (2026-05-24)
-- A1 `spacing.py` created (`BLANK_ROW`, gap/padding constants, `blank_row`, `append_gap`,
-  `ensure_prompt_newline` typed against prompt-toolkit `StyleAndTextTuples`) + 10 unit tests.
-- A2 `_live_view.py`: `_ACTION_SPACER = BLANK_ROW` (retired `Text(" ")`).
-- A3 `tool_execution.py`: tinted card padding `(1,1)` → `TINTED_CARD_PADDING (0,1)` — kills the
-  card/stream double-gap.
-- A4 `edit.py`: removed stray `Text("")` between summary and diff (now matches write.py +
-  edit.py's own 2nd diff path).
-- A5 `prompt.py`: refactored every `endswith("\n")` guard to `ensure_prompt_newline()`; added
-  the spinner-verb under-gap at the `_render_agent_status` chokepoint (interactive) and in
-  `_live_view.compose()` gated on a non-empty status line (non-interactive).
-- A6 `_dialog_shell.py` + approval/question/btw/worklog panels: section gaps via `blank_row()`,
-  padding via `DIALOG_PANEL_PADDING`/`WORKLOG_PANEL_PADDING`.
-- A7 `markdown.py`: code-block padding → `CODE_BLOCK_PADDING`.
-- A8 added under-gap behavior tests (compose under-gap present / no-double-gap when empty).
-- Verification: `make check-pythinker-code` clean; `tests/ui_and_conv` + `tests/ui` = 1183
-  passed; new spacing/live-view tests = 20 passed. Behavior-preserving except the intended
-  card-padding and under-gap changes.
-
-### Track B — DONE (2026-05-24), look-preserving
-Decision: the two diff renderers serve different surfaces (inline boxless tool-card diff =
-the screenshot; `render_diff_panel`/`_preview` = boxed approval/worklog). "One engine
-everywhere" would box-ify the inline diff → rejected by user. Unified via a shared module
-instead.
-- New `render_constants.py`: `DIFF_CONTEXT_LINES`, `DIFF_LINE_NUMBER_MIN_WIDTH`,
-  `LISTING_LINE_NUMBER_MIN_WIDTH`, keymap-driven `expand_hint()` (+4 tests).
-- Deduped the two `=3` context constants and `max(…,2)` widths across `components/diff.py`
-  and `utils/rich/diff_render.py`; listing width floor via constant in `_render_utils.py`.
-- Unified the truncation hint across grep/write/diff-preview — fixes a real bug: the panel
-  preview hardcoded `ctrl-e` while the actual expand key is `ctrl+o`; also standardized the
-  `…`/`...` glyph + phrasing.
-- Verified inline diff stays boxless (matches screenshot); `tests/ui_and_conv`+`ui`+`utils`
-  = 1526 passed; checks clean.
-- Deferred (rationale): full word-diff engine merge (different internal representations —
-  risky, low reward); syntax-highlighting tool cards (would change the boxless look);
-  line-number token muted-vs-dim unification (marginal benefit, visible-change risk).
-
-### Track C — DONE (2026-05-24)
-- C1: H2 heading color bug fixed; `link_url` dimmed vs `link`; 3 thinking-path `grey50`
-  literals → `tui_rich_style("thinking_text")`; widened `BulletColumns.bullet_style` to
-  `StyleType`. +2 regression tests.
-- C2: no-op — streaming already parses only new committed slices (not whole buffer), and
-  `PythinkerMarkdownStream` is tested (not dead). Left intact (surgical-change discipline).
-- C3: skipped — code-block line numbers would change the boxless look.
-
-### Track D — DONE (2026-05-24)
-- Finished the `grey50` sweep in `_blocks.py` (subagent/content/bullets) → `tui_rich_style`
-  tokens; `_blocks.py` is now grey50-free.
-- New `glyphs.py`: single source for spinner frames + interval + reduced-motion glyph;
-  `motion.py` and `spinner_words.py` dedup to it (same object, verified by `is`).
-- `test_render_matrix.py`: width {40,80,120} x NO_COLOR x reduced-motion matrix asserting
-  boxless inline diff, content-stable-under-color-toggle, static glyph under reduced motion.
-- `test_render_hardening.py`: guard that `_blocks.py` stays grey50-free + spinner-frame
-  single-source.
-- Deferred (honest): whole-repo raw-color lint enforcement (too big/risky for one pass —
-  needs a systematic sweep first); broad `●`/`⎿`/`❯` marker migration (prompt symbols
-  already centralized via `PROMPT_SYMBOL_*`; adding unused constants would be speculative).
-
-### Final verification
-`make check-pythinker-code` clean; `tests/ui_and_conv`+`ui`+`utils` = 1556 passed, 1 skipped.
diff --git a/tasks/tui-text-color-standardization.md b/tasks/tui-text-color-standardization.md
deleted file mode 100644
index e4453d22..00000000
--- a/tasks/tui-text-color-standardization.md
+++ /dev/null
@@ -1,212 +0,0 @@
-# Plan: Standardize TUI text/font colors (markdown + agent/subagent)
-
-**Status**: Scouting complete — awaiting design decisions before implementation.
-**Created**: 2026-05-24
-**Branch**: `feat/tui-brand-rebrand`
-**Scope (confirmed by user)**: Keep the coral brand. Standardize *text/font colors* — the
-agent/subagent and markdown text colors are the main issue. NOT touching box-drawing
-corners, status-glyph vocabulary, or pi's actual color values.
-
----
-
-## Reference
-
-Design ported from `blackbox/pi-main` (TypeScript/Rich-equivalent). Pythinker already
-ported pi's **core token names** (`accent`, `border`, `tool_success_bg`, …) onto the
-branded `TuiTokens` palette during this branch. Values diverge by design:
-- Pythinker `accent`: **blue** (`#5EA7E8` dark / `#256EA8` light, theme.py:415)
-- Pythinker coral brand: logo (`_LOGO_CORAL=#EE9983`) + verb-spinner shimmer (motion.py)
-- pi `accent`: teal `#8abeb7`
-
-The brand is intentionally different from pi. The "coral brand" the user refers to is the
-logo/shimmer coral, not the structural accent token.
-
----
-
-## Root cause (the "main issue")
-
-There are **two parallel color systems** in `src/pythinker_code/ui/theme.py`:
-
-1. `TuiTokens` (theme.py:368+) — the branded coral source of truth. Everything routed
-   through `tui_rich_style()` / `fg()` / `ShellTone` already resolves here.
-2. `MarkdownColors` (theme.py:218–265) — a **separate, hardcoded** palette
-   (`heading=#F4F4F5`, `emphasis=#A3A3A3`, `inline_code=#AFE3F1`, …) that is **not**
-   derived from `TuiTokens`.
-
-`MarkdownColors` is consumed *only* by `components/markdown.py:_markdown_style_overrides`,
-which is the render path for:
-- assistant messages (`components/messages.py:67,99,144`)
-- special messages — skill / compaction / branch summaries (`components/special_messages.py`)
-- worklog, blocks, question panel (`visualize/_worklog.py`, `_blocks.py`, `_question_panel.py`)
-
-**Consequence:** markdown/prose text colors are maintained independently of the brand
-tokens. Some values coincide (`link=#AFE3F1` == `info` token) but they drift separately,
-and the coral brand never reaches prose.
-
-### Agent/subagent specifics
-- `tool_renderers/agent.py` already routes its *chrome* (header, prompt preview, icons)
-  through tokens correctly.
-- BUT subagent **result text** is rendered by `format_lines_block(... style_token="tool_output")`
-  (agent.py:129–134) as **flat single-color lines, not markdown**. So when a subagent
-  returns markdown, it shows as flat grey `tool_output` text — visually inconsistent with
-  how the assistant's identical markdown renders.
-
----
-
-## Decisions
-
-- [x] **D1 — Markdown accent role.** RESOLVED: **low-chrome, no coral in prose.** Keep
-      the current visual mapping; re-source it from `TuiTokens` for a single source of
-      truth. Zero visual change to prose.
-- [x] **D2 — Subagent output.** RESOLVED: **keep flat (no markdown parsing)**, ensure the
-      color derives from the unified `tool_output` token. (agent.py:133 already passes
-      `style_token="tool_output"` → already token-backed. Likely a no-op; verify.)
-
-## ⚠️ Open observation — `accent` token is BLUE, not coral
-
-The structural `accent` token is **blue** (`#5EA7E8` dark / `#256EA8` light, theme.py:415).
-Coral `#EE9983` appears only in the **logo** (`__init__.py:1767 _LOGO_CORAL`) and the
-**verb-spinner shimmer** (`motion.py:32-38`, marked `brand-exception`). So the current
-brand is *blue structural accent + coral logo/motion*. User said "we need coral brand" —
-needs a decision on whether `accent` should flip to coral (separate from the markdown work).
-
----
-
-## Value-preserving token mapping (verified against theme.py:414-475)
-
-Every `MarkdownColors` field equals an existing token in BOTH themes, except
-`code_block_bg` (markdown-only). So re-sourcing is a pure refactor — no visual change.
-
-| MarkdownColors field | Dark / Light value | Token (same value) |
-|---|---|---|
-| heading | #F4F4F5 / #213853 | `tool_title` |
-| strong | #F4F4F5 / #213853 | `tool_title` |
-| emphasis | #A3A3A3 / #666666 | `muted` |
-| inline_code | #AFE3F1 / #176B7E | `info` |
-| link | #AFE3F1 / #176B7E | `info` |
-| quote | #A3A3A3 / #666666 | `muted` |
-| table_border | #2B3A52 / #C8BEC0 | `border_muted` |
-| code_block_border | #2B3A52 / #C8BEC0 | `border_muted` |
-| spinner_active | #AFE3F1 / #176B7E | `info` |
-| spinner_done | #7BC97F / #2C7A39 | `success` |
-| spinner_failed | #EF5E62 / #C0392B | `error` |
-| code_block_bg | #1f2030 / #f1f5f9 | **none** → add new `code_block_bg` token |
-
-## Proposed plan
-
-1. **Add `code_block_bg` token** to `TuiTokens` (#1f2030 dark / #f1f5f9 light) +
-   `TUI_TOKEN_NAMES` → `verify`: token tests pass.
-2. **Rebuild `_MARKDOWN_DARK`/`_MARKDOWN_LIGHT`** to read every field from the token
-   palette instead of hardcoded hex (keep the `MarkdownColors` dataclass + `get_markdown_colors`
-   API unchanged so `markdown.py` needs no structural change) → `verify`: resolved values
-   byte-identical to current; `tests/ui_and_conv/` snapshots unchanged.
-3. **Confirm subagent output** is token-backed (agent.py:133) → `verify`: no raw color in path.
-4. **Text/copy micro-pass** (only if quick, in scope): standardize error-message
-   punctuation (setup.py:44 vs oauth.py:165).
-5. **Run full TUI suite** + lint/format.
-
----
-
-## Out of scope (flagged, not doing)
-
-- Switching `box.ROUNDED` → pi's square corners (visible change, fights the rebrand).
-- Flattening pythinker's 7 status glyphs to pi's 4 (`✓`/`✗`).
-- Porting pi's fixed-hex syntax theme — pythinker's `ANSISyntaxTheme`
-  (`utils/rich/syntax.py`) is terminal-native and *better*; porting pi's would regress it.
-- Per-thinking-level border colors (pi has 6; pythinker has the levels but one color).
-  Genuinely portable later, but it's a *new feature*, not text-color standardization.
-- Centralizing prompt symbols (`✨`/`›`/`$`) into `glyphs.py` — tidiness, separate task.
-
----
-
-## Broader roadmap (from deep scan — validated 2026-05-24)
-
-Source: TUI deep scan analysis vs pi-main. Claims below have been validated against actual
-source files. See correction notes.
-
-**Correction notes** (errors in the raw analysis, do not carry into work):
-- Analysis claims `accent="#EE9983 (coral)"` — WRONG. Actual: `#5EA7E8` dark / `#256EA8`
-  light (theme.py:415). Coral is logo/shimmer only.
-- Analysis claims "restart required for theme switching" — WRONG. `/theme` slash command
-  exists (tips.py:17); `set_active_theme()` called at runtime (__init__.py:545).
-- Analysis claims "$COLORFGBG fallback" for terminal detection — UNCONFIRMED. No such code
-  found in source.
-
-### P1 — Thinking level color gradient (0.5 day, medium impact)
-
-Port pi's 6 thinking-level border tokens into TuiTokens. Pythinker already exposes all 6
-levels (off/minimal/low/medium/high/xhigh) in the ThinkingLevel literal (selectors/thinking.py:7)
-but has only one `thinking_text` color. pi maps levels to a gradient from muted→vivid.
-
-Proposed tokens to add to TuiTokens:
-```
-thinking_off:     ""          # no border tint (terminal default)
-thinking_minimal: "#6e6e6e"   # subtle
-thinking_low:     "#5f87af"   # blue-grey
-thinking_medium:  "#81a2be"   # blue
-thinking_high:    "#b294bb"   # purple
-thinking_xhigh:   "#d183e8"   # vivid purple
-```
-Use in the thinking block renderer to color the border by active level.
-
-### P1 — Missing markdown tokens (0.5 day, medium impact)
-
-Add to MarkdownColors (and route through TuiTokens after Phase 0):
-- `md_quote_border` — separate from quote text color
-- `md_hr` — horizontal rule color
-- `md_list_bullet` — bullet/number color (currently reuses `quote`)
-- `md_link_url` — URL part of links (currently same as link text)
-
-### P2 — Syntax highlighting token system (2–3 days, medium impact)
-
-Create a bridge that maps TuiTokens syntax fields → Rich/pygments ANSISyntaxTheme,
-replacing the current standalone PYTHINKER_ANSI_THEME with one that respects the active
-brand palette. Tokens to add: `syntax_comment`, `syntax_keyword`, `syntax_function`,
-`syntax_variable`, `syntax_string`, `syntax_number`, `syntax_type`, `syntax_operator`,
-`syntax_punctuation`.
-
-Note: pythinker's current ANSISyntaxTheme is *terminal-native* (uses ANSI color names,
-not hex) which is actually the right approach. A bridge that passes through the terminal
-palette is preferable to pi's hard-coded VS-Code hex approach. Preserving terminal-
-adaptability is a feature.
-
-### P2 — Terminal background auto-detection (1 day, nice-to-have)
-
-Auto-select dark/light theme on startup by detecting terminal background color. pi uses
-OSC 11 query → $COLORFGBG env var → luminance analysis. Fallback chain should be:
-OSC 11 → COLORFGBG → TERM heuristics → "dark".
-
-Note: pythinker currently has no terminal background detection (no COLORFGBG code found).
-
-### P3 — JSON theme file system (3–4 days, high architectural impact)
-
-Externalize TuiTokens + MarkdownColors + DiffColors as JSON files with a `vars` section
-for color aliasing. Enable user custom themes in `~/.pythinker/themes/`. This is the
-highest-leverage infrastructure investment — it unlocks runtime switching, hot-reload,
-and user customization without code changes.
-
-Implementation shape:
-1. `ui/theme_loader.py` — JSON parser with var indirection + circular detection + schema validation
-2. Built-in `dark.json`/`light.json` as package data (ship with pythinker-code)
-3. User override search path: `~/.pythinker/themes/<name>.json`
-4. `set_active_theme()` gains `theme_name` param; resolves from loader
-
-Note: The `/theme` command already exists for dark/light switching (tips.py:17). This
-phase extends it to arbitrary named themes.
-
-### P3 — Component-level theme factories (1–2 days, architecture)
-
-Add bridge factories matching pi's `getMarkdownTheme()` / `getEditorTheme()` pattern:
-```python
-def get_markdown_component_theme() -> MarkdownComponentTheme: ...
-def get_prompt_component_theme() -> PromptComponentTheme: ...
-def get_tool_render_theme() -> ToolRenderTheme: ...
-```
-Reduces consumers needing to know token names directly. Lower priority — current
-`tui_rich_style()` API is workable.
-
----
-
-## Review section
-
-_(to be filled after implementation)_
diff --git a/tests/core/test_compaction_restore.py b/tests/core/test_compaction_restore.py
index 9d1ab764..dfe4c47f 100644
--- a/tests/core/test_compaction_restore.py
+++ b/tests/core/test_compaction_restore.py
@@ -177,3 +177,47 @@ def _capture_wire(msg):
     session_start_call = soul._hook_engine.trigger.await_args_list[2]  # pyright: ignore[reportPrivateUsage]
     assert session_start_call.args[0] == "SessionStart"
     assert session_start_call.kwargs["matcher_value"] == "compact"
+
+
+@pytest.mark.asyncio
+async def test_compact_context_emits_end_when_compaction_fails(
+    runtime: Runtime,
+    tmp_path: Path,
+) -> None:
+    agent = Agent(
+        name="Test Agent",
+        system_prompt="Test system prompt.",
+        toolset=EmptyToolset(),
+        runtime=runtime,
+    )
+    context = Context(file_backend=tmp_path / "history-failure.jsonl")
+    soul = PythinkerSoul(agent, context=context)
+    runtime.session.state.active_skills = []
+
+    await context.append_message(Message(role="user", content=[TextPart(text="compact me")]))
+
+    soul._run_with_connection_recovery = AsyncMock(  # pyright: ignore[reportPrivateUsage]
+        side_effect=RuntimeError("LLM 5xx")
+    )
+    soul._hook_engine.trigger = AsyncMock(return_value=[])  # pyright: ignore[reportPrivateUsage]
+
+    sent: list[str] = []
+
+    def _capture_wire(msg):
+        sent.append(type(msg).__name__)
+
+    with (
+        patch("pythinker_code.soul.pythinkersoul.wire_send", _capture_wire),
+        patch("pythinker_code.telemetry.track") as track,
+        pytest.raises(RuntimeError, match="LLM 5xx"),
+    ):
+        await soul.compact_context()
+
+    assert sent.count("CompactionBegin") == 1
+    assert sent.count("CompactionEnd") == 1
+    track.assert_called_once_with(
+        "compaction_triggered",
+        trigger_type="auto",
+        before_tokens=context.token_count,
+        success=False,
+    )
diff --git a/tests/core/test_memory_phase_bcd.py b/tests/core/test_memory_phase_bcd.py
index 4b949d43..709b8909 100644
--- a/tests/core/test_memory_phase_bcd.py
+++ b/tests/core/test_memory_phase_bcd.py
@@ -6,9 +6,15 @@
 from pythinker_core.message import Message, TextPart
 from pythinker_host.path import HostPath
 
+from pythinker_code.memory.consolidation import generate_inbox_candidates
 from pythinker_code.memory.harvest import CompactionHarvester
 from pythinker_code.memory.recap import build_session_recap, content_hash
-from pythinker_code.memory.retriever import RankedBlock, RecallQuery, estimate_tokens
+from pythinker_code.memory.retriever import (
+    LexicalRetriever,
+    RankedBlock,
+    RecallQuery,
+    estimate_tokens,
+)
 from pythinker_code.memory.retriever_sqlite import SqliteFts5Retriever
 from pythinker_code.project_memory import ProjectMemoryStore
 from pythinker_code.session_state import SessionState, TodoItemState
@@ -83,20 +89,72 @@ def test_compaction_harvester_extracts_safe_assistant_notes():
     ]
 
 
-async def test_sqlite_retriever_falls_back_to_lexical():
-    block = RankedBlock(
+def _ranked(content: str, *, title: str = "retriever") -> RankedBlock:
+    return RankedBlock(
         tier="memory",
         source_path="MEMORY.md",
         source_id=None,
         session_id=None,
-        title="retriever",
+        title=title,
         labels=(),
         files=(),
         created_at_epoch=time.time(),
-        token_estimate=estimate_tokens("lexical sqlite fallback"),
+        token_estimate=estimate_tokens(content),
         score=0.0,
-        content="lexical sqlite fallback",
+        content=content,
+    )
+
+
+async def test_lexical_retriever_matches_unicode_terms():
+    out = await LexicalRetriever(
+        [_ranked("решение использовать кеш"), _ranked("english only")]
+    ).retrieve(RecallQuery(text="решение"), budget_tokens=100)
+    assert out and out[0].content == "решение использовать кеш"
+
+
+async def test_lexical_retriever_uses_query_labels_as_terms():
+    out = await LexicalRetriever([_ranked("lexical sqlite fallback")]).retrieve(
+        RecallQuery(labels=("sqlite fallback",)), budget_tokens=100
+    )
+    assert out and out[0].content == "lexical sqlite fallback"
+
+
+async def test_generate_inbox_candidates_skips_approved_scratch_note(tmp_path, monkeypatch):
+    monkeypatch.setenv("PYTHINKER_SHARE_DIR", str(tmp_path / "share"))
+    repo = tmp_path / "repo"
+    scratch = repo / ".pythinker" / "scratch"
+    scratch.mkdir(parents=True)
+    (scratch / "notes.md").write_text(
+        "### decision — 2026-05-30 10:00\n\nUse lexical recall for project memory.",
+        encoding="utf-8",
+    )
+    store = ProjectMemoryStore(_hp(repo))
+    assert (await store.add("memory", "Use lexical recall for project memory.")).ok is True
+
+    assert await generate_inbox_candidates(store, _hp(repo)) == []
+
+
+async def test_generate_inbox_candidates_ignores_corrupt_duplicate_file(tmp_path, monkeypatch):
+    monkeypatch.setenv("PYTHINKER_SHARE_DIR", str(tmp_path / "share"))
+    repo = tmp_path / "repo"
+    scratch = repo / ".pythinker" / "scratch"
+    scratch.mkdir(parents=True)
+    (scratch / "notes.md").write_text(
+        "### decision — 2026-05-30 10:00\n\nKeep corrupt inbox files from crashing harvest.",
+        encoding="utf-8",
     )
+    store = ProjectMemoryStore(_hp(repo))
+    first = await generate_inbox_candidates(store, _hp(repo))
+    assert len(first) == 1
+    root = await store._ensure_dir()  # pyright: ignore[reportPrivateUsage]
+    inbox = root / "memory" / "inbox"
+    (inbox / f"{first[0].id}.json").write_text("{not json", encoding="utf-8")
+
+    assert await generate_inbox_candidates(store, _hp(repo)) == []
+
+
+async def test_sqlite_retriever_falls_back_to_lexical():
+    block = _ranked("lexical sqlite fallback")
     out = await SqliteFts5Retriever([block], fts5_available=False).retrieve(
         RecallQuery(text="sqlite fallback"), budget_tokens=100
     )
diff --git a/tests/tools/test_str_replace_file.py b/tests/tools/test_str_replace_file.py
index fb95f70f..e9fa8582 100644
--- a/tests/tools/test_str_replace_file.py
+++ b/tests/tools/test_str_replace_file.py
@@ -282,10 +282,8 @@ async def test_replace_mixed_multiple_edits(
         Params(
             path=str(file_path),
             edit=[
-                Edit(old="apple", new="fruit", replace_all=False),  # Only first occurrence
-                Edit(
-                    old="banana", new="tasty", replace_all=True
-                ),  # All occurrences (though only one)
+                Edit(old="apple apple", new="fruit apple", replace_all=False),
+                Edit(old="banana", new="tasty", replace_all=True),
             ],
         )
     )
@@ -295,6 +293,111 @@ async def test_replace_mixed_multiple_edits(
     assert await file_path.read_text() == "fruit apple tasty apple cherry"
 
 
+async def test_multi_edit_reports_missing_old_string(
+    str_replace_file_tool: StrReplaceFile, temp_work_dir: HostPath
+):
+    """A non-matching edit in a multi-edit list must error, not silently no-op.
+
+    Regression: previously the file was written with only the matching edits
+    applied and the missing edit was swallowed because the no-change check ran
+    once over the whole batch.
+    """
+    file_path = temp_work_dir / "test.txt"
+    original_content = "apple banana cherry"
+    await file_path.write_text(original_content)
+
+    result = await str_replace_file_tool(
+        Params(
+            path=str(file_path),
+            edit=[
+                Edit(old="apple", new="fruit"),
+                Edit(old="not-present", new="x"),
+            ],
+        )
+    )
+
+    assert result.is_error
+    assert "No replacements were made" in result.message
+    assert "not-present" in result.message
+    # The file must be left untouched when any edit fails to match.
+    assert await file_path.read_text() == original_content
+
+
+async def test_multi_edit_count_handles_chained_edits(
+    str_replace_file_tool: StrReplaceFile, temp_work_dir: HostPath
+):
+    """Replacement count is tallied against progressively-edited content.
+
+    The second edit targets text produced by the first; the success message
+    must still count it (the old code counted against the original content and
+    reported zero for chained edits).
+    """
+    file_path = temp_work_dir / "test.txt"
+    await file_path.write_text("alpha")
+
+    result = await str_replace_file_tool(
+        Params(
+            path=str(file_path),
+            edit=[
+                Edit(old="alpha", new="beta"),
+                Edit(old="beta", new="gamma"),
+            ],
+        )
+    )
+
+    assert not result.is_error
+    assert "successfully edited" in result.message
+    assert "2 total replacement(s)" in result.message
+    assert await file_path.read_text() == "gamma"
+
+
+async def test_single_edit_requires_unique_old_string(
+    str_replace_file_tool: StrReplaceFile, temp_work_dir: HostPath
+):
+    """A non-replace_all edit must not silently choose the first match."""
+    file_path = temp_work_dir / "test.txt"
+    original_content = "apple banana apple"
+    await file_path.write_text(original_content)
+
+    result = await str_replace_file_tool(
+        Params(path=str(file_path), edit=Edit(old="apple", new="fruit"))
+    )
+
+    assert result.is_error
+    assert "occurs 2 times" in result.message
+    assert "replace_all=true" in result.message
+    assert await file_path.read_text() == original_content
+
+
+async def test_replace_rejects_empty_old_string(
+    str_replace_file_tool: StrReplaceFile, temp_work_dir: HostPath
+):
+    """An empty old string would otherwise insert text instead of replacing."""
+    file_path = temp_work_dir / "test.txt"
+    original_content = "Hello world!"
+    await file_path.write_text(original_content)
+
+    result = await str_replace_file_tool(
+        Params(path=str(file_path), edit=Edit(old="", new="prefix"))
+    )
+
+    assert result.is_error
+    assert "old string cannot be empty" in result.message
+    assert await file_path.read_text() == original_content
+
+
+async def test_replace_rejects_empty_edit_list(
+    str_replace_file_tool: StrReplaceFile, temp_work_dir: HostPath
+):
+    file_path = temp_work_dir / "test.txt"
+    await file_path.write_text("Hello world!")
+
+    result = await str_replace_file_tool(Params(path=str(file_path), edit=[]))
+
+    assert result.is_error
+    assert "At least one edit" in result.message
+
+
 async def test_replace_empty_strings(
     str_replace_file_tool: StrReplaceFile, temp_work_dir: HostPath
 ):
diff --git a/tests/tools/test_tool_descriptions.py b/tests/tools/test_tool_descriptions.py
index fedd900c..cf048f24 100644
--- a/tests/tools/test_tool_descriptions.py
+++ b/tests/tools/test_tool_descriptions.py
@@ -367,6 +367,7 @@ def test_str_replace_file_description(str_replace_file_tool: StrReplaceFile):
 - Only use this tool on text files.
 - Multi-line strings are supported.
 - Can specify a single edit or a list of edits in one call.
+- Unless `replace_all` is true, the old string must match exactly once; add surrounding context if it is ambiguous.
 - You should prefer this tool over WriteFile tool and Shell `sed` command.
 """
     )
diff --git a/tests/ui_and_conv/test_shell_export_import_commands.py b/tests/ui_and_conv/test_shell_export_import_commands.py
index a8c3cf53..030c5703 100644
--- a/tests/ui_and_conv/test_shell_export_import_commands.py
+++ b/tests/ui_and_conv/test_shell_export_import_commands.py
@@ -1,10 +1,12 @@
 from __future__ import annotations
 
+from io import StringIO
 from pathlib import Path
 from types import SimpleNamespace
 from unittest.mock import AsyncMock, Mock
 
 from pythinker_core.message import Message
+from rich.console import Console
 
 from pythinker_code.session import Session
 from pythinker_code.ui.shell import export_import as shell_export_import
@@ -46,6 +48,25 @@ async def test_export_writes_markdown_file(tmp_path: Path) -> None:
     assert "Hi!" in content
 
 
+async def test_export_escapes_bracketed_output_path(tmp_path: Path, monkeypatch) -> None:
+    app = _make_shell_app(tmp_path)
+    app.soul.context.history = [Message(role="user", content=[TextPart(text="Hello")])]
+    output_dir = tmp_path / "[proj]"
+    output_dir.mkdir()
+    output = output_dir / "session.md"
+
+    printed = StringIO()
+    monkeypatch.setattr(
+        shell_export_import,
+        "console",
+        Console(file=printed, force_terminal=False, color_system=None),
+    )
+
+    await shell_export_import.export(app, str(output))  # type: ignore[reportGeneralTypeIssues]
+
+    assert "[proj]" in printed.getvalue()
+
+
 async def test_import_from_file_appends_message_and_wire_markers(tmp_path: Path) -> None:
     app = _make_shell_app(tmp_path)
     source_file = tmp_path / "source.md"