VectifyAI · KylinMountain · Jun 30, 2026 · Jun 30, 2026
diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py
@@ -262,12 +262,75 @@ def _cached_text(text: str) -> list[dict]:
     ephemeral cache_control marker.
 
     LiteLLM passes the marker through to Anthropic (and OpenRouter →
-    Anthropic). For providers that ignore cache_control, the list-of-blocks
-    payload remains a valid OpenAI-compatible content shape.
+    Anthropic). For other providers the marker is stripped at the request
+    egress (see :func:`_strip_cache_control`, applied in :func:`_llm_call`),
+    because not every provider merely *ignores* it — Gemini in particular
+    turns it into a 400. The list-of-blocks payload that remains is a valid
+    OpenAI-compatible content shape.
     """
     return [{"type": "text", "text": text, "cache_control": {"type": "ephemeral"}}]
 
 
+def _accepts_cache_control(model: str) -> bool:
+    """Whether ``model`` honours Anthropic-style ``cache_control`` markers.
+
+    The markers emitted by :func:`_cached_text` are an Anthropic feature.
+    LiteLLM forwards them to Anthropic directly, and to Anthropic (Claude)
+    models served via OpenRouter, Bedrock and Vertex. For other providers —
+    notably Gemini — LiteLLM instead translates the marker into a
+    provider-native cached-content object that conflicts with
+    ``system_instruction``/``tools`` and makes *every* request fail with
+    ``400 CachedContent can not be used with ...``. Detect the provider so the
+    marker can be dropped before it reaches such a backend.
+    """
+    # Import the real symbol rather than going through the module-level
+    # ``litellm`` reference: provider detection must stay correct even when a
+    # caller patches ``openkb.agent.compiler.litellm`` to stub out completion.
+    from litellm import get_llm_provider
+
+    try:
+        provider = get_llm_provider(model)[1]
+    except Exception:
+        provider = ""
+    lowered = model.lower()
+    if provider == "anthropic":
+        return True
+    if provider in ("openrouter", "bedrock", "vertex_ai") and (
+        "claude" in lowered or "anthropic" in lowered
+    ):
+        return True
+    return False
+
+
+def _strip_cache_control(messages: list[dict]) -> list[dict]:
+    """Return ``messages`` with every ``cache_control`` key removed.
+
+    Only list-of-blocks contents (see :func:`_cached_text`) can carry the
+    marker; plain-string contents pass through untouched. The input is not
+    mutated.
+    """
+    cleaned: list[dict] = []
+    for msg in messages:
+        content = msg.get("content")
+        if isinstance(content, list):
+            blocks = [
+                {k: v for k, v in block.items() if k != "cache_control"}
+                if isinstance(block, dict)
+                else block
+                for block in content
+            ]
+            msg = {**msg, "content": blocks}
+        cleaned.append(msg)
+    return cleaned
+
+
+def _prepare_messages(model: str, messages: list[dict]) -> list[dict]:
+    """Drop cache_control markers when ``model`` would reject them."""
+    if _accepts_cache_control(model):
+        return messages
+    return _strip_cache_control(messages)
+
+
 class _Spinner:
     """Animated dots spinner that runs in a background thread."""
 
@@ -328,6 +391,7 @@ def _fmt_messages(messages: list[dict], max_content: int = 200) -> str:
 
 def _llm_call(model: str, messages: list[dict], step_name: str, **kwargs) -> str:
     """Single LLM call with animated progress and debug logging."""
+    messages = _prepare_messages(model, messages)
     extra_headers = get_extra_headers()
     if extra_headers:
         kwargs.setdefault("extra_headers", extra_headers)
@@ -353,6 +417,7 @@ def _llm_call(model: str, messages: list[dict], step_name: str, **kwargs) -> str
 
 async def _llm_call_async(model: str, messages: list[dict], step_name: str, **kwargs) -> str:
     """Async LLM call with timing output and debug logging."""
+    messages = _prepare_messages(model, messages)
     extra_headers = get_extra_headers()
     if extra_headers:
         kwargs.setdefault("extra_headers", extra_headers)

diff --git a/tests/test_compiler.py b/tests/test_compiler.py
@@ -2260,6 +2260,68 @@ async def test_llm_call_async_injects_extra_headers(self):
         assert kwargs["extra_headers"] == {"Copilot-Integration-Id": "vscode-chat"}
 
 
+class TestCacheControlStripping:
+    """cache_control markers must only reach providers that honour them.
+
+    ``_cached_text`` tags payloads with an Anthropic ``cache_control`` marker.
+    LiteLLM turns that marker into a hard 400 for Gemini ("CachedContent can not
+    be used with system_instruction/tools") and silently wastes it on other
+    non-Anthropic providers, so ``_llm_call``/``_llm_call_async`` strip it for
+    every non-Anthropic model. Regression for the all-Gemini-compiles-fail bug.
+    """
+
+    def test_accepts_for_anthropic_providers(self):
+        from openkb.agent.compiler import _accepts_cache_control
+
+        assert _accepts_cache_control("anthropic/claude-sonnet-4-6")
+        assert _accepts_cache_control("claude-opus-4-6")
+        # Claude served via OpenRouter still honours the marker.
+        assert _accepts_cache_control("openrouter/anthropic/claude-3.5-sonnet")
+
+    def test_rejects_for_non_anthropic_providers(self):
+        from openkb.agent.compiler import _accepts_cache_control
+
+        assert not _accepts_cache_control("gemini/gemini-2.5-pro")
+        assert not _accepts_cache_control("gpt-4o")
+
+    def test_strip_removes_marker_without_mutating_input(self):
+        from openkb.agent.compiler import _cached_text, _strip_cache_control
+
+        messages = [
+            {"role": "system", "content": "plain string stays"},
+            {"role": "user", "content": _cached_text("doc")},
+        ]
+        cleaned = _strip_cache_control(messages)
+        # Plain-string content passes through untouched.
+        assert cleaned[0]["content"] == "plain string stays"
+        # Marker gone, text preserved.
+        assert cleaned[1]["content"] == [{"type": "text", "text": "doc"}]
+        # Original input is not mutated.
+        assert "cache_control" in messages[1]["content"][0]
+
+    def test_llm_call_strips_marker_for_gemini(self):
+        from openkb.agent.compiler import _cached_text, _llm_call
+
+        with patch("openkb.agent.compiler.litellm.completion",
+                   MagicMock(side_effect=_mock_completion(["ok"]))) as mock_completion:
+            _llm_call("gemini/gemini-2.5-pro",
+                      [{"role": "user", "content": _cached_text("doc")}], "step")
+        sent = mock_completion.call_args.kwargs["messages"]
+        block = sent[0]["content"][0]
+        assert "cache_control" not in block
+        assert block["text"] == "doc"
+
+    def test_llm_call_keeps_marker_for_anthropic(self):
+        from openkb.agent.compiler import _cached_text, _llm_call
+
+        with patch("openkb.agent.compiler.litellm.completion",
+                   MagicMock(side_effect=_mock_completion(["ok"]))) as mock_completion:
+            _llm_call("anthropic/claude-sonnet-4-6",
+                      [{"role": "user", "content": _cached_text("doc")}], "step")
+        sent = mock_completion.call_args.kwargs["messages"]
+        assert sent[0]["content"][0]["cache_control"] == {"type": "ephemeral"}
+
+
 class TestFrontmatterDashBoundary:
     """Regression: description containing '---' must not truncate frontmatter."""