Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 67 additions & 2 deletions openkb/agent/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,12 +262,75 @@ def _cached_text(text: str) -> list[dict]:
ephemeral cache_control marker.

LiteLLM passes the marker through to Anthropic (and OpenRouter →
Anthropic). For providers that ignore cache_control, the list-of-blocks
payload remains a valid OpenAI-compatible content shape.
Anthropic). For other providers the marker is stripped at the request
egress (see :func:`_strip_cache_control`, applied in :func:`_llm_call`),
because not every provider merely *ignores* it — Gemini in particular
turns it into a 400. The list-of-blocks payload that remains is a valid
OpenAI-compatible content shape.
"""
return [{"type": "text", "text": text, "cache_control": {"type": "ephemeral"}}]


def _accepts_cache_control(model: str) -> bool:
"""Whether ``model`` honours Anthropic-style ``cache_control`` markers.

The markers emitted by :func:`_cached_text` are an Anthropic feature.
LiteLLM forwards them to Anthropic directly, and to Anthropic (Claude)
models served via OpenRouter, Bedrock and Vertex. For other providers —
notably Gemini — LiteLLM instead translates the marker into a
provider-native cached-content object that conflicts with
``system_instruction``/``tools`` and makes *every* request fail with
``400 CachedContent can not be used with ...``. Detect the provider so the
marker can be dropped before it reaches such a backend.
"""
# Import the real symbol rather than going through the module-level
# ``litellm`` reference: provider detection must stay correct even when a
# caller patches ``openkb.agent.compiler.litellm`` to stub out completion.
from litellm import get_llm_provider

try:
provider = get_llm_provider(model)[1]
except Exception:
provider = ""
lowered = model.lower()
if provider == "anthropic":
return True
if provider in ("openrouter", "bedrock", "vertex_ai") and (
"claude" in lowered or "anthropic" in lowered
):
return True
return False


def _strip_cache_control(messages: list[dict]) -> list[dict]:
"""Return ``messages`` with every ``cache_control`` key removed.

Only list-of-blocks contents (see :func:`_cached_text`) can carry the
marker; plain-string contents pass through untouched. The input is not
mutated.
"""
cleaned: list[dict] = []
for msg in messages:
content = msg.get("content")
if isinstance(content, list):
blocks = [
{k: v for k, v in block.items() if k != "cache_control"}
if isinstance(block, dict)
else block
for block in content
]
msg = {**msg, "content": blocks}
cleaned.append(msg)
return cleaned


def _prepare_messages(model: str, messages: list[dict]) -> list[dict]:
"""Drop cache_control markers when ``model`` would reject them."""
if _accepts_cache_control(model):
return messages
return _strip_cache_control(messages)


class _Spinner:
"""Animated dots spinner that runs in a background thread."""

Expand Down Expand Up @@ -328,6 +391,7 @@ def _fmt_messages(messages: list[dict], max_content: int = 200) -> str:

def _llm_call(model: str, messages: list[dict], step_name: str, **kwargs) -> str:
"""Single LLM call with animated progress and debug logging."""
messages = _prepare_messages(model, messages)
extra_headers = get_extra_headers()
if extra_headers:
kwargs.setdefault("extra_headers", extra_headers)
Expand All @@ -353,6 +417,7 @@ def _llm_call(model: str, messages: list[dict], step_name: str, **kwargs) -> str

async def _llm_call_async(model: str, messages: list[dict], step_name: str, **kwargs) -> str:
"""Async LLM call with timing output and debug logging."""
messages = _prepare_messages(model, messages)
extra_headers = get_extra_headers()
if extra_headers:
kwargs.setdefault("extra_headers", extra_headers)
Expand Down
62 changes: 62 additions & 0 deletions tests/test_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2260,6 +2260,68 @@ async def test_llm_call_async_injects_extra_headers(self):
assert kwargs["extra_headers"] == {"Copilot-Integration-Id": "vscode-chat"}


class TestCacheControlStripping:
"""cache_control markers must only reach providers that honour them.

``_cached_text`` tags payloads with an Anthropic ``cache_control`` marker.
LiteLLM turns that marker into a hard 400 for Gemini ("CachedContent can not
be used with system_instruction/tools") and silently wastes it on other
non-Anthropic providers, so ``_llm_call``/``_llm_call_async`` strip it for
every non-Anthropic model. Regression for the all-Gemini-compiles-fail bug.
"""

def test_accepts_for_anthropic_providers(self):
from openkb.agent.compiler import _accepts_cache_control

assert _accepts_cache_control("anthropic/claude-sonnet-4-6")
assert _accepts_cache_control("claude-opus-4-6")
# Claude served via OpenRouter still honours the marker.
assert _accepts_cache_control("openrouter/anthropic/claude-3.5-sonnet")

def test_rejects_for_non_anthropic_providers(self):
from openkb.agent.compiler import _accepts_cache_control

assert not _accepts_cache_control("gemini/gemini-2.5-pro")
assert not _accepts_cache_control("gpt-4o")

def test_strip_removes_marker_without_mutating_input(self):
from openkb.agent.compiler import _cached_text, _strip_cache_control

messages = [
{"role": "system", "content": "plain string stays"},
{"role": "user", "content": _cached_text("doc")},
]
cleaned = _strip_cache_control(messages)
# Plain-string content passes through untouched.
assert cleaned[0]["content"] == "plain string stays"
# Marker gone, text preserved.
assert cleaned[1]["content"] == [{"type": "text", "text": "doc"}]
# Original input is not mutated.
assert "cache_control" in messages[1]["content"][0]

def test_llm_call_strips_marker_for_gemini(self):
from openkb.agent.compiler import _cached_text, _llm_call

with patch("openkb.agent.compiler.litellm.completion",
MagicMock(side_effect=_mock_completion(["ok"]))) as mock_completion:
_llm_call("gemini/gemini-2.5-pro",
[{"role": "user", "content": _cached_text("doc")}], "step")
sent = mock_completion.call_args.kwargs["messages"]
block = sent[0]["content"][0]
assert "cache_control" not in block
assert block["text"] == "doc"

def test_llm_call_keeps_marker_for_anthropic(self):
from openkb.agent.compiler import _cached_text, _llm_call

with patch("openkb.agent.compiler.litellm.completion",
MagicMock(side_effect=_mock_completion(["ok"]))) as mock_completion:
_llm_call("anthropic/claude-sonnet-4-6",
[{"role": "user", "content": _cached_text("doc")}], "step")
sent = mock_completion.call_args.kwargs["messages"]
assert sent[0]["content"][0]["cache_control"] == {"type": "ephemeral"}


class TestFrontmatterDashBoundary:
"""Regression: description containing '---' must not truncate frontmatter."""

Expand Down