Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# 6.9.2 - 2025-11-10

- fix(llma): fix cache token double subtraction in Langchain for non-Anthropic providers causing negative costs

# 6.9.1 - 2025-11-07

- fix(error-tracking): pass code variables config from init to client
Expand Down
38 changes: 29 additions & 9 deletions posthog/ai/langchain/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,7 @@ def _capture_generation(
event_properties["$ai_is_error"] = True
else:
# Add usage
usage = _parse_usage(output)
usage = _parse_usage(output, run.provider, run.model)
event_properties["$ai_input_tokens"] = usage.input_tokens
event_properties["$ai_output_tokens"] = usage.output_tokens
event_properties["$ai_cache_creation_input_tokens"] = (
Expand Down Expand Up @@ -696,6 +696,8 @@ class ModelUsage:

def _parse_usage_model(
usage: Union[BaseModel, dict],
provider: Optional[str] = None,
model: Optional[str] = None,
) -> ModelUsage:
if isinstance(usage, BaseModel):
usage = usage.__dict__
Expand Down Expand Up @@ -764,16 +766,30 @@ def _parse_usage_model(
for mapped_key, dataclass_key in field_mapping.items()
},
)
# In LangChain, input_tokens is the sum of input and cache read tokens.
# Our cost calculation expects them to be separate, for Anthropic.
if normalized_usage.input_tokens and normalized_usage.cache_read_tokens:
# For Anthropic providers, LangChain reports input_tokens as the sum of input and cache read tokens.
# Our cost calculation expects them to be separate for Anthropic, so we subtract cache tokens.
# For other providers (OpenAI, etc.), input_tokens already includes cache tokens as expected.
# Match logic consistent with plugin-server: exact match on provider OR substring match on model
is_anthropic = False
if provider and provider.lower() == "anthropic":
is_anthropic = True
elif model and "anthropic" in model.lower():
is_anthropic = True

if (
is_anthropic
and normalized_usage.input_tokens
and normalized_usage.cache_read_tokens
):
normalized_usage.input_tokens = max(
normalized_usage.input_tokens - normalized_usage.cache_read_tokens, 0
)
return normalized_usage


def _parse_usage(response: LLMResult) -> ModelUsage:
def _parse_usage(
response: LLMResult, provider: Optional[str] = None, model: Optional[str] = None
) -> ModelUsage:
# langchain-anthropic uses the usage field
llm_usage_keys = ["token_usage", "usage"]
llm_usage: ModelUsage = ModelUsage(
Expand All @@ -787,21 +803,25 @@ def _parse_usage(response: LLMResult) -> ModelUsage:
if response.llm_output is not None:
for key in llm_usage_keys:
if response.llm_output.get(key):
llm_usage = _parse_usage_model(response.llm_output[key])
llm_usage = _parse_usage_model(
response.llm_output[key], provider, model
)
break

if hasattr(response, "generations"):
for generation in response.generations:
if "usage" in generation:
llm_usage = _parse_usage_model(generation["usage"])
llm_usage = _parse_usage_model(generation["usage"], provider, model)
break

for generation_chunk in generation:
if generation_chunk.generation_info and (
"usage_metadata" in generation_chunk.generation_info
):
llm_usage = _parse_usage_model(
generation_chunk.generation_info["usage_metadata"]
generation_chunk.generation_info["usage_metadata"],
provider,
model,
)
break

Expand All @@ -828,7 +848,7 @@ def _parse_usage(response: LLMResult) -> ModelUsage:
bedrock_anthropic_usage or bedrock_titan_usage or ollama_usage
)
if chunk_usage:
llm_usage = _parse_usage_model(chunk_usage)
llm_usage = _parse_usage_model(chunk_usage, provider, model)
break

return llm_usage
Expand Down
59 changes: 52 additions & 7 deletions posthog/test/ai/langchain/test_callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1584,13 +1584,58 @@ def test_anthropic_cache_write_and_read_tokens(mock_client):
generation_props = generation_args["properties"]

assert generation_args["event"] == "$ai_generation"
assert generation_props["$ai_input_tokens"] == 400
assert (
generation_props["$ai_input_tokens"] == 1200
) # No provider metadata, no subtraction
assert generation_props["$ai_output_tokens"] == 30
assert generation_props["$ai_cache_creation_input_tokens"] == 0
assert generation_props["$ai_cache_read_input_tokens"] == 800
assert generation_props["$ai_reasoning_tokens"] == 0


def test_anthropic_provider_subtracts_cache_tokens(mock_client):
"""Test that Anthropic provider correctly subtracts cache tokens from input tokens."""
from langchain_core.outputs import LLMResult, ChatGeneration
from langchain_core.messages import AIMessage
from uuid import uuid4

cb = CallbackHandler(mock_client)
run_id = uuid4()

# Set up with Anthropic provider
cb._set_llm_metadata(
serialized={},
run_id=run_id,
messages=[{"role": "user", "content": "test"}],
metadata={"ls_provider": "anthropic", "ls_model_name": "claude-3-sonnet"},
)

# Response with cache tokens: 1200 input (includes 800 cached)
response = LLMResult(
generations=[
[
ChatGeneration(
message=AIMessage(content="Response"),
generation_info={
"usage_metadata": {
"input_tokens": 1200,
"output_tokens": 50,
"cache_read_input_tokens": 800,
}
},
)
]
],
llm_output={},
)

cb._pop_run_and_capture_generation(run_id, None, response)

generation_args = mock_client.capture.call_args_list[0][1]
assert generation_args["properties"]["$ai_input_tokens"] == 400 # 1200 - 800
assert generation_args["properties"]["$ai_cache_read_input_tokens"] == 800


def test_openai_cache_read_tokens(mock_client):
"""Test that OpenAI cache read tokens are captured correctly."""
prompt = ChatPromptTemplate.from_messages(
Expand Down Expand Up @@ -1626,7 +1671,7 @@ def test_openai_cache_read_tokens(mock_client):
generation_props = generation_args["properties"]

assert generation_args["event"] == "$ai_generation"
assert generation_props["$ai_input_tokens"] == 50
assert generation_props["$ai_input_tokens"] == 150 # No subtraction for OpenAI
assert generation_props["$ai_output_tokens"] == 40
assert generation_props["$ai_cache_read_input_tokens"] == 100
assert generation_props["$ai_cache_creation_input_tokens"] == 0
Expand Down Expand Up @@ -1708,7 +1753,7 @@ def test_combined_reasoning_and_cache_tokens(mock_client):
generation_props = generation_args["properties"]

assert generation_args["event"] == "$ai_generation"
assert generation_props["$ai_input_tokens"] == 200
assert generation_props["$ai_input_tokens"] == 500 # No subtraction for OpenAI
assert generation_props["$ai_output_tokens"] == 100
assert generation_props["$ai_cache_read_input_tokens"] == 300
assert generation_props["$ai_cache_creation_input_tokens"] == 0
Expand Down Expand Up @@ -1917,8 +1962,8 @@ def test_cache_read_tokens_subtraction_from_input_tokens(mock_client):
generation_props = generation_args["properties"]

assert generation_args["event"] == "$ai_generation"
# Input tokens should be reduced: 150 - 100 = 50
assert generation_props["$ai_input_tokens"] == 50
# Input tokens not reduced without provider metadata
assert generation_props["$ai_input_tokens"] == 150
assert generation_props["$ai_output_tokens"] == 40
assert generation_props["$ai_cache_read_input_tokens"] == 100

Expand Down Expand Up @@ -1959,8 +2004,8 @@ def test_cache_read_tokens_subtraction_prevents_negative(mock_client):
generation_props = generation_args["properties"]

assert generation_args["event"] == "$ai_generation"
# Input tokens should be 0, not negative: max(80 - 100, 0) = 0
assert generation_props["$ai_input_tokens"] == 0
# Input tokens not reduced without provider metadata
assert generation_props["$ai_input_tokens"] == 80
assert generation_props["$ai_output_tokens"] == 20
assert generation_props["$ai_cache_read_input_tokens"] == 100

Expand Down
2 changes: 1 addition & 1 deletion posthog/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
VERSION = "6.9.1"
VERSION = "6.9.2"

if __name__ == "__main__":
print(VERSION, end="") # noqa: T201
Loading