From 9cb1a8e5f11990f87d224d75aa6fc5287b09654f Mon Sep 17 00:00:00 2001 From: Carlos Marchal Date: Mon, 10 Nov 2025 20:19:20 +0100 Subject: [PATCH] fix: cache token double subtraction for non-Anthropic providers PR #346 introduced cache token subtraction for all providers, causing double subtraction for OpenAI/OpenRouter and resulting in negative costs. The plugin-server only subtracts cache tokens for Anthropic providers (exact match on provider name or substring match on model name). This fix aligns the Python SDK with that behavior. Changes: - Only subtract cache tokens when provider="anthropic" OR model contains "anthropic" - Passes provider and model metadata to usage parsing functions - Updates tests to reflect correct behavior (no subtraction for OpenAI) - Adds test for Anthropic provider subtraction Fixes negative cost calculations for users on OpenAI/OpenRouter with cached tokens. --- CHANGELOG.md | 4 ++ posthog/ai/langchain/callbacks.py | 38 +++++++++---- posthog/test/ai/langchain/test_callbacks.py | 59 ++++++++++++++++++--- posthog/version.py | 2 +- 4 files changed, 86 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f73af971..2ca5bda5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# 6.9.2 - 2025-11-10 + +- fix(llma): fix cache token double subtraction in Langchain for non-Anthropic providers causing negative costs + # 6.9.1 - 2025-11-07 - fix(error-tracking): pass code variables config from init to client diff --git a/posthog/ai/langchain/callbacks.py b/posthog/ai/langchain/callbacks.py index 065a15c8..2132b3ae 100644 --- a/posthog/ai/langchain/callbacks.py +++ b/posthog/ai/langchain/callbacks.py @@ -575,7 +575,7 @@ def _capture_generation( event_properties["$ai_is_error"] = True else: # Add usage - usage = _parse_usage(output) + usage = _parse_usage(output, run.provider, run.model) event_properties["$ai_input_tokens"] = usage.input_tokens event_properties["$ai_output_tokens"] = usage.output_tokens event_properties["$ai_cache_creation_input_tokens"] = ( @@ -696,6 +696,8 @@ class ModelUsage: def _parse_usage_model( usage: Union[BaseModel, dict], + provider: Optional[str] = None, + model: Optional[str] = None, ) -> ModelUsage: if isinstance(usage, BaseModel): usage = usage.__dict__ @@ -764,16 +766,30 @@ def _parse_usage_model( for mapped_key, dataclass_key in field_mapping.items() }, ) - # In LangChain, input_tokens is the sum of input and cache read tokens. - # Our cost calculation expects them to be separate, for Anthropic. - if normalized_usage.input_tokens and normalized_usage.cache_read_tokens: + # For Anthropic providers, LangChain reports input_tokens as the sum of input and cache read tokens. + # Our cost calculation expects them to be separate for Anthropic, so we subtract cache tokens. + # For other providers (OpenAI, etc.), input_tokens already includes cache tokens as expected. + # Match logic consistent with plugin-server: exact match on provider OR substring match on model + is_anthropic = False + if provider and provider.lower() == "anthropic": + is_anthropic = True + elif model and "anthropic" in model.lower(): + is_anthropic = True + + if ( + is_anthropic + and normalized_usage.input_tokens + and normalized_usage.cache_read_tokens + ): normalized_usage.input_tokens = max( normalized_usage.input_tokens - normalized_usage.cache_read_tokens, 0 ) return normalized_usage -def _parse_usage(response: LLMResult) -> ModelUsage: +def _parse_usage( + response: LLMResult, provider: Optional[str] = None, model: Optional[str] = None +) -> ModelUsage: # langchain-anthropic uses the usage field llm_usage_keys = ["token_usage", "usage"] llm_usage: ModelUsage = ModelUsage( @@ -787,13 +803,15 @@ def _parse_usage(response: LLMResult) -> ModelUsage: if response.llm_output is not None: for key in llm_usage_keys: if response.llm_output.get(key): - llm_usage = _parse_usage_model(response.llm_output[key]) + llm_usage = _parse_usage_model( + response.llm_output[key], provider, model + ) break if hasattr(response, "generations"): for generation in response.generations: if "usage" in generation: - llm_usage = _parse_usage_model(generation["usage"]) + llm_usage = _parse_usage_model(generation["usage"], provider, model) break for generation_chunk in generation: @@ -801,7 +819,9 @@ def _parse_usage(response: LLMResult) -> ModelUsage: "usage_metadata" in generation_chunk.generation_info ): llm_usage = _parse_usage_model( - generation_chunk.generation_info["usage_metadata"] + generation_chunk.generation_info["usage_metadata"], + provider, + model, ) break @@ -828,7 +848,7 @@ def _parse_usage(response: LLMResult) -> ModelUsage: bedrock_anthropic_usage or bedrock_titan_usage or ollama_usage ) if chunk_usage: - llm_usage = _parse_usage_model(chunk_usage) + llm_usage = _parse_usage_model(chunk_usage, provider, model) break return llm_usage diff --git a/posthog/test/ai/langchain/test_callbacks.py b/posthog/test/ai/langchain/test_callbacks.py index 51d8339b..c3cf41e1 100644 --- a/posthog/test/ai/langchain/test_callbacks.py +++ b/posthog/test/ai/langchain/test_callbacks.py @@ -1584,13 +1584,58 @@ def test_anthropic_cache_write_and_read_tokens(mock_client): generation_props = generation_args["properties"] assert generation_args["event"] == "$ai_generation" - assert generation_props["$ai_input_tokens"] == 400 + assert ( + generation_props["$ai_input_tokens"] == 1200 + ) # No provider metadata, no subtraction assert generation_props["$ai_output_tokens"] == 30 assert generation_props["$ai_cache_creation_input_tokens"] == 0 assert generation_props["$ai_cache_read_input_tokens"] == 800 assert generation_props["$ai_reasoning_tokens"] == 0 +def test_anthropic_provider_subtracts_cache_tokens(mock_client): + """Test that Anthropic provider correctly subtracts cache tokens from input tokens.""" + from langchain_core.outputs import LLMResult, ChatGeneration + from langchain_core.messages import AIMessage + from uuid import uuid4 + + cb = CallbackHandler(mock_client) + run_id = uuid4() + + # Set up with Anthropic provider + cb._set_llm_metadata( + serialized={}, + run_id=run_id, + messages=[{"role": "user", "content": "test"}], + metadata={"ls_provider": "anthropic", "ls_model_name": "claude-3-sonnet"}, + ) + + # Response with cache tokens: 1200 input (includes 800 cached) + response = LLMResult( + generations=[ + [ + ChatGeneration( + message=AIMessage(content="Response"), + generation_info={ + "usage_metadata": { + "input_tokens": 1200, + "output_tokens": 50, + "cache_read_input_tokens": 800, + } + }, + ) + ] + ], + llm_output={}, + ) + + cb._pop_run_and_capture_generation(run_id, None, response) + + generation_args = mock_client.capture.call_args_list[0][1] + assert generation_args["properties"]["$ai_input_tokens"] == 400 # 1200 - 800 + assert generation_args["properties"]["$ai_cache_read_input_tokens"] == 800 + + def test_openai_cache_read_tokens(mock_client): """Test that OpenAI cache read tokens are captured correctly.""" prompt = ChatPromptTemplate.from_messages( @@ -1626,7 +1671,7 @@ def test_openai_cache_read_tokens(mock_client): generation_props = generation_args["properties"] assert generation_args["event"] == "$ai_generation" - assert generation_props["$ai_input_tokens"] == 50 + assert generation_props["$ai_input_tokens"] == 150 # No subtraction for OpenAI assert generation_props["$ai_output_tokens"] == 40 assert generation_props["$ai_cache_read_input_tokens"] == 100 assert generation_props["$ai_cache_creation_input_tokens"] == 0 @@ -1708,7 +1753,7 @@ def test_combined_reasoning_and_cache_tokens(mock_client): generation_props = generation_args["properties"] assert generation_args["event"] == "$ai_generation" - assert generation_props["$ai_input_tokens"] == 200 + assert generation_props["$ai_input_tokens"] == 500 # No subtraction for OpenAI assert generation_props["$ai_output_tokens"] == 100 assert generation_props["$ai_cache_read_input_tokens"] == 300 assert generation_props["$ai_cache_creation_input_tokens"] == 0 @@ -1917,8 +1962,8 @@ def test_cache_read_tokens_subtraction_from_input_tokens(mock_client): generation_props = generation_args["properties"] assert generation_args["event"] == "$ai_generation" - # Input tokens should be reduced: 150 - 100 = 50 - assert generation_props["$ai_input_tokens"] == 50 + # Input tokens not reduced without provider metadata + assert generation_props["$ai_input_tokens"] == 150 assert generation_props["$ai_output_tokens"] == 40 assert generation_props["$ai_cache_read_input_tokens"] == 100 @@ -1959,8 +2004,8 @@ def test_cache_read_tokens_subtraction_prevents_negative(mock_client): generation_props = generation_args["properties"] assert generation_args["event"] == "$ai_generation" - # Input tokens should be 0, not negative: max(80 - 100, 0) = 0 - assert generation_props["$ai_input_tokens"] == 0 + # Input tokens not reduced without provider metadata + assert generation_props["$ai_input_tokens"] == 80 assert generation_props["$ai_output_tokens"] == 20 assert generation_props["$ai_cache_read_input_tokens"] == 100 diff --git a/posthog/version.py b/posthog/version.py index 94bfc721..a6a3bbe9 100644 --- a/posthog/version.py +++ b/posthog/version.py @@ -1,4 +1,4 @@ -VERSION = "6.9.1" +VERSION = "6.9.2" if __name__ == "__main__": print(VERSION, end="") # noqa: T201