diff --git a/src/eva/assistant/agentic/audit_log.py b/src/eva/assistant/agentic/audit_log.py index 8913f0cb..b9536da2 100644 --- a/src/eva/assistant/agentic/audit_log.py +++ b/src/eva/assistant/agentic/audit_log.py @@ -36,6 +36,7 @@ class ConversationMessage(BaseModel): tool_call_id: Optional[str] = None name: Optional[str] = None # For tool messages turn_id: Optional[int] = None # For associating transcription updates + reasoning: Optional[str] = None # For model reasoning (e.g., from OpenAI o1) def to_dict(self) -> dict[str, Any]: """Convert to a plain dict, excluding None fields and internal tracking fields.""" @@ -268,6 +269,9 @@ def append_llm_call(self, llm_call: LLMCall, agent_name: Optional[str] = None) - "timestamp": current_timestamp_ms(), "message_type": "llm_call", } + # Add reasoning if present + if llm_call.response and llm_call.response.reasoning: + transcript_entry["value"]["reasoning"] = llm_call.response.reasoning self.transcript.append(transcript_entry) def append_tool_call( diff --git a/src/eva/assistant/agentic/system.py b/src/eva/assistant/agentic/system.py index da857933..34dbe906 100644 --- a/src/eva/assistant/agentic/system.py +++ b/src/eva/assistant/agentic/system.py @@ -207,6 +207,7 @@ async def _run_tool_loop( "latency": llm_stats.get("latency", 0.0), "parameters": json.dumps(llm_stats.get("parameters", {})), "tool_calls": json.dumps(response_tool_calls_for_stats) if response_tool_calls_for_stats else "", + "reasoning": f'"{llm_stats.get("reasoning_content", "")}"', } self.agent_perf_stats.append(perf_stat) logger.debug( @@ -217,6 +218,7 @@ async def _run_tool_loop( role=MessageRole.ASSISTANT, content=response_content, tool_calls=tool_calls_dicts if tool_calls_dicts else None, + reasoning=llm_stats.get("reasoning"), ) llm_call = LLMCall( @@ -381,6 +383,7 @@ def save_agent_perf_stats(self) -> None: "parameters", "tool_calls", "latency", + "reasoning", ] writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() diff --git a/src/eva/assistant/pipeline/alm_vllm.py b/src/eva/assistant/pipeline/alm_vllm.py index a7108808..657ca4c3 100644 --- a/src/eva/assistant/pipeline/alm_vllm.py +++ b/src/eva/assistant/pipeline/alm_vllm.py @@ -198,6 +198,9 @@ async def complete( message = response.choices[0].message usage = response.usage + # Extract reasoning if present (OpenAI o1 and compatible models) + reasoning = getattr(message, "reasoning_content", None) + stats = { "prompt_tokens": usage.prompt_tokens if usage else 0, "completion_tokens": usage.completion_tokens if usage else 0, @@ -206,6 +209,7 @@ async def complete( "cost": 0.0, # Self-hosted, no API cost "cost_source": "self_hosted", "latency": round(elapsed, 3), + "reasoning": reasoning, } if hasattr(message, "tool_calls") and message.tool_calls: diff --git a/src/eva/assistant/services/llm.py b/src/eva/assistant/services/llm.py index cb729289..4d941304 100644 --- a/src/eva/assistant/services/llm.py +++ b/src/eva/assistant/services/llm.py @@ -80,6 +80,9 @@ async def complete( response_cost = hidden_params.get("response_cost") cost_source = "litellm" + # Extract reasoning if present (OpenAI o1 and compatible models) + reasoning = getattr(message, "reasoning_content", None) + stats = { "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, @@ -88,6 +91,7 @@ async def complete( "cost": response_cost, "cost_source": cost_source, "latency": round(elapsed_time, 3), + "reasoning": reasoning, } if hasattr(message, "tool_calls") and message.tool_calls: diff --git a/tests/unit/assistant/test_audit_log.py b/tests/unit/assistant/test_audit_log.py index 34b45172..7d2101d0 100644 --- a/tests/unit/assistant/test_audit_log.py +++ b/tests/unit/assistant/test_audit_log.py @@ -138,6 +138,27 @@ def test_append_llm_call_no_response(self): assert self.log.llm_prompts[0]["response"] == "" assert self.log.llm_prompts[0]["response_message"] is None + def test_append_llm_call_with_reasoning(self): + response_msg = ConversationMessage( + role=MessageRole.ASSISTANT, content="Sure!", reasoning="I thought about this carefully..." + ) + llm_call = LLMCall( + messages=[{"role": "user", "content": "Hi"}], + response=response_msg, + duration_seconds=1.5, + start_time="100", + end_time="200", + model="o1-preview", + latency_ms=1500.0, + ) + self.log.append_llm_call(llm_call, agent_name="TestAgent") + + # Check that reasoning is added to transcript entry + assert len(self.log.transcript) == 1 + assert "reasoning" in self.log.transcript[0]["value"] + assert self.log.transcript[0]["value"]["reasoning"] == "I thought about this carefully..." + assert self.log.transcript[0]["value"]["response"] == "Sure!" + def test_append_tool_call_without_response(self): self.log.append_tool_call("search", {"query": "test"}) assert len(self.log.transcript) == 1