diff --git a/openhands-sdk/openhands/sdk/agent/agent.py b/openhands-sdk/openhands/sdk/agent/agent.py index ad9f418c9..8408b33cd 100644 --- a/openhands-sdk/openhands/sdk/agent/agent.py +++ b/openhands-sdk/openhands/sdk/agent/agent.py @@ -216,6 +216,17 @@ def step( # LLMResponse already contains the converted message and metrics snapshot message: Message = llm_response.message + # Check if this is a reasoning-only response (e.g., from reasoning models) + # or a message-only response without tool calls + has_reasoning = ( + message.responses_reasoning_item is not None + or message.reasoning_content is not None + or (message.thinking_blocks and len(message.thinking_blocks) > 0) + ) + has_content = any( + isinstance(c, TextContent) and c.text.strip() for c in message.content + ) + if message.tool_calls and len(message.tool_calls) > 0: if not all(isinstance(c, TextContent) for c in message.content): logger.warning( @@ -254,16 +265,18 @@ def step( if action_events: self._execute_actions(conversation, action_events, on_event) + return - else: - logger.debug("LLM produced a message response - awaits user input") - state.execution_status = ConversationExecutionStatus.FINISHED - msg_event = MessageEvent( - source="agent", - llm_message=message, - llm_response_id=llm_response.id, - ) - on_event(msg_event) + # No tool calls - emit message event for reasoning or content responses + if not has_reasoning and not has_content: + logger.warning("LLM produced empty response - continuing agent loop") + + msg_event = MessageEvent( + source="agent", + llm_message=message, + llm_response_id=llm_response.id, + ) + on_event(msg_event) # If using VLLM, we can get the raw prompt and response tokens # that can be useful for RL training. @@ -279,6 +292,13 @@ def step( ) on_event(token_event) + # Finish conversation if LLM produced content (awaits user input) + # Continue if only reasoning without content (e.g., GPT-5 codex thinking) + if has_content: + logger.debug("LLM produced a message response - awaits user input") + state.execution_status = ConversationExecutionStatus.FINISHED + return + def _requires_user_confirmation( self, state: ConversationState, action_events: list[ActionEvent] ) -> bool: diff --git a/tests/sdk/agent/test_nonexistent_tool_handling.py b/tests/sdk/agent/test_nonexistent_tool_handling.py index 3c4394ce6..f11179216 100644 --- a/tests/sdk/agent/test_nonexistent_tool_handling.py +++ b/tests/sdk/agent/test_nonexistent_tool_handling.py @@ -14,7 +14,7 @@ from openhands.sdk.agent import Agent from openhands.sdk.conversation import Conversation from openhands.sdk.conversation.state import ConversationExecutionStatus -from openhands.sdk.event import AgentErrorEvent, MessageEvent +from openhands.sdk.event import ActionEvent, AgentErrorEvent from openhands.sdk.llm import LLM, Message, TextContent @@ -232,7 +232,7 @@ def mock_llm_response(messages, **kwargs): object="chat.completion", ) else: - # Second call: respond normally after seeing the error + # Second call: respond with finish tool return ModelResponse( id="mock-response-2", choices=[ @@ -240,11 +240,22 @@ def mock_llm_response(messages, **kwargs): index=0, message=LiteLLMMessage( role="assistant", - content=( - "I see there was an error. Let me respond normally now." - ), + content=None, + tool_calls=[ + ChatCompletionMessageToolCall( + id="finish-call-1", + type="function", + function=Function( + name="finish", + arguments=( + '{"message": "I see there ' + 'was an error. Task completed."}' + ), + ), + ) + ], ), - finish_reason="stop", + finish_reason="tool_calls", ) ], created=0, @@ -283,21 +294,18 @@ def event_callback(event): != ConversationExecutionStatus.FINISHED ) - # Run second step - should continue normally + # Run second step - should call finish tool agent.step(conversation, on_event=event_callback) - # Verify we got a message event from the second response - message_events = [ + # Verify we got an action event for the finish tool + action_events = [ e for e in collected_events - if isinstance(e, MessageEvent) and e.source == "agent" + if isinstance(e, ActionEvent) + and e.source == "agent" + and e.tool_name == "finish" ] - assert len(message_events) == 1 - - message_event = message_events[0] - content_text = message_event.llm_message.content[0] - assert isinstance(content_text, TextContent) - assert "respond normally" in content_text.text + assert len(action_events) == 1 # Now the conversation should be finished with conversation.state: diff --git a/tests/sdk/agent/test_reasoning_only_responses.py b/tests/sdk/agent/test_reasoning_only_responses.py new file mode 100644 index 000000000..07f590252 --- /dev/null +++ b/tests/sdk/agent/test_reasoning_only_responses.py @@ -0,0 +1,200 @@ +"""Test agent behavior with reasoning-only responses (e.g., GPT-5 codex).""" + +from unittest.mock import MagicMock + +from litellm.types.utils import ModelResponse +from pydantic import PrivateAttr + +from openhands.sdk.agent import Agent +from openhands.sdk.conversation import Conversation +from openhands.sdk.conversation.state import ConversationExecutionStatus +from openhands.sdk.event.llm_convertible.message import MessageEvent +from openhands.sdk.llm import LLM, LLMResponse, Message, MessageToolCall, TextContent +from openhands.sdk.llm.utils.metrics import MetricsSnapshot, TokenUsage + + +class ReasoningOnlyLLM(LLM): + """Test LLM that returns reasoning-only response first, then finish.""" + + _call_count: int = PrivateAttr(default=0) + + def __init__(self): + super().__init__(model="test-model") + + def completion( # type: ignore[override] + self, *, messages, tools=None, **kwargs + ) -> LLMResponse: + self._call_count += 1 + + if self._call_count == 1: + # First call: return reasoning-only response + message = Message(role="assistant") + message.reasoning_content = "Let me think about this..." + return LLMResponse( + message=message, + metrics=MetricsSnapshot( + model_name="test", + accumulated_cost=0.0, + max_budget_per_task=0.0, + accumulated_token_usage=TokenUsage(model="test"), + ), + raw_response=MagicMock(spec=ModelResponse, id="r1"), + ) + else: + # Second call: return finish action + message = Message(role="assistant") + message.tool_calls = [ + MessageToolCall( + id="finish-call-1", + name="finish", + arguments='{"message": "Task completed"}', + origin="completion", + ) + ] + return LLMResponse( + message=message, + metrics=MetricsSnapshot( + model_name="test", + accumulated_cost=0.0, + max_budget_per_task=0.0, + accumulated_token_usage=TokenUsage(model="test"), + ), + raw_response=MagicMock(spec=ModelResponse, id="r2"), + ) + + +def test_agent_continues_after_reasoning_only_response(): + """Test that agent continues looping after receiving reasoning-only response.""" + llm = ReasoningOnlyLLM() + agent = Agent(llm=llm, tools=[]) + conversation = Conversation(agent=agent) + + # Send initial user message + conversation.send_message("Please solve this task") + + # Run the conversation + conversation.run() + + # Verify agent was called twice (reasoning-only, then finish) + assert llm._call_count == 2 + + # Verify conversation finished + assert conversation.state.execution_status == ConversationExecutionStatus.FINISHED + + +class ContentOnlyLLM(LLM): + """Test LLM that returns content-only response (should finish immediately).""" + + _call_count: int = PrivateAttr(default=0) + + def __init__(self): + super().__init__(model="test-model") + + def completion( # type: ignore[override] + self, *, messages, tools=None, **kwargs + ) -> LLMResponse: + self._call_count += 1 + + # Return content-only response - should finish conversation immediately + message = Message(role="assistant") + message.content = [TextContent(text="I'm thinking about this...")] + return LLMResponse( + message=message, + metrics=MetricsSnapshot( + model_name="test", + accumulated_cost=0.0, + max_budget_per_task=0.0, + accumulated_token_usage=TokenUsage(model="test"), + ), + raw_response=MagicMock(spec=ModelResponse, id="r1"), + ) + + +def test_agent_finishes_after_content_only_response(): + """Test that agent finishes immediately after receiving content-only response.""" + llm = ContentOnlyLLM() + agent = Agent(llm=llm, tools=[]) + conversation = Conversation(agent=agent) + + conversation.send_message("Analyze this") + conversation.run() + + # Verify agent was called once - content responses finish immediately + assert llm._call_count == 1 + assert conversation.state.execution_status == ConversationExecutionStatus.FINISHED + + # Verify the content message was emitted + msg_events = [ + e + for e in conversation.state.events + if isinstance(e, MessageEvent) and e.source == "agent" + ] + assert len(msg_events) == 1 + assert any( + isinstance(c, TextContent) and c.text == "I'm thinking about this..." + for c in msg_events[0].llm_message.content + ) + + +class EmptyResponseLLM(LLM): + """Test LLM that returns empty response first, then finish.""" + + _call_count: int = PrivateAttr(default=0) + + def __init__(self): + super().__init__(model="test-model") + + def completion( # type: ignore[override] + self, *, messages, tools=None, **kwargs + ) -> LLMResponse: + self._call_count += 1 + + if self._call_count == 1: + # First call: return empty response (edge case) + message = Message(role="assistant") + message.content = [] + return LLMResponse( + message=message, + metrics=MetricsSnapshot( + model_name="test", + accumulated_cost=0.0, + max_budget_per_task=0.0, + accumulated_token_usage=TokenUsage(model="test"), + ), + raw_response=MagicMock(spec=ModelResponse, id="r1"), + ) + else: + # Second call: return finish action + message = Message(role="assistant") + message.tool_calls = [ + MessageToolCall( + id="finish-call-3", + name="finish", + arguments='{"message": "Done"}', + origin="completion", + ) + ] + return LLMResponse( + message=message, + metrics=MetricsSnapshot( + model_name="test", + accumulated_cost=0.0, + max_budget_per_task=0.0, + accumulated_token_usage=TokenUsage(model="test"), + ), + raw_response=MagicMock(spec=ModelResponse, id="r2"), + ) + + +def test_agent_handles_empty_response(): + """Test that agent continues even with completely empty response.""" + llm = EmptyResponseLLM() + agent = Agent(llm=llm, tools=[]) + conversation = Conversation(agent=agent) + + conversation.send_message("Test") + conversation.run() + + # Verify agent continued after empty response + assert llm._call_count == 2 + assert conversation.state.execution_status == ConversationExecutionStatus.FINISHED