From b6aadc8c233de5f11e537fb2083146f06a36028a Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 26 Nov 2025 03:42:28 +0000 Subject: [PATCH 1/3] Add documentation for LLM streaming feature Document the new LLM streaming capability that allows token-by-token display of responses in real-time. Co-authored-by: openhands --- sdk/guides/llm-streaming.mdx | 215 +++++++++++++++++++++++++++++++++++ 1 file changed, 215 insertions(+) create mode 100644 sdk/guides/llm-streaming.mdx diff --git a/sdk/guides/llm-streaming.mdx b/sdk/guides/llm-streaming.mdx new file mode 100644 index 00000000..79a4b777 --- /dev/null +++ b/sdk/guides/llm-streaming.mdx @@ -0,0 +1,215 @@ +--- +title: LLM Streaming +description: Stream LLM responses token-by-token for real-time display and interactive user experiences. +--- + +Enable real-time display of LLM responses as they're generated, token by token. This guide demonstrates how to use streaming callbacks to process and display tokens as they arrive from the language model. + +## Overview + + +This example is available on GitHub: [examples/01_standalone_sdk/29_llm_streaming.py](https://github.com/OpenHands/software-agent-sdk/blob/main/examples/01_standalone_sdk/29_llm_streaming.py) + + +Streaming allows you to display LLM responses progressively as the model generates them, rather than waiting for the complete response. This creates a more responsive user experience, especially for long-form content generation. + +```python icon="python" expandable examples/01_standalone_sdk/29_llm_streaming.py +import os +import sys + +from pydantic import SecretStr + +from openhands.sdk import ( + Conversation, + get_logger, +) +from openhands.sdk.llm import LLM +from openhands.sdk.llm.streaming import ModelResponseStream +from openhands.tools.preset.default import get_default_agent + + +logger = get_logger(__name__) + + +api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") +if not api_key: + raise RuntimeError("Set LLM_API_KEY or OPENAI_API_KEY in your environment.") + +model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") +base_url = os.getenv("LLM_BASE_URL") +llm = LLM( + model=model, + api_key=SecretStr(api_key), + base_url=base_url, + usage_id="stream-demo", + stream=True, +) + +agent = get_default_agent(llm=llm, cli_mode=True) + + +def on_token(chunk: ModelResponseStream) -> None: + choices = chunk.choices + for choice in choices: + delta = choice.delta + if delta is not None: + content = getattr(delta, "content", None) + if isinstance(content, str): + sys.stdout.write(content) + sys.stdout.flush() + + +conversation = Conversation( + agent=agent, + workspace=os.getcwd(), + token_callbacks=[on_token], +) + +story_prompt = ( + "Tell me a long story about LLM streaming, make sure it has multiple paragraphs. " +) +conversation.send_message(story_prompt) +print("Token Streaming:") +print("-" * 100 + "\n") +conversation.run() + +cleanup_prompt = ( + "Thank you. Please delete the streaming story file now that I've read it, " + "then confirm the deletion." +) +conversation.send_message(cleanup_prompt) +print("Token Streaming:") +print("-" * 100 + "\n") +conversation.run() +``` + +```bash Running the Example +export LLM_API_KEY="your-api-key" +export LLM_MODEL="anthropic/claude-sonnet-4-5-20250929" +cd agent-sdk +uv run python examples/01_standalone_sdk/29_llm_streaming.py +``` + +## How It Works + +### 1. Enable Streaming on LLM + +Configure the LLM with streaming enabled: + +```python highlight={6} +llm = LLM( + model="anthropic/claude-sonnet-4-5-20250929", + api_key=SecretStr(api_key), + base_url=base_url, + usage_id="stream-demo", + stream=True, # Enable streaming +) +``` + +### 2. Define Token Callback + +Create a callback function that processes streaming chunks as they arrive: + +```python highlight={1-9} +def on_token(chunk: ModelResponseStream) -> None: + """Process each streaming chunk as it arrives.""" + choices = chunk.choices + for choice in choices: + delta = choice.delta + if delta is not None: + content = getattr(delta, "content", None) + if isinstance(content, str): + sys.stdout.write(content) + sys.stdout.flush() +``` + +The callback receives a `ModelResponseStream` object containing: +- **`choices`**: List of response choices from the model +- **`delta`**: Incremental content changes for each choice +- **`content`**: The actual text tokens being streamed + +### 3. Register Callback with Conversation + +Pass your token callback to the conversation: + +```python highlight={3} +conversation = Conversation( + agent=agent, + token_callbacks=[on_token], # Register streaming callback + workspace=os.getcwd(), +) +``` + +The `token_callbacks` parameter accepts a list of callbacks, allowing you to register multiple handlers if needed (e.g., one for display, another for logging). + +## Understanding Stream Chunks + +Each streaming chunk contains partial response data: + +```python +# Chunk structure +ModelResponseStream( + id="chatcmpl-...", + choices=[ + Choice( + index=0, + delta=Delta( + role="assistant", # Only in first chunk + content="Hello" # Partial text + ), + finish_reason=None # Set in final chunk + ) + ], + model="claude-3-sonnet-20240229", + created=1234567890 +) +``` + +**Key Properties:** +- **First Chunk**: Contains `role` in delta +- **Intermediate Chunks**: Contain partial `content` text +- **Final Chunk**: Sets `finish_reason` (e.g., "stop", "length") + +## Use Cases + +**Real-time Display**: Show responses as they're generated for better UX + +**Progress Indicators**: Track response generation progress + +**Interactive Applications**: Build chatbots and assistants with typing effects + +**Custom Processing**: Filter, transform, or log tokens as they arrive + +## Advanced: Multiple Callbacks + +Register multiple callbacks for different purposes: + +```python +def display_callback(chunk: ModelResponseStream) -> None: + """Display tokens to user.""" + # Display logic here + pass + +def logging_callback(chunk: ModelResponseStream) -> None: + """Log tokens for debugging.""" + # Logging logic here + pass + +conversation = Conversation( + agent=agent, + token_callbacks=[display_callback, logging_callback], + workspace=os.getcwd(), +) +``` + +## Performance Considerations + +- **Flush Output**: Call `sys.stdout.flush()` after writing to ensure immediate display +- **Callback Overhead**: Keep callbacks lightweight to avoid slowing down streaming +- **Error Handling**: Implement error handling in callbacks to prevent disrupting the stream + +## Next Steps + +- **[LLM Error Handling](/sdk/guides/llm-error-handling)** - Handle streaming errors gracefully +- **[Custom Visualizer](/sdk/guides/convo-custom-visualizer)** - Build custom UI for streaming +- **[Interactive Terminal](/sdk/guides/agent-interactive-terminal)** - Display streams in terminal UI From 25980900030e3be2a75383d55723ef5d7275f304 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 27 Nov 2025 00:22:57 +0800 Subject: [PATCH 2/3] Simplify LLM Streaming guide by removing sections Removed sections on understanding stream chunks, use cases, advanced callbacks, and performance considerations to streamline the guide. --- sdk/guides/llm-streaming.mdx | 71 ++---------------------------------- 1 file changed, 4 insertions(+), 67 deletions(-) diff --git a/sdk/guides/llm-streaming.mdx b/sdk/guides/llm-streaming.mdx index 79a4b777..d2e05854 100644 --- a/sdk/guides/llm-streaming.mdx +++ b/sdk/guides/llm-streaming.mdx @@ -3,9 +3,12 @@ title: LLM Streaming description: Stream LLM responses token-by-token for real-time display and interactive user experiences. --- + +This is currently only supported for the chat completion endpoint. + + Enable real-time display of LLM responses as they're generated, token by token. This guide demonstrates how to use streaming callbacks to process and display tokens as they arrive from the language model. -## Overview This example is available on GitHub: [examples/01_standalone_sdk/29_llm_streaming.py](https://github.com/OpenHands/software-agent-sdk/blob/main/examples/01_standalone_sdk/29_llm_streaming.py) @@ -142,72 +145,6 @@ conversation = Conversation( The `token_callbacks` parameter accepts a list of callbacks, allowing you to register multiple handlers if needed (e.g., one for display, another for logging). -## Understanding Stream Chunks - -Each streaming chunk contains partial response data: - -```python -# Chunk structure -ModelResponseStream( - id="chatcmpl-...", - choices=[ - Choice( - index=0, - delta=Delta( - role="assistant", # Only in first chunk - content="Hello" # Partial text - ), - finish_reason=None # Set in final chunk - ) - ], - model="claude-3-sonnet-20240229", - created=1234567890 -) -``` - -**Key Properties:** -- **First Chunk**: Contains `role` in delta -- **Intermediate Chunks**: Contain partial `content` text -- **Final Chunk**: Sets `finish_reason` (e.g., "stop", "length") - -## Use Cases - -**Real-time Display**: Show responses as they're generated for better UX - -**Progress Indicators**: Track response generation progress - -**Interactive Applications**: Build chatbots and assistants with typing effects - -**Custom Processing**: Filter, transform, or log tokens as they arrive - -## Advanced: Multiple Callbacks - -Register multiple callbacks for different purposes: - -```python -def display_callback(chunk: ModelResponseStream) -> None: - """Display tokens to user.""" - # Display logic here - pass - -def logging_callback(chunk: ModelResponseStream) -> None: - """Log tokens for debugging.""" - # Logging logic here - pass - -conversation = Conversation( - agent=agent, - token_callbacks=[display_callback, logging_callback], - workspace=os.getcwd(), -) -``` - -## Performance Considerations - -- **Flush Output**: Call `sys.stdout.flush()` after writing to ensure immediate display -- **Callback Overhead**: Keep callbacks lightweight to avoid slowing down streaming -- **Error Handling**: Implement error handling in callbacks to prevent disrupting the stream - ## Next Steps - **[LLM Error Handling](/sdk/guides/llm-error-handling)** - Handle streaming errors gracefully From ff521f2083935c0c17f50cec4775385a96deb043 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 26 Nov 2025 16:25:48 +0000 Subject: [PATCH 3/3] Add llm-streaming guide to docs.json under LLM Features --- docs.json | 1 + 1 file changed, 1 insertion(+) diff --git a/docs.json b/docs.json index f8c95819..fb885b5b 100644 --- a/docs.json +++ b/docs.json @@ -211,6 +211,7 @@ "sdk/guides/llm-registry", "sdk/guides/llm-routing", "sdk/guides/llm-reasoning", + "sdk/guides/llm-streaming", "sdk/guides/llm-image-input", "sdk/guides/llm-error-handling" ]