From b94ce8387326b34c8f17d8a5a7b326b944554276 Mon Sep 17 00:00:00 2001
From: kxz2002 <115912648+kxz2002@users.noreply.github.com>
Date: Wed, 29 Oct 2025 18:16:50 +0800
Subject: [PATCH 1/4] [Feature] add a new reasoning parser (#4571)

* add new reasoning_parser initial commit

* add parser file content

* add register

* ernie_test_reasoning_parser

* support <tool_call> token and add tool_parser

* add and fix unit tests

* modify reasoning_parser

* modify reasoning parser and tool parser

* modify unit tests

* modify reasoning_parser and tool_parser

* modify unit tests

* fix tool_parser

* modify the logic of reasoning_parser and tool_parser

* add and modify unit tests

* standardize code style

* simplify reasoning_parser and tool_parser

* modify unit test
---
 .../openai/tool_parsers/__init__.py           |   7 +-
 .../ernie_45_vl_thinking_tool_parser.py       | 361 ++++++++++++++++++
 fastdeploy/reasoning/__init__.py              |   2 +
 .../ernie_45_vl_thinking_reasoning_parser.py  | 138 +++++++
 .../test_ernie_45_vl_thinking_tool_parser.py  | 193 ++++++++++
 tests/reasoning/test_reasoning_parser.py      | 164 ++++++++
 6 files changed, 860 insertions(+), 5 deletions(-)
 create mode 100644 fastdeploy/entrypoints/openai/tool_parsers/ernie_45_vl_thinking_tool_parser.py
 create mode 100644 fastdeploy/reasoning/ernie_45_vl_thinking_reasoning_parser.py
 create mode 100644 tests/entrypoints/openai/tool_parsers/test_ernie_45_vl_thinking_tool_parser.py

diff --git a/fastdeploy/entrypoints/openai/tool_parsers/__init__.py b/fastdeploy/entrypoints/openai/tool_parsers/__init__.py
index 2078a8c9fe5..a4df47ac99d 100644
--- a/fastdeploy/entrypoints/openai/tool_parsers/__init__.py
+++ b/fastdeploy/entrypoints/openai/tool_parsers/__init__.py
@@ -15,10 +15,7 @@
 """
 
 from .abstract_tool_parser import ToolParser, ToolParserManager
+from .ernie_45_vl_thinking_tool_parser import Ernie45VLThinkingToolParser
 from .ernie_x1_tool_parser import ErnieX1ToolParser
 
-__all__ = [
-    "ToolParser",
-    "ToolParserManager",
-    "ErnieX1ToolParser",
-]
+__all__ = ["ToolParser", "ToolParserManager", "ErnieX1ToolParser", "Ernie45VLThinkingToolParser"]
diff --git a/fastdeploy/entrypoints/openai/tool_parsers/ernie_45_vl_thinking_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/ernie_45_vl_thinking_tool_parser.py
new file mode 100644
index 00000000000..131c17e6abc
--- /dev/null
+++ b/fastdeploy/entrypoints/openai/tool_parsers/ernie_45_vl_thinking_tool_parser.py
@@ -0,0 +1,361 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import json
+import re
+import uuid
+from collections.abc import Sequence
+from typing import Union
+
+import partial_json_parser
+
+
+def random_tool_call_id() -> str:
+    """Generate a random tool call ID"""
+    return f"chatcmpl-tool-{str(uuid.uuid4().hex)}"
+
+
+from fastdeploy.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
+from fastdeploy.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+    ToolParserManager,
+)
+from fastdeploy.utils import data_processor_logger
+
+
+@ToolParserManager.register_module("ernie_45-vl-thinking")
+class Ernie45VLThinkingToolParser(ToolParser):
+    """
+    Tool parser for Ernie model version 4.5.1.
+    This parser handles tool calls with newline formats.
+    """
+
+    def __init__(self, tokenizer):
+        super().__init__(tokenizer)
+
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id: int = -1
+        self.current_tool_name_sent: bool = False
+        self.streamed_args_for_tool: list[str] = []  # map what has been streamed for each tool so far to a list
+        self.buffer: str = ""  # buffer for accumulating unprocessed streaming content
+        self.bracket_counts: dict = {"total_l": 0, "total_r": 0}  # track bracket counts in streamed deltas
+        self.tool_call_start_token: str = "<tool_call>"
+        self.tool_call_end_token: str = "</tool_call>"
+        self.valid = None
+
+        self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token)
+        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
+        if self.tool_call_start_token_id is None:
+            self.tool_call_start_token_id = -1
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolCallParser constructor during construction."
+            )
+
+    def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response.
+        Supports XML-style formats with newlines:
+        - XML format: <think>\n...\n</think>\n\n\n<tool_call>\n{...}\n</tool_call>\n...
+
+        Handles boundary cases:
+        1. Only name and partial arguments: {"name": "get_weather", "arguments": {"location": "北京"
+        2. Only partial name: {"name": "get_we
+        3. Only name and arguments field without content: {"name": "get_weather", "argume
+        """
+
+        try:
+            tool_calls = []
+
+            function_call_arr = []
+            remaining_text = model_output
+
+            think_end = remaining_text.find("</think>")
+            think_end = think_end + len("</think>") if think_end != -1 else 0
+            tool_begin = remaining_text.find("<tool_call>")
+            if tool_begin != -1:
+                middle_str = remaining_text[think_end:tool_begin]
+                if len(middle_str.strip("\n")) > 0:
+                    return ExtractedToolCallInformation(tools_called=False, content=model_output)
+
+            while True:
+                # Find the next <tool_call>
+                tool_call_pos = remaining_text.find("<tool_call>")
+                if tool_call_pos == -1:
+                    break
+
+                # Extract content after <tool_call>
+                tool_content_start = tool_call_pos + len("<tool_call>")
+                tool_content_end = remaining_text.find("</tool_call>", tool_content_start)
+
+                tool_json = ""
+                if tool_content_end == -1:
+                    # Processing unclosed tool_call block (truncated case)
+                    tool_json = remaining_text[tool_content_start:].strip()
+                    remaining_text = ""  # No more content to process
+                else:
+                    # Processing closed </tool_call> block
+                    tool_json = remaining_text[tool_content_start:tool_content_end].strip()
+                    remaining_text = remaining_text[tool_content_end + len("</tool_call>") :]
+
+                if not tool_json:
+                    continue
+
+                # Process tool_json
+                tool_json = tool_json.strip()
+                if not tool_json.startswith("{"):
+                    tool_json = "{" + tool_json
+                if not tool_json.endswith("}"):
+                    tool_json = tool_json + "}"
+
+                try:
+                    # Parsing strategy: First try standard json.loads
+                    try:
+                        tool_data = json.loads(tool_json)
+
+                        if isinstance(tool_data, dict) and "name" in tool_data and "arguments" in tool_data:
+                            function_call_arr.append(
+                                {
+                                    "name": tool_data["name"],
+                                    "arguments": tool_data["arguments"],
+                                    "_is_complete": True,  # Mark as complete
+                                }
+                            )
+                            continue
+                    except json.JSONDecodeError:
+                        pass
+
+                    # Try partial_json_parser when standard parsing fails
+                    from partial_json_parser.core.options import Allow
+
+                    try:
+                        tool_data = {}
+                        flags = Allow.ALL & ~Allow.STR
+
+                        # Parse the name field
+                        name_match = re.search(r'"name"\s*:\s*"([^"]*)"', tool_json)
+                        if name_match:
+                            tool_data["name"] = name_match.group(1)
+
+                        # Parse the arguments field
+                        args_match = re.search(r'"arguments"\s*:\s*(\{.*)', tool_json)
+                        if args_match:
+                            try:
+                                tool_data["arguments"] = partial_json_parser.loads(args_match.group(1), flags=flags)
+                            except:
+                                tool_data["arguments"] = None
+
+                        if isinstance(tool_data, dict):
+                            function_call_arr.append(
+                                {
+                                    "name": tool_data.get("name", ""),
+                                    "arguments": tool_data.get("arguments", {}),
+                                    "_is_partial": True,  # Mark as partial
+                                }
+                            )
+                    except Exception as e:
+                        data_processor_logger.debug(f"Failed to parse tool call: {str(e)}")
+                        continue
+                except Exception as e:
+                    data_processor_logger.debug(f"Failed to parse tool call: {str(e)}")
+                    continue
+
+            if not function_call_arr:
+                data_processor_logger.error("No valid tool calls found")
+                return ExtractedToolCallInformation(tools_called=False, content=model_output)
+
+            tool_calls = []
+            all_complete = True  # Initialize as all complete
+
+            for tool_call in function_call_arr:
+                # Set flags
+                is_complete = tool_call.get("_is_complete", False)
+                is_partial = tool_call.get("_is_partial", False)
+
+                # If any tool call is incomplete or partial, mark all_complete as False
+                if not is_complete or is_partial:
+                    all_complete = False
+
+                # Process arguments
+                tool_args = tool_call.get("arguments", {})
+                if not isinstance(tool_args, dict):
+                    tool_args = {}
+
+                try:
+                    args_str = json.dumps(tool_args, ensure_ascii=False) if tool_args else "{}"
+                except:
+                    args_str = "{}"
+
+                tool_calls.append(
+                    ToolCall(
+                        type="function",
+                        id=random_tool_call_id(),
+                        function=FunctionCall(
+                            name=tool_call.get("name", ""),
+                            arguments=args_str,
+                        ),
+                    )
+                )
+
+            # Only return tools_called=True if all tool calls are complete
+            return ExtractedToolCallInformation(
+                tools_called=all_complete, tool_calls=tool_calls if tool_calls else None, content=""
+            )
+
+        except Exception as e:
+            data_processor_logger.error(f"Error in extracting tool call from response: {str(e)}")
+            return ExtractedToolCallInformation(tools_called=False, tool_calls=None, content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: dict,
+    ) -> Union[DeltaMessage, None]:
+
+        if self.tool_call_start_token_id not in current_token_ids:
+            return DeltaMessage(content=delta_text)
+
+        if self.valid is not None and not self.valid:
+            return DeltaMessage(content=delta_text)
+
+        # Skip empty chunks
+        if len(delta_text.strip()) == 0:
+            return None
+
+        try:
+            delta = None
+            # Use buffer to accumulate delta_text content
+            self.buffer += delta_text
+
+            # Process the buffer content
+            if "<tool_call>" in delta_text:
+                if self.valid is None:
+                    tool_call_begin = current_text.find(self.tool_call_start_token)
+                    prefix = current_text[:tool_call_begin]
+                    prefix = prefix.strip("\n")
+                    if len(prefix) > 0 and not prefix.endswith("</think>"):
+                        self.valid = False
+                        return DeltaMessage(content=delta_text)
+                    self.valid = True
+                self.current_tool_id = (
+                    max(self.current_tool_id, 0) if self.current_tool_id == -1 else self.current_tool_id + 1
+                )
+                self.current_tool_name_sent = False
+                if len(self.streamed_args_for_tool) <= self.current_tool_id:
+                    self.streamed_args_for_tool.append("")
+                data_processor_logger.debug(f"New tool call started with ID: {self.current_tool_id}")
+
+            # 1. Try to parse the name field
+            if not self.current_tool_name_sent and '"name"' in self.buffer:
+                name_match = re.search(r'"name"\s*:\s*"([^"]*)"', self.buffer)
+                if name_match:
+                    name = name_match.group(1)
+                    if name:
+                        delta = DeltaMessage(
+                            tool_calls=[
+                                DeltaToolCall(
+                                    index=self.current_tool_id,
+                                    type="function",
+                                    id=random_tool_call_id(),
+                                    function=DeltaFunctionCall(name=name).model_dump(exclude_none=True),
+                                )
+                            ]
+                        )
+                        # Delete the processed name part from the buffer
+                        self.buffer = self.buffer[name_match.end() :]
+                        self.current_tool_name_sent = True
+                        return delta
+            # 2. Processing arguments field
+            if '"arguments"' in self.buffer:
+                args_match = re.search(r'"arguments"\s*:\s*(\{.*)', self.buffer)
+                if args_match:
+                    args_content = args_match.group(1)
+                    try:
+                        # Check if arguments field is complete by bracket matching
+                        if "}}" in args_content:
+                            matched_pos = -1
+                            for i, ch in enumerate(delta_text):
+                                if ch == "{":
+                                    self.bracket_counts["total_l"] += 1
+                                elif ch == "}":
+                                    self.bracket_counts["total_r"] += 1
+
+                                if self.bracket_counts["total_l"] == self.bracket_counts["total_r"]:
+                                    matched_pos = i
+                                    break
+
+                            if matched_pos >= 0:
+                                # Clean up bracket counts for next tool call
+                                truncate_text = delta_text[: matched_pos + 1]
+                                delta = DeltaMessage(
+                                    tool_calls=[
+                                        DeltaToolCall(
+                                            index=self.current_tool_id,
+                                            function=DeltaFunctionCall(arguments=truncate_text).model_dump(
+                                                exclude_none=True
+                                            ),
+                                        )
+                                    ]
+                                )
+                                self.buffer = self.buffer[args_match.end() :]
+                                return delta
+                            else:
+                                # No complete match yet
+                                return None
+                        else:
+                            # Return partial arguments
+                            for ch in delta_text:
+                                if ch == "{":
+                                    self.bracket_counts["total_l"] += 1
+                                elif ch == "}":
+                                    self.bracket_counts["total_r"] += 1
+                            delta = DeltaMessage(
+                                tool_calls=[
+                                    DeltaToolCall(
+                                        index=self.current_tool_id,
+                                        function=DeltaFunctionCall(arguments=delta_text).model_dump(exclude_none=True),
+                                    )
+                                ]
+                            )
+                            return delta
+                    except Exception as e:
+                        data_processor_logger.error(f"Error in streaming tool call extraction: {str(e)}")
+                        return None
+            if "</tool_call>" in self.buffer:
+                end_pos = self.buffer.find("</tool_call>")
+                self.buffer = self.buffer[end_pos + len("</tool_call>") :]
+
+                self.streamed_args_for_tool.append("")
+
+            return delta
+
+        except Exception as e:
+            data_processor_logger.error(f"Error in streaming tool call extraction: {str(e)}")
+            return None
diff --git a/fastdeploy/reasoning/__init__.py b/fastdeploy/reasoning/__init__.py
index 49c627895fd..c384e6f37c7 100644
--- a/fastdeploy/reasoning/__init__.py
+++ b/fastdeploy/reasoning/__init__.py
@@ -17,6 +17,7 @@
 from fastdeploy.plugins import load_reasoning_parser_plugins
 
 from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
+from .ernie_45_vl_thinking_reasoning_parser import Ernie45VLThinkingReasoningParser
 from .ernie_vl_reasoning_parsers import ErnieVLReasoningParser
 from .ernie_x1_reasoning_parsers import ErnieX1ReasoningParser
 from .qwen3_reasoning_parsers import Qwen3ReasoningParser
@@ -27,6 +28,7 @@
     "ErnieVLReasoningParser",
     "Qwen3ReasoningParser",
     "ErnieX1ReasoningParser",
+    "Ernie45VLThinkingReasoningParser",
 ]
 
 load_reasoning_parser_plugins()
diff --git a/fastdeploy/reasoning/ernie_45_vl_thinking_reasoning_parser.py b/fastdeploy/reasoning/ernie_45_vl_thinking_reasoning_parser.py
new file mode 100644
index 00000000000..72b045a3dc0
--- /dev/null
+++ b/fastdeploy/reasoning/ernie_45_vl_thinking_reasoning_parser.py
@@ -0,0 +1,138 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from collections.abc import Sequence
+from typing import Optional, Union
+
+from fastdeploy.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
+from fastdeploy.reasoning import ReasoningParser, ReasoningParserManager
+
+
+@ReasoningParserManager.register_module("erine-45-vl-thinking")
+class Ernie45VLThinkingReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for ernir_vl model.
+
+    The ernie_vl model uses ...</think>... tokens to denote reasoning text
+    within its output. The model provides a strict switch to disable reasoning
+    output via the 'enable_thinking=False' parameter. This parser extracts the
+    reasoning content enclosed by <think> and </think> tokens from the model's
+    output.
+    """
+
+    def __init__(self, tokenizer):
+        super().__init__(tokenizer)
+        self.think_end_token = "</think>"
+        self.tool_begin_token = "<tool_call>"
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ReasoningParser " "constructor during construction."
+            )
+
+        self.think_end_token_id = self.vocab.get(self.think_end_token)
+        self.tool_begin_token_id = self.vocab.get(self.tool_begin_token)
+        if self.tool_begin_token_id is None:
+            self.tool_begin_token_id = -1
+
+        if self.think_end_token_id is None:
+            raise RuntimeError("Test reasoning parser could not locate think end tokens in the tokenizer!")
+
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        return self.think_end_token_id in input_ids
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """
+        Extract reasoning content from a delta message.
+        Handles streaming output where previous + delta = current.
+        Uses token IDs for faster processing.
+        For text abc</think>xyz:
+        - 'abc' goes to reasoning_content
+        - 'xyz' goes to content
+        """
+        if self.think_end_token not in current_text:
+            return DeltaMessage(reasoning_content=delta_text)
+        # Skip single special tokens
+        if len(delta_token_ids) == 1 and delta_token_ids[0] == self.think_end_token_id:
+            return None
+        if self._is_with_tool(current_text=current_text, current_token_ids=current_token_ids):
+            if self.think_end_token in delta_text:
+                think_begin = delta_text.find(self.think_end_token)
+                reasoning_content = delta_text[:think_begin]
+                return DeltaMessage(reasoning_content=reasoning_content)
+            return None
+        if self.think_end_token in delta_text:
+            reasoning_content, _, content = delta_text.partition(self.think_end_token)
+            striped_content = content.strip("\n")
+            if len(striped_content) == 0:
+                return DeltaMessage(reasoning_content=reasoning_content) if reasoning_content else None
+            return (
+                DeltaMessage(reasoning_content=reasoning_content, content=content)
+                if reasoning_content
+                else DeltaMessage(content=content)
+            )
+        think_end = current_text.find(self.think_end_token) + len(self.think_end_token)
+        suffix = current_text[think_end:]
+        striped_suffix = suffix.strip("\n")
+        if len(striped_suffix) == 0:
+            return None
+        return DeltaMessage(content=delta_text)
+
+    def extract_reasoning_content(
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[Optional[str], Optional[str]]:
+        """
+        Extract reasoning content from the model output.
+
+        For text abc</think>xyz:
+        - 'abc' goes to reasoning_content
+        - 'xyz' goes to content
+
+        Returns:
+            tuple[Optional[str], Optional[str]]: reasoning content and content
+        """
+
+        # Check if the model output contains the </think> tokens.
+        if self.think_end_token not in model_output:
+            return model_output, ""
+        reasoning_content, _, content = model_output.partition(self.think_end_token)
+        if self.tool_begin_token in content:
+            prefix, _, _ = content.partition(self.tool_begin_token)
+            prefix_strip = prefix.lstrip("\n")
+            if len(prefix_strip) > 0:
+                return reasoning_content, content
+            return reasoning_content, ""
+        return reasoning_content, content
+
+    def _is_with_tool(self, current_text: str, current_token_ids: Sequence[int]) -> bool:
+        think_end_index = current_text.find(self.think_end_token)
+        think_end = think_end_index + len(self.think_end_token)
+        middle_str = current_text[think_end:]
+        if self.tool_begin_token_id in current_token_ids:
+            prefix, _, _ = middle_str.partition(self.tool_begin_token)
+            striped_prefix = prefix.strip("\n")
+            if len(striped_prefix) > 0:
+                return False
+            return True
+        return False
diff --git a/tests/entrypoints/openai/tool_parsers/test_ernie_45_vl_thinking_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_ernie_45_vl_thinking_tool_parser.py
new file mode 100644
index 00000000000..c5676ce6697
--- /dev/null
+++ b/tests/entrypoints/openai/tool_parsers/test_ernie_45_vl_thinking_tool_parser.py
@@ -0,0 +1,193 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import unittest
+from unittest.mock import patch
+
+from fastdeploy.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
+from fastdeploy.entrypoints.openai.tool_parsers.ernie_45_vl_thinking_tool_parser import (
+    Ernie45VLThinkingToolParser,
+)
+
+
+class DummyTokenizer:
+    """Dummy tokenizer with minimal vocab for testing"""
+
+    def __init__(self):
+        self.vocab = {"<tool_call>": 1, "</tool_call>": 2}
+
+
+class TestErnie45VLThinkingToolParser(unittest.TestCase):
+    def setUp(self):
+        class DummyTokenizer:
+            def __init__(self):
+                self.vocab = {"<tool_call>": 1, "</tool_call>": 2}
+
+            def get_vocab(self):
+                return self.vocab
+
+        self.tokenizer = DummyTokenizer()
+        self.parser = Ernie45VLThinkingToolParser(tokenizer=self.tokenizer)
+        self.dummy_request = ChatCompletionRequest(messages=[{"role": "user", "content": "hi"}])
+
+    # ---------------- Batch extraction tests ----------------
+
+    def test_extract_tool_calls_complete(self):
+        """Test normal extraction of complete tool_call JSON"""
+        output = '<tool_call>{"name": "get_weather", "arguments": {"location": "北京"}}</tool_call>'
+        result = self.parser.extract_tool_calls(output, self.dummy_request)
+        self.assertTrue(result.tools_called)
+        self.assertEqual(result.tool_calls[0].function.name, "get_weather")
+
+    def test_extract_tool_calls_partial_arguments(self):
+        """Test partial extraction when arguments incomplete"""
+        output = '<tool_call>{"name": "get_weather", "arguments": {"location": "北"</tool_call>'
+        result = self.parser.extract_tool_calls(output, self.dummy_request)
+        self.assertFalse(result.tools_called)
+        self.assertEqual(result.tool_calls[0].function.name, "get_weather")
+
+    def test_extract_tool_calls_no_toolcall(self):
+        """Test when no tool_call tags are present"""
+        output = "no tool call here"
+        result = self.parser.extract_tool_calls(output, self.dummy_request)
+        self.assertFalse(result.tools_called)
+
+    def test_extract_tool_calls_invalid_json(self):
+        """Test tool_call with badly formatted JSON triggers fallback parser"""
+        output = '<tool_call>"name": "get_weather", "arguments": {</tool_call>'
+        result = self.parser.extract_tool_calls(output, self.dummy_request)
+        self.assertFalse(result.tools_called)
+        self.assertEqual(result.tool_calls[0].function.name, "get_weather")
+
+    def test_extract_tool_calls_exception(self):
+        """Force exception to cover error branch"""
+        with patch(
+            "fastdeploy.entrypoints.openai.tool_parsers.ernie_x1_tool_parser.json.loads", side_effect=Exception("boom")
+        ):
+            output = '<tool_call>{"name": "get_weather", "arguments": {}}</tool_call>'
+            result = self.parser.extract_tool_calls(output, self.dummy_request)
+            self.assertFalse(result.tools_called)
+
+    def test_extract_tool_calls_illegal(self):
+        output = '</think>abc<tool_call>{"name": "get_weather", "arguments": {"location": "北京"}}</tool_call>'
+        result = self.parser.extract_tool_calls(output, self.dummy_request)
+        self.assertFalse(result.tools_called)
+        self.assertEqual(
+            result.content,
+            '</think>abc<tool_call>{"name": "get_weather", "arguments": {"location": "北京"}}</tool_call>',
+        )
+        output = 'abc<tool_call>{"name": "get_weather", "arguments": {"location": "北京"}}</tool_call>'
+        result = self.parser.extract_tool_calls(output, self.dummy_request)
+        self.assertFalse(result.tools_called)
+        self.assertEqual(
+            result.content, 'abc<tool_call>{"name": "get_weather", "arguments": {"location": "北京"}}</tool_call>'
+        )
+
+    # ---------------- Streaming extraction tests ----------------
+
+    def test_streaming_no_toolcall(self):
+        """Streaming extraction returns normal DeltaMessage when no <tool_call>"""
+        result = self.parser.extract_tool_calls_streaming(
+            "", "abc", "abc", [], [], [], self.dummy_request.model_dump()
+        )
+        self.assertIsInstance(result, DeltaMessage)
+        self.assertIsNone(result.tool_calls)
+        self.assertEqual(result.content, "abc")
+
+    def test_streaming_skip_empty_chunk(self):
+        """Streaming extraction skips empty chunks"""
+        result = self.parser.extract_tool_calls_streaming(
+            "", "<tool_call>", "   ", [], [1], [1], self.dummy_request.model_dump()
+        )
+        self.assertIsNone(result)
+
+    def test_streaming_new_toolcall_and_name(self):
+        """Streaming extraction detects new toolcall and extracts name"""
+        delta = self.parser.extract_tool_calls_streaming(
+            "", "<tool_call>", '<tool_call>{"name": "get_weather"', [], [1], [1], self.dummy_request.model_dump()
+        )
+        self.assertIsNotNone(delta)
+        self.assertEqual(delta.tool_calls[0].function.name, "get_weather")
+
+    def test_streaming_partial_arguments(self):
+        """Streaming extraction yields partial arguments deltas"""
+        text = '"arguments": {"location":'
+        delta = self.parser.extract_tool_calls_streaming(
+            "", "<tool_call>" + text, text, [], [1], [1], self.dummy_request.model_dump()
+        )
+        self.assertIsInstance(delta, DeltaMessage)
+        self.assertIn("arguments", delta.tool_calls[0].function.arguments)
+
+    def test_streaming_complete_arguments_and_end(self):
+        """Streaming extraction completes arguments with brackets matched and closes tool_call"""
+        text = '"arguments": {"location": "北京"}}'
+        delta = self.parser.extract_tool_calls_streaming(
+            "", "<tool_call>" + text, "<tool_call>" + text, [], [1], [1], self.dummy_request.model_dump()
+        )
+        self.assertIsInstance(delta, DeltaMessage)
+        # Also simulate closing tag
+        end_delta = self.parser.extract_tool_calls_streaming(
+            "<tool_call>" + text,
+            "<tool_call>" + text + "</tool_call>",
+            "</tool_call>",
+            [1],
+            [1, 2],
+            [2],
+            self.dummy_request.model_dump(),
+        )
+        self.assertIsNone(end_delta)
+
+    def test_streaming_no_tool_illegal(self):
+        result = self.parser.extract_tool_calls_streaming(
+            "", "abc<tool_call>", "abc<tool_call>", [], [], [], self.dummy_request.model_dump()
+        )
+        self.assertIsInstance(result, DeltaMessage)
+        self.assertIsNone(result.tool_calls)
+        self.assertEqual(result.content, "abc<tool_call>")
+        result = self.parser.extract_tool_calls_streaming(
+            "", "</think>abc<tool_call>", "</think>abc<tool_call>", [], [], [], self.dummy_request.model_dump()
+        )
+        self.assertIsInstance(result, DeltaMessage)
+        self.assertIsNone(result.tool_calls)
+        self.assertEqual(result.content, "</think>abc<tool_call>")
+
+    def test_streaming_tool_with_reasoning(self):
+        delta = self.parser.extract_tool_calls_streaming(
+            "",
+            '</think><tool_call>{"name": "get_weather"',
+            '</think><tool_call>{"name": "get_weather"',
+            [],
+            [1],
+            [1],
+            self.dummy_request.model_dump(),
+        )
+        self.assertIsNotNone(delta)
+        self.assertEqual(delta.tool_calls[0].function.name, "get_weather")
+        delta = self.parser.extract_tool_calls_streaming(
+            "",
+            '</think>\n\n<tool_call>{"name": "get_weather"',
+            '</think>\n\n<tool_call>{"name": "get_weather"',
+            [],
+            [1],
+            [1],
+            self.dummy_request.model_dump(),
+        )
+        self.assertIsNotNone(delta)
+        self.assertEqual(delta.tool_calls[0].function.name, "get_weather")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/reasoning/test_reasoning_parser.py b/tests/reasoning/test_reasoning_parser.py
index 90a48c89909..9e06523b026 100644
--- a/tests/reasoning/test_reasoning_parser.py
+++ b/tests/reasoning/test_reasoning_parser.py
@@ -18,6 +18,9 @@
 
 from fastdeploy.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from fastdeploy.reasoning import ReasoningParser, ReasoningParserManager
+from fastdeploy.reasoning.ernie_45_vl_thinking_reasoning_parser import (
+    Ernie45VLThinkingReasoningParser,
+)
 from fastdeploy.reasoning.ernie_x1_reasoning_parsers import ErnieX1ReasoningParser
 
 
@@ -261,5 +264,166 @@ def test_batch_preserve_all_newlines(self):
         self.assertEqual(response, "line1\nline2\n")
 
 
+class TestErnie45VLThinkingReasoningParser(unittest.TestCase):
+    def setUp(self):
+        self.tokenizer = DummyTokenizer()
+        self.parser = Ernie45VLThinkingReasoningParser(tokenizer=self.tokenizer)
+        self.test_request = ChatCompletionRequest(
+            model="ernie-test", messages=[{"role": "user", "content": "test prompt"}]
+        )
+
+    def test_streaming_non_reasoning(self):
+        result = self.parser.extract_reasoning_content_streaming(
+            previous_text="",
+            current_text="a",
+            delta_text="a",
+            previous_token_ids=[],
+            current_token_ids=[200],
+            delta_token_ids=[200],
+        )
+        self.assertIsInstance(result, DeltaMessage)
+        self.assertEqual(result.reasoning_content, "a")
+        self.assertIsNone(result.content)
+
+    def test_streaming_with_reasoning(self):
+        result = self.parser.extract_reasoning_content_streaming(
+            previous_text="ab",
+            current_text="ab</think>",
+            delta_text="</think>",
+            previous_token_ids=[200, 201],
+            current_token_ids=[200, 201, 100],
+            delta_token_ids=[100],
+        )
+        self.assertIsNone(result)
+
+    def test_streaming_with_reasoning_and_content(self):
+        result = self.parser.extract_reasoning_content_streaming(
+            previous_text="ab",
+            current_text="ab</think>\n\ncd",
+            delta_text="</think>\n\ncd",
+            previous_token_ids=[200, 201],
+            current_token_ids=[200, 201, 100, 300, 400],
+            delta_token_ids=[100, 300, 400],
+        )
+        self.assertIsInstance(result, DeltaMessage)
+        self.assertIsNone(result.reasoning_content)
+        self.assertEqual(result.content, "\n\ncd")
+
+    def test_streaming_with_reasoning_new_line(self):
+        result = self.parser.extract_reasoning_content_streaming(
+            previous_text="abc",
+            current_text="abc</think>\n\n",
+            delta_text="</think>\n\n",
+            previous_token_ids=[200, 201, 202],
+            current_token_ids=[200, 201, 202, 100],
+            delta_token_ids=[100],
+        )
+        self.assertIsNone(result)
+
+    def test_streaming_with_reasoning_and_tool(self):
+        result = self.parser.extract_reasoning_content_streaming(
+            previous_text="abc",
+            current_text="abc</think>\n\n<tool_call>",
+            delta_text="</think>\n\n<tool_call>",
+            previous_token_ids=[200, 201, 202],
+            current_token_ids=[200, 201, 202, 100, 200, 101],
+            delta_token_ids=[100, 200, 101],
+        )
+        self.assertIsInstance(result, DeltaMessage)
+        self.assertEqual(result.reasoning_content, "")
+
+    def test_streaming_with_reasoning_and_illegal_tool(self):
+        result = self.parser.extract_reasoning_content_streaming(
+            previous_text="abc</think>",
+            current_text="abc</think>\n\nhello<tool_call>",
+            delta_text="\n\nhello<tool_call>",
+            previous_token_ids=[200, 201, 202],
+            current_token_ids=[200, 201, 202, 100, 200, 101],
+            delta_token_ids=[109, 200, 101],
+        )
+        self.assertIsInstance(result, DeltaMessage)
+        self.assertEqual(result.content, "\n\nhello<tool_call>")
+
+    def test_streaming_with_reasoning_no_tool(self):
+        result = self.parser.extract_reasoning_content_streaming(
+            previous_text="abc",
+            current_text="abchello</think>\nworld",
+            delta_text="hello</think>\nworld",
+            previous_token_ids=[200, 201, 202],
+            current_token_ids=[200, 201, 202, 100, 200, 110],
+            delta_token_ids=[100, 200, 110],
+        )
+        self.assertIsInstance(result, DeltaMessage)
+        self.assertEqual(result.reasoning_content, "hello")
+        self.assertEqual(result.content, "\nworld")
+
+    def test_streaming_reasoning_previous_no_tool(self):
+        result = self.parser.extract_reasoning_content_streaming(
+            previous_text="</think>",
+            current_text="</think>\nhello",
+            delta_text="\nhello",
+            previous_token_ids=[100],
+            current_token_ids=[100, 110, 111],
+            delta_token_ids=[110, 111],
+        )
+        self.assertIsInstance(result, DeltaMessage)
+        self.assertIsNone(result.reasoning_content)
+        self.assertEqual(result.content, "\nhello")
+
+    def test_streaming_no_reasoning_previous_tool(self):
+        result = self.parser.extract_reasoning_content_streaming(
+            previous_text="<tool_call>",
+            current_text="<tool_call>hello",
+            delta_text="hello",
+            previous_token_ids=[101],
+            current_token_ids=[101, 110],
+            delta_token_ids=[110],
+        )
+        self.assertIsInstance(result, DeltaMessage)
+        self.assertEqual(result.reasoning_content, "hello")
+
+    def test_batch_no_think_end(self):
+        reasoning, content = self.parser.extract_reasoning_content(
+            model_output="direct response", request=self.test_request
+        )
+        self.assertEqual(reasoning, "direct response")
+        self.assertEqual(content, "")
+
+    def test_batch_no_think_end_with_tool(self):
+        reasoning, content = self.parser.extract_reasoning_content(
+            model_output="direct response<tool_call>abc", request=self.test_request
+        )
+        self.assertEqual(reasoning, "direct response<tool_call>abc")
+        self.assertEqual(content, "")
+
+    def test_batch_think_end_normal_content(self):
+        reasoning, content = self.parser.extract_reasoning_content(
+            model_output="reasoning</think>\nresponse", request=self.test_request
+        )
+        self.assertEqual(reasoning, "reasoning")
+        self.assertEqual(content, "\nresponse")
+
+    def test_batch_think_end_with_tool(self):
+        reasoning, content = self.parser.extract_reasoning_content(
+            model_output="reasoning</think>\n<tool_call>tool params</tool_call>", request=self.test_request
+        )
+        self.assertEqual(reasoning, "reasoning")
+        self.assertEqual(content, "")
+
+    def test_batch_think_end_with_illegal_tool(self):
+        reasoning, content = self.parser.extract_reasoning_content(
+            model_output="reasoning</think>\nABC\n<tool_call>tool params</tool_call>", request=self.test_request
+        )
+        self.assertEqual(reasoning, "reasoning")
+        self.assertEqual(content, "\nABC\n<tool_call>tool params</tool_call>")
+
+    def test_batch_think_end_content_with_newline(self):
+        reasoning, content = self.parser.extract_reasoning_content(
+            model_output="reasoning</think>\n\n  actual response", request=self.test_request
+        )
+        self.assertEqual(reasoning, "reasoning")
+        self.assertEqual(content, "\n\n  actual response")
+
+
 if __name__ == "__main__":
     unittest.main()

From 51068f10ebcb5c44249281c05a0a9f592796cfca Mon Sep 17 00:00:00 2001
From: kxz2002 <115912648+kxz2002@users.noreply.github.com>
Date: Fri, 31 Oct 2025 10:42:19 +0800
Subject: [PATCH 2/4] [BugFix] Fix finish reason in
 _create_chat_completion_choice  (#4582)

* fix n_param _create_chat_completion_choicel

* fix unit test

* fix final_res

* modify unit tests
---
 fastdeploy/entrypoints/openai/serving_chat.py         | 4 ++--
 tests/entrypoints/openai/test_max_streaming_tokens.py | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
index b18fc510224..cf11ba8fffd 100644
--- a/fastdeploy/entrypoints/openai/serving_chat.py
+++ b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -621,7 +621,7 @@ async def _create_chat_completion_choice(
 
         if output is not None and output.get("metrics") and output["metrics"].get("request_start_time"):
             work_process_metrics.e2e_request_latency.observe(
-                time.time() - output.get("metrics").get("request_start_time")
+                time.time() - data.get("metrics").get("request_start_time")
             )
         message = ChatMessage(
             role="assistant",
@@ -655,7 +655,7 @@ async def _create_chat_completion_choice(
                 finish_reason = "tool_calls"
         else:
             finish_reason = "length"
-        if output.get("error_msg") is not None and "Recover" in output["error_msg"]:
+        if data.get("error_msg") is not None and "Recover" in data["error_msg"]:
             finish_reason = "recover_stop"
 
         return ChatCompletionResponseChoice(
diff --git a/tests/entrypoints/openai/test_max_streaming_tokens.py b/tests/entrypoints/openai/test_max_streaming_tokens.py
index 01b6346e03d..3a772f9193e 100644
--- a/tests/entrypoints/openai/test_max_streaming_tokens.py
+++ b/tests/entrypoints/openai/test_max_streaming_tokens.py
@@ -412,7 +412,7 @@ async def test_create_chat_completion_choice(self):
                 "test_data": {
                     "request_id": "test_1",
                     "outputs": {
-                        "token_ids": [789],
+                        "token_ids": [123, 456, 789],
                         "text": "Edge case response",
                         "reasoning_content": None,
                         "tool_call": None,
@@ -424,7 +424,7 @@ async def test_create_chat_completion_choice(self):
                     "previous_num_tokens": 1,
                 },
                 "mock_request": ChatCompletionRequest(
-                    model="test", messages=[], return_token_ids=True, max_tokens=5, n=2
+                    model="test", messages=[], return_token_ids=True, max_tokens=1, n=2
                 ),
                 "expected": {
                     "index": 1,
@@ -434,7 +434,7 @@ async def test_create_chat_completion_choice(self):
                     "raw_prediction": None,
                     "num_cached_tokens": 0,
                     "num_image_tokens": 0,
-                    "finish_reason": "stop",
+                    "finish_reason": "length",
                 },
             },
         ]

From ebc970c566de9d4db2a796b646624a57cb28d652 Mon Sep 17 00:00:00 2001
From: kxz2002 <115912648+kxz2002@users.noreply.github.com>
Date: Thu, 30 Oct 2025 19:45:41 +0800
Subject: [PATCH 3/4] [BugFix] fix offline llm chat "enable_thinking" is always
 "False" (#4686)

* fix enable_thinking

* recover ernie4_5_vl_processor
---
 fastdeploy/engine/request.py           | 2 +-
 fastdeploy/input/ernie4_5_processor.py | 2 +-
 fastdeploy/input/text_processor.py     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py
index 1b846d8fc0a..3036f530fc6 100644
--- a/fastdeploy/engine/request.py
+++ b/fastdeploy/engine/request.py
@@ -197,7 +197,7 @@ def from_dict(cls, d: dict):
             guided_grammar=d.get("guided_grammar", None),
             structural_tag=d.get("structural_tag", None),
             guided_json_object=d.get("guided_json_object", None),
-            enable_thinking=d.get("enable_thinking", False),
+            enable_thinking=d.get("enable_thinking", None),
             reasoning_max_tokens=d.get("reasoning_max_tokens", None),
             trace_carrier=d.get("trace_carrier", {}),
             chat_template=d.get("chat_template", None),
diff --git a/fastdeploy/input/ernie4_5_processor.py b/fastdeploy/input/ernie4_5_processor.py
index e0daacdc6ae..a151dbfdd6d 100644
--- a/fastdeploy/input/ernie4_5_processor.py
+++ b/fastdeploy/input/ernie4_5_processor.py
@@ -130,7 +130,7 @@ def process_request(self, request, max_model_len=None, **kwargs):
                 if chat_template_kwargs:
                     if isinstance(chat_template_kwargs, dict):
                         for k, v in chat_template_kwargs.items():
-                            if k not in task:
+                            if k not in task or task[k] is None:
                                 task[k] = v
                     else:
                         raise ValueError("Invalid input: chat_template_kwargs must be a dict")
diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py
index 75e068c4000..45ce5dda25b 100644
--- a/fastdeploy/input/text_processor.py
+++ b/fastdeploy/input/text_processor.py
@@ -245,7 +245,7 @@ def process_request(self, request, max_model_len=None, **kwargs):
                 if chat_template_kwargs:
                     if isinstance(chat_template_kwargs, dict):
                         for k, v in chat_template_kwargs.items():
-                            if k not in task:
+                            if k not in task or task[k] is None:
                                 task[k] = v
                     else:
                         raise ValueError("Invalid input: chat_template_kwargs must be a dict")

From 174226b5081eaef63fa13fc4687e6f5a2a8dacb2 Mon Sep 17 00:00:00 2001
From: kxz2002 <115912648+kxz2002@users.noreply.github.com>
Date: Thu, 6 Nov 2025 11:28:55 +0800
Subject: [PATCH 4/4] [BugFix] Fix ernie_vl_reasoning_parsers.py 'end_token' to
 'think_end_token' (#4805)

* fix ernie_vl_reasoning_parsers.py 'end_token' to 'think_end_token'

* add unit tests
---
 .../reasoning/ernie_vl_reasoning_parsers.py   |  4 +-
 tests/reasoning/test_reasoning_parser.py      | 56 +++++++++++++++++++
 2 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py b/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py
index 5636ee9f5ea..18e15f18aa3 100644
--- a/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py
+++ b/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py
@@ -70,9 +70,9 @@ def extract_reasoning_content_streaming(
         if len(delta_token_ids) == 1 and delta_token_ids[0] == self.think_end_token_id:
             return None
         if self.think_end_token_id in delta_token_ids:
-            end_index = delta_text.find(self.end_token)
+            end_index = delta_text.find(self.think_end_token)
             reasoning_content = delta_text[:end_index]
-            content = delta_text[end_index + len(self.end_token) :]
+            content = delta_text[end_index + len(self.think_end_token) :]
             return DeltaMessage(reasoning_content=reasoning_content, content=content)
         elif self.think_end_token_id in previous_token_ids:
             return DeltaMessage(content=delta_text)
diff --git a/tests/reasoning/test_reasoning_parser.py b/tests/reasoning/test_reasoning_parser.py
index 26a7457db9a..e6deded445d 100644
--- a/tests/reasoning/test_reasoning_parser.py
+++ b/tests/reasoning/test_reasoning_parser.py
@@ -21,6 +21,7 @@
 from fastdeploy.reasoning.ernie_45_vl_thinking_reasoning_parser import (
     Ernie45VLThinkingReasoningParser,
 )
+from fastdeploy.reasoning.ernie_vl_reasoning_parsers import ErnieVLReasoningParser
 from fastdeploy.reasoning.ernie_x1_reasoning_parsers import ErnieX1ReasoningParser
 
 
@@ -425,5 +426,60 @@ def test_batch_think_end_content_with_newline(self):
         self.assertEqual(content, "\n\n  actual response")
 
 
+class TestErnieVLReasoningParser(unittest.TestCase):
+    def setUp(self):
+        self.tokenizer = DummyTokenizer()
+        self.parser = ErnieVLReasoningParser(tokenizer=self.tokenizer)
+        self.test_request = ChatCompletionRequest(
+            model="ernie-test", messages=[{"role": "user", "content": "test prompt"}]
+        )
+
+    def test_extract_reasoning_content_stream(self):
+        result = self.parser.extract_reasoning_content_streaming(
+            previous_text="abc",
+            current_text="abc</think>xyz",
+            delta_text="</think>xyz",
+            previous_token_ids=[200, 201, 202],
+            current_token_ids=[200, 201, 202, 100, 110, 120, 130],
+            delta_token_ids=[100, 110, 120, 130],
+        )
+        self.assertIsInstance(result, DeltaMessage)
+        self.assertEqual(result.reasoning_content, "")
+        self.assertEqual(result.content, "xyz")
+
+    def test_extract_reasoning_content_stream_think_in_previous(self):
+        result = self.parser.extract_reasoning_content_streaming(
+            previous_text="abc</think>",
+            current_text="abc</think>xyz",
+            delta_text="xyz",
+            previous_token_ids=[200, 201, 202, 100],
+            current_token_ids=[200, 201, 202, 100, 110, 120, 130],
+            delta_token_ids=[110, 120, 130],
+        )
+        self.assertIsInstance(result, DeltaMessage)
+        self.assertIsNone(result.reasoning_content)
+        self.assertEqual(result.content, "xyz")
+
+    def test_extract_reasoning_content_stream_no_think_token(self):
+        result = self.parser.extract_reasoning_content_streaming(
+            previous_text="abc",
+            current_text="abcxyz",
+            delta_text="xyz",
+            previous_token_ids=[200, 201, 202],
+            current_token_ids=[200, 201, 202, 110, 120, 130],
+            delta_token_ids=[110, 120, 130],
+        )
+        self.assertIsInstance(result, DeltaMessage)
+        self.assertIsNone(result.content)
+        self.assertEqual(result.reasoning_content, "xyz")
+
+    def test_extract_reasoning_content(self):
+        reasoning, content = self.parser.extract_reasoning_content(
+            model_output="reasoning</think>\nactual response", request=self.test_request
+        )
+        self.assertEqual(reasoning, "reasoning")
+        self.assertEqual(content, "\nactual response")
+
+
 if __name__ == "__main__":
     unittest.main()