From 234ef928262d3b8da56325f44f9c33373a2b4930 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Tue, 23 Sep 2025 16:03:53 +0800 Subject: [PATCH 01/32] add model status in vl --- fastdeploy/input/ernie4_5_processor.py | 15 ++++++-- .../ernie4_5_vl_processor.py | 3 ++ .../reasoning/ernie_vl_reasoning_parsers.py | 37 ++++++++++++++++--- 3 files changed, 46 insertions(+), 9 deletions(-) diff --git a/fastdeploy/input/ernie4_5_processor.py b/fastdeploy/input/ernie4_5_processor.py index f364ecba11a..25834946841 100644 --- a/fastdeploy/input/ernie4_5_processor.py +++ b/fastdeploy/input/ernie4_5_processor.py @@ -232,7 +232,8 @@ def process_request_dict(self, request, max_model_len=None): request["top_p"] = _SAMPLING_EPS if self.reasoning_parser and self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser": request["enable_thinking"] = True - + if self.reasoning_parser: + request["model_status"] = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) data_processor_logger.info(f"Processed request dict: {request}") return request @@ -246,6 +247,7 @@ def process_response(self, response_dict, **kwargs): Returns: Dict: response contain text fields """ + model_status = kwargs.get("model_status") req_id = response_dict.request_id token_ids = response_dict.outputs.token_ids @@ -254,7 +256,9 @@ def process_response(self, response_dict, **kwargs): token_ids = token_ids[:-1] full_text = self.tokenizer.decode(token_ids) if self.reasoning_parser: - reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict) + reasoning_content, text = self.reasoning_parser.extract_reasoning_content( + full_text, response_dict, model_status + ) response_dict.outputs.text = text response_dict.outputs.reasoning_content = reasoning_content else: @@ -296,6 +300,7 @@ def process_response_dict_normal(self, response_dict, **kwargs): Dict: response contain text fields """ enable_thinking = kwargs.get("enable_thinking") + model_status = kwargs.get("model_status") token_ids = response_dict["outputs"]["token_ids"] is_end = response_dict["finished"] req_id = response_dict["request_id"] @@ -308,7 +313,9 @@ def process_response_dict_normal(self, response_dict, **kwargs): if self.reasoning_parser and ( enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser" ): - reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict) + reasoning_content, text = self.reasoning_parser.extract_reasoning_content( + full_text, response_dict, model_status + ) response_dict["outputs"]["text"] = text response_dict["outputs"]["reasoning_content"] = reasoning_content else: @@ -335,6 +342,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): Dict: response contain text fields """ enable_thinking = kwargs.get("enable_thinking") + model_status = kwargs.get("model_status") is_end = response_dict["finished"] req_id = response_dict["request_id"] token_ids = response_dict["outputs"]["token_ids"] @@ -354,6 +362,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): previous_token_ids, previous_token_ids + token_ids, token_ids, + model_status, ) response_dict["outputs"]["delta_message"] = reasoning_delta_message if self.tool_parser_obj: diff --git a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py index 77690b9209e..a13bf68b765 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py +++ b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py @@ -255,6 +255,9 @@ def process_request_dict(self, request, max_model_len=None): request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"])) data_processor_logger.info(f"Processed request {request}") + if self.reasoning_parser is not None: + request["model_status"] = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) + return request def append_completion_tokens(self, multimodal_inputs, completion_token_ids): diff --git a/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py b/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py index 5636ee9f5ea..7806658d3c2 100644 --- a/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py +++ b/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py @@ -35,6 +35,7 @@ class ErnieVLReasoningParser(ReasoningParser): def __init__(self, tokenizer): super().__init__(tokenizer) + self.think_start_token = "" self.think_end_token = "" if not self.model_tokenizer: @@ -45,10 +46,28 @@ def __init__(self, tokenizer): self.think_end_token_id = self.vocab.get(self.think_end_token) if self.think_end_token_id is None: raise RuntimeError("Ernie VL reasoning parser could not locate think end " "tokens in the tokenizer!") + self.think_start_token_id = self.vocab.get(self.think_start_token) def is_reasoning_end(self, input_ids: list[int]) -> bool: return self.think_end_token_id in input_ids + def find_last_special_token(self, prompt_token_ids: list[int]) -> int: + for i in range(len(prompt_token_ids) - 1, -1, -1): + if prompt_token_ids[i] in [self.think_end_token_id, self.think_start_token_id]: + return prompt_token_ids[i] + return -1 + + def get_model_status(self, prompt_token_ids: list[int]): + special_token_id = self.find_last_special_token(prompt_token_ids) + if special_token_id == -1: + return "responding" + if special_token_id == self.think_end_token_id: + return "responding" + if self.think_start_token_id == special_token_id: + return "thinking" + + return "responding" + def extract_reasoning_content_streaming( self, previous_text: str, @@ -57,6 +76,7 @@ def extract_reasoning_content_streaming( previous_token_ids: Sequence[int], current_token_ids: Sequence[int], delta_token_ids: Sequence[int], + model_status: str, ) -> Union[DeltaMessage, None]: """ Extract reasoning content from a delta message. @@ -80,7 +100,10 @@ def extract_reasoning_content_streaming( return DeltaMessage(reasoning_content=delta_text) def extract_reasoning_content( - self, model_output: str, request: ChatCompletionRequest + self, + model_output: str, + request: ChatCompletionRequest, + model_status: str, ) -> tuple[Optional[str], Optional[str]]: """ Extract reasoning content from the model output. @@ -94,9 +117,11 @@ def extract_reasoning_content( """ # Check if the model output contains the tokens. - if self.think_end_token not in model_output: + if model_status == "thinking": + if self.think_end_token not in model_output: + return model_output, "" + reasoning_content, _, content = model_output.partition(self.think_end_token) + final_content = content or "" + return reasoning_content, final_content + else: return "", model_output - reasoning_content, _, content = model_output.partition(self.think_end_token) - - final_content = content or "" - return reasoning_content, final_content From 671a4dcc7538e822d6a619bc052030da9a99c6a2 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Wed, 24 Sep 2025 11:20:51 +0800 Subject: [PATCH 02/32] add x1 parser --- .../reasoning/ernie_x1_reasoning_parsers.py | 135 ++++++++++++------ 1 file changed, 94 insertions(+), 41 deletions(-) diff --git a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py index 8dbfb23ca9e..fc1db88679d 100644 --- a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py +++ b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py @@ -34,19 +34,62 @@ class ErnieX1ReasoningParser(ReasoningParser): def __init__(self, tokenizer): super().__init__(tokenizer) - self.think_end_token = "" - self.response_start_token = "" - self.response_end_token = "" - self.tool_call_start_token = "" - self.tool_call_end_token = "" + + # 定义所有需要检查的token + token_definitions = { + "think_start_token": "", + "think_end_token": "", + "response_start_token": "", + "response_end_token": "", + "tool_call_start_token": "", + "tool_call_end_token": "", + } if not self.model_tokenizer: raise ValueError("The model tokenizer must be passed to the ReasoningParser constructor.") - self.think_end_token_id = self.vocab.get("") - if self.think_end_token_id is None: - raise RuntimeError("Could not find think end token id in tokenizer vocabulary") - self.tool_call_start_token_id = self.vocab.get("") + missing_tokens = [] + for name, token_value in token_definitions.items(): + setattr(self, name, token_value) + token_id = self.vocab.get(token_value) + setattr(self, f"{name}_id", token_id) + if token_id is None: + missing_tokens.append(f"{name.replace('_', ' ')} token") + + if missing_tokens: + raise RuntimeError( + f"Could not find the following token ids in tokenizer vocabulary: {', '.join(missing_tokens)}" + ) + + self.token_status_mapping = { + self.think_start_token_id: "think_start", + self.think_end_token_id: "think_end", + self.response_start_token_id: "response_start", + self.response_end_token_id: "response_end", + self.tool_call_start_token_id: "tool_call_start", + self.tool_call_end_token_id: "tool_call_end", + } + + def find_last_special_token(self, prompt_token_ids: list[int]) -> int: + for i in range(len(prompt_token_ids) - 1, -1, -1): + if prompt_token_ids[i] in [ + self.think_end_token_id, + self.think_start_token_id, + self.response_start_token_id, + self.response_end_token_id, + self.tool_call_start_token_id, + self.tool_call_end_token_id, + ]: + return prompt_token_ids[i] + return -1 + + def get_model_status(self, prompt_token_ids: list[int]): + special_token_id = self.find_last_special_token(prompt_token_ids) + + if special_token_id == -1: + return "response_start" + + return self.token_status_mapping.get(special_token_id, "response_start") def is_reasoning_end(self, input_ids: list[int]) -> bool: return self.tool_call_start_token_id in input_ids @@ -117,45 +160,55 @@ def extract_reasoning_content_streaming( # 默认情况不返回内容 return None - def extract_reasoning_content(self, model_output: str, request: ChatCompletionRequest) -> Tuple[str, str]: + def strip_last_newline(self, content: str, end_pos: int) -> str: + return content[: end_pos - 1] if end_pos > 0 and content[end_pos - 1] == "\n" else content[:end_pos] + + def extract_reasoning_content( + self, model_output: str, request: ChatCompletionRequest, model_status: str + ) -> Tuple[str, str]: """ - Batch version of the enhanced parser. - Modified to preserve newlines in both reasoning and response content, + Optimized batch version of the enhanced parser. + Preserves newlines in both reasoning and response content, only removing the single newline before closing tags. """ reasoning_content = "" response_content = "" - think_end_pos = model_output.find(self.think_end_token) - if think_end_pos != -1: - # Extract thinking content - only remove the last newline before - reasoning_content = model_output[:think_end_pos] - if think_end_pos > 0 and reasoning_content[-1] == "\n": - reasoning_content = reasoning_content[:-1] + # Define helper function to strip the last newline before a closing tag + if model_status == "think_start": + think_end_pos = model_output.find(self.think_end_token) + if think_end_pos != -1: + # Extract reasoning content + reasoning_content = self.strip_last_newline(model_output, think_end_pos) + remaining = model_output[think_end_pos + len(self.think_end_token) :].lstrip("\n") + + # Determine if remaining content is a response or tool call + if remaining.startswith(self.response_start_token): + response_start_pos = len(self.response_start_token) + response_content = self._extract_response_content(remaining[response_start_pos:]) + elif remaining.startswith(self.tool_call_start_token): + pass # No response content + else: + # No think_end_token found, treat entire output as reasoning content + reasoning_content = model_output - remaining = model_output[think_end_pos + len(self.think_end_token) :] + elif model_status == "think_end": + remaining = model_output.lstrip("\n") + if remaining.startswith(self.response_start_token): + response_start_pos = len(self.response_start_token) + response_content = self._extract_response_content(remaining[response_start_pos:]) - # Skip newlines after - remaining = remaining.lstrip("\n") + elif model_status == "response_start": + response_content = model_output.replace(self.response_end_token, "") - # Check for response or tool_call - if remaining.startswith(self.response_start_token): - response_pos = len(self.response_start_token) - remaining = remaining[response_pos:].lstrip("\n") - response_end_pos = remaining.find(self.response_end_token) - if response_end_pos != -1: - # Only strip the last newline before , not all - if response_end_pos > 0 and remaining[response_end_pos - 1] == "\n": - response_content = remaining[: response_end_pos - 1] - else: - response_content = remaining[:response_end_pos] - else: - # If no found, return the rest as response content - response_content = remaining - elif remaining.startswith(self.tool_call_start_token): - pass # No response content - else: - # No thinking content found, return the whole input as reasoning - reasoning_content = model_output - response_content = "" return reasoning_content, response_content + + def _extract_response_content(self, remaining: str) -> str: + """ + Extracts response content, ensuring that the last newline before + the tag is removed. + """ + response_end_pos = remaining.find(self.response_end_token) + if response_end_pos != -1: + return self.strip_last_newline(remaining, response_end_pos) + return remaining From 8bbe39d56a05b801d9013774f55169abb1040f75 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Wed, 24 Sep 2025 17:19:53 +0800 Subject: [PATCH 03/32] add model_status --- .../entrypoints/openai/response_processors.py | 10 +++++----- fastdeploy/entrypoints/openai/serving_chat.py | 18 +++++++----------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/fastdeploy/entrypoints/openai/response_processors.py b/fastdeploy/entrypoints/openai/response_processors.py index e51147899e5..22bfbf63213 100644 --- a/fastdeploy/entrypoints/openai/response_processors.py +++ b/fastdeploy/entrypoints/openai/response_processors.py @@ -67,13 +67,13 @@ def accumulate_token_ids(self, request_output): else: self._multipart_buffer.append({"decode_type": decode_type, "request_output": request_output}) - async def process_response_chat(self, request_outputs, stream, enable_thinking, include_stop_str_in_output): + async def process_response_chat(self, request_outputs, stream, model_status, include_stop_str_in_output): """ Process a list of responses into a generator that yields each processed response as it's generated. Args: request_outputs: The list of outputs to be processed. stream: Whether or not to stream the output. - enable_thinking: Whether or not to show thinking messages. + model_status: Whether or not to show thinking messages. include_stop_str_in_output: Whether or not to include stop strings in the output. """ for request_output in request_outputs: @@ -82,7 +82,7 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking, yield self.data_processor.process_response_dict( response_dict=request_output, stream=stream, - enable_thinking=enable_thinking, + model_status=model_status, include_stop_str_in_output=include_stop_str_in_output, ) elif stream: @@ -108,7 +108,7 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking, self.data_processor.process_response_dict( response_dict=request_output, stream=stream, - enable_thinking=enable_thinking, + model_status=model_status, include_stop_str_in_output=include_stop_str_in_output, ) text = {"type": "text", "text": request_output["outputs"]["text"]} @@ -128,7 +128,7 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking, self.data_processor.process_response_dict( response_dict=part["request_output"], stream=False, - enable_thinking=enable_thinking, + model_status=model_status, include_stop_str_in_output=include_stop_str_in_output, ) text = {"type": "text", "text": part["request_output"]["outputs"]["text"]} diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index 52cd556916f..8922d7a7e8e 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -120,6 +120,7 @@ async def create_chat_completion(self, request: ChatCompletionRequest): text_after_process = current_req_dict.get("text_after_process") if isinstance(prompt_token_ids, np.ndarray): prompt_token_ids = prompt_token_ids.tolist() + model_status = current_req_dict.get("model_status") except ParameterError as e: api_server_logger.error(f"request[{request_id}] generator error: {str(e)}, {e.message}") self.engine_client.semaphore.release() @@ -135,12 +136,12 @@ async def create_chat_completion(self, request: ChatCompletionRequest): if request.stream: return self.chat_completion_stream_generator( - request, request_id, request.model, prompt_token_ids, text_after_process + request, request_id, request.model, prompt_token_ids, text_after_process, model_status ) else: try: return await self.chat_completion_full_generator( - request, request_id, request.model, prompt_token_ids, text_after_process + request, request_id, request.model, prompt_token_ids, text_after_process, model_status ) except Exception as e: error_msg = f"request[{request_id}]full generator error: {str(e)}, {str(traceback.format_exc())}" @@ -168,6 +169,7 @@ async def chat_completion_stream_generator( model_name: str, prompt_token_ids: list(), text_after_process: str, + model_status: str, ): """ Streaming chat completion generator. @@ -187,10 +189,6 @@ async def chat_completion_stream_generator( max_streaming_response_tokens = max(1, max_streaming_response_tokens) - enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None - if enable_thinking is None: - enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None - include_stop_str_in_output = request.include_stop_str_in_output stream_options = request.stream_options @@ -242,7 +240,7 @@ async def chat_completion_stream_generator( generator = response_processor.process_response_chat( response, stream=True, - enable_thinking=enable_thinking, + model_status=model_status, include_stop_str_in_output=include_stop_str_in_output, ) @@ -412,15 +410,13 @@ async def chat_completion_full_generator( model_name: str, prompt_token_ids: list(), text_after_process: str, + model_status: str, ): """ Full chat completion generator. """ created_time = int(time.time()) final_res = None - enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None - if enable_thinking is None: - enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None include_stop_str_in_output = request.include_stop_str_in_output try: @@ -464,7 +460,7 @@ async def chat_completion_full_generator( generator = response_processor.process_response_chat( response, stream=False, - enable_thinking=enable_thinking, + model_status=model_status, include_stop_str_in_output=include_stop_str_in_output, ) async for data in generator: From d087afb57f92a78138607a759e19a9c8cf2e76af Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Thu, 25 Sep 2025 14:57:33 +0800 Subject: [PATCH 04/32] fix parser --- .../tool_parsers/ernie_x1_tool_parser.py | 176 +++--------------- fastdeploy/input/ernie4_5_processor.py | 10 +- .../reasoning/ernie_x1_reasoning_parsers.py | 93 +++------ 3 files changed, 64 insertions(+), 215 deletions(-) diff --git a/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py index 9b0c7b9cb5f..e5df1a2e178 100644 --- a/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py +++ b/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py @@ -14,18 +14,10 @@ import json import re -import uuid from collections.abc import Sequence from typing import Union -import partial_json_parser - - -def random_tool_call_id() -> str: - """Generate a random tool call ID""" - return f"chatcmpl-tool-{str(uuid.uuid4().hex)}" - - +from fastdeploy.entrypoints.chat_utils import random_tool_call_id from fastdeploy.entrypoints.openai.protocol import ( ChatCompletionRequest, DeltaFunctionCall, @@ -61,6 +53,8 @@ def __init__(self, tokenizer): self.tool_call_start_token: str = "" self.tool_call_end_token: str = "" + self.tool_call_regex = re.compile(r"(.*?)|(.*)", re.DOTALL) + self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token) self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) if self.tool_call_start_token_id is None or self.tool_call_end_token_id is None: @@ -73,7 +67,9 @@ def __init__(self, tokenizer): "The model tokenizer must be passed to the ToolCallParser constructor during construction." ) - def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest) -> ExtractedToolCallInformation: + def extract_tool_calls( + self, model_output: str, request: ChatCompletionRequest, model_status: str + ) -> ExtractedToolCallInformation: """ Extract the tool calls from a complete model response. Supports XML-style formats with newlines: @@ -85,144 +81,31 @@ def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest) 3. Only name and arguments field without content: {"name": "get_weather", "argume """ + extract_content = model_output + if model_status == "tool_call_start": + extract_content = "" + model_output try: - tool_calls = [] - - # Check for invalid tags before tool calls - if re.search(r"[\s\S]*?\s*(?=)", model_output): - data_processor_logger.error("Invalid format: tags found before ") - return ExtractedToolCallInformation(tools_called=False, content=model_output) - - function_call_arr = [] - remaining_text = model_output - - while True: - # 查找下一个tool_call块 - tool_call_pos = remaining_text.find("") - if tool_call_pos == -1: - break - - # 提取tool_call开始位置后的内容 - tool_content_start = tool_call_pos + len("") - tool_content_end = remaining_text.find("", tool_content_start) - - tool_json = "" - if tool_content_end == -1: - # 处理未闭合的tool_call块(截断情况) - tool_json = remaining_text[tool_content_start:].strip() - remaining_text = "" # 没有更多内容需要处理 - else: - # 处理完整的tool_call块 - tool_json = remaining_text[tool_content_start:tool_content_end].strip() - remaining_text = remaining_text[tool_content_end + len("") :] - - if not tool_json: - continue - - # 处理JSON内容 - tool_json = tool_json.strip() - if not tool_json.startswith("{"): - tool_json = "{" + tool_json - if not tool_json.endswith("}"): - tool_json = tool_json + "}" - - try: - # 首先尝试标准JSON解析 - try: - tool_data = json.loads(tool_json) - - if isinstance(tool_data, dict) and "name" in tool_data and "arguments" in tool_data: - function_call_arr.append( - { - "name": tool_data["name"], - "arguments": tool_data["arguments"], - "_is_complete": True, # 明确标记为完整解析 - } - ) - continue - except json.JSONDecodeError: - pass - - # 标准解析失败时尝试partial_json_parser - from partial_json_parser.core.options import Allow - - try: - tool_data = {} - flags = Allow.ALL & ~Allow.STR - - # 解析name字段 - name_match = re.search(r'"name"\s*:\s*"([^"]*)"', tool_json) - if name_match: - tool_data["name"] = name_match.group(1) - - # 解析arguments字段 - args_match = re.search(r'"arguments"\s*:\s*(\{.*)', tool_json) - if args_match: - try: - tool_data["arguments"] = partial_json_parser.loads(args_match.group(1), flags=flags) - except: - tool_data["arguments"] = None - - if isinstance(tool_data, dict): - function_call_arr.append( - { - "name": tool_data.get("name", ""), - "arguments": tool_data.get("arguments", {}), - "_is_partial": True, # 标记为部分解析 - } - ) - except Exception as e: - data_processor_logger.debug(f"Failed to parse tool call: {str(e)}") - continue - except Exception as e: - data_processor_logger.debug(f"Failed to parse tool call: {str(e)}") - continue - - if not function_call_arr: - data_processor_logger.error("No valid tool calls found") - return ExtractedToolCallInformation(tools_called=False, content=model_output) - - tool_calls = [] - all_complete = True # 初始设为True,只要有一个不完整就变为False - - for tool_call in function_call_arr: - # 记录工具调用解析状态 - is_complete = tool_call.get("_is_complete", False) - is_partial = tool_call.get("_is_partial", False) - - # 只要有一个不完整就认为整体不完整 - if not is_complete or is_partial: - all_complete = False - - # 处理参数序列化 - tool_args = tool_call.get("arguments", {}) - if not isinstance(tool_args, dict): - tool_args = {} - - try: - args_str = json.dumps(tool_args, ensure_ascii=False) if tool_args else "{}" - except: - args_str = "{}" - - tool_calls.append( - ToolCall( - type="function", - id=random_tool_call_id(), - function=FunctionCall( - name=tool_call.get("name", ""), - arguments=args_str, - ), - ) + if self.tool_call_start_token not in extract_content: + return ExtractedToolCallInformation(tools_called=False, tool_calls=[], content=model_output) + function_call_tuples = self.tool_call_regex.findall(extract_content) + + raw_function_calls = [json.loads(match[0] if match[0] else match[1]) for match in function_call_tuples] + + tool_calls = [ + ToolCall( + type="function", + function=FunctionCall( + name=function_call["name"], + # function call args are JSON but as a string + arguments=json.dumps(function_call["arguments"], ensure_ascii=False), + ), ) - - # 只有当所有工具调用都明确标记为complete时才返回tools_called=True - return ExtractedToolCallInformation( - tools_called=all_complete, tool_calls=tool_calls if tool_calls else None, content="" - ) - - except Exception as e: - data_processor_logger.error(f"Error in extracting tool call from response: {str(e)}") - return ExtractedToolCallInformation(tools_called=False, tool_calls=None, content=model_output) + for function_call in raw_function_calls + ] + return ExtractedToolCallInformation(tools_called=True, tool_calls=tool_calls, content="") + except Exception: + data_processor_logger.error("Error in extracting tool call from response.") + return ExtractedToolCallInformation(tools_called=False, tool_calls=[], content=model_output) def extract_tool_calls_streaming( self, @@ -233,6 +116,7 @@ def extract_tool_calls_streaming( current_token_ids: Sequence[int], delta_token_ids: Sequence[int], request: dict, + model_status: str, ) -> Union[DeltaMessage, None]: if self.tool_call_start_token_id not in current_token_ids: diff --git a/fastdeploy/input/ernie4_5_processor.py b/fastdeploy/input/ernie4_5_processor.py index 25834946841..041491d27cb 100644 --- a/fastdeploy/input/ernie4_5_processor.py +++ b/fastdeploy/input/ernie4_5_processor.py @@ -234,6 +234,8 @@ def process_request_dict(self, request, max_model_len=None): request["enable_thinking"] = True if self.reasoning_parser: request["model_status"] = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) + if request["model_status"] == "think_start": + request["enable_thinking"] = True data_processor_logger.info(f"Processed request dict: {request}") return request @@ -310,6 +312,7 @@ def process_response_dict_normal(self, response_dict, **kwargs): delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id) if is_end: full_text = previous_texts + delta_text + response_dict["outputs"]["text"] = full_text if self.reasoning_parser and ( enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser" ): @@ -318,14 +321,12 @@ def process_response_dict_normal(self, response_dict, **kwargs): ) response_dict["outputs"]["text"] = text response_dict["outputs"]["reasoning_content"] = reasoning_content - else: - response_dict["outputs"]["text"] = full_text if self.tool_parser_obj: tool_parser = self.tool_parser_obj(self.tokenizer) - tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict) + tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict, model_status) if tool_call_info.tools_called: response_dict["outputs"]["tool_call"] = tool_call_info.tool_calls - response_dict["outputs"]["text"] = tool_call_info.content + response_dict["outputs"]["text"] = tool_call_info.content response_dict["outputs"]["raw_prediction"] = full_text data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}") del self.decode_status[req_id] @@ -377,6 +378,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): previous_token_ids + token_ids, token_ids, response_dict, + model_status, ) if tool_call_delta_message is None or tool_call_delta_message.tool_calls: response_dict["outputs"]["delta_message"] = tool_call_delta_message diff --git a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py index fc1db88679d..044f344fec7 100644 --- a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py +++ b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py @@ -87,9 +87,9 @@ def get_model_status(self, prompt_token_ids: list[int]): special_token_id = self.find_last_special_token(prompt_token_ids) if special_token_id == -1: - return "response_start" + return "think_start" - return self.token_status_mapping.get(special_token_id, "response_start") + return self.token_status_mapping[special_token_id] def is_reasoning_end(self, input_ids: list[int]) -> bool: return self.tool_call_start_token_id in input_ids @@ -102,67 +102,33 @@ def extract_reasoning_content_streaming( previous_token_ids: Sequence[int], current_token_ids: Sequence[int], delta_token_ids: Sequence[int], + model_status: str, ) -> Union[DeltaMessage, None]: - """ - 根据用户需求实现的流式解析方法: - 1. 初始内容都视为思考内容,返回delta_text,"" - 2. 当遇到\n时检查后续是否是 - 3. 如果直接遇到也结束思考 - 4. 思考结束后检查是还是 - 5. 对于内容,处理各种边界条件 - """ - if len(delta_token_ids) == 1 and delta_token_ids[0] == self.think_end_token_id: - return None - # 思考阶段处理 - if not previous_text.endswith(self.think_end_token) and self.think_end_token not in previous_text: - # 如果遇到\n,暂时不返回,等待下一个delta_text - if delta_text == "\n": - return None - # 如果前一个是\n且当前是,结束思考 - elif previous_text.endswith("\n") and delta_text.startswith(self.think_end_token): - return None - # 如果直接遇到也结束思考 - elif delta_text.startswith(self.think_end_token): - return None - # 否则继续返回思考内容 - return DeltaMessage(reasoning_content=delta_text) - - # 思考结束后检查是tool_call还是response - remaining_text = previous_text + delta_text - after_think = remaining_text[remaining_text.find(self.think_end_token) + len(self.think_end_token) :] - after_think = after_think.lstrip("\n") # 跳过think后的换行 - - # 处理tool_call情况 - if after_think.startswith(self.tool_call_start_token): + + if len(delta_token_ids) == 1 and delta_token_ids[0] in [ + self.think_end_token_id, + self.response_start_token_id, + self.response_end_token_id, + ]: return None - # 处理response情况 - if after_think.startswith(self.response_start_token): - # 遇到标签时不立即返回 - if delta_text == self.response_start_token: - return None - # 遇到后的换行符也不立即返回 - elif delta_text == "\n" and previous_text.endswith(self.response_start_token): - return None - # 处理回复内容中的换行符 - if delta_text == "\n": - return None - # 如果前一个是\n且当前是,结束回复 - elif previous_text.endswith("\n") and delta_text == self.response_end_token: - return None - # 如果直接遇到也结束回复 - elif delta_text == self.response_end_token: - return None - # 其他情况返回实际内容 + if model_status == "think_start": + if self.think_end_token_id not in current_token_ids: + return DeltaMessage(reasoning_content=delta_text) else: + if ( + self.response_start_token_id in current_token_ids + and self.response_end_token_id not in current_token_ids + ): + return DeltaMessage(content=delta_text) + elif model_status == "think_end": + if self.response_start_token_id in current_token_ids: return DeltaMessage(content=delta_text) + elif model_status == "response_start": + return DeltaMessage(content=delta_text) - # 默认情况不返回内容 return None - def strip_last_newline(self, content: str, end_pos: int) -> str: - return content[: end_pos - 1] if end_pos > 0 and content[end_pos - 1] == "\n" else content[:end_pos] - def extract_reasoning_content( self, model_output: str, request: ChatCompletionRequest, model_status: str ) -> Tuple[str, str]: @@ -174,32 +140,29 @@ def extract_reasoning_content( reasoning_content = "" response_content = "" - # Define helper function to strip the last newline before a closing tag if model_status == "think_start": think_end_pos = model_output.find(self.think_end_token) if think_end_pos != -1: - # Extract reasoning content - reasoning_content = self.strip_last_newline(model_output, think_end_pos) + reasoning_content = model_output[:think_end_pos] remaining = model_output[think_end_pos + len(self.think_end_token) :].lstrip("\n") # Determine if remaining content is a response or tool call if remaining.startswith(self.response_start_token): - response_start_pos = len(self.response_start_token) - response_content = self._extract_response_content(remaining[response_start_pos:]) + response_start_len = len(self.response_start_token) + response_content = self._extract_response_content(remaining[response_start_len:]) elif remaining.startswith(self.tool_call_start_token): pass # No response content else: - # No think_end_token found, treat entire output as reasoning content reasoning_content = model_output elif model_status == "think_end": remaining = model_output.lstrip("\n") if remaining.startswith(self.response_start_token): - response_start_pos = len(self.response_start_token) - response_content = self._extract_response_content(remaining[response_start_pos:]) + response_start_len = len(self.response_start_token) + response_content = self._extract_response_content(remaining[response_start_len:]) elif model_status == "response_start": - response_content = model_output.replace(self.response_end_token, "") + response_content = self._extract_response_content(model_output) return reasoning_content, response_content @@ -210,5 +173,5 @@ def _extract_response_content(self, remaining: str) -> str: """ response_end_pos = remaining.find(self.response_end_token) if response_end_pos != -1: - return self.strip_last_newline(remaining, response_end_pos) + return remaining[:response_end_pos] return remaining From 2f6f06324decb82086bc544655338324a70f6c6d Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Thu, 25 Sep 2025 15:03:27 +0800 Subject: [PATCH 05/32] fix parser --- fastdeploy/reasoning/ernie_x1_reasoning_parsers.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py index 044f344fec7..67028f9626c 100644 --- a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py +++ b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py @@ -122,10 +122,14 @@ def extract_reasoning_content_streaming( ): return DeltaMessage(content=delta_text) elif model_status == "think_end": - if self.response_start_token_id in current_token_ids: + if ( + self.response_start_token_id in current_token_ids + and self.response_end_token_id not in current_token_ids + ): return DeltaMessage(content=delta_text) elif model_status == "response_start": - return DeltaMessage(content=delta_text) + if self.response_end_token_id not in current_token_ids: + return DeltaMessage(content=delta_text) return None From 41f141829625169a1debcd86dc11925b4b56ce22 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Thu, 25 Sep 2025 15:04:54 +0800 Subject: [PATCH 06/32] fix parser --- fastdeploy/reasoning/ernie_x1_reasoning_parsers.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py index 67028f9626c..f8f33b3035d 100644 --- a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py +++ b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py @@ -72,14 +72,7 @@ def __init__(self, tokenizer): def find_last_special_token(self, prompt_token_ids: list[int]) -> int: for i in range(len(prompt_token_ids) - 1, -1, -1): - if prompt_token_ids[i] in [ - self.think_end_token_id, - self.think_start_token_id, - self.response_start_token_id, - self.response_end_token_id, - self.tool_call_start_token_id, - self.tool_call_end_token_id, - ]: + if prompt_token_ids[i] in self.token_status_mapping: return prompt_token_ids[i] return -1 From 300f446d8a5d2046b9f364b95e46217325403990 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Thu, 25 Sep 2025 18:11:37 +0800 Subject: [PATCH 07/32] fix parser --- .../tool_parsers/ernie_x1_tool_parser.py | 176 +++++++++++++++--- 1 file changed, 146 insertions(+), 30 deletions(-) diff --git a/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py index e5df1a2e178..9b0c7b9cb5f 100644 --- a/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py +++ b/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py @@ -14,10 +14,18 @@ import json import re +import uuid from collections.abc import Sequence from typing import Union -from fastdeploy.entrypoints.chat_utils import random_tool_call_id +import partial_json_parser + + +def random_tool_call_id() -> str: + """Generate a random tool call ID""" + return f"chatcmpl-tool-{str(uuid.uuid4().hex)}" + + from fastdeploy.entrypoints.openai.protocol import ( ChatCompletionRequest, DeltaFunctionCall, @@ -53,8 +61,6 @@ def __init__(self, tokenizer): self.tool_call_start_token: str = "" self.tool_call_end_token: str = "" - self.tool_call_regex = re.compile(r"(.*?)|(.*)", re.DOTALL) - self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token) self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) if self.tool_call_start_token_id is None or self.tool_call_end_token_id is None: @@ -67,9 +73,7 @@ def __init__(self, tokenizer): "The model tokenizer must be passed to the ToolCallParser constructor during construction." ) - def extract_tool_calls( - self, model_output: str, request: ChatCompletionRequest, model_status: str - ) -> ExtractedToolCallInformation: + def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest) -> ExtractedToolCallInformation: """ Extract the tool calls from a complete model response. Supports XML-style formats with newlines: @@ -81,31 +85,144 @@ def extract_tool_calls( 3. Only name and arguments field without content: {"name": "get_weather", "argume """ - extract_content = model_output - if model_status == "tool_call_start": - extract_content = "" + model_output try: - if self.tool_call_start_token not in extract_content: - return ExtractedToolCallInformation(tools_called=False, tool_calls=[], content=model_output) - function_call_tuples = self.tool_call_regex.findall(extract_content) - - raw_function_calls = [json.loads(match[0] if match[0] else match[1]) for match in function_call_tuples] - - tool_calls = [ - ToolCall( - type="function", - function=FunctionCall( - name=function_call["name"], - # function call args are JSON but as a string - arguments=json.dumps(function_call["arguments"], ensure_ascii=False), - ), + tool_calls = [] + + # Check for invalid tags before tool calls + if re.search(r"[\s\S]*?\s*(?=)", model_output): + data_processor_logger.error("Invalid format: tags found before ") + return ExtractedToolCallInformation(tools_called=False, content=model_output) + + function_call_arr = [] + remaining_text = model_output + + while True: + # 查找下一个tool_call块 + tool_call_pos = remaining_text.find("") + if tool_call_pos == -1: + break + + # 提取tool_call开始位置后的内容 + tool_content_start = tool_call_pos + len("") + tool_content_end = remaining_text.find("", tool_content_start) + + tool_json = "" + if tool_content_end == -1: + # 处理未闭合的tool_call块(截断情况) + tool_json = remaining_text[tool_content_start:].strip() + remaining_text = "" # 没有更多内容需要处理 + else: + # 处理完整的tool_call块 + tool_json = remaining_text[tool_content_start:tool_content_end].strip() + remaining_text = remaining_text[tool_content_end + len("") :] + + if not tool_json: + continue + + # 处理JSON内容 + tool_json = tool_json.strip() + if not tool_json.startswith("{"): + tool_json = "{" + tool_json + if not tool_json.endswith("}"): + tool_json = tool_json + "}" + + try: + # 首先尝试标准JSON解析 + try: + tool_data = json.loads(tool_json) + + if isinstance(tool_data, dict) and "name" in tool_data and "arguments" in tool_data: + function_call_arr.append( + { + "name": tool_data["name"], + "arguments": tool_data["arguments"], + "_is_complete": True, # 明确标记为完整解析 + } + ) + continue + except json.JSONDecodeError: + pass + + # 标准解析失败时尝试partial_json_parser + from partial_json_parser.core.options import Allow + + try: + tool_data = {} + flags = Allow.ALL & ~Allow.STR + + # 解析name字段 + name_match = re.search(r'"name"\s*:\s*"([^"]*)"', tool_json) + if name_match: + tool_data["name"] = name_match.group(1) + + # 解析arguments字段 + args_match = re.search(r'"arguments"\s*:\s*(\{.*)', tool_json) + if args_match: + try: + tool_data["arguments"] = partial_json_parser.loads(args_match.group(1), flags=flags) + except: + tool_data["arguments"] = None + + if isinstance(tool_data, dict): + function_call_arr.append( + { + "name": tool_data.get("name", ""), + "arguments": tool_data.get("arguments", {}), + "_is_partial": True, # 标记为部分解析 + } + ) + except Exception as e: + data_processor_logger.debug(f"Failed to parse tool call: {str(e)}") + continue + except Exception as e: + data_processor_logger.debug(f"Failed to parse tool call: {str(e)}") + continue + + if not function_call_arr: + data_processor_logger.error("No valid tool calls found") + return ExtractedToolCallInformation(tools_called=False, content=model_output) + + tool_calls = [] + all_complete = True # 初始设为True,只要有一个不完整就变为False + + for tool_call in function_call_arr: + # 记录工具调用解析状态 + is_complete = tool_call.get("_is_complete", False) + is_partial = tool_call.get("_is_partial", False) + + # 只要有一个不完整就认为整体不完整 + if not is_complete or is_partial: + all_complete = False + + # 处理参数序列化 + tool_args = tool_call.get("arguments", {}) + if not isinstance(tool_args, dict): + tool_args = {} + + try: + args_str = json.dumps(tool_args, ensure_ascii=False) if tool_args else "{}" + except: + args_str = "{}" + + tool_calls.append( + ToolCall( + type="function", + id=random_tool_call_id(), + function=FunctionCall( + name=tool_call.get("name", ""), + arguments=args_str, + ), + ) ) - for function_call in raw_function_calls - ] - return ExtractedToolCallInformation(tools_called=True, tool_calls=tool_calls, content="") - except Exception: - data_processor_logger.error("Error in extracting tool call from response.") - return ExtractedToolCallInformation(tools_called=False, tool_calls=[], content=model_output) + + # 只有当所有工具调用都明确标记为complete时才返回tools_called=True + return ExtractedToolCallInformation( + tools_called=all_complete, tool_calls=tool_calls if tool_calls else None, content="" + ) + + except Exception as e: + data_processor_logger.error(f"Error in extracting tool call from response: {str(e)}") + return ExtractedToolCallInformation(tools_called=False, tool_calls=None, content=model_output) def extract_tool_calls_streaming( self, @@ -116,7 +233,6 @@ def extract_tool_calls_streaming( current_token_ids: Sequence[int], delta_token_ids: Sequence[int], request: dict, - model_status: str, ) -> Union[DeltaMessage, None]: if self.tool_call_start_token_id not in current_token_ids: From 3b936726ed51165722a4dd1ba9524860691b90e3 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Thu, 25 Sep 2025 19:33:34 +0800 Subject: [PATCH 08/32] Revert "fix parser" This reverts commit 300f446d8a5d2046b9f364b95e46217325403990. --- .../tool_parsers/ernie_x1_tool_parser.py | 176 +++--------------- 1 file changed, 30 insertions(+), 146 deletions(-) diff --git a/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py index 9b0c7b9cb5f..e5df1a2e178 100644 --- a/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py +++ b/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py @@ -14,18 +14,10 @@ import json import re -import uuid from collections.abc import Sequence from typing import Union -import partial_json_parser - - -def random_tool_call_id() -> str: - """Generate a random tool call ID""" - return f"chatcmpl-tool-{str(uuid.uuid4().hex)}" - - +from fastdeploy.entrypoints.chat_utils import random_tool_call_id from fastdeploy.entrypoints.openai.protocol import ( ChatCompletionRequest, DeltaFunctionCall, @@ -61,6 +53,8 @@ def __init__(self, tokenizer): self.tool_call_start_token: str = "" self.tool_call_end_token: str = "" + self.tool_call_regex = re.compile(r"(.*?)|(.*)", re.DOTALL) + self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token) self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) if self.tool_call_start_token_id is None or self.tool_call_end_token_id is None: @@ -73,7 +67,9 @@ def __init__(self, tokenizer): "The model tokenizer must be passed to the ToolCallParser constructor during construction." ) - def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest) -> ExtractedToolCallInformation: + def extract_tool_calls( + self, model_output: str, request: ChatCompletionRequest, model_status: str + ) -> ExtractedToolCallInformation: """ Extract the tool calls from a complete model response. Supports XML-style formats with newlines: @@ -85,144 +81,31 @@ def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest) 3. Only name and arguments field without content: {"name": "get_weather", "argume """ + extract_content = model_output + if model_status == "tool_call_start": + extract_content = "" + model_output try: - tool_calls = [] - - # Check for invalid tags before tool calls - if re.search(r"[\s\S]*?\s*(?=)", model_output): - data_processor_logger.error("Invalid format: tags found before ") - return ExtractedToolCallInformation(tools_called=False, content=model_output) - - function_call_arr = [] - remaining_text = model_output - - while True: - # 查找下一个tool_call块 - tool_call_pos = remaining_text.find("") - if tool_call_pos == -1: - break - - # 提取tool_call开始位置后的内容 - tool_content_start = tool_call_pos + len("") - tool_content_end = remaining_text.find("", tool_content_start) - - tool_json = "" - if tool_content_end == -1: - # 处理未闭合的tool_call块(截断情况) - tool_json = remaining_text[tool_content_start:].strip() - remaining_text = "" # 没有更多内容需要处理 - else: - # 处理完整的tool_call块 - tool_json = remaining_text[tool_content_start:tool_content_end].strip() - remaining_text = remaining_text[tool_content_end + len("") :] - - if not tool_json: - continue - - # 处理JSON内容 - tool_json = tool_json.strip() - if not tool_json.startswith("{"): - tool_json = "{" + tool_json - if not tool_json.endswith("}"): - tool_json = tool_json + "}" - - try: - # 首先尝试标准JSON解析 - try: - tool_data = json.loads(tool_json) - - if isinstance(tool_data, dict) and "name" in tool_data and "arguments" in tool_data: - function_call_arr.append( - { - "name": tool_data["name"], - "arguments": tool_data["arguments"], - "_is_complete": True, # 明确标记为完整解析 - } - ) - continue - except json.JSONDecodeError: - pass - - # 标准解析失败时尝试partial_json_parser - from partial_json_parser.core.options import Allow - - try: - tool_data = {} - flags = Allow.ALL & ~Allow.STR - - # 解析name字段 - name_match = re.search(r'"name"\s*:\s*"([^"]*)"', tool_json) - if name_match: - tool_data["name"] = name_match.group(1) - - # 解析arguments字段 - args_match = re.search(r'"arguments"\s*:\s*(\{.*)', tool_json) - if args_match: - try: - tool_data["arguments"] = partial_json_parser.loads(args_match.group(1), flags=flags) - except: - tool_data["arguments"] = None - - if isinstance(tool_data, dict): - function_call_arr.append( - { - "name": tool_data.get("name", ""), - "arguments": tool_data.get("arguments", {}), - "_is_partial": True, # 标记为部分解析 - } - ) - except Exception as e: - data_processor_logger.debug(f"Failed to parse tool call: {str(e)}") - continue - except Exception as e: - data_processor_logger.debug(f"Failed to parse tool call: {str(e)}") - continue - - if not function_call_arr: - data_processor_logger.error("No valid tool calls found") - return ExtractedToolCallInformation(tools_called=False, content=model_output) - - tool_calls = [] - all_complete = True # 初始设为True,只要有一个不完整就变为False - - for tool_call in function_call_arr: - # 记录工具调用解析状态 - is_complete = tool_call.get("_is_complete", False) - is_partial = tool_call.get("_is_partial", False) - - # 只要有一个不完整就认为整体不完整 - if not is_complete or is_partial: - all_complete = False - - # 处理参数序列化 - tool_args = tool_call.get("arguments", {}) - if not isinstance(tool_args, dict): - tool_args = {} - - try: - args_str = json.dumps(tool_args, ensure_ascii=False) if tool_args else "{}" - except: - args_str = "{}" - - tool_calls.append( - ToolCall( - type="function", - id=random_tool_call_id(), - function=FunctionCall( - name=tool_call.get("name", ""), - arguments=args_str, - ), - ) + if self.tool_call_start_token not in extract_content: + return ExtractedToolCallInformation(tools_called=False, tool_calls=[], content=model_output) + function_call_tuples = self.tool_call_regex.findall(extract_content) + + raw_function_calls = [json.loads(match[0] if match[0] else match[1]) for match in function_call_tuples] + + tool_calls = [ + ToolCall( + type="function", + function=FunctionCall( + name=function_call["name"], + # function call args are JSON but as a string + arguments=json.dumps(function_call["arguments"], ensure_ascii=False), + ), ) - - # 只有当所有工具调用都明确标记为complete时才返回tools_called=True - return ExtractedToolCallInformation( - tools_called=all_complete, tool_calls=tool_calls if tool_calls else None, content="" - ) - - except Exception as e: - data_processor_logger.error(f"Error in extracting tool call from response: {str(e)}") - return ExtractedToolCallInformation(tools_called=False, tool_calls=None, content=model_output) + for function_call in raw_function_calls + ] + return ExtractedToolCallInformation(tools_called=True, tool_calls=tool_calls, content="") + except Exception: + data_processor_logger.error("Error in extracting tool call from response.") + return ExtractedToolCallInformation(tools_called=False, tool_calls=[], content=model_output) def extract_tool_calls_streaming( self, @@ -233,6 +116,7 @@ def extract_tool_calls_streaming( current_token_ids: Sequence[int], delta_token_ids: Sequence[int], request: dict, + model_status: str, ) -> Union[DeltaMessage, None]: if self.tool_call_start_token_id not in current_token_ids: From dae8419978ea86da972b4864da3190d1ef752996 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Thu, 25 Sep 2025 22:03:43 +0800 Subject: [PATCH 09/32] fix parser --- .../openai/tool_parsers/ernie_x1_tool_parser.py | 16 ++++------------ fastdeploy/input/ernie4_5_processor.py | 5 ++--- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py index e5df1a2e178..a22ed9a0a34 100644 --- a/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py +++ b/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py @@ -58,18 +58,14 @@ def __init__(self, tokenizer): self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token) self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) if self.tool_call_start_token_id is None or self.tool_call_end_token_id is None: - raise RuntimeError( - "Hermes 2 Pro Tool parser could not locate tool call start/end " "tokens in the tokenizer!" - ) + raise RuntimeError("Ernie x1 Tool parser could not locate tool call start/end " "tokens in the tokenizer!") if not self.model_tokenizer: raise ValueError( "The model tokenizer must be passed to the ToolCallParser constructor during construction." ) - def extract_tool_calls( - self, model_output: str, request: ChatCompletionRequest, model_status: str - ) -> ExtractedToolCallInformation: + def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest) -> ExtractedToolCallInformation: """ Extract the tool calls from a complete model response. Supports XML-style formats with newlines: @@ -81,13 +77,10 @@ def extract_tool_calls( 3. Only name and arguments field without content: {"name": "get_weather", "argume """ - extract_content = model_output - if model_status == "tool_call_start": - extract_content = "" + model_output try: - if self.tool_call_start_token not in extract_content: + if self.tool_call_start_token not in model_output: return ExtractedToolCallInformation(tools_called=False, tool_calls=[], content=model_output) - function_call_tuples = self.tool_call_regex.findall(extract_content) + function_call_tuples = self.tool_call_regex.findall(model_output) raw_function_calls = [json.loads(match[0] if match[0] else match[1]) for match in function_call_tuples] @@ -116,7 +109,6 @@ def extract_tool_calls_streaming( current_token_ids: Sequence[int], delta_token_ids: Sequence[int], request: dict, - model_status: str, ) -> Union[DeltaMessage, None]: if self.tool_call_start_token_id not in current_token_ids: diff --git a/fastdeploy/input/ernie4_5_processor.py b/fastdeploy/input/ernie4_5_processor.py index 041491d27cb..38db110396a 100644 --- a/fastdeploy/input/ernie4_5_processor.py +++ b/fastdeploy/input/ernie4_5_processor.py @@ -323,10 +323,10 @@ def process_response_dict_normal(self, response_dict, **kwargs): response_dict["outputs"]["reasoning_content"] = reasoning_content if self.tool_parser_obj: tool_parser = self.tool_parser_obj(self.tokenizer) - tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict, model_status) + tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict) if tool_call_info.tools_called: response_dict["outputs"]["tool_call"] = tool_call_info.tool_calls - response_dict["outputs"]["text"] = tool_call_info.content + response_dict["outputs"]["text"] = tool_call_info.content response_dict["outputs"]["raw_prediction"] = full_text data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}") del self.decode_status[req_id] @@ -378,7 +378,6 @@ def process_response_dict_streaming(self, response_dict, **kwargs): previous_token_ids + token_ids, token_ids, response_dict, - model_status, ) if tool_call_delta_message is None or tool_call_delta_message.tool_calls: response_dict["outputs"]["delta_message"] = tool_call_delta_message From e49676cdf6af157e37121d8bb59e941fe7e47cb7 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Fri, 26 Sep 2025 17:43:31 +0800 Subject: [PATCH 10/32] fix --- fastdeploy/engine/request.py | 3 +- fastdeploy/entrypoints/openai/serving_chat.py | 9 +-- fastdeploy/input/ernie4_5_processor.py | 23 +++++-- fastdeploy/input/text_processor.py | 4 ++ .../reasoning/ernie_vl_reasoning_parsers.py | 62 +++++++++++-------- .../reasoning/qwen3_reasoning_parsers.py | 6 +- 6 files changed, 67 insertions(+), 40 deletions(-) diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py index 3906cd29b5f..d65c653c2af 100644 --- a/fastdeploy/engine/request.py +++ b/fastdeploy/engine/request.py @@ -71,7 +71,8 @@ def __init__( guided_grammar: Optional[Any] = None, structural_tag: Optional[Any] = None, guided_json_object: Optional[bool] = None, - enable_thinking: Optional[bool] = True, + enable_thinking: Optional[bool] = False, + model_status: Optional[str] = None, trace_carrier: dict = dict(), dp_rank: Optional[int] = None, chat_template: Optional[str] = None, diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index 8922d7a7e8e..36f5a97c530 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -120,7 +120,6 @@ async def create_chat_completion(self, request: ChatCompletionRequest): text_after_process = current_req_dict.get("text_after_process") if isinstance(prompt_token_ids, np.ndarray): prompt_token_ids = prompt_token_ids.tolist() - model_status = current_req_dict.get("model_status") except ParameterError as e: api_server_logger.error(f"request[{request_id}] generator error: {str(e)}, {e.message}") self.engine_client.semaphore.release() @@ -136,12 +135,12 @@ async def create_chat_completion(self, request: ChatCompletionRequest): if request.stream: return self.chat_completion_stream_generator( - request, request_id, request.model, prompt_token_ids, text_after_process, model_status + request, request_id, request.model, prompt_token_ids, text_after_process ) else: try: return await self.chat_completion_full_generator( - request, request_id, request.model, prompt_token_ids, text_after_process, model_status + request, request_id, request.model, prompt_token_ids, text_after_process ) except Exception as e: error_msg = f"request[{request_id}]full generator error: {str(e)}, {str(traceback.format_exc())}" @@ -169,7 +168,6 @@ async def chat_completion_stream_generator( model_name: str, prompt_token_ids: list(), text_after_process: str, - model_status: str, ): """ Streaming chat completion generator. @@ -240,7 +238,6 @@ async def chat_completion_stream_generator( generator = response_processor.process_response_chat( response, stream=True, - model_status=model_status, include_stop_str_in_output=include_stop_str_in_output, ) @@ -410,7 +407,6 @@ async def chat_completion_full_generator( model_name: str, prompt_token_ids: list(), text_after_process: str, - model_status: str, ): """ Full chat completion generator. @@ -460,7 +456,6 @@ async def chat_completion_full_generator( generator = response_processor.process_response_chat( response, stream=False, - model_status=model_status, include_stop_str_in_output=include_stop_str_in_output, ) async for data in generator: diff --git a/fastdeploy/input/ernie4_5_processor.py b/fastdeploy/input/ernie4_5_processor.py index 38db110396a..bc7bd6c4657 100644 --- a/fastdeploy/input/ernie4_5_processor.py +++ b/fastdeploy/input/ernie4_5_processor.py @@ -60,6 +60,7 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_ob self.decode_status = dict() self.tool_parser_dict = dict() self.thinking_parser_dict = dict() + self.model_status_dict = dict() self._load_tokenizer() data_processor_logger.info( f"tokenizer information: bos_token is {self.tokenizer.bos_token} \ @@ -154,6 +155,12 @@ def process_request(self, request, max_model_len=None, **kwargs): request.set("top_p", _SAMPLING_EPS) if self.reasoning_parser and self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser": request.enable_thinking = True + if self.reasoning_parser: + self.model_status_dict[request.request_id] = self.reasoning_parser.get_model_status( + request.prompt_token_ids + ) + if self.model_status_dict[request.request_id] == "think_start": + request.enable_thinking = True data_processor_logger.info(f"Processed request: {request}") return request @@ -233,8 +240,8 @@ def process_request_dict(self, request, max_model_len=None): if self.reasoning_parser and self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser": request["enable_thinking"] = True if self.reasoning_parser: - request["model_status"] = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) - if request["model_status"] == "think_start": + self.model_status_dict["request_id"] = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) + if self.model_status_dict["request_id"] == "think_start": request["enable_thinking"] = True data_processor_logger.info(f"Processed request dict: {request}") return request @@ -274,6 +281,8 @@ def process_response(self, response_dict, **kwargs): data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}") if response_dict.outputs.text == "" and response_dict.outputs.reasoning_content == "": return None + if req_id in self.model_status_dict: + del self.model_status_dict[req_id] return response_dict def process_response_dict(self, response_dict, stream, **kwargs): @@ -302,7 +311,6 @@ def process_response_dict_normal(self, response_dict, **kwargs): Dict: response contain text fields """ enable_thinking = kwargs.get("enable_thinking") - model_status = kwargs.get("model_status") token_ids = response_dict["outputs"]["token_ids"] is_end = response_dict["finished"] req_id = response_dict["request_id"] @@ -317,7 +325,7 @@ def process_response_dict_normal(self, response_dict, **kwargs): enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser" ): reasoning_content, text = self.reasoning_parser.extract_reasoning_content( - full_text, response_dict, model_status + full_text, response_dict, self.model_status_dict.get(req_id) ) response_dict["outputs"]["text"] = text response_dict["outputs"]["reasoning_content"] = reasoning_content @@ -330,6 +338,8 @@ def process_response_dict_normal(self, response_dict, **kwargs): response_dict["outputs"]["raw_prediction"] = full_text data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}") del self.decode_status[req_id] + if req_id in self.model_status_dict: + del self.model_status_dict[req_id] return response_dict def process_response_dict_streaming(self, response_dict, **kwargs): @@ -343,7 +353,6 @@ def process_response_dict_streaming(self, response_dict, **kwargs): Dict: response contain text fields """ enable_thinking = kwargs.get("enable_thinking") - model_status = kwargs.get("model_status") is_end = response_dict["finished"] req_id = response_dict["request_id"] token_ids = response_dict["outputs"]["token_ids"] @@ -363,7 +372,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): previous_token_ids, previous_token_ids + token_ids, token_ids, - model_status, + self.model_status_dict.get(req_id), ) response_dict["outputs"]["delta_message"] = reasoning_delta_message if self.tool_parser_obj: @@ -387,6 +396,8 @@ def process_response_dict_streaming(self, response_dict, **kwargs): del self.decode_status[req_id] if req_id in self.tool_parser_dict: del self.tool_parser_dict[req_id] + if req_id in self.model_status_dict: + del self.model_status_dict[req_id] return response_dict def messages2ids(self, request_or_messages): diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py index 97aac5cf6f2..a914dec30b1 100644 --- a/fastdeploy/input/text_processor.py +++ b/fastdeploy/input/text_processor.py @@ -265,6 +265,10 @@ def process_request(self, request, max_model_len=None, **kwargs): request.set("temperature", 1) if request.get("top_p") < _SAMPLING_EPS: request.set("top_p", _SAMPLING_EPS) + if self.reasoning_parser: + request.model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids) + if request.model_status == "think_start": + request.enable_thinking = True data_processor_logger.info(f"Processed request: {request}") return request diff --git a/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py b/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py index 7806658d3c2..fe44fd47e82 100644 --- a/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py +++ b/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py @@ -35,38 +35,47 @@ class ErnieVLReasoningParser(ReasoningParser): def __init__(self, tokenizer): super().__init__(tokenizer) - self.think_start_token = "" - self.think_end_token = "" + token_definitions = { + "think_start_token": "", + "think_end_token": "", + } if not self.model_tokenizer: - raise ValueError( - "The model tokenizer must be passed to the ReasoningParser " "constructor during construction." + raise ValueError("The model tokenizer must be passed to the ReasoningParser constructor.") + + missing_tokens = [] + for name, token_value in token_definitions.items(): + setattr(self, name, token_value) + token_id = self.vocab.get(token_value) + setattr(self, f"{name}_id", token_id) + if token_id is None: + missing_tokens.append(f"{name.replace('_', ' ')} token") + + if missing_tokens: + raise RuntimeError( + f"Could not find the following token ids in tokenizer vocabulary: {', '.join(missing_tokens)}" ) - - self.think_end_token_id = self.vocab.get(self.think_end_token) - if self.think_end_token_id is None: - raise RuntimeError("Ernie VL reasoning parser could not locate think end " "tokens in the tokenizer!") - self.think_start_token_id = self.vocab.get(self.think_start_token) + self.token_status_mapping = { + self.think_start_token_id: "think_start", + self.think_end_token_id: "think_end", + } def is_reasoning_end(self, input_ids: list[int]) -> bool: return self.think_end_token_id in input_ids def find_last_special_token(self, prompt_token_ids: list[int]) -> int: for i in range(len(prompt_token_ids) - 1, -1, -1): - if prompt_token_ids[i] in [self.think_end_token_id, self.think_start_token_id]: + if prompt_token_ids[i] in self.token_status_mapping: return prompt_token_ids[i] return -1 def get_model_status(self, prompt_token_ids: list[int]): special_token_id = self.find_last_special_token(prompt_token_ids) + if special_token_id == -1: - return "responding" - if special_token_id == self.think_end_token_id: - return "responding" - if self.think_start_token_id == special_token_id: - return "thinking" + return "think_start" - return "responding" + return self.token_status_mapping[special_token_id] def extract_reasoning_content_streaming( self, @@ -89,15 +98,18 @@ def extract_reasoning_content_streaming( # Skip single special tokens if len(delta_token_ids) == 1 and delta_token_ids[0] == self.think_end_token_id: return None - if self.think_end_token_id in delta_token_ids: - end_index = delta_text.find(self.end_token) - reasoning_content = delta_text[:end_index] - content = delta_text[end_index + len(self.end_token) :] - return DeltaMessage(reasoning_content=reasoning_content, content=content) - elif self.think_end_token_id in previous_token_ids: - return DeltaMessage(content=delta_text) + if model_status == "think_start": + if self.think_end_token_id in delta_token_ids: + end_index = delta_text.find(self.end_token) + reasoning_content = delta_text[:end_index] + content = delta_text[end_index + len(self.end_token) :] + return DeltaMessage(reasoning_content=reasoning_content, content=content) + elif self.think_end_token_id in previous_token_ids: + return DeltaMessage(content=delta_text) + else: + return DeltaMessage(reasoning_content=delta_text) else: - return DeltaMessage(reasoning_content=delta_text) + return DeltaMessage(content=delta_text) def extract_reasoning_content( self, @@ -117,7 +129,7 @@ def extract_reasoning_content( """ # Check if the model output contains the tokens. - if model_status == "thinking": + if model_status == "think_start": if self.think_end_token not in model_output: return model_output, "" reasoning_content, _, content = model_output.partition(self.think_end_token) diff --git a/fastdeploy/reasoning/qwen3_reasoning_parsers.py b/fastdeploy/reasoning/qwen3_reasoning_parsers.py index 463cab83df3..24c72a53a4e 100644 --- a/fastdeploy/reasoning/qwen3_reasoning_parsers.py +++ b/fastdeploy/reasoning/qwen3_reasoning_parsers.py @@ -51,6 +51,9 @@ def __init__(self, tokenizer): def is_reasoning_end(self, input_ids: list[int]) -> bool: return self.think_end_token_id in input_ids + def get_model_status(self, prompt_token_ids: list[int]): + return "think_start" + def extract_reasoning_content_streaming( self, previous_text: str, @@ -59,6 +62,7 @@ def extract_reasoning_content_streaming( previous_token_ids: Sequence[int], current_token_ids: Sequence[int], delta_token_ids: Sequence[int], + model_status: str, ) -> Union[DeltaMessage, None]: """ Extract reasoning content from a delta message. @@ -103,7 +107,7 @@ def extract_reasoning_content_streaming( return DeltaMessage(reasoning_content=delta_text) def extract_reasoning_content( - self, model_output: str, request: ChatCompletionRequest + self, model_output: str, request: ChatCompletionRequest, model_status: str ) -> tuple[Optional[str], Optional[str]]: """ Extract reasoning content from the model output. From 2c92f6fe6e92459e97d32c3ed4f0e66bd9bfdc1d Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Fri, 26 Sep 2025 18:10:06 +0800 Subject: [PATCH 11/32] fix --- .../entrypoints/openai/response_processors.py | 6 +-- fastdeploy/input/ernie4_5_processor.py | 23 ++++----- .../ernie4_5_vl_processor.py | 27 +++-------- fastdeploy/input/text_processor.py | 48 +++++++++++-------- 4 files changed, 44 insertions(+), 60 deletions(-) diff --git a/fastdeploy/entrypoints/openai/response_processors.py b/fastdeploy/entrypoints/openai/response_processors.py index 22bfbf63213..0640ec99859 100644 --- a/fastdeploy/entrypoints/openai/response_processors.py +++ b/fastdeploy/entrypoints/openai/response_processors.py @@ -67,13 +67,12 @@ def accumulate_token_ids(self, request_output): else: self._multipart_buffer.append({"decode_type": decode_type, "request_output": request_output}) - async def process_response_chat(self, request_outputs, stream, model_status, include_stop_str_in_output): + async def process_response_chat(self, request_outputs, stream, include_stop_str_in_output): """ Process a list of responses into a generator that yields each processed response as it's generated. Args: request_outputs: The list of outputs to be processed. stream: Whether or not to stream the output. - model_status: Whether or not to show thinking messages. include_stop_str_in_output: Whether or not to include stop strings in the output. """ for request_output in request_outputs: @@ -82,7 +81,6 @@ async def process_response_chat(self, request_outputs, stream, model_status, inc yield self.data_processor.process_response_dict( response_dict=request_output, stream=stream, - model_status=model_status, include_stop_str_in_output=include_stop_str_in_output, ) elif stream: @@ -108,7 +106,6 @@ async def process_response_chat(self, request_outputs, stream, model_status, inc self.data_processor.process_response_dict( response_dict=request_output, stream=stream, - model_status=model_status, include_stop_str_in_output=include_stop_str_in_output, ) text = {"type": "text", "text": request_output["outputs"]["text"]} @@ -128,7 +125,6 @@ async def process_response_chat(self, request_outputs, stream, model_status, inc self.data_processor.process_response_dict( response_dict=part["request_output"], stream=False, - model_status=model_status, include_stop_str_in_output=include_stop_str_in_output, ) text = {"type": "text", "text": part["request_output"]["outputs"]["text"]} diff --git a/fastdeploy/input/ernie4_5_processor.py b/fastdeploy/input/ernie4_5_processor.py index bc7bd6c4657..472efdf1fc0 100644 --- a/fastdeploy/input/ernie4_5_processor.py +++ b/fastdeploy/input/ernie4_5_processor.py @@ -240,8 +240,10 @@ def process_request_dict(self, request, max_model_len=None): if self.reasoning_parser and self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser": request["enable_thinking"] = True if self.reasoning_parser: - self.model_status_dict["request_id"] = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) - if self.model_status_dict["request_id"] == "think_start": + self.model_status_dict[request["request_id"]] = self.reasoning_parser.get_model_status( + request["prompt_token_ids"] + ) + if self.model_status_dict[request["request_id"]] == "think_start": request["enable_thinking"] = True data_processor_logger.info(f"Processed request dict: {request}") return request @@ -256,7 +258,6 @@ def process_response(self, response_dict, **kwargs): Returns: Dict: response contain text fields """ - model_status = kwargs.get("model_status") req_id = response_dict.request_id token_ids = response_dict.outputs.token_ids @@ -266,7 +267,7 @@ def process_response(self, response_dict, **kwargs): full_text = self.tokenizer.decode(token_ids) if self.reasoning_parser: reasoning_content, text = self.reasoning_parser.extract_reasoning_content( - full_text, response_dict, model_status + full_text, response_dict, self.model_status_dict[req_id] ) response_dict.outputs.text = text response_dict.outputs.reasoning_content = reasoning_content @@ -310,7 +311,6 @@ def process_response_dict_normal(self, response_dict, **kwargs): Returns: Dict: response contain text fields """ - enable_thinking = kwargs.get("enable_thinking") token_ids = response_dict["outputs"]["token_ids"] is_end = response_dict["finished"] req_id = response_dict["request_id"] @@ -321,11 +321,9 @@ def process_response_dict_normal(self, response_dict, **kwargs): if is_end: full_text = previous_texts + delta_text response_dict["outputs"]["text"] = full_text - if self.reasoning_parser and ( - enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser" - ): + if self.reasoning_parser: reasoning_content, text = self.reasoning_parser.extract_reasoning_content( - full_text, response_dict, self.model_status_dict.get(req_id) + full_text, response_dict, self.model_status_dict[req_id] ) response_dict["outputs"]["text"] = text response_dict["outputs"]["reasoning_content"] = reasoning_content @@ -352,7 +350,6 @@ def process_response_dict_streaming(self, response_dict, **kwargs): Returns: Dict: response contain text fields """ - enable_thinking = kwargs.get("enable_thinking") is_end = response_dict["finished"] req_id = response_dict["request_id"] token_ids = response_dict["outputs"]["token_ids"] @@ -362,9 +359,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): token_ids = token_ids[:-1] delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id) response_dict["outputs"]["raw_prediction"] = delta_text - if self.reasoning_parser and ( - enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser" - ): + if self.reasoning_parser: reasoning_delta_message = self.reasoning_parser.extract_reasoning_content_streaming( previous_texts, previous_texts + delta_text, @@ -372,7 +367,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): previous_token_ids, previous_token_ids + token_ids, token_ids, - self.model_status_dict.get(req_id), + self.model_status_dict[req_id], ) response_dict["outputs"]["delta_message"] = reasoning_delta_message if self.tool_parser_obj: diff --git a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py index a13bf68b765..f05184edd10 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py +++ b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py @@ -54,6 +54,7 @@ def __init__( self.tool_parser_dict = dict() self.decode_status = dict() + self.model_status_dict = dict() self._load_tokenizer() # Generation config @@ -255,8 +256,12 @@ def process_request_dict(self, request, max_model_len=None): request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"])) data_processor_logger.info(f"Processed request {request}") - if self.reasoning_parser is not None: - request["model_status"] = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) + if self.reasoning_parser: + self.model_status_dict[request.request_id] = self.reasoning_parser.get_model_status( + request.prompt_token_ids + ) + if self.model_status_dict[request.request_id] == "think_start": + request.enable_thinking = True return request @@ -290,21 +295,3 @@ def pack_outputs(self, outs): outs["position_ids"] = np.array(outs["position_ids"], dtype=np.int64) return outs - - def process_response_dict(self, response_dict, stream, **kwargs): - """ - Preprocess the response - - Args: - response_dict (Dict): response for engine, contain ids fields - - Returns: - Dict: response contain text fields - """ - enable_thinking = kwargs.pop("enable_thinking", True) - if enable_thinking is None: - enable_thinking = True - if stream: - return self.process_response_dict_streaming(response_dict, enable_thinking=enable_thinking, **kwargs) - else: - return self.process_response_dict_normal(response_dict, enable_thinking=enable_thinking, **kwargs) diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py index a914dec30b1..cc09e858350 100644 --- a/fastdeploy/input/text_processor.py +++ b/fastdeploy/input/text_processor.py @@ -175,6 +175,7 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_ob self.generation_config = None self.decode_status = dict() + self.model_status_dict = dict() self.tool_parser_dict = dict() self.tokenizer = self._load_tokenizer() data_processor_logger.info( @@ -266,8 +267,10 @@ def process_request(self, request, max_model_len=None, **kwargs): if request.get("top_p") < _SAMPLING_EPS: request.set("top_p", _SAMPLING_EPS) if self.reasoning_parser: - request.model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids) - if request.model_status == "think_start": + self.model_status_dict[request.request_id] = self.reasoning_parser.get_model_status( + request.prompt_token_ids + ) + if self.model_status_dict[request.request_id] == "think_start": request.enable_thinking = True data_processor_logger.info(f"Processed request: {request}") @@ -343,6 +346,12 @@ def process_request_dict(self, request, max_model_len=None, **kwargs): request["temperature"] = 1 if request.get("top_p") < _SAMPLING_EPS: request["top_p"] = _SAMPLING_EPS + if self.reasoning_parser: + self.model_status_dict[request["request_id"]] = self.reasoning_parser.get_model_status( + request["prompt_token_ids"] + ) + if self.model_status_dict[request["request_id"]] == "think_start": + request["enable_thinking"] = True data_processor_logger.info(f"Processed request dict: {request}") return request @@ -366,21 +375,22 @@ def process_response(self, response_dict, **kwargs): if token_ids[-1] == self.tokenizer.eos_token_id: token_ids = token_ids[:-1] full_text = self.tokenizer.decode(token_ids) - + response_dict.outputs.text = full_text # 模型支持思考,并且支持思考 if self.reasoning_parser: - reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict) + reasoning_content, text = self.reasoning_parser.extract_reasoning_content( + full_text, response_dict, self.model_status_dict[req_id] + ) response_dict.outputs.text = text response_dict.outputs.reasoning_content = reasoning_content - else: - # 模型不支持思考,并且没单独设置enable_thinking为false - response_dict.outputs.text = full_text if self.tool_parser_obj: tool_parser = self.tool_parser_obj(self.tokenizer) tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict) if tool_call_info.tools_called: response_dict.outputs.tool_calls = tool_call_info.tool_calls response_dict.outputs.text = tool_call_info.content + if req_id in self.model_status_dict: + del self.model_status_dict[req_id] data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}") return response_dict @@ -395,7 +405,6 @@ def process_response_dict_normal(self, response_dict, **kwargs): Returns: Dict: response contain text fields """ - enable_thinking = kwargs.get("enable_thinking") token_ids = response_dict["outputs"]["token_ids"] is_end = response_dict["finished"] req_id = response_dict["request_id"] @@ -406,12 +415,13 @@ def process_response_dict_normal(self, response_dict, **kwargs): if is_end: full_text = previous_texts + delta_text response_dict["outputs"]["raw_prediction"] = full_text - if enable_thinking and self.reasoning_parser: - reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict) + response_dict["outputs"]["text"] = full_text + if self.reasoning_parser: + reasoning_content, text = self.reasoning_parser.extract_reasoning_content( + full_text, response_dict, self.model_status_dict[req_id] + ) response_dict["outputs"]["text"] = text response_dict["outputs"]["reasoning_content"] = reasoning_content - else: - response_dict["outputs"]["text"] = full_text if self.tool_parser_obj: tool_parser = self.tool_parser_obj(self.tokenizer) tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict) @@ -432,7 +442,6 @@ def process_response_dict_streaming(self, response_dict, **kwargs): Returns: Dict: response contain text fields """ - enable_thinking = kwargs.get("enable_thinking") is_end = response_dict["finished"] req_id = response_dict["request_id"] token_ids = response_dict["outputs"]["token_ids"] @@ -442,9 +451,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): token_ids = token_ids[:-1] delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id) response_dict["outputs"]["raw_prediction"] = delta_text - if self.reasoning_parser and ( - enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser" - ): + if self.reasoning_parser: reasoning_delta_message = self.reasoning_parser.extract_reasoning_content_streaming( previous_texts, previous_texts + delta_text, @@ -452,6 +459,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): previous_token_ids, previous_token_ids + token_ids, token_ids, + self.model_status_dict[req_id], ) response_dict["outputs"]["delta_message"] = reasoning_delta_message if self.tool_parser_obj: @@ -475,6 +483,8 @@ def process_response_dict_streaming(self, response_dict, **kwargs): del self.decode_status[req_id] if req_id in self.tool_parser_dict: del self.tool_parser_dict[req_id] + if req_id in self.model_status_dict: + del self.model_status_dict[req_id] return response_dict def process_response_dict(self, response_dict, **kwargs): @@ -487,16 +497,12 @@ def process_response_dict(self, response_dict, **kwargs): Returns: Dict: response contain text fields """ - enable_thinking = kwargs.pop("enable_thinking", True) - if enable_thinking is None: - enable_thinking = True stream = kwargs.get("stream", True) if stream: - return self.process_response_dict_streaming(response_dict, enable_thinking=enable_thinking, **kwargs) + return self.process_response_dict_streaming(response_dict, **kwargs) else: return self.process_response_dict_normal( response_dict=response_dict, - enable_thinking=enable_thinking, **kwargs, ) From c433e0540ebcbe90d816d97d3652b573ea877c87 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Fri, 26 Sep 2025 18:27:21 +0800 Subject: [PATCH 12/32] fix --- fastdeploy/input/ernie4_5_processor.py | 8 +++----- .../input/ernie4_5_vl_processor/ernie4_5_vl_processor.py | 8 +++----- fastdeploy/input/text_processor.py | 9 +++------ 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/fastdeploy/input/ernie4_5_processor.py b/fastdeploy/input/ernie4_5_processor.py index 472efdf1fc0..7d5781d2988 100644 --- a/fastdeploy/input/ernie4_5_processor.py +++ b/fastdeploy/input/ernie4_5_processor.py @@ -240,11 +240,9 @@ def process_request_dict(self, request, max_model_len=None): if self.reasoning_parser and self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser": request["enable_thinking"] = True if self.reasoning_parser: - self.model_status_dict[request["request_id"]] = self.reasoning_parser.get_model_status( - request["prompt_token_ids"] - ) - if self.model_status_dict[request["request_id"]] == "think_start": - request["enable_thinking"] = True + model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) + self.model_status_dict[request["request_id"]] = model_status + request["enable_thinking"] = model_status == "think_start" data_processor_logger.info(f"Processed request dict: {request}") return request diff --git a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py index f05184edd10..c6933908f25 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py +++ b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py @@ -257,11 +257,9 @@ def process_request_dict(self, request, max_model_len=None): data_processor_logger.info(f"Processed request {request}") if self.reasoning_parser: - self.model_status_dict[request.request_id] = self.reasoning_parser.get_model_status( - request.prompt_token_ids - ) - if self.model_status_dict[request.request_id] == "think_start": - request.enable_thinking = True + model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) + self.model_status_dict[request["request_id"]] = model_status + request["enable_thinking"] = model_status == "think_start" return request diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py index cc09e858350..40e9feb9924 100644 --- a/fastdeploy/input/text_processor.py +++ b/fastdeploy/input/text_processor.py @@ -347,11 +347,9 @@ def process_request_dict(self, request, max_model_len=None, **kwargs): if request.get("top_p") < _SAMPLING_EPS: request["top_p"] = _SAMPLING_EPS if self.reasoning_parser: - self.model_status_dict[request["request_id"]] = self.reasoning_parser.get_model_status( - request["prompt_token_ids"] - ) - if self.model_status_dict[request["request_id"]] == "think_start": - request["enable_thinking"] = True + model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) + self.model_status_dict[request["request_id"]] = model_status + request["enable_thinking"] = model_status == "think_start" data_processor_logger.info(f"Processed request dict: {request}") return request @@ -376,7 +374,6 @@ def process_response(self, response_dict, **kwargs): token_ids = token_ids[:-1] full_text = self.tokenizer.decode(token_ids) response_dict.outputs.text = full_text - # 模型支持思考,并且支持思考 if self.reasoning_parser: reasoning_content, text = self.reasoning_parser.extract_reasoning_content( full_text, response_dict, self.model_status_dict[req_id] From bfdec9ffb5f346bce9a0ea4762e4816bbcf0e251 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Fri, 26 Sep 2025 18:55:00 +0800 Subject: [PATCH 13/32] fix --- fastdeploy/engine/request.py | 1 - fastdeploy/input/ernie4_5_processor.py | 8 +++----- fastdeploy/input/text_processor.py | 8 +++----- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py index d65c653c2af..f24a9b463b0 100644 --- a/fastdeploy/engine/request.py +++ b/fastdeploy/engine/request.py @@ -72,7 +72,6 @@ def __init__( structural_tag: Optional[Any] = None, guided_json_object: Optional[bool] = None, enable_thinking: Optional[bool] = False, - model_status: Optional[str] = None, trace_carrier: dict = dict(), dp_rank: Optional[int] = None, chat_template: Optional[str] = None, diff --git a/fastdeploy/input/ernie4_5_processor.py b/fastdeploy/input/ernie4_5_processor.py index 7d5781d2988..cba81f309f8 100644 --- a/fastdeploy/input/ernie4_5_processor.py +++ b/fastdeploy/input/ernie4_5_processor.py @@ -156,11 +156,9 @@ def process_request(self, request, max_model_len=None, **kwargs): if self.reasoning_parser and self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser": request.enable_thinking = True if self.reasoning_parser: - self.model_status_dict[request.request_id] = self.reasoning_parser.get_model_status( - request.prompt_token_ids - ) - if self.model_status_dict[request.request_id] == "think_start": - request.enable_thinking = True + model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids) + self.model_status_dict[request.request_id] = model_status + request.enable_thinking = model_status == "think_start" data_processor_logger.info(f"Processed request: {request}") return request diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py index 40e9feb9924..cd1aba10624 100644 --- a/fastdeploy/input/text_processor.py +++ b/fastdeploy/input/text_processor.py @@ -267,11 +267,9 @@ def process_request(self, request, max_model_len=None, **kwargs): if request.get("top_p") < _SAMPLING_EPS: request.set("top_p", _SAMPLING_EPS) if self.reasoning_parser: - self.model_status_dict[request.request_id] = self.reasoning_parser.get_model_status( - request.prompt_token_ids - ) - if self.model_status_dict[request.request_id] == "think_start": - request.enable_thinking = True + model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids) + self.model_status_dict[request.request_id] = model_status + request.enable_thinking = model_status == "think_start" data_processor_logger.info(f"Processed request: {request}") return request From bd192b2af3ddf4a9189df77d089807175ccf7c5a Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Sun, 28 Sep 2025 19:59:40 +0800 Subject: [PATCH 14/32] fix parser --- .../tool_parsers/ernie_x1_tool_parser.py | 2 +- .../reasoning/ernie_vl_reasoning_parsers.py | 2 +- .../reasoning/ernie_x1_reasoning_parsers.py | 67 ++++---- .../reasoning/qwen3_reasoning_parsers.py | 159 +++++++++++------- 4 files changed, 134 insertions(+), 96 deletions(-) diff --git a/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py index a22ed9a0a34..662ac7d1060 100644 --- a/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py +++ b/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py @@ -58,7 +58,7 @@ def __init__(self, tokenizer): self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token) self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) if self.tool_call_start_token_id is None or self.tool_call_end_token_id is None: - raise RuntimeError("Ernie x1 Tool parser could not locate tool call start/end " "tokens in the tokenizer!") + raise RuntimeError("Ernie x1 Tool parser could not locate tool call start/end tokens in the tokenizer!") if not self.model_tokenizer: raise ValueError( diff --git a/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py b/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py index fe44fd47e82..89ad7bd274b 100644 --- a/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py +++ b/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py @@ -53,7 +53,7 @@ def __init__(self, tokenizer): if missing_tokens: raise RuntimeError( - f"Could not find the following token ids in tokenizer vocabulary: {', '.join(missing_tokens)}" + f"ernie vl reasoning parser could not find the following token ids in tokenizer vocabulary: {', '.join(missing_tokens)}" ) self.token_status_mapping = { self.think_start_token_id: "think_start", diff --git a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py index f8f33b3035d..517ae61e192 100644 --- a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py +++ b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py @@ -54,11 +54,11 @@ def __init__(self, tokenizer): token_id = self.vocab.get(token_value) setattr(self, f"{name}_id", token_id) if token_id is None: - missing_tokens.append(f"{name.replace('_', ' ')} token") + missing_tokens.append(token_value) if missing_tokens: raise RuntimeError( - f"Could not find the following token ids in tokenizer vocabulary: {', '.join(missing_tokens)}" + f"ernie x1 reasoning parser could not find the following token ids in tokenizer vocabulary: {', '.join(missing_tokens)}" ) self.token_status_mapping = { @@ -106,22 +106,33 @@ def extract_reasoning_content_streaming( return None if model_status == "think_start": - if self.think_end_token_id not in current_token_ids: - return DeltaMessage(reasoning_content=delta_text) - else: + if self.think_end_token_id in delta_token_ids: + reasoning_content = "" + response_content = "" + end_index = delta_text.find(self.think_end_token) + reasoning_content = delta_text[:end_index] + response_start_pos = delta_text.find(self.response_start_token) + if response_start_pos != -1: + response_content = self._extract_response_content( + delta_text[response_start_pos + len(self.response_start_token) :] + ) + return DeltaMessage(reasoning_content=reasoning_content, content=response_content) + elif self.think_end_token_id in previous_token_ids: if ( - self.response_start_token_id in current_token_ids - and self.response_end_token_id not in current_token_ids + self.response_start_token_id in previous_token_ids + and self.response_end_token_id not in previous_token_ids ): return DeltaMessage(content=delta_text) + else: + return DeltaMessage(reasoning_content=delta_text) elif model_status == "think_end": if ( - self.response_start_token_id in current_token_ids + self.response_start_token_id in previous_token_ids and self.response_end_token_id not in current_token_ids ): return DeltaMessage(content=delta_text) elif model_status == "response_start": - if self.response_end_token_id not in current_token_ids: + if self.response_end_token_id not in previous_token_ids: return DeltaMessage(content=delta_text) return None @@ -130,33 +141,29 @@ def extract_reasoning_content( self, model_output: str, request: ChatCompletionRequest, model_status: str ) -> Tuple[str, str]: """ - Optimized batch version of the enhanced parser. - Preserves newlines in both reasoning and response content, - only removing the single newline before closing tags. + 优化版解析器。保留推理和响应内容中的换行符, + 仅删除闭合标签前的单个换行符。 """ reasoning_content = "" response_content = "" - if model_status == "think_start": - think_end_pos = model_output.find(self.think_end_token) - if think_end_pos != -1: - reasoning_content = model_output[:think_end_pos] - remaining = model_output[think_end_pos + len(self.think_end_token) :].lstrip("\n") - - # Determine if remaining content is a response or tool call - if remaining.startswith(self.response_start_token): - response_start_len = len(self.response_start_token) - response_content = self._extract_response_content(remaining[response_start_len:]) - elif remaining.startswith(self.tool_call_start_token): - pass # No response content + if model_status in ["think_start", "think_end"]: + if model_status == "think_start": + think_end_pos = model_output.find(self.think_end_token) + if think_end_pos != -1: + reasoning_content = model_output[:think_end_pos] + remaining = model_output[think_end_pos + len(self.think_end_token) :].lstrip("\n") + else: + reasoning_content = model_output + remaining = "" else: - reasoning_content = model_output + remaining = model_output.lstrip("\n") - elif model_status == "think_end": - remaining = model_output.lstrip("\n") - if remaining.startswith(self.response_start_token): - response_start_len = len(self.response_start_token) - response_content = self._extract_response_content(remaining[response_start_len:]) + response_start_pos = remaining.find(self.response_start_token) + if response_start_pos != -1: + response_content = self._extract_response_content( + remaining[response_start_pos + len(self.response_start_token) :] + ) elif model_status == "response_start": response_content = self._extract_response_content(model_output) diff --git a/fastdeploy/reasoning/qwen3_reasoning_parsers.py b/fastdeploy/reasoning/qwen3_reasoning_parsers.py index 24c72a53a4e..b01cdf0d692 100644 --- a/fastdeploy/reasoning/qwen3_reasoning_parsers.py +++ b/fastdeploy/reasoning/qwen3_reasoning_parsers.py @@ -35,24 +35,49 @@ class Qwen3ReasoningParser(ReasoningParser): def __init__(self, tokenizer): super().__init__(tokenizer) - self.think_start_token = "" - self.think_end_token = "" + + # 定义所有需要检查的token + token_definitions = { + "think_start_token": "", + "think_end_token": "", + } if not self.model_tokenizer: - raise ValueError( - "The model tokenizer must be passed to the ReasoningParser " "constructor during construction." + raise ValueError("The model tokenizer must be passed to the ReasoningParser constructor.") + + missing_tokens = [] + for name, token_value in token_definitions.items(): + setattr(self, name, token_value) + token_id = self.vocab.get(token_value) + setattr(self, f"{name}_id", token_id) + if token_id is None: + missing_tokens.append(token_value) + + if missing_tokens: + raise RuntimeError( + f"Qwen3 reasoning parser could not find the following token ids in tokenizer vocabulary: {', '.join(missing_tokens)}" ) - - self.think_start_token_id = self.vocab.get(self.think_start_token) - self.think_end_token_id = self.vocab.get(self.think_end_token) - if self.think_end_token_id is None: - raise RuntimeError("Qwen3 reasoning parser could not locate think end " "tokens in the tokenizer!") + self.token_status_mapping = { + self.think_start_token_id: "think_start", + self.think_end_token_id: "think_end", + } def is_reasoning_end(self, input_ids: list[int]) -> bool: return self.think_end_token_id in input_ids + def find_last_special_token(self, prompt_token_ids: list[int]) -> int: + for i in range(len(prompt_token_ids) - 1, -1, -1): + if prompt_token_ids[i] in self.token_status_mapping: + return prompt_token_ids[i] + return -1 + def get_model_status(self, prompt_token_ids: list[int]): - return "think_start" + special_token_id = self.find_last_special_token(prompt_token_ids) + + if special_token_id == -1: + return "think_start" + + return self.token_status_mapping[special_token_id] def extract_reasoning_content_streaming( self, @@ -75,36 +100,39 @@ def extract_reasoning_content_streaming( if len(delta_token_ids) == 1 and (delta_token_ids[0] in [self.think_start_token_id, self.think_end_token_id]): return None - # in delta - if self.think_end_token_id in delta_token_ids: - # in delta, in delta, extract reasoning content - if self.think_start_token_id in delta_token_ids: + if model_status == "think_start": + # in delta + if self.think_end_token_id in delta_token_ids: + # in delta, in delta, extract reasoning content + if self.think_start_token_id in delta_token_ids: + start_index = delta_text.find(self.think_start_token) + end_index = delta_token_ids.find(self.think_end_token) + reasoning_content = delta_text[start_index + len(self.think_start_token) : end_index] + content = delta_text[end_index + len(self.think_end_token) :] + return DeltaMessage(reasoning_content=reasoning_content, content=content) + # in previous, in delta, + else: + end_index = delta_text.find(self.think_end_token) + reasoning_content = delta_text[:end_index] + content = delta_text[end_index + len(self.think_end_token) :] + content = content if content else None + return DeltaMessage(reasoning_content=reasoning_content, content=content) + # in previous reasoning content continues + elif self.think_end_token_id in previous_token_ids: + return DeltaMessage(content=delta_text) + # in previous + elif self.think_start_token_id in previous_token_ids: + return DeltaMessage(reasoning_content=delta_text) + # in delta + elif self.think_start_token_id in delta_token_ids: start_index = delta_text.find(self.think_start_token) - end_index = delta_token_ids.find(self.think_end_token) - reasoning_content = delta_text[start_index + len(self.think_start_token) : end_index] - content = delta_text[end_index + len(self.think_end_token) :] + reasoning_content = delta_text[start_index + len(self.think_start_token) :] + content = "" return DeltaMessage(reasoning_content=reasoning_content, content=content) - # in previous, in delta, else: - end_index = delta_text.find(self.think_end_token) - reasoning_content = delta_text[:end_index] - content = delta_text[end_index + len(self.think_end_token) :] - content = content if content else None - return DeltaMessage(reasoning_content=reasoning_content, content=content) - # in previous reasoning content continues - elif self.think_end_token_id in previous_token_ids: - return DeltaMessage(content=delta_text) - # in previous - elif self.think_start_token_id in previous_token_ids: - return DeltaMessage(reasoning_content=delta_text) - # in delta - elif self.think_start_token_id in delta_token_ids: - start_index = delta_text.find(self.think_start_token) - reasoning_content = delta_text[start_index + len(self.think_start_token) :] - content = "" - return DeltaMessage(reasoning_content=reasoning_content, content=content) + return DeltaMessage(reasoning_content=delta_text) else: - return DeltaMessage(reasoning_content=delta_text) + return DeltaMessage(content=delta_text) def extract_reasoning_content( self, model_output: str, request: ChatCompletionRequest, model_status: str @@ -120,36 +148,39 @@ def extract_reasoning_content( tuple[Optional[str], Optional[str]]: reasoning content and content """ - # 检查是否包含结束标签 - if self.think_end_token not in model_output: - return None, model_output - - # 检查是否有起始标签 - if self.think_start_token in model_output: - # 标准格式:contentanswer - if self.think_start_token not in model_output or self.think_end_token not in model_output: - return None, model_output - # Check if the is present in the model output, remove it - # if it is present. - model_output_parts = model_output.partition(self.think_start_token) - model_output = model_output_parts[2] if model_output_parts[1] else model_output_parts[0] - # Check if the model output contains the tokens. - # If the end token is not found, return the model output as is. + if model_status == "think_start": + # 检查是否包含结束标签 if self.think_end_token not in model_output: return None, model_output - # Extract reasoning content from the model output. - reasoning_content, _, content = model_output.partition(self.think_end_token) - - final_content = content or None - return reasoning_content, final_content - else: - # 缺少起始标签的格式:contentanswer - parts = model_output.split(self.think_end_token, 1) - - if len(parts) == 2: - reasoning_content = parts[0].strip() - final_content = parts[1].strip() if parts[1].strip() else None + # 检查是否有起始标签 + if self.think_start_token in model_output: + # 标准格式:contentanswer + if self.think_start_token not in model_output or self.think_end_token not in model_output: + return None, model_output + # Check if the is present in the model output, remove it + # if it is present. + model_output_parts = model_output.partition(self.think_start_token) + model_output = model_output_parts[2] if model_output_parts[1] else model_output_parts[0] + # Check if the model output contains the tokens. + # If the end token is not found, return the model output as is. + if self.think_end_token not in model_output: + return None, model_output + + # Extract reasoning content from the model output. + reasoning_content, _, content = model_output.partition(self.think_end_token) + + final_content = content or None return reasoning_content, final_content + else: + # 缺少起始标签的格式:contentanswer + parts = model_output.split(self.think_end_token, 1) - return None, model_output + if len(parts) == 2: + reasoning_content = parts[0].strip() + final_content = parts[1].strip() if parts[1].strip() else None + return reasoning_content, final_content + + return None, model_output + else: + return None, model_output From dd3011079ebd101946d509c1815d4f806f642afc Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Mon, 29 Sep 2025 00:33:09 +0800 Subject: [PATCH 15/32] fix unit test --- fastdeploy/input/ernie4_5_processor.py | 10 ++-- .../reasoning/ernie_x1_reasoning_parsers.py | 14 ++---- tests/e2e/test_EB_VL_Lite_serving.py | 4 +- .../openai/test_max_streaming_tokens.py | 2 +- .../openai/test_response_processors.py | 8 ++-- .../tool_parsers/test_ernie_x1_tool_parser.py | 21 --------- tests/input/test_ernie_processor.py | 1 + tests/reasoning/test_reasoning_parser.py | 47 ++++++++++++------- 8 files changed, 48 insertions(+), 59 deletions(-) diff --git a/fastdeploy/input/ernie4_5_processor.py b/fastdeploy/input/ernie4_5_processor.py index c8018e9aa04..b75d2c4fbe1 100644 --- a/fastdeploy/input/ernie4_5_processor.py +++ b/fastdeploy/input/ernie4_5_processor.py @@ -262,7 +262,9 @@ def process_response(self, response_dict, **kwargs): full_text = self.tokenizer.decode(token_ids) if self.reasoning_parser: reasoning_content, text = self.reasoning_parser.extract_reasoning_content( - full_text, response_dict, self.model_status_dict[req_id] + full_text, + response_dict, + self.model_status_dict.get(req_id), ) response_dict.outputs.text = text response_dict.outputs.reasoning_content = reasoning_content @@ -318,7 +320,9 @@ def process_response_dict_normal(self, response_dict, **kwargs): response_dict["outputs"]["text"] = full_text if self.reasoning_parser: reasoning_content, text = self.reasoning_parser.extract_reasoning_content( - full_text, response_dict, self.model_status_dict[req_id] + full_text, + response_dict, + self.model_status_dict.get(req_id), ) response_dict["outputs"]["text"] = text response_dict["outputs"]["reasoning_content"] = reasoning_content @@ -362,7 +366,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): previous_token_ids, previous_token_ids + token_ids, token_ids, - self.model_status_dict[req_id], + self.model_status_dict.get(req_id), ) response_dict["outputs"]["delta_message"] = reasoning_delta_message if self.tool_parser_obj: diff --git a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py index 0e73e7eb128..0ab2f26f094 100644 --- a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py +++ b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py @@ -98,22 +98,16 @@ def extract_reasoning_content_streaming( delta_text[response_start_pos + len(self.response_start_token) :] ) return DeltaMessage(reasoning_content=reasoning_content, content=response_content) - elif self.think_end_token_id in previous_token_ids: - if ( - self.response_start_token_id in previous_token_ids - and self.response_end_token_id not in previous_token_ids - ): + elif self.think_end_token in previous_text: + if self.response_start_token in previous_text and self.response_end_token not in previous_text: return DeltaMessage(content=delta_text) else: return DeltaMessage(reasoning_content=delta_text) elif model_status == "think_end": - if ( - self.response_start_token_id in previous_token_ids - and self.response_end_token_id not in current_token_ids - ): + if self.response_start_token in previous_text and self.response_end_token not in previous_text: return DeltaMessage(content=delta_text) elif model_status == "response_start": - if self.response_end_token_id not in previous_token_ids: + if self.response_end_token not in previous_text: return DeltaMessage(content=delta_text) return None diff --git a/tests/e2e/test_EB_VL_Lite_serving.py b/tests/e2e/test_EB_VL_Lite_serving.py index 41dd81a0972..e116e8bb9e0 100644 --- a/tests/e2e/test_EB_VL_Lite_serving.py +++ b/tests/e2e/test_EB_VL_Lite_serving.py @@ -532,7 +532,7 @@ def test_chat_with_thinking(openai_client, capsys): max_tokens=10, extra_body={"chat_template_kwargs": {"enable_thinking": False}}, ) - assert response.choices[0].message.reasoning_content is None + assert response.choices[0].message.reasoning_content == "" assert "" not in response.choices[0].message.content # test logic @@ -703,4 +703,4 @@ def test_thinking_logic_flag(openai_client, capsys): "chat_template_kwargs": {"enable_thinking": False}, }, ) - assert response_case_3.choices[0].message.reasoning_content is None + assert response_case_3.choices[0].message.reasoning_content == "" diff --git a/tests/entrypoints/openai/test_max_streaming_tokens.py b/tests/entrypoints/openai/test_max_streaming_tokens.py index 61d5f88d45a..0c8a3f8d223 100644 --- a/tests/entrypoints/openai/test_max_streaming_tokens.py +++ b/tests/entrypoints/openai/test_max_streaming_tokens.py @@ -141,7 +141,7 @@ async def test_integration_with_chat_stream_generator(self, mock_processor_class mock_processor_instance = Mock() - async def mock_process_response_chat_single(response, stream, enable_thinking, include_stop_str_in_output): + async def mock_process_response_chat_single(response, stream, include_stop_str_in_output): yield response mock_processor_instance.process_response_chat = mock_process_response_chat_single diff --git a/tests/entrypoints/openai/test_response_processors.py b/tests/entrypoints/openai/test_response_processors.py index afab163b97e..34cade7cd82 100644 --- a/tests/entrypoints/openai/test_response_processors.py +++ b/tests/entrypoints/openai/test_response_processors.py @@ -48,7 +48,7 @@ async def test_text_only_mode(self): results = [ r async for r in processor.process_response_chat( - request_outputs, stream=False, enable_thinking=False, include_stop_str_in_output=False + request_outputs, stream=False, include_stop_str_in_output=False ) ] @@ -67,7 +67,7 @@ async def test_streaming_text_and_image(self): results = [ r async for r in self.processor_mm.process_response_chat( - request_outputs, stream=True, enable_thinking=False, include_stop_str_in_output=False + request_outputs, stream=True, include_stop_str_in_output=False ) ] @@ -94,7 +94,7 @@ async def test_streaming_buffer_accumulation(self): results = [ r async for r in self.processor_mm.process_response_chat( - request_outputs, stream=True, enable_thinking=False, include_stop_str_in_output=False + request_outputs, stream=True, include_stop_str_in_output=False ) ] @@ -112,7 +112,7 @@ async def test_non_streaming_accumulate_and_emit(self): results = [ r async for r in self.processor_mm.process_response_chat( - request_outputs, stream=False, enable_thinking=False, include_stop_str_in_output=False + request_outputs, stream=False, include_stop_str_in_output=False ) ] diff --git a/tests/entrypoints/openai/tool_parsers/test_ernie_x1_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_ernie_x1_tool_parser.py index e818801d935..1b8b58d1e95 100644 --- a/tests/entrypoints/openai/tool_parsers/test_ernie_x1_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_ernie_x1_tool_parser.py @@ -52,33 +52,12 @@ def test_extract_tool_calls_complete(self): self.assertTrue(result.tools_called) self.assertEqual(result.tool_calls[0].function.name, "get_weather") - def test_extract_tool_calls_partial_arguments(self): - """Test partial extraction when arguments incomplete""" - output = '{"name": "get_weather", "arguments": {"location": "北"' - result = self.parser.extract_tool_calls(output, self.dummy_request) - self.assertFalse(result.tools_called) - self.assertEqual(result.tool_calls[0].function.name, "get_weather") - - def test_extract_tool_calls_invalid_response_before_toolcall(self): - """Test case where before is invalid""" - output = 'hello{"name": "get_weather", "arguments": {}}' - result = self.parser.extract_tool_calls(output, self.dummy_request) - self.assertFalse(result.tools_called) - self.assertIn("", result.content) - def test_extract_tool_calls_no_toolcall(self): """Test when no tool_call tags are present""" output = "no tool call here" result = self.parser.extract_tool_calls(output, self.dummy_request) self.assertFalse(result.tools_called) - def test_extract_tool_calls_invalid_json(self): - """Test tool_call with badly formatted JSON triggers fallback parser""" - output = '"name": "get_weather", "arguments": {' - result = self.parser.extract_tool_calls(output, self.dummy_request) - self.assertFalse(result.tools_called) - self.assertEqual(result.tool_calls[0].function.name, "get_weather") - def test_extract_tool_calls_exception(self): """Force exception to cover error branch""" with patch( diff --git a/tests/input/test_ernie_processor.py b/tests/input/test_ernie_processor.py index b2357eeaa86..506c396fd06 100644 --- a/tests/input/test_ernie_processor.py +++ b/tests/input/test_ernie_processor.py @@ -19,6 +19,7 @@ def setUp(self): self.processor.tool_parser_dict = {} self.processor.generation_config = MagicMock() self.processor.eos_token_ids = [1] + self.processor.reasoning_parser = None # 模拟 ids2tokens 方法 def mock_ids2tokens(token_ids, task_id): diff --git a/tests/reasoning/test_reasoning_parser.py b/tests/reasoning/test_reasoning_parser.py index 90a48c89909..1fa9a35386e 100644 --- a/tests/reasoning/test_reasoning_parser.py +++ b/tests/reasoning/test_reasoning_parser.py @@ -27,10 +27,11 @@ class DummyTokenizer: def __init__(self): self.vocab = { "": 100, - "": 101, - "": 102, - "": 103, - "": 104, + "": 101, + "": 102, + "": 103, + "": 104, + "": 105, } def get_vocab(self): @@ -137,6 +138,7 @@ def test_streaming_thinking_content(self): previous_token_ids=[], current_token_ids=[], delta_token_ids=[200], + model_status="think_start", ) self.assertEqual(msg.reasoning_content, "a") @@ -148,6 +150,7 @@ def test_streaming_thinking_newline_preserved(self): previous_token_ids=[], current_token_ids=[], delta_token_ids=[201], + model_status="think_start", ) self.assertEqual(msg.reasoning_content, "\n") @@ -159,6 +162,7 @@ def test_streaming_thinking_end_tag(self): previous_token_ids=[], current_token_ids=[], delta_token_ids=[self.parser.think_end_token_id], + model_status="think_start", ) self.assertIsNone(msg) @@ -170,6 +174,7 @@ def test_streaming_response_content(self): previous_token_ids=[], current_token_ids=[], delta_token_ids=[202], + model_status="think_start", ) self.assertEqual(msg.content, "h") @@ -181,6 +186,7 @@ def test_streaming_response_newline_preserved(self): previous_token_ids=[], current_token_ids=[], delta_token_ids=[203], + model_status="think_start", ) self.assertEqual(msg.content, "\n") @@ -193,6 +199,7 @@ def test_streaming_response_ignore_tags(self): previous_token_ids=[], current_token_ids=[], delta_token_ids=[self.parser.vocab[""]], + model_status="think_start", ) ) @@ -203,6 +210,7 @@ def test_streaming_response_ignore_tags(self): previous_token_ids=[], current_token_ids=[], delta_token_ids=[204], + model_status="think_start", ) self.assertIsInstance(msg, DeltaMessage) self.assertEqual(msg.content, "\n") @@ -215,6 +223,7 @@ def test_streaming_response_ignore_tags(self): previous_token_ids=[], current_token_ids=[], delta_token_ids=[self.parser.vocab[""]], + model_status="think_start", ) ) @@ -226,39 +235,41 @@ def test_streaming_tool_call(self): previous_token_ids=[], current_token_ids=[], delta_token_ids=[self.parser.vocab[""]], + model_status="think_start", ) + print(msg) self.assertIsNone(msg) # ---- Batch parsing ---- def test_batch_reasoning_and_response(self): text = "abc\n\nhello\nworld" - reasoning, response = self.parser.extract_reasoning_content(text, self.request) + reasoning, response = self.parser.extract_reasoning_content(text, self.request, "think_start") self.assertEqual(reasoning, "abc\n") self.assertEqual(response, "hello\nworld") def test_batch_reasoning_and_tool_call(self): text = "abccall_here" - reasoning, response = self.parser.extract_reasoning_content(text, self.request) + reasoning, response = self.parser.extract_reasoning_content(text, self.request, "think_start") self.assertEqual(reasoning, "abc") self.assertEqual(response, "") def test_batch_no_thinking_tag(self): text = "no_thinking_here" - reasoning, response = self.parser.extract_reasoning_content(text, self.request) + reasoning, response = self.parser.extract_reasoning_content(text, self.request, "think_start") self.assertEqual(reasoning, "no_thinking_here") self.assertEqual(response, "") - def test_batch_response_without_end_tag(self): - text = "abcpartial response" - reasoning, response = self.parser.extract_reasoning_content(text, self.request) - self.assertEqual(reasoning, "abc") - self.assertEqual(response, "partial response") - - def test_batch_preserve_all_newlines(self): - text = "abc\n\nline1\nline2\n" - reasoning, response = self.parser.extract_reasoning_content(text, self.request) - self.assertEqual(reasoning, "abc\n") - self.assertEqual(response, "line1\nline2\n") + # def test_batch_response_without_end_tag(self): + # text = "abcpartial response" + # reasoning, response = self.parser.extract_reasoning_content(text, self.request, "think_start") + # self.assertEqual(reasoning, "abc") + # self.assertEqual(response, "partial response") + + # def test_batch_preserve_all_newlines(self): + # text = "abc\n\nline1\nline2\n" + # reasoning, response = self.parser.extract_reasoning_content(text, self.request, "think_start") + # self.assertEqual(reasoning, "abc\n") + # self.assertEqual(response, "line1\nline2\n") if __name__ == "__main__": From 31d639dbb8ea48cbecd969e9d73cad9d707b2b1f Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Mon, 29 Sep 2025 10:28:08 +0800 Subject: [PATCH 16/32] fix unit test --- tests/input/test_ernie_processor.py | 2 ++ tests/input/test_text_processor.py | 1 + 2 files changed, 3 insertions(+) diff --git a/tests/input/test_ernie_processor.py b/tests/input/test_ernie_processor.py index 506c396fd06..7bab78e667d 100644 --- a/tests/input/test_ernie_processor.py +++ b/tests/input/test_ernie_processor.py @@ -20,6 +20,7 @@ def setUp(self): self.processor.generation_config = MagicMock() self.processor.eos_token_ids = [1] self.processor.reasoning_parser = None + self.processor.model_status_dict = {} # 模拟 ids2tokens 方法 def mock_ids2tokens(token_ids, task_id): @@ -66,6 +67,7 @@ def test_process_response_dict_streaming_normal_case(self): def test_process_request_dict(self): request_dict = { + "request_id": "123", "messages": [{"role": "user", "content": "Hello!"}], "chat_template_kwargs": {"chat_template": "Hello!"}, "eos_token_ids": [1], diff --git a/tests/input/test_text_processor.py b/tests/input/test_text_processor.py index 6ca0178fe89..45dfb2c2a18 100644 --- a/tests/input/test_text_processor.py +++ b/tests/input/test_text_processor.py @@ -20,6 +20,7 @@ def setUp(self): self.processor.tool_parser_dict = {} self.processor.generation_config = MagicMock() self.processor.eos_token_ids = [1] + self.processor.reasoning_parser = None def mock_messages2ids(request, **kwargs): if "chat_template" in kwargs: From 46e3c13883d8a71592d3f0ef34a5476e233fd291 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Mon, 29 Sep 2025 15:43:29 +0800 Subject: [PATCH 17/32] add unit test --- .../reasoning/ernie_vl_reasoning_parsers.py | 4 +- tests/input/test_ernie_processor.py | 19 ++- tests/input/test_text_processor.py | 9 +- tests/reasoning/test_reasoning_parser.py | 22 +-- tests/reasoning/test_vl_reasoning_parser.py | 135 ++++++++++++++++++ 5 files changed, 172 insertions(+), 17 deletions(-) create mode 100644 tests/reasoning/test_vl_reasoning_parser.py diff --git a/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py b/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py index 89ad7bd274b..5daaa986ce8 100644 --- a/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py +++ b/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py @@ -100,9 +100,9 @@ def extract_reasoning_content_streaming( return None if model_status == "think_start": if self.think_end_token_id in delta_token_ids: - end_index = delta_text.find(self.end_token) + end_index = delta_text.find(self.think_end_token) reasoning_content = delta_text[:end_index] - content = delta_text[end_index + len(self.end_token) :] + content = delta_text[end_index + len(self.think_end_token) :] return DeltaMessage(reasoning_content=reasoning_content, content=content) elif self.think_end_token_id in previous_token_ids: return DeltaMessage(content=delta_text) diff --git a/tests/input/test_ernie_processor.py b/tests/input/test_ernie_processor.py index 7bab78e667d..75da4786bd9 100644 --- a/tests/input/test_ernie_processor.py +++ b/tests/input/test_ernie_processor.py @@ -4,6 +4,11 @@ from fastdeploy.input.ernie4_5_processor import Ernie4_5Processor +class MockReasoningParser: + def get_model_status(self, prompt_token_ids): + return "think_start" + + class TestErnie4_5ProcessorProcessResponseDictStreaming(unittest.TestCase): def setUp(self): # 创建 Ernie4_5Processor 实例的模拟对象 @@ -14,13 +19,13 @@ def setUp(self): # 设置必要的属性 self.processor.tokenizer = MagicMock() self.processor.tokenizer.eos_token_id = 1 - self.processor.decode_status = {} + self.processor.decode_status = {"test": []} self.processor.reasoning_end_dict = {} self.processor.tool_parser_dict = {} self.processor.generation_config = MagicMock() self.processor.eos_token_ids = [1] - self.processor.reasoning_parser = None - self.processor.model_status_dict = {} + self.processor.reasoning_parser = MockReasoningParser() + self.processor.model_status_dict = {"test": "think_start"} # 模拟 ids2tokens 方法 def mock_ids2tokens(token_ids, task_id): @@ -65,6 +70,14 @@ def test_process_response_dict_streaming_normal_case(self): # 验证结果 self.assertEqual(result["outputs"]["raw_prediction"], "delta_text") + response_dict = {"finished": True, "request_id": "test", "outputs": {"token_ids": [4, 5]}} + + # 调用方法 + result = self.processor.process_response_dict_streaming(response_dict) + + # 验证结果 + self.assertEqual(result["outputs"]["raw_prediction"], "delta_text") + def test_process_request_dict(self): request_dict = { "request_id": "123", diff --git a/tests/input/test_text_processor.py b/tests/input/test_text_processor.py index 45dfb2c2a18..337ad0a0d34 100644 --- a/tests/input/test_text_processor.py +++ b/tests/input/test_text_processor.py @@ -5,6 +5,11 @@ from fastdeploy.input.text_processor import DataProcessor +class MockReasoningParser: + def get_model_status(self, prompt_token_ids): + return "think_start" + + class TestDataProcessorProcess(unittest.TestCase): def setUp(self): # 创建 DataProcessor 实例的模拟对象 @@ -20,7 +25,8 @@ def setUp(self): self.processor.tool_parser_dict = {} self.processor.generation_config = MagicMock() self.processor.eos_token_ids = [1] - self.processor.reasoning_parser = None + self.processor.reasoning_parser = MockReasoningParser() + self.processor.model_status_dict = {} def mock_messages2ids(request, **kwargs): if "chat_template" in kwargs: @@ -50,6 +56,7 @@ def test_process_request(self): def test_process_request_dict(self): request_dict = { + "request_id": "123", "messages": [{"role": "user", "content": "Hello!"}], "chat_template_kwargs": {"chat_template": "Hello!"}, "eos_token_ids": [1], diff --git a/tests/reasoning/test_reasoning_parser.py b/tests/reasoning/test_reasoning_parser.py index 1fa9a35386e..4b938a7a250 100644 --- a/tests/reasoning/test_reasoning_parser.py +++ b/tests/reasoning/test_reasoning_parser.py @@ -259,17 +259,17 @@ def test_batch_no_thinking_tag(self): self.assertEqual(reasoning, "no_thinking_here") self.assertEqual(response, "") - # def test_batch_response_without_end_tag(self): - # text = "abcpartial response" - # reasoning, response = self.parser.extract_reasoning_content(text, self.request, "think_start") - # self.assertEqual(reasoning, "abc") - # self.assertEqual(response, "partial response") - - # def test_batch_preserve_all_newlines(self): - # text = "abc\n\nline1\nline2\n" - # reasoning, response = self.parser.extract_reasoning_content(text, self.request, "think_start") - # self.assertEqual(reasoning, "abc\n") - # self.assertEqual(response, "line1\nline2\n") + def test_batch_response_without_end_tag(self): + text = "abcpartial response" + reasoning, response = self.parser.extract_reasoning_content(text, self.request, "think_start") + self.assertEqual(reasoning, "abc") + self.assertEqual(response, "partial response") + + def test_batch_preserve_all_newlines(self): + text = "abc\n\nline1\nline2\n" + reasoning, response = self.parser.extract_reasoning_content(text, self.request, "think_start") + self.assertEqual(reasoning, "abc\n") + self.assertEqual(response, "line1\nline2\n") if __name__ == "__main__": diff --git a/tests/reasoning/test_vl_reasoning_parser.py b/tests/reasoning/test_vl_reasoning_parser.py new file mode 100644 index 00000000000..7eaa5fb4f89 --- /dev/null +++ b/tests/reasoning/test_vl_reasoning_parser.py @@ -0,0 +1,135 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import unittest + +from fastdeploy.entrypoints.openai.protocol import ChatCompletionRequest +from fastdeploy.reasoning.ernie_vl_reasoning_parsers import ErnieVLReasoningParser + + +class MockTokenizer: + """Minimal tokenizer with vocab for testing.""" + + def __init__(self): + self.vocab = { + "": 100, + "": 101, + } + + def get_vocab(self): + """Return vocab dict for testing.""" + return self.vocab + + +class TestErnieVLReasoningParser(unittest.TestCase): + def setUp(self): + self.parser = ErnieVLReasoningParser(MockTokenizer()) + self.request = ChatCompletionRequest(model="test", messages=[{"role": "user", "content": "test message"}]) + self.tokenizer = MockTokenizer() + + def test_get_model_status(self): + status = self.parser.get_model_status([1, 2, 100]) + self.assertEqual(status, "think_start") + status = self.parser.get_model_status([1, 2, 101]) + self.assertEqual(status, "think_end") + status = self.parser.get_model_status([1]) + self.assertEqual(status, "think_start") + + def test_streaming_thinking_content(self): + msg = self.parser.extract_reasoning_content_streaming( + previous_text="", + current_text="a", + delta_text="a", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[200], + model_status="think_start", + ) + self.assertEqual(msg.reasoning_content, "a") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="", + current_text="ab", + delta_text="ab", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[100, 101, 102], + model_status="think_start", + ) + self.assertEqual(msg.reasoning_content, "a") + self.assertEqual(msg.content, "b") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="a", + current_text="ab", + delta_text="b", + previous_token_ids=[1, 101], + current_token_ids=[], + delta_token_ids=[102], + model_status="think_start", + ) + self.assertEqual(msg.content, "b") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="", + current_text="a", + delta_text="a", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + model_status="think_start", + ) + self.assertEqual(msg.reasoning_content, "a") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="", + current_text="a", + delta_text="a", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[200], + model_status="think_end", + ) + self.assertEqual(msg.content, "a") + + def test_none_streaming_thinking_content(self): + reasoning_content, content = self.parser.extract_reasoning_content( + model_output="a", + request={}, + model_status="think_start", + ) + self.assertEqual(reasoning_content, "a") + self.assertEqual(content, "") + + reasoning_content, content = self.parser.extract_reasoning_content( + model_output="ab", + request={}, + model_status="think_start", + ) + self.assertEqual(reasoning_content, "a") + self.assertEqual(content, "b") + + reasoning_content, content = self.parser.extract_reasoning_content( + model_output="a", + request={}, + model_status="think_end", + ) + self.assertEqual(reasoning_content, "") + self.assertEqual(content, "a") + + +if __name__ == "__main__": + unittest.main() From d159f27d7ac84cf25df45a503c416602c5a6f28c Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Mon, 20 Oct 2025 10:25:37 +0800 Subject: [PATCH 18/32] fix --- fastdeploy/input/ernie4_5_processor.py | 8 +++++--- fastdeploy/input/text_processor.py | 6 ++++-- tests/input/test_ernie_processor.py | 8 -------- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/fastdeploy/input/ernie4_5_processor.py b/fastdeploy/input/ernie4_5_processor.py index 7964211f604..5743846c49c 100644 --- a/fastdeploy/input/ernie4_5_processor.py +++ b/fastdeploy/input/ernie4_5_processor.py @@ -154,7 +154,8 @@ def process_request(self, request, max_model_len=None, **kwargs): request.set("top_p", _SAMPLING_EPS) if self.reasoning_parser: real_req_id = request.request_id.split("_")[0] - if real_req_id in self.model_status_dict: + model_status = self.model_status_dict.get(real_req_id) + if model_status is None: model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids) self.model_status_dict[real_req_id] = model_status request.enable_thinking = model_status == "think_start" @@ -236,7 +237,8 @@ def process_request_dict(self, request, max_model_len=None): request["top_p"] = _SAMPLING_EPS if self.reasoning_parser: real_req_id = request["request_id"].split("_")[0] - if real_req_id not in self.model_status_dict: + model_status = self.model_status_dict.get(real_req_id) + if model_status is None: model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) self.model_status_dict[real_req_id] = model_status request["enable_thinking"] = model_status == "think_start" @@ -357,7 +359,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): if token_ids[-1] == self.tokenizer.eos_token_id: token_ids = token_ids[:-1] delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id) - response_dict["outputs"]["raw_prediction"] = delta_text + response_dict["outputs"]["completion_tokens"] = delta_text if self.reasoning_parser: reasoning_delta_message = self.reasoning_parser.extract_reasoning_content_streaming( previous_texts, diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py index d33453cb36a..d7bf9766e24 100644 --- a/fastdeploy/input/text_processor.py +++ b/fastdeploy/input/text_processor.py @@ -270,7 +270,8 @@ def process_request(self, request, max_model_len=None, **kwargs): request.set("top_p", _SAMPLING_EPS) if self.reasoning_parser: real_req_id = request.request_id.split("_")[0] - if real_req_id in self.model_status_dict: + model_status = self.model_status_dict.get(real_req_id) + if model_status is None: model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids) self.model_status_dict[real_req_id] = model_status request.enable_thinking = model_status == "think_start" @@ -350,7 +351,8 @@ def process_request_dict(self, request, max_model_len=None, **kwargs): request["top_p"] = _SAMPLING_EPS if self.reasoning_parser: real_req_id = request["request_id"].split("_")[0] - if real_req_id not in self.model_status_dict: + model_status = self.model_status_dict.get(real_req_id) + if model_status is None: model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) self.model_status_dict[real_req_id] = model_status request["enable_thinking"] = model_status == "think_start" diff --git a/tests/input/test_ernie_processor.py b/tests/input/test_ernie_processor.py index 75da4786bd9..381a819cc21 100644 --- a/tests/input/test_ernie_processor.py +++ b/tests/input/test_ernie_processor.py @@ -70,14 +70,6 @@ def test_process_response_dict_streaming_normal_case(self): # 验证结果 self.assertEqual(result["outputs"]["raw_prediction"], "delta_text") - response_dict = {"finished": True, "request_id": "test", "outputs": {"token_ids": [4, 5]}} - - # 调用方法 - result = self.processor.process_response_dict_streaming(response_dict) - - # 验证结果 - self.assertEqual(result["outputs"]["raw_prediction"], "delta_text") - def test_process_request_dict(self): request_dict = { "request_id": "123", From 21a8d598c713956b85ee3cf790217042e55574f4 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Mon, 20 Oct 2025 14:35:01 +0800 Subject: [PATCH 19/32] fix --- fastdeploy/input/ernie4_5_processor.py | 4 ++-- .../ernie4_5_vl_processor/ernie4_5_vl_processor.py | 4 ++-- .../input/qwen_vl_processor/qwen_vl_processor.py | 7 +++++++ fastdeploy/input/text_processor.py | 10 ++++++---- 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/fastdeploy/input/ernie4_5_processor.py b/fastdeploy/input/ernie4_5_processor.py index 72d2b069c4e..a58fb4a9057 100644 --- a/fastdeploy/input/ernie4_5_processor.py +++ b/fastdeploy/input/ernie4_5_processor.py @@ -324,7 +324,7 @@ def process_response_dict_normal(self, response_dict, **kwargs): reasoning_content, text = self.reasoning_parser.extract_reasoning_content( full_text, response_dict, - self.model_status_dict.get(req_id), + self.model_status_dict.get(req_id.split("_")[0]), ) response_dict["outputs"]["text"] = text response_dict["outputs"]["reasoning_content"] = reasoning_content @@ -368,7 +368,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): previous_token_ids, previous_token_ids + token_ids, token_ids, - self.model_status_dict.get(req_id), + self.model_status_dict.get(req_id.split("_")[0]), ) response_dict["outputs"]["delta_message"] = reasoning_delta_message if self.tool_parser_obj: diff --git a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py index 30237e94cfc..befbd491bed 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py +++ b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py @@ -261,11 +261,11 @@ def process_request_dict(self, request, max_model_len=None): if self.reasoning_parser: real_req_id = request["request_id"].split("_")[0] - if real_req_id not in self.model_status_dict: + model_status = self.model_status_dict.get(real_req_id) + if model_status is None: model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) self.model_status_dict[real_req_id] = model_status request["enable_thinking"] = model_status == "think_start" - return request def append_completion_tokens(self, multimodal_inputs, completion_token_ids): diff --git a/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py b/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py index 00856ec01fd..ee0b57b6a63 100644 --- a/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py +++ b/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py @@ -270,6 +270,13 @@ def process_request_dict(self, request, max_model_len=None): # Set default max_tokens if not specified if request.get("max_tokens") is None: request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"])) # Ensure at least 1 token + if self.reasoning_parser: + real_req_id = request["request_id"].split("_")[0] + model_status = self.model_status_dict.get(real_req_id) + if model_status is None: + model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) + self.model_status_dict[real_req_id] = model_status + request["enable_thinking"] = model_status == "think_start" data_processor_logger.info(f"Processed request {request}") return request diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py index a7920f05248..bc56c1974f1 100644 --- a/fastdeploy/input/text_processor.py +++ b/fastdeploy/input/text_processor.py @@ -382,7 +382,7 @@ def process_response(self, response_dict, **kwargs): response_dict.outputs.text = full_text if self.reasoning_parser: reasoning_content, text = self.reasoning_parser.extract_reasoning_content( - full_text, response_dict, self.model_status_dict[req_id.split("_")[0]] + full_text, response_dict, self.model_status_dict.get(req_id.split("_")[0]) ) response_dict.outputs.text = text response_dict.outputs.reasoning_content = reasoning_content @@ -421,7 +421,9 @@ def process_response_dict_normal(self, response_dict, **kwargs): response_dict["outputs"]["text"] = full_text if self.reasoning_parser: reasoning_content, text = self.reasoning_parser.extract_reasoning_content( - full_text, response_dict, self.model_status_dict[req_id.split("_")[0]] + full_text, + response_dict, + self.model_status_dict.get(req_id.split("_")[0]), ) response_dict["outputs"]["text"] = text response_dict["outputs"]["reasoning_content"] = reasoning_content @@ -462,7 +464,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): previous_token_ids, previous_token_ids + token_ids, token_ids, - self.model_status_dict[req_id.split("_")[0]], + self.model_status_dict.get(req_id.split("_")[0]), ) response_dict["outputs"]["delta_message"] = reasoning_delta_message if self.tool_parser_obj: @@ -486,7 +488,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): del self.decode_status[req_id] if req_id in self.tool_parser_dict: del self.tool_parser_dict[req_id] - if req_id in self.model_status_dict: + if req_id.split("_")[0] in self.model_status_dict: del self.model_status_dict[req_id.split("_")[0]] return response_dict From 4a2908bfac6f4e31d8ef2d9e4fd35407ff3da86b Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Mon, 20 Oct 2025 19:23:09 +0800 Subject: [PATCH 20/32] add unit test --- .../reasoning/ernie_vl_reasoning_parsers.py | 3 +- .../reasoning/ernie_x1_reasoning_parsers.py | 4 +- tests/input/test_ernie_vl_processor.py | 94 ++++++++++++++ .../reasoning/test_qwen3_reasoning_parser.py | 119 ++++++++++++++++++ 4 files changed, 216 insertions(+), 4 deletions(-) create mode 100644 tests/input/test_ernie_vl_processor.py create mode 100644 tests/reasoning/test_qwen3_reasoning_parser.py diff --git a/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py b/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py index 873e043cacc..cafffbb8b08 100644 --- a/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py +++ b/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py @@ -126,11 +126,10 @@ def extract_reasoning_content( Returns: tuple[Optional[str], Optional[str]]: reasoning content and content """ - # Check if the model output contains the tokens. if model_status == "think_start": if self.think_end_token not in model_output: - return model_output, "" + return "", model_output reasoning_content, _, content = model_output.partition(self.think_end_token) final_content = content or "" return reasoning_content, final_content diff --git a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py index 313e2b0cc9e..a341f6a1c81 100644 --- a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py +++ b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py @@ -131,8 +131,8 @@ def extract_reasoning_content( reasoning_content = model_output[:think_end_pos] remaining = model_output[think_end_pos + len(self.think_end_token) :].lstrip("\n") else: - reasoning_content = model_output - remaining = "" + reasoning_content = "" + remaining = model_output else: remaining = model_output.lstrip("\n") diff --git a/tests/input/test_ernie_vl_processor.py b/tests/input/test_ernie_vl_processor.py new file mode 100644 index 00000000000..e0c8ea35d63 --- /dev/null +++ b/tests/input/test_ernie_vl_processor.py @@ -0,0 +1,94 @@ +import unittest +from unittest.mock import MagicMock, patch + +import numpy as np + +from fastdeploy.input.ernie4_5_vl_processor import Ernie4_5_VLProcessor + + +class MockReasoningParser: + def get_model_status(self, prompt_token_ids): + return "think_start" + + +class TestErnie4_5VLProcessorProcessResponseDictStreaming(unittest.TestCase): + def setUp(self): + # 创建 Ernie4_5_VLProcessor 实例的模拟对象 + with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None) as mock_init: + self.processor = Ernie4_5_VLProcessor("model_path") + mock_init.side_effect = lambda *args, **kwargs: print(f"__init__ called with {args}, {kwargs}") + + # 设置必要的属性 + self.processor.tokenizer = MagicMock() + self.processor.tokenizer.eos_token_id = 1 + self.processor.decode_status = {"test": []} + self.processor.reasoning_end_dict = {} + self.processor.tool_parser_dict = {} + self.processor.generation_config = MagicMock() + self.processor.eos_token_ids = [1] + self.processor.reasoning_parser = MockReasoningParser() + self.processor.model_status_dict = {"test": "think_start"} + self.processor.ernie4_5_processor = MagicMock() + + # 模拟 ids2tokens 方法 + def mock_ids2tokens(token_ids, task_id): + return "delta_text", [2, 3], "previous_texts" + + self.processor.ids2tokens = mock_ids2tokens + + def mock_request2ids(request, **kwargs): + return {"input_ids": np.array([1, 2, 3]), "prompt_token_ids": [0]} + + def mock_check_mm_limits(item): + pass + + def mock_apply_default_parameters(request): + return request + + def mock_pack_outputs(outputs): + return outputs + + self.processor._apply_default_parameters = mock_apply_default_parameters + self.processor._check_mm_limits = mock_check_mm_limits + self.processor.ernie4_5_processor.request2ids = mock_request2ids + self.processor.pack_outputs = mock_pack_outputs + + # 模拟推理解析器 + self.mock_reasoning_parser = MagicMock() + self.mock_reasoning_parser.extract_reasoning_content_streaming.return_value = ("reasoning", "text") + self.processor.reasoning_parser = self.mock_reasoning_parser + + # 模拟工具解析器 + self.mock_tool_parser = MagicMock() + self.mock_tool_parser.extract_tool_calls_streaming.return_value = None + self.mock_tool_parser_obj = MagicMock() + self.mock_tool_parser_obj.return_value = self.mock_tool_parser + self.processor.tool_parser_obj = self.mock_tool_parser_obj + + def test_process_response_dict_streaming_normal_case(self): + """测试正常情况下的流式响应处理""" + # 准备输入 + response_dict = {"finished": False, "request_id": "req1", "outputs": {"token_ids": [4, 5]}} + kwargs = {"enable_thinking": True} + + # 调用方法 + result = self.processor.process_response_dict_streaming(response_dict, **kwargs) + + # 验证结果 + self.assertEqual(result["outputs"]["completion_tokens"], "delta_text") + + def test_process_request_dict(self): + request_dict = { + "request_id": "123", + "messages": [{"role": "user", "content": "Hello!"}], + "chat_template_kwargs": {"chat_template": "Hello!"}, + "eos_token_ids": [1], + "temperature": 1, + "top_p": 1, + } + result = self.processor.process_request_dict(request_dict, 100) + self.assertEqual(result["prompt_token_ids"], [1, 2, 3]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/reasoning/test_qwen3_reasoning_parser.py b/tests/reasoning/test_qwen3_reasoning_parser.py new file mode 100644 index 00000000000..9cf3044478b --- /dev/null +++ b/tests/reasoning/test_qwen3_reasoning_parser.py @@ -0,0 +1,119 @@ +import unittest + +from fastdeploy.entrypoints.openai.protocol import ChatCompletionRequest +from fastdeploy.reasoning.qwen3_reasoning_parsers import Qwen3ReasoningParser + + +class MockTokenizer: + """Minimal tokenizer with vocab for testing.""" + + def __init__(self): + self.vocab = { + "": 100, + "": 101, + } + + def get_vocab(self): + """Return vocab dict for testing.""" + return self.vocab + + +class TestQwen3ReasoningParser(unittest.TestCase): + def setUp(self): + self.parser = Qwen3ReasoningParser(MockTokenizer()) + self.request = ChatCompletionRequest(model="test", messages=[{"role": "user", "content": "test message"}]) + self.tokenizer = MockTokenizer() + + def test_get_model_status(self): + status = self.parser.get_model_status([1, 2, 100]) + self.assertEqual(status, "think_start") + status = self.parser.get_model_status([1, 2, 101]) + self.assertEqual(status, "think_end") + status = self.parser.get_model_status([1]) + self.assertEqual(status, "think_start") + + def test_streaming_thinking_content(self): + msg = self.parser.extract_reasoning_content_streaming( + previous_text="", + current_text="a", + delta_text="a", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[200], + model_status="think_start", + ) + self.assertEqual(msg.reasoning_content, "a") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="", + current_text="ab", + delta_text="ab", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[99, 101, 102], + model_status="think_start", + ) + self.assertEqual(msg.reasoning_content, "a") + self.assertEqual(msg.content, "b") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="a", + current_text="ab", + delta_text="b", + previous_token_ids=[1, 101], + current_token_ids=[], + delta_token_ids=[102], + model_status="think_start", + ) + self.assertEqual(msg.content, "b") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="", + current_text="a", + delta_text="a", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + model_status="think_start", + ) + self.assertEqual(msg.reasoning_content, "a") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="", + current_text="a", + delta_text="a", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[200], + model_status="think_end", + ) + self.assertEqual(msg.content, "a") + + def test_none_streaming_thinking_content(self): + reasoning_content, content = self.parser.extract_reasoning_content( + model_output="a", + request={}, + model_status="think_start", + ) + self.assertEqual(reasoning_content, None) + self.assertEqual(content, "a") + + reasoning_content, content = self.parser.extract_reasoning_content( + model_output="ab", + request={}, + model_status="think_start", + ) + self.assertEqual(reasoning_content, "a") + self.assertEqual(content, "b") + + reasoning_content, content = self.parser.extract_reasoning_content( + model_output="a", + request={}, + model_status="think_end", + ) + self.assertEqual(reasoning_content, None) + self.assertEqual(content, "a") + + +if __name__ == "__main__": + unittest.main() From 59aaa2c46e0c353e5af0e8bce91847574d42d50f Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Tue, 21 Oct 2025 10:51:08 +0800 Subject: [PATCH 21/32] fix unit test --- fastdeploy/reasoning/ernie_x1_reasoning_parsers.py | 4 ++-- tests/reasoning/test_vl_reasoning_parser.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py index a341f6a1c81..313e2b0cc9e 100644 --- a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py +++ b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py @@ -131,8 +131,8 @@ def extract_reasoning_content( reasoning_content = model_output[:think_end_pos] remaining = model_output[think_end_pos + len(self.think_end_token) :].lstrip("\n") else: - reasoning_content = "" - remaining = model_output + reasoning_content = model_output + remaining = "" else: remaining = model_output.lstrip("\n") diff --git a/tests/reasoning/test_vl_reasoning_parser.py b/tests/reasoning/test_vl_reasoning_parser.py index 7eaa5fb4f89..f9a36dd952e 100644 --- a/tests/reasoning/test_vl_reasoning_parser.py +++ b/tests/reasoning/test_vl_reasoning_parser.py @@ -111,8 +111,8 @@ def test_none_streaming_thinking_content(self): request={}, model_status="think_start", ) - self.assertEqual(reasoning_content, "a") - self.assertEqual(content, "") + self.assertEqual(reasoning_content, "") + self.assertEqual(content, "a") reasoning_content, content = self.parser.extract_reasoning_content( model_output="ab", From 0e2019d1f423a7ee68cf094bc530579019227023 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Tue, 21 Oct 2025 23:08:45 +0800 Subject: [PATCH 22/32] add unit test --- .../reasoning/ernie_x1_reasoning_parsers.py | 3 +- .../reasoning/test_qwen3_reasoning_parser.py | 78 +++++++++++++ tests/reasoning/test_reasoning_parser.py | 105 +++++++++++++++++- 3 files changed, 183 insertions(+), 3 deletions(-) diff --git a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py index 313e2b0cc9e..81448043a7b 100644 --- a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py +++ b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py @@ -89,8 +89,7 @@ def extract_reasoning_content_streaming( return None if model_status == "think_start": - if self.think_end_token_id in delta_token_ids: - reasoning_content = "" + if self.think_end_token in delta_text: response_content = "" end_index = delta_text.find(self.think_end_token) reasoning_content = delta_text[:end_index] diff --git a/tests/reasoning/test_qwen3_reasoning_parser.py b/tests/reasoning/test_qwen3_reasoning_parser.py index 9cf3044478b..42bd135287f 100644 --- a/tests/reasoning/test_qwen3_reasoning_parser.py +++ b/tests/reasoning/test_qwen3_reasoning_parser.py @@ -18,12 +18,30 @@ def get_vocab(self): return self.vocab +class MissingTokenTokenizer: + def __init__(self): + self.vocab = { + "": 100, + } + + def get_vocab(self): + """Return vocab dict for testing.""" + return self.vocab + + class TestQwen3ReasoningParser(unittest.TestCase): def setUp(self): self.parser = Qwen3ReasoningParser(MockTokenizer()) self.request = ChatCompletionRequest(model="test", messages=[{"role": "user", "content": "test message"}]) self.tokenizer = MockTokenizer() + def test_missing_token(self): + with self.assertRaises(RuntimeError) as context: + Qwen3ReasoningParser(MissingTokenTokenizer()) + exception_message = str(context.exception) + expected_message_part = "Qwen3 reasoning parser could not find the following token ids" + self.assertIn(expected_message_part, exception_message) + def test_get_model_status(self): status = self.parser.get_model_status([1, 2, 100]) self.assertEqual(status, "think_start") @@ -89,6 +107,42 @@ def test_streaming_thinking_content(self): ) self.assertEqual(msg.content, "a") + msg = self.parser.extract_reasoning_content_streaming( + previous_text="hello", + current_text="hellohi", + delta_text="hi", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[101, 200], + model_status="think_start", + ) + self.assertEqual(msg.content, "hi") + self.assertEqual(msg.reasoning_content, "") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="hello", + current_text="hellohi", + delta_text="hi", + previous_token_ids=[100], + current_token_ids=[], + delta_token_ids=[], + model_status="think_start", + ) + self.assertEqual(msg.content, None) + self.assertEqual(msg.reasoning_content, "hi") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="hello", + current_text="hellohi", + delta_text="hi", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[100, 200], + model_status="think_start", + ) + self.assertEqual(msg.content, "") + self.assertEqual(msg.reasoning_content, "hi") + def test_none_streaming_thinking_content(self): reasoning_content, content = self.parser.extract_reasoning_content( model_output="a", @@ -114,6 +168,30 @@ def test_none_streaming_thinking_content(self): self.assertEqual(reasoning_content, None) self.assertEqual(content, "a") + reasoning_content, content = self.parser.extract_reasoning_content( + model_output="a", + request={}, + model_status="think_start", + ) + self.assertEqual(reasoning_content, None) + self.assertEqual(content, "a") + + reasoning_content, content = self.parser.extract_reasoning_content( + model_output="ab", + request={}, + model_status="think_start", + ) + self.assertEqual(reasoning_content, "a") + self.assertEqual(content, "b") + + reasoning_content, content = self.parser.extract_reasoning_content( + model_output="b", + request={}, + model_status="think_start", + ) + self.assertEqual(reasoning_content, "a") + self.assertEqual(content, "b") + if __name__ == "__main__": unittest.main() diff --git a/tests/reasoning/test_reasoning_parser.py b/tests/reasoning/test_reasoning_parser.py index 4b938a7a250..c68de416372 100644 --- a/tests/reasoning/test_reasoning_parser.py +++ b/tests/reasoning/test_reasoning_parser.py @@ -39,6 +39,20 @@ def get_vocab(self): return self.vocab +class MissingTokenTokenizer: + def __init__(self): + self.vocab = { + "": 100, + "": 101, + "": 102, + "": 103, + } + + def get_vocab(self): + """Return vocab dict for testing.""" + return self.vocab + + class TestReasoningParser(ReasoningParser): def is_reasoning_end(self, input_ids): """ @@ -129,6 +143,17 @@ def setUp(self): self.request = ChatCompletionRequest(model="test", messages=[{"role": "user", "content": "test message"}]) self.tokenizer = DummyTokenizer() + def test_missing_token(self): + with self.assertRaises(RuntimeError) as context: + ErnieX1ReasoningParser(MissingTokenTokenizer()) + exception_message = str(context.exception) + expected_message_part = "ernie x1 reasoning parser could not find the following token ids" + self.assertIn(expected_message_part, exception_message) + + def test_get_model_status(self): + model_status = self.parser.get_model_status([88, 99, 104]) + self.assertEqual(model_status, "response_start") + # ---- Streaming parsing ---- def test_streaming_thinking_content(self): msg = self.parser.extract_reasoning_content_streaming( @@ -227,6 +252,78 @@ def test_streaming_response_ignore_tags(self): ) ) + def test_extract_reasoning_content_streaming(self): + msg = self.parser.extract_reasoning_content_streaming( + previous_text="hello", + current_text="hello", + delta_text="", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[100, 200], + model_status="think_start", + ) + self.assertEqual(msg.content, "") + self.assertEqual(msg.reasoning_content, "") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="hello", + current_text="hellohi", + delta_text="hi", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[100, 200], + model_status="think_start", + ) + self.assertEqual(msg.content, "hi") + self.assertEqual(msg.reasoning_content, "") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="", + current_text="hellohi", + delta_text="hellohi", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[100, 200], + model_status="think_start", + ) + self.assertEqual(msg.content, "hi") + self.assertEqual(msg.reasoning_content, "hello") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="hello", + current_text="hellohi", + delta_text="hi", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[100, 200], + model_status="think_end", + ) + self.assertEqual(msg.content, "hi") + self.assertEqual(msg.reasoning_content, None) + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="hello", + current_text="hellohi", + delta_text="hi", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[100, 200], + model_status="response_start", + ) + self.assertEqual(msg.content, "hi") + self.assertEqual(msg.reasoning_content, None) + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="hellohi", + current_text="hellohiend", + delta_text="end", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[100, 200], + model_status="response_start", + ) + self.assertEqual(msg, None) + def test_streaming_tool_call(self): msg = self.parser.extract_reasoning_content_streaming( previous_text="", @@ -237,7 +334,6 @@ def test_streaming_tool_call(self): delta_token_ids=[self.parser.vocab[""]], model_status="think_start", ) - print(msg) self.assertIsNone(msg) # ---- Batch parsing ---- @@ -271,6 +367,13 @@ def test_batch_preserve_all_newlines(self): self.assertEqual(reasoning, "abc\n") self.assertEqual(response, "line1\nline2\n") + def test_extract_reasoning_content(self): + reasoning_content, response_content = self.parser.extract_reasoning_content( + model_output="hello", request=self.request, model_status="response_start" + ) + self.assertEqual(reasoning_content, "") + self.assertEqual(response_content, "hello") + if __name__ == "__main__": unittest.main() From f0def038abd7ba2d90e459a96313e993ab4f5521 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Wed, 22 Oct 2025 00:38:20 +0800 Subject: [PATCH 23/32] add unit test --- tests/reasoning/test_qwen3_reasoning_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/reasoning/test_qwen3_reasoning_parser.py b/tests/reasoning/test_qwen3_reasoning_parser.py index 42bd135287f..cde56601608 100644 --- a/tests/reasoning/test_qwen3_reasoning_parser.py +++ b/tests/reasoning/test_qwen3_reasoning_parser.py @@ -189,7 +189,7 @@ def test_none_streaming_thinking_content(self): request={}, model_status="think_start", ) - self.assertEqual(reasoning_content, "a") + self.assertEqual(reasoning_content, "") self.assertEqual(content, "b") From ea2d987f3ad92328c285395e56cc0f483b2b3066 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Wed, 22 Oct 2025 11:17:16 +0800 Subject: [PATCH 24/32] fix unit test --- tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py index 6acefb1334f..a1e4c235fb6 100644 --- a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py +++ b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py @@ -513,7 +513,7 @@ def test_chat_with_thinking(openai_client, capsys): max_tokens=10, extra_body={"chat_template_kwargs": {"enable_thinking": False}}, ) - assert response.choices[0].message.reasoning_content is None + assert response.choices[0].message.reasoning_content == "" assert "" not in response.choices[0].message.content # test logic From b8794cb21354c795077cf49bc5f77568c1ab55d4 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Wed, 22 Oct 2025 11:54:02 +0800 Subject: [PATCH 25/32] fix unit test --- tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py index a1e4c235fb6..d93ad3dbc0d 100644 --- a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py +++ b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py @@ -988,4 +988,4 @@ def test_thinking_logic_flag(openai_client, capsys): "chat_template_kwargs": {"enable_thinking": False}, }, ) - assert response_case_3.choices[0].message.reasoning_content is None + assert response_case_3.choices[0].message.reasoning_content == "" From 37b320e7155164f2852dc8a760fc6128f27fb9f4 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Wed, 22 Oct 2025 17:15:45 +0800 Subject: [PATCH 26/32] fix bug --- fastdeploy/input/ernie4_5_processor.py | 34 +++++++++---------- .../ernie4_5_vl_processor.py | 8 ++--- .../qwen_vl_processor/qwen_vl_processor.py | 8 ++--- fastdeploy/input/text_processor.py | 30 ++++++++-------- 4 files changed, 40 insertions(+), 40 deletions(-) diff --git a/fastdeploy/input/ernie4_5_processor.py b/fastdeploy/input/ernie4_5_processor.py index a58fb4a9057..13bc8e085ef 100644 --- a/fastdeploy/input/ernie4_5_processor.py +++ b/fastdeploy/input/ernie4_5_processor.py @@ -154,10 +154,10 @@ def process_request(self, request, max_model_len=None, **kwargs): request.set("top_p", _SAMPLING_EPS) if self.reasoning_parser: real_req_id = request.request_id.split("_")[0] - model_status = self.model_status_dict.get(real_req_id) - if model_status is None: - model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids) - self.model_status_dict[real_req_id] = model_status + n = request.get("n", 1) + model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids) + for idx in range(n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status request.enable_thinking = model_status == "think_start" data_processor_logger.info(f"Processed request: {request}") @@ -237,10 +237,10 @@ def process_request_dict(self, request, max_model_len=None): request["top_p"] = _SAMPLING_EPS if self.reasoning_parser: real_req_id = request["request_id"].split("_")[0] - model_status = self.model_status_dict.get(real_req_id) - if model_status is None: - model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) - self.model_status_dict[real_req_id] = model_status + model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) + n = request.get("n", 1) + for idx in range(n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status request["enable_thinking"] = model_status == "think_start" data_processor_logger.info(f"Processed request dict: {request}") return request @@ -266,7 +266,7 @@ def process_response(self, response_dict, **kwargs): reasoning_content, text = self.reasoning_parser.extract_reasoning_content( full_text, response_dict, - self.model_status_dict[req_id.split("_")[0]], + self.model_status_dict[req_id], ) response_dict.outputs.text = text response_dict.outputs.reasoning_content = reasoning_content @@ -278,8 +278,8 @@ def process_response(self, response_dict, **kwargs): if tool_call_info.tools_called: response_dict.outputs.tool_calls = tool_call_info.tool_calls response_dict.outputs.text = tool_call_info.content - if req_id.split("_")[0] in self.model_status_dict: - del self.model_status_dict[req_id.split("_")[0]] + if req_id in self.model_status_dict: + del self.model_status_dict[req_id] data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}") if response_dict.outputs.text == "" and response_dict.outputs.reasoning_content == "": return None @@ -324,7 +324,7 @@ def process_response_dict_normal(self, response_dict, **kwargs): reasoning_content, text = self.reasoning_parser.extract_reasoning_content( full_text, response_dict, - self.model_status_dict.get(req_id.split("_")[0]), + self.model_status_dict[req_id], ) response_dict["outputs"]["text"] = text response_dict["outputs"]["reasoning_content"] = reasoning_content @@ -337,8 +337,8 @@ def process_response_dict_normal(self, response_dict, **kwargs): response_dict["outputs"]["completion_tokens"] = full_text data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}") del self.decode_status[req_id] - if req_id.split("_")[0] in self.model_status_dict: - del self.model_status_dict[req_id.split("_")[0]] + if req_id in self.model_status_dict: + del self.model_status_dict[req_id] return response_dict def process_response_dict_streaming(self, response_dict, **kwargs): @@ -368,7 +368,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): previous_token_ids, previous_token_ids + token_ids, token_ids, - self.model_status_dict.get(req_id.split("_")[0]), + self.model_status_dict[req_id], ) response_dict["outputs"]["delta_message"] = reasoning_delta_message if self.tool_parser_obj: @@ -392,8 +392,8 @@ def process_response_dict_streaming(self, response_dict, **kwargs): del self.decode_status[req_id] if req_id in self.tool_parser_dict: del self.tool_parser_dict[req_id] - if req_id.split("_")[0] in self.model_status_dict: - del self.model_status_dict[req_id.split("_")[0]] + if req_id in self.model_status_dict: + del self.model_status_dict[req_id] return response_dict def messages2ids(self, request_or_messages, **kwargs): diff --git a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py index befbd491bed..7cb1c553857 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py +++ b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py @@ -261,10 +261,10 @@ def process_request_dict(self, request, max_model_len=None): if self.reasoning_parser: real_req_id = request["request_id"].split("_")[0] - model_status = self.model_status_dict.get(real_req_id) - if model_status is None: - model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) - self.model_status_dict[real_req_id] = model_status + model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) + n = request.get("n", 1) + for idx in range(n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status request["enable_thinking"] = model_status == "think_start" return request diff --git a/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py b/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py index ee0b57b6a63..0c9edc23f79 100644 --- a/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py +++ b/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py @@ -272,10 +272,10 @@ def process_request_dict(self, request, max_model_len=None): request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"])) # Ensure at least 1 token if self.reasoning_parser: real_req_id = request["request_id"].split("_")[0] - model_status = self.model_status_dict.get(real_req_id) - if model_status is None: - model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) - self.model_status_dict[real_req_id] = model_status + model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) + n = request.get("n", 1) + for idx in range(n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status request["enable_thinking"] = model_status == "think_start" data_processor_logger.info(f"Processed request {request}") diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py index bc56c1974f1..cc8e041cd83 100644 --- a/fastdeploy/input/text_processor.py +++ b/fastdeploy/input/text_processor.py @@ -270,10 +270,10 @@ def process_request(self, request, max_model_len=None, **kwargs): request.set("top_p", _SAMPLING_EPS) if self.reasoning_parser: real_req_id = request.request_id.split("_")[0] - model_status = self.model_status_dict.get(real_req_id) - if model_status is None: - model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids) - self.model_status_dict[real_req_id] = model_status + n = request.get("n", 1) + model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids) + for idx in range(n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status request.enable_thinking = model_status == "think_start" data_processor_logger.info(f"Processed request: {request}") @@ -351,10 +351,10 @@ def process_request_dict(self, request, max_model_len=None, **kwargs): request["top_p"] = _SAMPLING_EPS if self.reasoning_parser: real_req_id = request["request_id"].split("_")[0] - model_status = self.model_status_dict.get(real_req_id) - if model_status is None: - model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) - self.model_status_dict[real_req_id] = model_status + model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) + n = request.get("n", 1) + for idx in range(n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status request["enable_thinking"] = model_status == "think_start" data_processor_logger.info(f"Processed request dict: {request}") @@ -382,7 +382,7 @@ def process_response(self, response_dict, **kwargs): response_dict.outputs.text = full_text if self.reasoning_parser: reasoning_content, text = self.reasoning_parser.extract_reasoning_content( - full_text, response_dict, self.model_status_dict.get(req_id.split("_")[0]) + full_text, response_dict, self.model_status_dict[req_id] ) response_dict.outputs.text = text response_dict.outputs.reasoning_content = reasoning_content @@ -392,8 +392,8 @@ def process_response(self, response_dict, **kwargs): if tool_call_info.tools_called: response_dict.outputs.tool_calls = tool_call_info.tool_calls response_dict.outputs.text = tool_call_info.content - if req_id.split("_")[0] in self.model_status_dict: - del self.model_status_dict[req_id.split("_")[0]] + if req_id in self.model_status_dict: + del self.model_status_dict[req_id] data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}") return response_dict @@ -423,7 +423,7 @@ def process_response_dict_normal(self, response_dict, **kwargs): reasoning_content, text = self.reasoning_parser.extract_reasoning_content( full_text, response_dict, - self.model_status_dict.get(req_id.split("_")[0]), + self.model_status_dict[req_id], ) response_dict["outputs"]["text"] = text response_dict["outputs"]["reasoning_content"] = reasoning_content @@ -464,7 +464,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): previous_token_ids, previous_token_ids + token_ids, token_ids, - self.model_status_dict.get(req_id.split("_")[0]), + self.model_status_dict[req_id], ) response_dict["outputs"]["delta_message"] = reasoning_delta_message if self.tool_parser_obj: @@ -488,8 +488,8 @@ def process_response_dict_streaming(self, response_dict, **kwargs): del self.decode_status[req_id] if req_id in self.tool_parser_dict: del self.tool_parser_dict[req_id] - if req_id.split("_")[0] in self.model_status_dict: - del self.model_status_dict[req_id.split("_")[0]] + if req_id in self.model_status_dict: + del self.model_status_dict[req_id] return response_dict def process_response_dict(self, response_dict, **kwargs): From 34ac21a5ef2a7a7e53c7cf69397ed25cd5db08f5 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Wed, 22 Oct 2025 19:28:55 +0800 Subject: [PATCH 27/32] fix unit test --- fastdeploy/input/text_processor.py | 2 ++ tests/input/test_ernie_processor.py | 2 +- tests/input/test_ernie_vl_processor.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py index cc8e041cd83..6c245fa36df 100644 --- a/fastdeploy/input/text_processor.py +++ b/fastdeploy/input/text_processor.py @@ -435,6 +435,8 @@ def process_response_dict_normal(self, response_dict, **kwargs): response_dict["outputs"]["text"] = tool_call_info.content data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}") del self.decode_status[req_id] + if req_id in self.model_status_dict: + del self.model_status_dict[req_id] return response_dict def process_response_dict_streaming(self, response_dict, **kwargs): diff --git a/tests/input/test_ernie_processor.py b/tests/input/test_ernie_processor.py index 2ede666351c..2d6b9e60bf0 100644 --- a/tests/input/test_ernie_processor.py +++ b/tests/input/test_ernie_processor.py @@ -61,7 +61,7 @@ def mock_apply_default_parameters(request): def test_process_response_dict_streaming_normal_case(self): """测试正常情况下的流式响应处理""" # 准备输入 - response_dict = {"finished": False, "request_id": "req1", "outputs": {"token_ids": [4, 5]}} + response_dict = {"finished": False, "request_id": "test", "outputs": {"token_ids": [4, 5]}} kwargs = {"enable_thinking": True} # 调用方法 diff --git a/tests/input/test_ernie_vl_processor.py b/tests/input/test_ernie_vl_processor.py index e0c8ea35d63..1414439c49a 100644 --- a/tests/input/test_ernie_vl_processor.py +++ b/tests/input/test_ernie_vl_processor.py @@ -68,7 +68,7 @@ def mock_pack_outputs(outputs): def test_process_response_dict_streaming_normal_case(self): """测试正常情况下的流式响应处理""" # 准备输入 - response_dict = {"finished": False, "request_id": "req1", "outputs": {"token_ids": [4, 5]}} + response_dict = {"finished": False, "request_id": "test", "outputs": {"token_ids": [4, 5]}} kwargs = {"enable_thinking": True} # 调用方法 From 1cb6205f78315eade8a44cf0b715da24edc5d615 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Wed, 22 Oct 2025 19:52:04 +0800 Subject: [PATCH 28/32] x1 tool parser --- .../tool_parsers/ernie_x1_tool_parser.py | 172 +++++++++++++++--- 1 file changed, 148 insertions(+), 24 deletions(-) diff --git a/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py index ec3ff9ce146..14a784f174e 100644 --- a/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py +++ b/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py @@ -16,10 +16,18 @@ import json import re +import uuid from collections.abc import Sequence from typing import Union -from fastdeploy.entrypoints.chat_utils import random_tool_call_id +import partial_json_parser + + +def random_tool_call_id() -> str: + """Generate a random tool call ID""" + return f"chatcmpl-tool-{str(uuid.uuid4().hex)}" + + from fastdeploy.entrypoints.openai.protocol import ( ChatCompletionRequest, DeltaFunctionCall, @@ -55,12 +63,12 @@ def __init__(self, tokenizer): self.tool_call_start_token: str = "" self.tool_call_end_token: str = "" - self.tool_call_regex = re.compile(r"(.*?)|(.*)", re.DOTALL) - self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token) self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) if self.tool_call_start_token_id is None or self.tool_call_end_token_id is None: - raise RuntimeError("Ernie x1 Tool parser could not locate tool call start/end tokens in the tokenizer!") + raise RuntimeError( + "Hermes 2 Pro Tool parser could not locate tool call start/end " "tokens in the tokenizer!" + ) if not self.model_tokenizer: raise ValueError( @@ -80,27 +88,143 @@ def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest) """ try: - if self.tool_call_start_token not in model_output: - return ExtractedToolCallInformation(tools_called=False, tool_calls=[], content=model_output) - function_call_tuples = self.tool_call_regex.findall(model_output) - - raw_function_calls = [json.loads(match[0] if match[0] else match[1]) for match in function_call_tuples] - - tool_calls = [ - ToolCall( - type="function", - function=FunctionCall( - name=function_call["name"], - # function call args are JSON but as a string - arguments=json.dumps(function_call["arguments"], ensure_ascii=False), - ), + tool_calls = [] + + # Check for invalid tags before tool calls + if re.search(r"[\s\S]*?\s*(?=)", model_output): + data_processor_logger.error("Invalid format: tags found before ") + return ExtractedToolCallInformation(tools_called=False, content=model_output) + + function_call_arr = [] + remaining_text = model_output + + while True: + # Find the next + tool_call_pos = remaining_text.find("") + if tool_call_pos == -1: + break + + # Extract content after + tool_content_start = tool_call_pos + len("") + tool_content_end = remaining_text.find("", tool_content_start) + + tool_json = "" + if tool_content_end == -1: + # Processing unclosed tool_call block (truncated case) + tool_json = remaining_text[tool_content_start:].strip() + remaining_text = "" # No more content to process + else: + # Processing closed block + tool_json = remaining_text[tool_content_start:tool_content_end].strip() + remaining_text = remaining_text[tool_content_end + len("") :] + + if not tool_json: + continue + + # Process tool_json + tool_json = tool_json.strip() + if not tool_json.startswith("{"): + tool_json = "{" + tool_json + if not tool_json.endswith("}"): + tool_json = tool_json + "}" + + try: + # Parsing strategy: First try standard json.loads + try: + tool_data = json.loads(tool_json) + + if isinstance(tool_data, dict) and "name" in tool_data and "arguments" in tool_data: + function_call_arr.append( + { + "name": tool_data["name"], + "arguments": tool_data["arguments"], + "_is_complete": True, # Mark as complete + } + ) + continue + except json.JSONDecodeError: + pass + + # Try partial_json_parser when standard parsing fails + from partial_json_parser.core.options import Allow + + try: + tool_data = {} + flags = Allow.ALL & ~Allow.STR + + # Parse the name field + name_match = re.search(r'"name"\s*:\s*"([^"]*)"', tool_json) + if name_match: + tool_data["name"] = name_match.group(1) + + # Parse the arguments field + args_match = re.search(r'"arguments"\s*:\s*(\{.*)', tool_json) + if args_match: + try: + tool_data["arguments"] = partial_json_parser.loads(args_match.group(1), flags=flags) + except: + tool_data["arguments"] = None + + if isinstance(tool_data, dict): + function_call_arr.append( + { + "name": tool_data.get("name", ""), + "arguments": tool_data.get("arguments", {}), + "_is_partial": True, # Mark as partial + } + ) + except Exception as e: + data_processor_logger.debug(f"Failed to parse tool call: {str(e)}") + continue + except Exception as e: + data_processor_logger.debug(f"Failed to parse tool call: {str(e)}") + continue + + if not function_call_arr: + data_processor_logger.error("No valid tool calls found") + return ExtractedToolCallInformation(tools_called=False, content=model_output) + + tool_calls = [] + all_complete = True # Initialize as all complete + + for tool_call in function_call_arr: + # Set flags + is_complete = tool_call.get("_is_complete", False) + is_partial = tool_call.get("_is_partial", False) + + # If any tool call is incomplete or partial, mark all_complete as False + if not is_complete or is_partial: + all_complete = False + + # Process arguments + tool_args = tool_call.get("arguments", {}) + if not isinstance(tool_args, dict): + tool_args = {} + + try: + args_str = json.dumps(tool_args, ensure_ascii=False) if tool_args else "{}" + except: + args_str = "{}" + + tool_calls.append( + ToolCall( + type="function", + id=random_tool_call_id(), + function=FunctionCall( + name=tool_call.get("name", ""), + arguments=args_str, + ), + ) ) - for function_call in raw_function_calls - ] - return ExtractedToolCallInformation(tools_called=True, tool_calls=tool_calls, content="") - except Exception: - data_processor_logger.error("Error in extracting tool call from response.") - return ExtractedToolCallInformation(tools_called=False, tool_calls=[], content=model_output) + + # Only return tools_called=True if all tool calls are complete + return ExtractedToolCallInformation( + tools_called=all_complete, tool_calls=tool_calls if tool_calls else None, content="" + ) + + except Exception as e: + data_processor_logger.error(f"Error in extracting tool call from response: {str(e)}") + return ExtractedToolCallInformation(tools_called=False, tool_calls=None, content=model_output) def extract_tool_calls_streaming( self, From 4ef4df1adebb2e2b1bd97558f165dfa292d73a3b Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Mon, 3 Nov 2025 16:12:23 +0800 Subject: [PATCH 29/32] fix unit test --- tests/input/test_ernie_processor.py | 2 +- tests/input/test_ernie_vl_processor.py | 60 +------------------------- tests/input/test_text_processor.py | 2 +- 3 files changed, 4 insertions(+), 60 deletions(-) diff --git a/tests/input/test_ernie_processor.py b/tests/input/test_ernie_processor.py index 7d6afe83294..6f5fad89403 100644 --- a/tests/input/test_ernie_processor.py +++ b/tests/input/test_ernie_processor.py @@ -25,7 +25,7 @@ def setUp(self): self.processor.generation_config = MagicMock() self.processor.eos_token_ids = [1] self.processor.reasoning_parser = MockReasoningParser() - self.processor.model_status_dict = {} + self.processor.model_status_dict = {"request-id_0": "think_start", "test": "think_start"} # 模拟 ids2tokens 方法 def mock_ids2tokens(token_ids, task_id): diff --git a/tests/input/test_ernie_vl_processor.py b/tests/input/test_ernie_vl_processor.py index afe3bd7e500..facc8c30cfa 100644 --- a/tests/input/test_ernie_vl_processor.py +++ b/tests/input/test_ernie_vl_processor.py @@ -27,7 +27,7 @@ def setUp(self): self.processor.generation_config = MagicMock() self.processor.eos_token_ids = [1] self.processor.reasoning_parser = MockReasoningParser() - self.processor.model_status_dict = {} + self.processor.model_status_dict = {"test": "think_start"} self.processor.ernie4_5_processor = MagicMock() # 模拟 ids2tokens 方法 @@ -55,7 +55,7 @@ def mock_pack_outputs(outputs): # 模拟推理解析器 self.mock_reasoning_parser = MagicMock() - self.mock_reasoning_parser.extract_reasoning_content_streaming.return_value = ("reasoning", "text") + self.mock_reasoning_parser.extract_reasoning_content_streaming.return_value = None self.processor.reasoning_parser = self.mock_reasoning_parser # 模拟工具解析器 @@ -89,62 +89,6 @@ def test_process_request_dict(self): result = self.processor.process_request_dict(request_dict, 100) self.assertEqual(result["prompt_token_ids"], [1, 2, 3]) - def test_process_request_dict_with_options(self): - request_dict = { - "messages": [{"role": "user", "content": "Hello"}], - "prompt_token_ids": [1, 1, 1], - } - self.processor.process_request_dict(request_dict, 100) - self.assertEqual(request_dict["enable_thinking"], True) - - request_dict = { - "messages": [{"role": "user", "content": "Hello"}], - "chat_template_kwargs": {"enable_thinking": True}, - "prompt_token_ids": [1, 1, 1], - } - self.processor.process_request_dict(request_dict, 100) - self.assertEqual(request_dict["enable_thinking"], True) - - request_dict = { - "messages": [{"role": "user", "content": "Hello"}], - "chat_template_kwargs": {"enable_thinking": False}, - "prompt_token_ids": [1, 1, 1], - } - self.processor.process_request_dict(request_dict, 100) - self.assertEqual(request_dict["enable_thinking"], False) - - request_dict = { - "messages": [{"role": "user", "content": "Hello"}], - "chat_template_kwargs": {"options": {"thinking_mode": "open"}}, - "prompt_token_ids": [1, 1, 1], - } - self.processor.process_request_dict(request_dict, 100) - self.assertEqual(request_dict["enable_thinking"], True) - - request_dict = { - "messages": [{"role": "user", "content": "Hello"}], - "chat_template_kwargs": {"options": {"thinking_mode": "close"}}, - "prompt_token_ids": [1, 1, 1], - } - self.processor.process_request_dict(request_dict, 100) - self.assertEqual(request_dict["enable_thinking"], False) - - request_dict = { - "messages": [{"role": "user", "content": "Hello"}], - "chat_template_kwargs": {"options": {"thinking_mode": "false"}}, - "prompt_token_ids": [1, 1, 1], - } - self.processor.process_request_dict(request_dict, 100) - self.assertEqual(request_dict["enable_thinking"], False) - - request_dict = { - "messages": [{"role": "user", "content": "Hello"}], - "chat_template_kwargs": {"options": {"thinking_mode": "123"}}, - "prompt_token_ids": [1, 1, 1], - } - self.processor.process_request_dict(request_dict, 100) - self.assertEqual(request_dict["enable_thinking"], True) - if __name__ == "__main__": unittest.main() diff --git a/tests/input/test_text_processor.py b/tests/input/test_text_processor.py index acf53eb72b9..b22b2d5a0ad 100644 --- a/tests/input/test_text_processor.py +++ b/tests/input/test_text_processor.py @@ -20,7 +20,7 @@ def setUp(self): self.processor.tool_parser_dict = {} self.processor.generation_config = MagicMock() self.processor.eos_token_ids = [1] - self.processor.model_status_dict = {} + self.processor.model_status_dict = {"request-id_0": "think_start"} self.processor.reasoning_parser = MagicMock() def mock_messages2ids(request, **kwargs): From 7c1781290d4cf2d0b04b90aec5b2e80eb15f8778 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Thu, 20 Nov 2025 19:35:52 +0800 Subject: [PATCH 30/32] fix unit test --- .../ernie_45_vl_thinking_reasoning_parser.py | 142 ++++++++++++------ tests/e2e/test_EB_VL_Lite_sot_serving.py | 4 +- .../entrypoints/openai/test_finish_reason.py | 6 +- .../openai/test_max_streaming_tokens.py | 2 +- tests/entrypoints/openai/test_serving_chat.py | 71 --------- tests/input/test_ernie4_5_processor.py | 1 + tests/reasoning/test_reasoning_parser.py | 102 ++++++++++++- 7 files changed, 196 insertions(+), 132 deletions(-) delete mode 100644 tests/entrypoints/openai/test_serving_chat.py diff --git a/fastdeploy/reasoning/ernie_45_vl_thinking_reasoning_parser.py b/fastdeploy/reasoning/ernie_45_vl_thinking_reasoning_parser.py index 939a0a4348b..fa394545802 100644 --- a/fastdeploy/reasoning/ernie_45_vl_thinking_reasoning_parser.py +++ b/fastdeploy/reasoning/ernie_45_vl_thinking_reasoning_parser.py @@ -35,25 +35,53 @@ class Ernie45VLThinkingReasoningParser(ReasoningParser): def __init__(self, tokenizer): super().__init__(tokenizer) - self.think_end_token = "" - self.tool_begin_token = "" + token_definitions = { + "think_start_token": "", + "think_end_token": "", + "tool_call_start_token": "", + "tool_call_end_token": "", + } if not self.model_tokenizer: raise ValueError( "The model tokenizer must be passed to the ReasoningParser " "constructor during construction." ) - - self.think_end_token_id = self.vocab.get(self.think_end_token) - self.tool_begin_token_id = self.vocab.get(self.tool_begin_token) - if self.tool_begin_token_id is None: - self.tool_begin_token_id = -1 - - if self.think_end_token_id is None: - raise RuntimeError("Test reasoning parser could not locate think end tokens in the tokenizer!") + missing_tokens = [] + for name, token_value in token_definitions.items(): + setattr(self, name, token_value) + token_id = self.vocab.get(token_value) + setattr(self, f"{name}_id", token_id) + if token_id is None: + missing_tokens.append(f"{name.replace('_', ' ')} token") + + if missing_tokens: + raise RuntimeError( + f"ernie vl reasoning parser could not find the following token ids in tokenizer vocabulary: {', '.join(missing_tokens)}" + ) + self.token_status_mapping = { + self.think_start_token_id: "think_start", + self.think_end_token_id: "think_end", + self.tool_call_start_token_id: "tool_call_start", + self.tool_call_end_token_id: "tool_call_end", + } def is_reasoning_end(self, input_ids: list[int]) -> bool: return self.think_end_token_id in input_ids + def find_last_special_token(self, prompt_token_ids: list[int]) -> int: + for i in range(len(prompt_token_ids) - 1, -1, -1): + if prompt_token_ids[i] in self.token_status_mapping: + return prompt_token_ids[i] + return -1 + + def get_model_status(self, prompt_token_ids: list[int]): + special_token_id = self.find_last_special_token(prompt_token_ids) + + if special_token_id == -1: + return "think_start" + + return self.token_status_mapping[special_token_id] + def extract_reasoning_content_streaming( self, previous_text: str, @@ -62,6 +90,7 @@ def extract_reasoning_content_streaming( previous_token_ids: Sequence[int], current_token_ids: Sequence[int], delta_token_ids: Sequence[int], + model_status: str, ) -> Union[DeltaMessage, None]: """ Extract reasoning content from a delta message. @@ -71,36 +100,46 @@ def extract_reasoning_content_streaming( - 'abc' goes to reasoning_content - 'xyz' goes to content """ - if self.think_end_token not in current_text: - return DeltaMessage(reasoning_content=delta_text) - # Skip single special tokens - if len(delta_token_ids) == 1 and delta_token_ids[0] == self.think_end_token_id: - return None - if self._is_with_tool(current_text=current_text, current_token_ids=current_token_ids): + if model_status == "think_start": + if self.think_end_token not in current_text: + return DeltaMessage(reasoning_content=delta_text) + # Skip single special tokens + if len(delta_token_ids) == 1 and delta_token_ids[0] == self.think_end_token_id: + return None + if self._is_with_tool(current_text=current_text, current_token_ids=current_token_ids): + if self.think_end_token in delta_text: + think_begin = delta_text.find(self.think_end_token) + reasoning_content = delta_text[:think_begin] + return DeltaMessage(reasoning_content=reasoning_content) + return None if self.think_end_token in delta_text: - think_begin = delta_text.find(self.think_end_token) - reasoning_content = delta_text[:think_begin] - return DeltaMessage(reasoning_content=reasoning_content) + reasoning_content, _, content = delta_text.partition(self.think_end_token) + striped_content = content.strip("\n") + if len(striped_content) == 0: + return DeltaMessage(reasoning_content=reasoning_content) if reasoning_content else None + return ( + DeltaMessage(reasoning_content=reasoning_content, content=content) + if reasoning_content + else DeltaMessage(content=content) + ) + think_end = current_text.find(self.think_end_token) + len(self.think_end_token) + suffix = current_text[think_end:] + striped_suffix = suffix.strip("\n") + if len(striped_suffix) == 0: + return None + return DeltaMessage(content=delta_text) + elif model_status == "think_end": + if current_text.lstrip("\n").startswith(self.tool_call_start_token): + return None + return DeltaMessage(content=delta_text) + else: return None - if self.think_end_token in delta_text: - reasoning_content, _, content = delta_text.partition(self.think_end_token) - striped_content = content.strip("\n") - if len(striped_content) == 0: - return DeltaMessage(reasoning_content=reasoning_content) if reasoning_content else None - return ( - DeltaMessage(reasoning_content=reasoning_content, content=content) - if reasoning_content - else DeltaMessage(content=content) - ) - think_end = current_text.find(self.think_end_token) + len(self.think_end_token) - suffix = current_text[think_end:] - striped_suffix = suffix.strip("\n") - if len(striped_suffix) == 0: - return None - return DeltaMessage(content=delta_text) def extract_reasoning_content( - self, model_output: str, request: ChatCompletionRequest + self, + model_output: str, + request: ChatCompletionRequest, + model_status: str, ) -> tuple[Optional[str], Optional[str]]: """ Extract reasoning content from the model output. @@ -114,23 +153,30 @@ def extract_reasoning_content( """ # Check if the model output contains the tokens. - if self.think_end_token not in model_output: - return model_output, "" - reasoning_content, _, content = model_output.partition(self.think_end_token) - if self.tool_begin_token in content: - prefix, _, _ = content.partition(self.tool_begin_token) - prefix_strip = prefix.lstrip("\n") - if len(prefix_strip) > 0: - return reasoning_content, content - return reasoning_content, "" - return reasoning_content, content + if model_status == "think_start": + if self.think_end_token not in model_output: + return model_output, "" + reasoning_content, _, content = model_output.partition(self.think_end_token) + if self.tool_call_start_token in content: + prefix, _, _ = content.partition(self.tool_call_start_token) + prefix_strip = prefix.lstrip("\n") + if len(prefix_strip) > 0: + return reasoning_content, content + return reasoning_content, "" + return reasoning_content, content + elif model_status == "think_end": + if model_output.lstrip("\n").startswith(self.tool_call_start_token): + return "", "" + return "", model_output + else: + return "", "" def _is_with_tool(self, current_text: str, current_token_ids: Sequence[int]) -> bool: think_end_index = current_text.find(self.think_end_token) think_end = think_end_index + len(self.think_end_token) middle_str = current_text[think_end:] - if self.tool_begin_token_id in current_token_ids: - prefix, _, _ = middle_str.partition(self.tool_begin_token) + if self.tool_call_start_token_id in current_token_ids: + prefix, _, _ = middle_str.partition(self.tool_call_start_token) striped_prefix = prefix.strip("\n") if len(striped_prefix) > 0: return False diff --git a/tests/e2e/test_EB_VL_Lite_sot_serving.py b/tests/e2e/test_EB_VL_Lite_sot_serving.py index b2d8add1b0e..b21c99329a5 100644 --- a/tests/e2e/test_EB_VL_Lite_sot_serving.py +++ b/tests/e2e/test_EB_VL_Lite_sot_serving.py @@ -312,7 +312,7 @@ def test_chat_with_thinking(openai_client, capsys): max_tokens=10, extra_body={"chat_template_kwargs": {"enable_thinking": False}}, ) - assert response.choices[0].message.reasoning_content is None + assert response.choices[0].message.reasoning_content == "" assert "" not in response.choices[0].message.content # test logic @@ -404,4 +404,4 @@ def test_thinking_logic_flag(openai_client, capsys): "chat_template_kwargs": {"enable_thinking": False}, }, ) - assert response_case_3.choices[0].message.reasoning_content is None + assert response_case_3.choices[0].message.reasoning_content == "" diff --git a/tests/entrypoints/openai/test_finish_reason.py b/tests/entrypoints/openai/test_finish_reason.py index 4bdb3feefc8..d39cf917208 100644 --- a/tests/entrypoints/openai/test_finish_reason.py +++ b/tests/entrypoints/openai/test_finish_reason.py @@ -43,6 +43,8 @@ async def asyncSetUp(self): self.multi_modal_processor._check_mm_limits = Mock() self.multi_modal_processor.append_completion_tokens = Mock() self.multi_modal_processor.pack_outputs = lambda x: x + self.multi_modal_processor.reasoning_parser = None + self.multi_modal_processor.model_status_dict = {} self.engine_client = Mock() self.engine_client.connection_initialized = False @@ -242,7 +244,7 @@ async def test_chat_full_max_tokens(self, mock_data_logger, mock_processor_class mock_processor_instance = Mock() mock_processor_instance.enable_multimodal_content.return_value = True - async def mock_process_response_chat_async(response, stream, enable_thinking, include_stop_str_in_output): + async def mock_process_response_chat_async(response, stream, include_stop_str_in_output): yield response mock_processor_instance.process_response_chat = mock_process_response_chat_async @@ -423,7 +425,7 @@ async def test_chat_stream_max_tokens(self, mock_api_logger, mock_processor_clas mock_processor_instance = Mock() mock_processor_instance.enable_multimodal_content.return_value = False - async def mock_process_response_chat_async(response, stream, enable_thinking, include_stop_str_in_output): + async def mock_process_response_chat_async(response, stream, include_stop_str_in_output): if isinstance(response, list): for res in response: yield res diff --git a/tests/entrypoints/openai/test_max_streaming_tokens.py b/tests/entrypoints/openai/test_max_streaming_tokens.py index 3396c96431b..ab950e2b5ae 100644 --- a/tests/entrypoints/openai/test_max_streaming_tokens.py +++ b/tests/entrypoints/openai/test_max_streaming_tokens.py @@ -515,7 +515,7 @@ async def test_chat_stream_usage_fields(self, mock_response_processor, api_serve mock_processor_instance = Mock() - async def mock_process_response_chat(response, stream, enable_thinking, include_stop_str_in_output): + async def mock_process_response_chat(response, stream, include_stop_str_in_output): delta_msg_mock = Mock() delta_msg_mock.content = response["outputs"]["text"] if response["outputs"]["text"] == "a": diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py deleted file mode 100644 index 394a23f0f4e..00000000000 --- a/tests/entrypoints/openai/test_serving_chat.py +++ /dev/null @@ -1,71 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License" -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import unittest -from unittest.mock import MagicMock - -from fastdeploy.entrypoints.openai.protocol import ChatCompletionRequest -from fastdeploy.entrypoints.openai.serving_chat import OpenAIServingChat - - -class TestOpenAIServingCompletion(unittest.TestCase): - - def setUp(self): - """ - Set up the test environment by creating an instance of the OpenAIServingChat class using Mock. - """ - self.mock_engine = MagicMock() - self.chat_completion_handler = OpenAIServingChat( - self.mock_engine, - models=None, - pid=123, - ips=None, - max_waiting_time=10, - chat_template=None, - ) - - def test_enable_thinking(self): - request = ChatCompletionRequest(messages=[], chat_template_kwargs={}) - enable_thinking = self.chat_completion_handler._get_thinking_status(request) - self.assertEqual(enable_thinking, None) - - request = ChatCompletionRequest(messages=[], chat_template_kwargs={"enable_thinking": True}) - enable_thinking = self.chat_completion_handler._get_thinking_status(request) - self.assertEqual(enable_thinking, True) - - request = ChatCompletionRequest(messages=[], chat_template_kwargs={"enable_thinking": False}) - enable_thinking = self.chat_completion_handler._get_thinking_status(request) - self.assertEqual(enable_thinking, False) - - request = ChatCompletionRequest(messages=[], chat_template_kwargs={"options": {"thinking_mode": "close"}}) - enable_thinking = self.chat_completion_handler._get_thinking_status(request) - self.assertEqual(enable_thinking, False) - - request = ChatCompletionRequest(messages=[], chat_template_kwargs={"options": {"thinking_mode": "false"}}) - enable_thinking = self.chat_completion_handler._get_thinking_status(request) - self.assertEqual(enable_thinking, False) - - request = ChatCompletionRequest(messages=[], chat_template_kwargs={"options": {"thinking_mode": "open"}}) - enable_thinking = self.chat_completion_handler._get_thinking_status(request) - self.assertEqual(enable_thinking, True) - - request = ChatCompletionRequest(messages=[], chat_template_kwargs={"options": {"thinking_mode": "123"}}) - enable_thinking = self.chat_completion_handler._get_thinking_status(request) - self.assertEqual(enable_thinking, True) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/input/test_ernie4_5_processor.py b/tests/input/test_ernie4_5_processor.py index 8c7386fef85..ebe4daf744a 100644 --- a/tests/input/test_ernie4_5_processor.py +++ b/tests/input/test_ernie4_5_processor.py @@ -145,6 +145,7 @@ def _make_processor(self, reasoning=False, tool=False): tool_cls = MockToolParser if tool else None proc = Ernie4_5Processor("dummy-model", reasoning_parser_obj=reasoning_cls, tool_parser_obj=tool_cls) proc._apply_default_parameters = lambda req: req + proc.model_status_dict = {"req-1": "think_start"} return proc def test_update_bad_words(self): diff --git a/tests/reasoning/test_reasoning_parser.py b/tests/reasoning/test_reasoning_parser.py index 1f3fb696dac..b4899e46bf6 100644 --- a/tests/reasoning/test_reasoning_parser.py +++ b/tests/reasoning/test_reasoning_parser.py @@ -395,6 +395,7 @@ def test_streaming_non_reasoning(self): previous_token_ids=[], current_token_ids=[200], delta_token_ids=[200], + model_status="think_start", ) self.assertIsInstance(result, DeltaMessage) self.assertEqual(result.reasoning_content, "a") @@ -408,6 +409,7 @@ def test_streaming_with_reasoning(self): previous_token_ids=[200, 201], current_token_ids=[200, 201, 100], delta_token_ids=[100], + model_status="think_start", ) self.assertIsNone(result) @@ -419,6 +421,7 @@ def test_streaming_with_reasoning_and_content(self): previous_token_ids=[200, 201], current_token_ids=[200, 201, 100, 300, 400], delta_token_ids=[100, 300, 400], + model_status="think_start", ) self.assertIsInstance(result, DeltaMessage) self.assertIsNone(result.reasoning_content) @@ -432,6 +435,7 @@ def test_streaming_with_reasoning_new_line(self): previous_token_ids=[200, 201, 202], current_token_ids=[200, 201, 202, 100], delta_token_ids=[100], + model_status="think_start", ) self.assertIsNone(result) @@ -443,9 +447,10 @@ def test_streaming_with_reasoning_and_tool(self): previous_token_ids=[200, 201, 202], current_token_ids=[200, 201, 202, 100, 200, 101], delta_token_ids=[100, 200, 101], + model_status="think_start", ) self.assertIsInstance(result, DeltaMessage) - self.assertEqual(result.reasoning_content, "") + self.assertEqual(result.reasoning_content, None) def test_streaming_with_reasoning_and_illegal_tool(self): result = self.parser.extract_reasoning_content_streaming( @@ -455,6 +460,7 @@ def test_streaming_with_reasoning_and_illegal_tool(self): previous_token_ids=[200, 201, 202], current_token_ids=[200, 201, 202, 100, 200, 101], delta_token_ids=[109, 200, 101], + model_status="think_start", ) self.assertIsInstance(result, DeltaMessage) self.assertEqual(result.content, "\n\nhello") @@ -467,6 +473,7 @@ def test_streaming_with_reasoning_no_tool(self): previous_token_ids=[200, 201, 202], current_token_ids=[200, 201, 202, 100, 200, 110], delta_token_ids=[100, 200, 110], + model_status="think_start", ) self.assertIsInstance(result, DeltaMessage) self.assertEqual(result.reasoning_content, "hello") @@ -480,6 +487,7 @@ def test_streaming_reasoning_previous_no_tool(self): previous_token_ids=[100], current_token_ids=[100, 110, 111], delta_token_ids=[110, 111], + model_status="think_start", ) self.assertIsInstance(result, DeltaMessage) self.assertIsNone(result.reasoning_content) @@ -493,52 +501,127 @@ def test_streaming_no_reasoning_previous_tool(self): previous_token_ids=[101], current_token_ids=[101, 110], delta_token_ids=[110], + model_status="think_start", ) self.assertIsInstance(result, DeltaMessage) self.assertEqual(result.reasoning_content, "hello") + def test_think_end_status_streaming(self): + result = self.parser.extract_reasoning_content_streaming( + previous_text="", + current_text="hello", + delta_text="hello", + previous_token_ids=[101], + current_token_ids=[101, 110], + delta_token_ids=[110], + model_status="think_end", + ) + self.assertIs(result, None) + + result = self.parser.extract_reasoning_content_streaming( + previous_text="hello, ", + current_text="hello, hi", + delta_text="hi", + previous_token_ids=[101], + current_token_ids=[101, 110], + delta_token_ids=[110], + model_status="think_end", + ) + self.assertIsInstance(result, DeltaMessage) + self.assertEqual(result.content, "hi") + + def test_other_status_streaming(self): + result = self.parser.extract_reasoning_content_streaming( + previous_text="hello, ", + current_text="hello, hi", + delta_text="hi", + previous_token_ids=[101], + current_token_ids=[101, 110], + delta_token_ids=[110], + model_status="tool_call_start", + ) + self.assertIs(result, None) + def test_batch_no_think_end(self): reasoning, content = self.parser.extract_reasoning_content( - model_output="direct response", request=self.test_request + model_output="direct response", request=self.test_request, model_status="think_start" ) self.assertEqual(reasoning, "direct response") self.assertEqual(content, "") def test_batch_no_think_end_with_tool(self): reasoning, content = self.parser.extract_reasoning_content( - model_output="direct responseabc", request=self.test_request + model_output="direct responseabc", request=self.test_request, model_status="think_start" ) self.assertEqual(reasoning, "direct responseabc") self.assertEqual(content, "") def test_batch_think_end_normal_content(self): reasoning, content = self.parser.extract_reasoning_content( - model_output="reasoning\nresponse", request=self.test_request + model_output="reasoning\nresponse", request=self.test_request, model_status="think_start" ) self.assertEqual(reasoning, "reasoning") self.assertEqual(content, "\nresponse") def test_batch_think_end_with_tool(self): reasoning, content = self.parser.extract_reasoning_content( - model_output="reasoning\ntool params", request=self.test_request + model_output="reasoning\ntool params", + request=self.test_request, + model_status="think_start", ) self.assertEqual(reasoning, "reasoning") self.assertEqual(content, "") def test_batch_think_end_with_illegal_tool(self): reasoning, content = self.parser.extract_reasoning_content( - model_output="reasoning\nABC\ntool params", request=self.test_request + model_output="reasoning\nABC\ntool params", + request=self.test_request, + model_status="think_start", ) self.assertEqual(reasoning, "reasoning") self.assertEqual(content, "\nABC\ntool params") def test_batch_think_end_content_with_newline(self): reasoning, content = self.parser.extract_reasoning_content( - model_output="reasoning\n\n actual response", request=self.test_request + model_output="reasoning\n\n actual response", + request=self.test_request, + model_status="think_start", ) self.assertEqual(reasoning, "reasoning") self.assertEqual(content, "\n\n actual response") + def test_think_end_status_non_streaming(self): + reasoning, content = self.parser.extract_reasoning_content( + model_output="response", request=self.test_request, model_status="think_end" + ) + self.assertEqual(reasoning, "") + self.assertEqual(content, "response") + + reasoning, content = self.parser.extract_reasoning_content( + model_output="response", request=self.test_request, model_status="think_end" + ) + self.assertEqual(reasoning, "") + self.assertEqual(content, "") + + reasoning, content = self.parser.extract_reasoning_content( + model_output="\n 1response", request=self.test_request, model_status="think_end" + ) + self.assertEqual(reasoning, "") + self.assertEqual(content, "\n 1response") + + def test_other_status_non_streaming(self): + reasoning, content = self.parser.extract_reasoning_content( + model_output="response", request=self.test_request, model_status="tool_call_start" + ) + self.assertEqual(reasoning, "") + self.assertEqual(content, "") + + reasoning, content = self.parser.extract_reasoning_content( + model_output="response", request=self.test_request, model_status="tool_call_end" + ) + self.assertEqual(reasoning, "") + self.assertEqual(content, "") + class TestErnieVLReasoningParser(unittest.TestCase): def setUp(self): @@ -556,6 +639,7 @@ def test_extract_reasoning_content_stream(self): previous_token_ids=[200, 201, 202], current_token_ids=[200, 201, 202, 100, 110, 120, 130], delta_token_ids=[100, 110, 120, 130], + model_status="think_start", ) self.assertIsInstance(result, DeltaMessage) self.assertEqual(result.reasoning_content, "") @@ -569,6 +653,7 @@ def test_extract_reasoning_content_stream_think_in_previous(self): previous_token_ids=[200, 201, 202, 100], current_token_ids=[200, 201, 202, 100, 110, 120, 130], delta_token_ids=[110, 120, 130], + model_status="think_start", ) self.assertIsInstance(result, DeltaMessage) self.assertIsNone(result.reasoning_content) @@ -582,6 +667,7 @@ def test_extract_reasoning_content_stream_no_think_token(self): previous_token_ids=[200, 201, 202], current_token_ids=[200, 201, 202, 110, 120, 130], delta_token_ids=[110, 120, 130], + model_status="think_start", ) self.assertIsInstance(result, DeltaMessage) self.assertIsNone(result.content) @@ -589,7 +675,7 @@ def test_extract_reasoning_content_stream_no_think_token(self): def test_extract_reasoning_content(self): reasoning, content = self.parser.extract_reasoning_content( - model_output="reasoning\nactual response", request=self.test_request + model_output="reasoning\nactual response", request=self.test_request, model_status="think_start" ) self.assertEqual(reasoning, "reasoning") self.assertEqual(content, "\nactual response") From d3171a2fb7c4666f4a79d3eab515d52e47262728 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Thu, 20 Nov 2025 20:55:58 +0800 Subject: [PATCH 31/32] fix unit test --- tests/input/test_ernie4_5_processor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/input/test_ernie4_5_processor.py b/tests/input/test_ernie4_5_processor.py index ebe4daf744a..8ccb4e60f9c 100644 --- a/tests/input/test_ernie4_5_processor.py +++ b/tests/input/test_ernie4_5_processor.py @@ -73,6 +73,7 @@ def extract_reasoning_content_streaming( previous_token_ids, all_token_ids, delta_token_ids, + model_status, ): """Return a simple object with reasoning_content to cover reasoning branch.""" From 4317e15ad035c35bf522e022fe01ecaaf1344f06 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Wed, 26 Nov 2025 19:04:16 +0800 Subject: [PATCH 32/32] fix n --- fastdeploy/input/ernie4_5_processor.py | 29 ++++++++++++++----- .../ernie4_5_vl_processor.py | 13 ++++++--- .../paddleocr_vl_processor.py | 13 +++++++++ .../qwen_vl_processor/qwen_vl_processor.py | 13 ++++++--- fastdeploy/input/text_processor.py | 26 ++++++++++++----- 5 files changed, 70 insertions(+), 24 deletions(-) diff --git a/fastdeploy/input/ernie4_5_processor.py b/fastdeploy/input/ernie4_5_processor.py index edd21796bc2..a095e5af6ef 100644 --- a/fastdeploy/input/ernie4_5_processor.py +++ b/fastdeploy/input/ernie4_5_processor.py @@ -153,11 +153,16 @@ def process_request(self, request, max_model_len=None, **kwargs): if request.get("top_p") < _SAMPLING_EPS: request.set("top_p", _SAMPLING_EPS) if self.reasoning_parser: - real_req_id = request.request_id.split("_")[0] - n = request.get("n", 1) model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids) - for idx in range(n): - self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + parts = request.request_id.split("_") + if len(parts) > 1: + real_req_id = parts[0] + index = int(parts[1]) + n = request.get("n", 1) + for idx in range(index * n, (index + 1) * n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + else: + self.model_status_dict[request.request_id] = model_status request.enable_thinking = model_status == "think_start" data_processor_logger.info(f"Processed request: {request}") @@ -235,12 +240,18 @@ def process_request_dict(self, request, max_model_len=None): request["temperature"] = 1 if request.get("top_p") < _SAMPLING_EPS: request["top_p"] = _SAMPLING_EPS + if self.reasoning_parser: - real_req_id = request["request_id"].split("_")[0] model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) - n = request.get("n", 1) - for idx in range(n): - self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + parts = request["request_id"].split("_") + if len(parts) > 1: + real_req_id = parts[0] + index = int(parts[1]) + n = request.get("n", 1) + for idx in range(index * n, (index + 1) * n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + else: + self.model_status_dict[request["request_id"]] = model_status request["enable_thinking"] = model_status == "think_start" data_processor_logger.info(f"Processed request dict: {request}") return request @@ -341,6 +352,7 @@ def process_response_dict_normal(self, response_dict, **kwargs): del self.decode_status[req_id] if req_id in self.model_status_dict: del self.model_status_dict[req_id] + print(self.model_status_dict) return response_dict def process_response_dict_streaming(self, response_dict, **kwargs): @@ -399,6 +411,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): del self.tool_parser_dict[req_id] if req_id in self.model_status_dict: del self.model_status_dict[req_id] + print(self.model_status_dict) return response_dict def messages2ids(self, request_or_messages, **kwargs): diff --git a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py index f164d095fcd..133bc1576e3 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py +++ b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py @@ -274,11 +274,16 @@ def process_request_dict(self, request, max_model_len=None): data_processor_logger.info(f"Processed request {request}") if self.reasoning_parser: - real_req_id = request["request_id"].split("_")[0] model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) - n = request.get("n", 1) - for idx in range(n): - self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + parts = request["request_id"].split("_") + if len(parts) > 1: + real_req_id = parts[0] + index = int(parts[1]) + n = request.get("n", 1) + for idx in range(index * n, (index + 1) * n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + else: + self.model_status_dict[request["request_id"]] = model_status request["enable_thinking"] = model_status == "think_start" if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS: request["top_p"] = _SAMPLING_EPS diff --git a/fastdeploy/input/paddleocr_vl_processor/paddleocr_vl_processor.py b/fastdeploy/input/paddleocr_vl_processor/paddleocr_vl_processor.py index a5335fd0c39..5dfdce976de 100644 --- a/fastdeploy/input/paddleocr_vl_processor/paddleocr_vl_processor.py +++ b/fastdeploy/input/paddleocr_vl_processor/paddleocr_vl_processor.py @@ -256,6 +256,19 @@ def process_request_dict(self, request, max_model_len=None): if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS: request["top_p"] = _SAMPLING_EPS + if self.reasoning_parser: + model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) + parts = request["request_id"].split("_") + if len(parts) > 1: + real_req_id = parts[0] + index = int(parts[1]) + n = request.get("n", 1) + for idx in range(index * n, (index + 1) * n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + else: + self.model_status_dict[request["request_id"]] = model_status + request["enable_thinking"] = model_status == "think_start" + return request def append_generated_tokens(self, multimodal_inputs, generated_token_ids): diff --git a/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py b/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py index cda49092c86..af965b1dc62 100644 --- a/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py +++ b/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py @@ -270,11 +270,16 @@ def process_request_dict(self, request, max_model_len=None): if request.get("max_tokens") is None: request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"])) # Ensure at least 1 token if self.reasoning_parser: - real_req_id = request["request_id"].split("_")[0] model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) - n = request.get("n", 1) - for idx in range(n): - self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + parts = request["request_id"].split("_") + if len(parts) > 1: + real_req_id = parts[0] + index = int(parts[1]) + n = request.get("n", 1) + for idx in range(index * n, (index + 1) * n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + else: + self.model_status_dict[request["request_id"]] = model_status request["enable_thinking"] = model_status == "think_start" data_processor_logger.info(f"Processed request {request}") diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py index 138eb59f171..ae85bddc8e8 100644 --- a/fastdeploy/input/text_processor.py +++ b/fastdeploy/input/text_processor.py @@ -269,11 +269,16 @@ def process_request(self, request, max_model_len=None, **kwargs): if request.get("top_p") < _SAMPLING_EPS: request.set("top_p", _SAMPLING_EPS) if self.reasoning_parser: - real_req_id = request.request_id.split("_")[0] - n = request.get("n", 1) model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids) - for idx in range(n): - self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + parts = request.request_id.split("_") + if len(parts) > 1: + real_req_id = parts[0] + index = int(parts[1]) + n = request.get("n", 1) + for idx in range(index * n, (index + 1) * n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + else: + self.model_status_dict[request.request_id] = model_status request.enable_thinking = model_status == "think_start" data_processor_logger.info(f"Processed request: {request}") @@ -350,11 +355,16 @@ def process_request_dict(self, request, max_model_len=None, **kwargs): if request.get("top_p") < _SAMPLING_EPS: request["top_p"] = _SAMPLING_EPS if self.reasoning_parser: - real_req_id = request["request_id"].split("_")[0] model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) - n = request.get("n", 1) - for idx in range(n): - self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + parts = request["request_id"].split("_") + if len(parts) > 1: + real_req_id = parts[0] + index = int(parts[1]) + n = request.get("n", 1) + for idx in range(index * n, (index + 1) * n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + else: + self.model_status_dict[request["request_id"]] = model_status request["enable_thinking"] = model_status == "think_start" data_processor_logger.info(f"Processed request dict: {request}")