diff --git a/fastdeploy/entrypoints/openai/response_processors.py b/fastdeploy/entrypoints/openai/response_processors.py index 95a5e3ec404..469f31dda4c 100644 --- a/fastdeploy/entrypoints/openai/response_processors.py +++ b/fastdeploy/entrypoints/openai/response_processors.py @@ -68,13 +68,12 @@ def accumulate_token_ids(self, request_output): else: self._multipart_buffer.append({"decode_type": decode_type, "request_output": request_output}) - async def process_response_chat(self, request_outputs, stream, enable_thinking, include_stop_str_in_output): + async def process_response_chat(self, request_outputs, stream, include_stop_str_in_output): """ Process a list of responses into a generator that yields each processed response as it's generated. Args: request_outputs: The list of outputs to be processed. stream: Whether or not to stream the output. - enable_thinking: Whether or not to show thinking messages. include_stop_str_in_output: Whether or not to include stop strings in the output. """ for request_output in request_outputs: @@ -83,7 +82,6 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking, yield self.data_processor.process_response_dict( response_dict=request_output, stream=stream, - enable_thinking=enable_thinking, include_stop_str_in_output=include_stop_str_in_output, ) elif stream: @@ -111,7 +109,6 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking, self.data_processor.process_response_dict( response_dict=request_output, stream=stream, - enable_thinking=enable_thinking, include_stop_str_in_output=include_stop_str_in_output, ) text = {"type": "text", "text": request_output["outputs"]["text"]} @@ -132,7 +129,6 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking, self.data_processor.process_response_dict( response_dict=part["request_output"], stream=False, - enable_thinking=enable_thinking, include_stop_str_in_output=include_stop_str_in_output, ) text = {"type": "text", "text": part["request_output"]["outputs"]["text"]} diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index 20846131233..aa761130c24 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -210,8 +210,6 @@ async def chat_completion_stream_generator( max_streaming_response_tokens = max(1, max_streaming_response_tokens) - enable_thinking = self._get_thinking_status(request) - include_stop_str_in_output = request.include_stop_str_in_output stream_options = request.stream_options @@ -267,7 +265,6 @@ async def chat_completion_stream_generator( generator = response_processor.process_response_chat( response, stream=True, - enable_thinking=enable_thinking, include_stop_str_in_output=include_stop_str_in_output, ) @@ -469,7 +466,6 @@ async def chat_completion_full_generator( """ created_time = int(time.time()) num_choices = 1 if request.n is None else request.n - enable_thinking = self._get_thinking_status(request) include_stop_str_in_output = request.include_stop_str_in_output try: @@ -523,7 +519,6 @@ async def chat_completion_full_generator( generator = response_processor.process_response_chat( response, stream=False, - enable_thinking=enable_thinking, include_stop_str_in_output=include_stop_str_in_output, ) async for data in generator: @@ -757,20 +752,3 @@ def _build_logprobs_response( error_msg = f"Error in _build_logprobs_response: {e}, {str(traceback.format_exc())}" api_server_logger.error(error_msg) return None - - def _get_thinking_status(self, request: ChatCompletionRequest) -> bool: - """ - Get the thinking status from the request. - """ - enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None - if enable_thinking is None: - enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None - options = request.chat_template_kwargs.get("options") if request.chat_template_kwargs else None - if options: - thinking_mode = options.get("thinking_mode") - if thinking_mode: - if thinking_mode == "close" or thinking_mode == "false": - enable_thinking = False - else: - enable_thinking = True - return enable_thinking diff --git a/fastdeploy/input/ernie4_5_processor.py b/fastdeploy/input/ernie4_5_processor.py index a151dbfdd6d..edd21796bc2 100644 --- a/fastdeploy/input/ernie4_5_processor.py +++ b/fastdeploy/input/ernie4_5_processor.py @@ -60,6 +60,7 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_ob self.decode_status = dict() self.tool_parser_dict = dict() self.thinking_parser_dict = dict() + self.model_status_dict = dict() self._load_tokenizer() data_processor_logger.info( f"tokenizer information: bos_token is {self.tokenizer.bos_token} \ @@ -151,8 +152,13 @@ def process_request(self, request, max_model_len=None, **kwargs): request.set("temperature", 1) if request.get("top_p") < _SAMPLING_EPS: request.set("top_p", _SAMPLING_EPS) - if self.reasoning_parser and self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser": - request.enable_thinking = True + if self.reasoning_parser: + real_req_id = request.request_id.split("_")[0] + n = request.get("n", 1) + model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids) + for idx in range(n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + request.enable_thinking = model_status == "think_start" data_processor_logger.info(f"Processed request: {request}") return request @@ -229,9 +235,13 @@ def process_request_dict(self, request, max_model_len=None): request["temperature"] = 1 if request.get("top_p") < _SAMPLING_EPS: request["top_p"] = _SAMPLING_EPS - if self.reasoning_parser and self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser": - request["enable_thinking"] = True - + if self.reasoning_parser: + real_req_id = request["request_id"].split("_")[0] + model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) + n = request.get("n", 1) + for idx in range(n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + request["enable_thinking"] = model_status == "think_start" data_processor_logger.info(f"Processed request dict: {request}") return request @@ -253,7 +263,11 @@ def process_response(self, response_dict, **kwargs): token_ids = token_ids[:-1] full_text = self.tokenizer.decode(token_ids) if self.reasoning_parser: - reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict) + reasoning_content, text = self.reasoning_parser.extract_reasoning_content( + full_text, + response_dict, + self.model_status_dict[req_id], + ) response_dict.outputs.text = text response_dict.outputs.reasoning_content = reasoning_content else: @@ -264,6 +278,8 @@ def process_response(self, response_dict, **kwargs): if tool_call_info.tools_called: response_dict.outputs.tool_calls = tool_call_info.tool_calls response_dict.outputs.text = tool_call_info.content + if req_id in self.model_status_dict: + del self.model_status_dict[req_id] data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}") if response_dict.outputs.text == "" and response_dict.outputs.reasoning_content == "": return None @@ -294,7 +310,6 @@ def process_response_dict_normal(self, response_dict, **kwargs): Returns: Dict: response contain text fields """ - enable_thinking = kwargs.get("enable_thinking") token_ids = response_dict["outputs"]["token_ids"] is_end = response_dict["finished"] req_id = response_dict["request_id"] @@ -304,16 +319,17 @@ def process_response_dict_normal(self, response_dict, **kwargs): delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id) if is_end: full_text = previous_texts + delta_text - if self.reasoning_parser and ( - enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser" - ): - reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict) + response_dict["outputs"]["text"] = full_text + if self.reasoning_parser: + reasoning_content, text = self.reasoning_parser.extract_reasoning_content( + full_text, + response_dict, + self.model_status_dict[req_id], + ) response_dict["outputs"]["text"] = text response_dict["outputs"]["reasoning_content"] = reasoning_content reasoning_tokens = self.tokenizer.tokenize(reasoning_content) response_dict["outputs"]["reasoning_token_num"] = len(reasoning_tokens) - else: - response_dict["outputs"]["text"] = full_text if self.tool_parser_obj: tool_parser = self.tool_parser_obj(self.tokenizer) tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict) @@ -323,6 +339,8 @@ def process_response_dict_normal(self, response_dict, **kwargs): response_dict["outputs"]["completion_tokens"] = full_text data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}") del self.decode_status[req_id] + if req_id in self.model_status_dict: + del self.model_status_dict[req_id] return response_dict def process_response_dict_streaming(self, response_dict, **kwargs): @@ -335,7 +353,6 @@ def process_response_dict_streaming(self, response_dict, **kwargs): Returns: Dict: response contain text fields """ - enable_thinking = kwargs.get("enable_thinking") is_end = response_dict["finished"] req_id = response_dict["request_id"] token_ids = response_dict["outputs"]["token_ids"] @@ -345,9 +362,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): token_ids = token_ids[:-1] delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id) response_dict["outputs"]["completion_tokens"] = delta_text - if self.reasoning_parser and ( - enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser" - ): + if self.reasoning_parser: reasoning_delta_message = self.reasoning_parser.extract_reasoning_content_streaming( previous_texts, previous_texts + delta_text, @@ -355,6 +370,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): previous_token_ids, previous_token_ids + token_ids, token_ids, + self.model_status_dict[req_id], ) response_dict["outputs"]["delta_message"] = reasoning_delta_message reasoning_content = reasoning_delta_message.reasoning_content if reasoning_delta_message else None @@ -381,6 +397,8 @@ def process_response_dict_streaming(self, response_dict, **kwargs): del self.decode_status[req_id] if req_id in self.tool_parser_dict: del self.tool_parser_dict[req_id] + if req_id in self.model_status_dict: + del self.model_status_dict[req_id] return response_dict def messages2ids(self, request_or_messages, **kwargs): diff --git a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py index 0fe724af53f..f164d095fcd 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py +++ b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py @@ -58,6 +58,7 @@ def __init__( self.tool_parser_dict = dict() self.decode_status = dict() + self.model_status_dict = dict() self._load_tokenizer() # Generation config @@ -272,6 +273,13 @@ def process_request_dict(self, request, max_model_len=None): request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1) data_processor_logger.info(f"Processed request {request}") + if self.reasoning_parser: + real_req_id = request["request_id"].split("_")[0] + model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) + n = request.get("n", 1) + for idx in range(n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + request["enable_thinking"] = model_status == "think_start" if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS: request["top_p"] = _SAMPLING_EPS @@ -307,21 +315,3 @@ def pack_outputs(self, outs): outs["position_ids"] = np.array(outs["position_ids"], dtype=np.int64) return outs - - def process_response_dict(self, response_dict, stream, **kwargs): - """ - Preprocess the response - - Args: - response_dict (Dict): response for engine, contain ids fields - - Returns: - Dict: response contain text fields - """ - enable_thinking = kwargs.pop("enable_thinking", True) - if enable_thinking is None: - enable_thinking = True - if stream: - return self.process_response_dict_streaming(response_dict, enable_thinking=enable_thinking, **kwargs) - else: - return self.process_response_dict_normal(response_dict, enable_thinking=enable_thinking, **kwargs) diff --git a/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py b/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py index 06f43f335ae..cda49092c86 100644 --- a/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py +++ b/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py @@ -269,6 +269,13 @@ def process_request_dict(self, request, max_model_len=None): # Set default max_tokens if not specified if request.get("max_tokens") is None: request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"])) # Ensure at least 1 token + if self.reasoning_parser: + real_req_id = request["request_id"].split("_")[0] + model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) + n = request.get("n", 1) + for idx in range(n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + request["enable_thinking"] = model_status == "think_start" data_processor_logger.info(f"Processed request {request}") return request diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py index 066909f361e..138eb59f171 100644 --- a/fastdeploy/input/text_processor.py +++ b/fastdeploy/input/text_processor.py @@ -175,6 +175,7 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_ob self.generation_config = None self.decode_status = dict() + self.model_status_dict = dict() self.tool_parser_dict = dict() self.tokenizer = self._load_tokenizer() data_processor_logger.info( @@ -267,6 +268,13 @@ def process_request(self, request, max_model_len=None, **kwargs): request.set("temperature", 1) if request.get("top_p") < _SAMPLING_EPS: request.set("top_p", _SAMPLING_EPS) + if self.reasoning_parser: + real_req_id = request.request_id.split("_")[0] + n = request.get("n", 1) + model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids) + for idx in range(n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + request.enable_thinking = model_status == "think_start" data_processor_logger.info(f"Processed request: {request}") return request @@ -341,6 +349,13 @@ def process_request_dict(self, request, max_model_len=None, **kwargs): request["temperature"] = 1 if request.get("top_p") < _SAMPLING_EPS: request["top_p"] = _SAMPLING_EPS + if self.reasoning_parser: + real_req_id = request["request_id"].split("_")[0] + model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) + n = request.get("n", 1) + for idx in range(n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + request["enable_thinking"] = model_status == "think_start" data_processor_logger.info(f"Processed request dict: {request}") return request @@ -364,21 +379,21 @@ def process_response(self, response_dict, **kwargs): if token_ids[-1] == self.tokenizer.eos_token_id: token_ids = token_ids[:-1] full_text = self.tokenizer.decode(token_ids) - - # 模型支持思考,并且支持思考 + response_dict.outputs.text = full_text if self.reasoning_parser: - reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict) + reasoning_content, text = self.reasoning_parser.extract_reasoning_content( + full_text, response_dict, self.model_status_dict[req_id] + ) response_dict.outputs.text = text response_dict.outputs.reasoning_content = reasoning_content - else: - # 模型不支持思考,并且没单独设置enable_thinking为false - response_dict.outputs.text = full_text if self.tool_parser_obj: tool_parser = self.tool_parser_obj(self.tokenizer) tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict) if tool_call_info.tools_called: response_dict.outputs.tool_calls = tool_call_info.tool_calls response_dict.outputs.text = tool_call_info.content + if req_id in self.model_status_dict: + del self.model_status_dict[req_id] data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}") return response_dict @@ -393,7 +408,6 @@ def process_response_dict_normal(self, response_dict, **kwargs): Returns: Dict: response contain text fields """ - enable_thinking = kwargs.get("enable_thinking") token_ids = response_dict["outputs"]["token_ids"] is_end = response_dict["finished"] req_id = response_dict["request_id"] @@ -404,14 +418,17 @@ def process_response_dict_normal(self, response_dict, **kwargs): if is_end: full_text = previous_texts + delta_text response_dict["outputs"]["completion_tokens"] = full_text - if enable_thinking and self.reasoning_parser: - reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict) + response_dict["outputs"]["text"] = full_text + if self.reasoning_parser: + reasoning_content, text = self.reasoning_parser.extract_reasoning_content( + full_text, + response_dict, + self.model_status_dict[req_id], + ) response_dict["outputs"]["text"] = text response_dict["outputs"]["reasoning_content"] = reasoning_content reasoning_tokens = self.tokenizer.tokenize(reasoning_content) response_dict["outputs"]["reasoning_token_num"] = len(reasoning_tokens) - else: - response_dict["outputs"]["text"] = full_text if self.tool_parser_obj: tool_parser = self.tool_parser_obj(self.tokenizer) tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict) @@ -420,6 +437,8 @@ def process_response_dict_normal(self, response_dict, **kwargs): response_dict["outputs"]["text"] = tool_call_info.content data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}") del self.decode_status[req_id] + if req_id in self.model_status_dict: + del self.model_status_dict[req_id] return response_dict def process_response_dict_streaming(self, response_dict, **kwargs): @@ -432,7 +451,6 @@ def process_response_dict_streaming(self, response_dict, **kwargs): Returns: Dict: response contain text fields """ - enable_thinking = kwargs.get("enable_thinking") is_end = response_dict["finished"] req_id = response_dict["request_id"] token_ids = response_dict["outputs"]["token_ids"] @@ -442,9 +460,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): token_ids = token_ids[:-1] delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id) response_dict["outputs"]["completion_tokens"] = delta_text - if self.reasoning_parser and ( - enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser" - ): + if self.reasoning_parser: reasoning_delta_message = self.reasoning_parser.extract_reasoning_content_streaming( previous_texts, previous_texts + delta_text, @@ -452,6 +468,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): previous_token_ids, previous_token_ids + token_ids, token_ids, + self.model_status_dict[req_id], ) response_dict["outputs"]["delta_message"] = reasoning_delta_message reasoning_content = reasoning_delta_message.reasoning_content if reasoning_delta_message else None @@ -478,6 +495,8 @@ def process_response_dict_streaming(self, response_dict, **kwargs): del self.decode_status[req_id] if req_id in self.tool_parser_dict: del self.tool_parser_dict[req_id] + if req_id in self.model_status_dict: + del self.model_status_dict[req_id] return response_dict def process_response_dict(self, response_dict, **kwargs): @@ -490,16 +509,12 @@ def process_response_dict(self, response_dict, **kwargs): Returns: Dict: response contain text fields """ - enable_thinking = kwargs.pop("enable_thinking", True) - if enable_thinking is None: - enable_thinking = True stream = kwargs.get("stream", True) if stream: - return self.process_response_dict_streaming(response_dict, enable_thinking=enable_thinking, **kwargs) + return self.process_response_dict_streaming(response_dict, **kwargs) else: return self.process_response_dict_normal( response_dict=response_dict, - enable_thinking=enable_thinking, **kwargs, ) diff --git a/fastdeploy/reasoning/ernie_45_vl_thinking_reasoning_parser.py b/fastdeploy/reasoning/ernie_45_vl_thinking_reasoning_parser.py index 939a0a4348b..fa394545802 100644 --- a/fastdeploy/reasoning/ernie_45_vl_thinking_reasoning_parser.py +++ b/fastdeploy/reasoning/ernie_45_vl_thinking_reasoning_parser.py @@ -35,25 +35,53 @@ class Ernie45VLThinkingReasoningParser(ReasoningParser): def __init__(self, tokenizer): super().__init__(tokenizer) - self.think_end_token = "" - self.tool_begin_token = "" + token_definitions = { + "think_start_token": "", + "think_end_token": "", + "tool_call_start_token": "", + "tool_call_end_token": "", + } if not self.model_tokenizer: raise ValueError( "The model tokenizer must be passed to the ReasoningParser " "constructor during construction." ) - - self.think_end_token_id = self.vocab.get(self.think_end_token) - self.tool_begin_token_id = self.vocab.get(self.tool_begin_token) - if self.tool_begin_token_id is None: - self.tool_begin_token_id = -1 - - if self.think_end_token_id is None: - raise RuntimeError("Test reasoning parser could not locate think end tokens in the tokenizer!") + missing_tokens = [] + for name, token_value in token_definitions.items(): + setattr(self, name, token_value) + token_id = self.vocab.get(token_value) + setattr(self, f"{name}_id", token_id) + if token_id is None: + missing_tokens.append(f"{name.replace('_', ' ')} token") + + if missing_tokens: + raise RuntimeError( + f"ernie vl reasoning parser could not find the following token ids in tokenizer vocabulary: {', '.join(missing_tokens)}" + ) + self.token_status_mapping = { + self.think_start_token_id: "think_start", + self.think_end_token_id: "think_end", + self.tool_call_start_token_id: "tool_call_start", + self.tool_call_end_token_id: "tool_call_end", + } def is_reasoning_end(self, input_ids: list[int]) -> bool: return self.think_end_token_id in input_ids + def find_last_special_token(self, prompt_token_ids: list[int]) -> int: + for i in range(len(prompt_token_ids) - 1, -1, -1): + if prompt_token_ids[i] in self.token_status_mapping: + return prompt_token_ids[i] + return -1 + + def get_model_status(self, prompt_token_ids: list[int]): + special_token_id = self.find_last_special_token(prompt_token_ids) + + if special_token_id == -1: + return "think_start" + + return self.token_status_mapping[special_token_id] + def extract_reasoning_content_streaming( self, previous_text: str, @@ -62,6 +90,7 @@ def extract_reasoning_content_streaming( previous_token_ids: Sequence[int], current_token_ids: Sequence[int], delta_token_ids: Sequence[int], + model_status: str, ) -> Union[DeltaMessage, None]: """ Extract reasoning content from a delta message. @@ -71,36 +100,46 @@ def extract_reasoning_content_streaming( - 'abc' goes to reasoning_content - 'xyz' goes to content """ - if self.think_end_token not in current_text: - return DeltaMessage(reasoning_content=delta_text) - # Skip single special tokens - if len(delta_token_ids) == 1 and delta_token_ids[0] == self.think_end_token_id: - return None - if self._is_with_tool(current_text=current_text, current_token_ids=current_token_ids): + if model_status == "think_start": + if self.think_end_token not in current_text: + return DeltaMessage(reasoning_content=delta_text) + # Skip single special tokens + if len(delta_token_ids) == 1 and delta_token_ids[0] == self.think_end_token_id: + return None + if self._is_with_tool(current_text=current_text, current_token_ids=current_token_ids): + if self.think_end_token in delta_text: + think_begin = delta_text.find(self.think_end_token) + reasoning_content = delta_text[:think_begin] + return DeltaMessage(reasoning_content=reasoning_content) + return None if self.think_end_token in delta_text: - think_begin = delta_text.find(self.think_end_token) - reasoning_content = delta_text[:think_begin] - return DeltaMessage(reasoning_content=reasoning_content) + reasoning_content, _, content = delta_text.partition(self.think_end_token) + striped_content = content.strip("\n") + if len(striped_content) == 0: + return DeltaMessage(reasoning_content=reasoning_content) if reasoning_content else None + return ( + DeltaMessage(reasoning_content=reasoning_content, content=content) + if reasoning_content + else DeltaMessage(content=content) + ) + think_end = current_text.find(self.think_end_token) + len(self.think_end_token) + suffix = current_text[think_end:] + striped_suffix = suffix.strip("\n") + if len(striped_suffix) == 0: + return None + return DeltaMessage(content=delta_text) + elif model_status == "think_end": + if current_text.lstrip("\n").startswith(self.tool_call_start_token): + return None + return DeltaMessage(content=delta_text) + else: return None - if self.think_end_token in delta_text: - reasoning_content, _, content = delta_text.partition(self.think_end_token) - striped_content = content.strip("\n") - if len(striped_content) == 0: - return DeltaMessage(reasoning_content=reasoning_content) if reasoning_content else None - return ( - DeltaMessage(reasoning_content=reasoning_content, content=content) - if reasoning_content - else DeltaMessage(content=content) - ) - think_end = current_text.find(self.think_end_token) + len(self.think_end_token) - suffix = current_text[think_end:] - striped_suffix = suffix.strip("\n") - if len(striped_suffix) == 0: - return None - return DeltaMessage(content=delta_text) def extract_reasoning_content( - self, model_output: str, request: ChatCompletionRequest + self, + model_output: str, + request: ChatCompletionRequest, + model_status: str, ) -> tuple[Optional[str], Optional[str]]: """ Extract reasoning content from the model output. @@ -114,23 +153,30 @@ def extract_reasoning_content( """ # Check if the model output contains the tokens. - if self.think_end_token not in model_output: - return model_output, "" - reasoning_content, _, content = model_output.partition(self.think_end_token) - if self.tool_begin_token in content: - prefix, _, _ = content.partition(self.tool_begin_token) - prefix_strip = prefix.lstrip("\n") - if len(prefix_strip) > 0: - return reasoning_content, content - return reasoning_content, "" - return reasoning_content, content + if model_status == "think_start": + if self.think_end_token not in model_output: + return model_output, "" + reasoning_content, _, content = model_output.partition(self.think_end_token) + if self.tool_call_start_token in content: + prefix, _, _ = content.partition(self.tool_call_start_token) + prefix_strip = prefix.lstrip("\n") + if len(prefix_strip) > 0: + return reasoning_content, content + return reasoning_content, "" + return reasoning_content, content + elif model_status == "think_end": + if model_output.lstrip("\n").startswith(self.tool_call_start_token): + return "", "" + return "", model_output + else: + return "", "" def _is_with_tool(self, current_text: str, current_token_ids: Sequence[int]) -> bool: think_end_index = current_text.find(self.think_end_token) think_end = think_end_index + len(self.think_end_token) middle_str = current_text[think_end:] - if self.tool_begin_token_id in current_token_ids: - prefix, _, _ = middle_str.partition(self.tool_begin_token) + if self.tool_call_start_token_id in current_token_ids: + prefix, _, _ = middle_str.partition(self.tool_call_start_token) striped_prefix = prefix.strip("\n") if len(striped_prefix) > 0: return False diff --git a/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py b/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py index 18e15f18aa3..cafffbb8b08 100644 --- a/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py +++ b/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py @@ -35,20 +35,48 @@ class ErnieVLReasoningParser(ReasoningParser): def __init__(self, tokenizer): super().__init__(tokenizer) - self.think_end_token = "" + token_definitions = { + "think_start_token": "", + "think_end_token": "", + } if not self.model_tokenizer: - raise ValueError( - "The model tokenizer must be passed to the ReasoningParser " "constructor during construction." - ) + raise ValueError("The model tokenizer must be passed to the ReasoningParser constructor.") + + missing_tokens = [] + for name, token_value in token_definitions.items(): + setattr(self, name, token_value) + token_id = self.vocab.get(token_value) + setattr(self, f"{name}_id", token_id) + if token_id is None: + missing_tokens.append(f"{name.replace('_', ' ')} token") - self.think_end_token_id = self.vocab.get(self.think_end_token) - if self.think_end_token_id is None: - raise RuntimeError("Ernie VL reasoning parser could not locate think end " "tokens in the tokenizer!") + if missing_tokens: + raise RuntimeError( + f"ernie vl reasoning parser could not find the following token ids in tokenizer vocabulary: {', '.join(missing_tokens)}" + ) + self.token_status_mapping = { + self.think_start_token_id: "think_start", + self.think_end_token_id: "think_end", + } def is_reasoning_end(self, input_ids: list[int]) -> bool: return self.think_end_token_id in input_ids + def find_last_special_token(self, prompt_token_ids: list[int]) -> int: + for i in range(len(prompt_token_ids) - 1, -1, -1): + if prompt_token_ids[i] in self.token_status_mapping: + return prompt_token_ids[i] + return -1 + + def get_model_status(self, prompt_token_ids: list[int]): + special_token_id = self.find_last_special_token(prompt_token_ids) + + if special_token_id == -1: + return "think_start" + + return self.token_status_mapping[special_token_id] + def extract_reasoning_content_streaming( self, previous_text: str, @@ -57,6 +85,7 @@ def extract_reasoning_content_streaming( previous_token_ids: Sequence[int], current_token_ids: Sequence[int], delta_token_ids: Sequence[int], + model_status: str, ) -> Union[DeltaMessage, None]: """ Extract reasoning content from a delta message. @@ -69,18 +98,23 @@ def extract_reasoning_content_streaming( # Skip single special tokens if len(delta_token_ids) == 1 and delta_token_ids[0] == self.think_end_token_id: return None - if self.think_end_token_id in delta_token_ids: - end_index = delta_text.find(self.think_end_token) - reasoning_content = delta_text[:end_index] - content = delta_text[end_index + len(self.think_end_token) :] - return DeltaMessage(reasoning_content=reasoning_content, content=content) - elif self.think_end_token_id in previous_token_ids: - return DeltaMessage(content=delta_text) - else: + if model_status == "think_start": + if self.think_end_token_id in delta_token_ids: + end_index = delta_text.find(self.think_end_token) + reasoning_content = delta_text[:end_index] + content = delta_text[end_index + len(self.think_end_token) :] + return DeltaMessage(reasoning_content=reasoning_content, content=content) + if self.think_end_token_id in previous_token_ids: + return DeltaMessage(content=delta_text) return DeltaMessage(reasoning_content=delta_text) + else: + return DeltaMessage(content=delta_text) def extract_reasoning_content( - self, model_output: str, request: ChatCompletionRequest + self, + model_output: str, + request: ChatCompletionRequest, + model_status: str, ) -> tuple[Optional[str], Optional[str]]: """ Extract reasoning content from the model output. @@ -92,11 +126,12 @@ def extract_reasoning_content( Returns: tuple[Optional[str], Optional[str]]: reasoning content and content """ - # Check if the model output contains the tokens. - if self.think_end_token not in model_output: + if model_status == "think_start": + if self.think_end_token not in model_output: + return "", model_output + reasoning_content, _, content = model_output.partition(self.think_end_token) + final_content = content or "" + return reasoning_content, final_content + else: return "", model_output - reasoning_content, _, content = model_output.partition(self.think_end_token) - - final_content = content or "" - return reasoning_content, final_content diff --git a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py index 77fc1d5ada8..3bb0cd4fb46 100644 --- a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py +++ b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py @@ -18,19 +18,55 @@ class ErnieX1ReasoningParser(ReasoningParser): def __init__(self, tokenizer): super().__init__(tokenizer) - self.think_end_token = "" - self.response_start_token = "" - self.response_end_token = "" - self.tool_call_start_token = "" - self.tool_call_end_token = "" + + # 定义所有需要检查的token + token_definitions = { + "think_start_token": "", + "think_end_token": "", + "response_start_token": "", + "response_end_token": "", + "tool_call_start_token": "", + "tool_call_end_token": "", + } if not self.model_tokenizer: raise ValueError("The model tokenizer must be passed to the ReasoningParser constructor.") - self.think_end_token_id = self.vocab.get("") - if self.think_end_token_id is None: - raise RuntimeError("Could not find think end token id in tokenizer vocabulary") - self.tool_call_start_token_id = self.vocab.get("") + missing_tokens = [] + for name, token_value in token_definitions.items(): + setattr(self, name, token_value) + token_id = self.vocab.get(token_value) + setattr(self, f"{name}_id", token_id) + if token_id is None: + missing_tokens.append(token_value) + + if missing_tokens: + raise RuntimeError( + f"ernie x1 reasoning parser could not find the following token ids in tokenizer vocabulary: {', '.join(missing_tokens)}" + ) + + self.token_status_mapping = { + self.think_start_token_id: "think_start", + self.think_end_token_id: "think_end", + self.response_start_token_id: "response_start", + self.response_end_token_id: "response_end", + self.tool_call_start_token_id: "tool_call_start", + self.tool_call_end_token_id: "tool_call_end", + } + + def find_last_special_token(self, prompt_token_ids: list[int]) -> int: + for i in range(len(prompt_token_ids) - 1, -1, -1): + if prompt_token_ids[i] in self.token_status_mapping: + return prompt_token_ids[i] + return -1 + + def get_model_status(self, prompt_token_ids: list[int]): + special_token_id = self.find_last_special_token(prompt_token_ids) + + if special_token_id == -1: + return "think_start" + + return self.token_status_mapping[special_token_id] def extract_reasoning_content_streaming( self, @@ -40,64 +76,82 @@ def extract_reasoning_content_streaming( previous_token_ids: Sequence[int], current_token_ids: Sequence[int], delta_token_ids: Sequence[int], + model_status: str, ) -> Union[DeltaMessage, None]: - # Ignore the single token - if len(delta_token_ids) == 1 and delta_token_ids[0] == self.think_end_token_id: - return None - # --- Thinking stage handling --- - if not previous_text.endswith(self.think_end_token) and self.think_end_token not in previous_text: - # If delta is , stop thinking, do not return - if delta_text.startswith(self.think_end_token): - return None - # Otherwise, return thinking content (keep \n as-is) - return DeltaMessage(reasoning_content=delta_text) - - # --- After thinking ends, check tool_call or response --- - remaining_text = previous_text + delta_text - after_think = remaining_text[remaining_text.find(self.think_end_token) + len(self.think_end_token) :] - after_think = after_think.lstrip("\n") - - # Handle tool_call case: skip it - if after_think.startswith(self.tool_call_start_token): + if len(delta_token_ids) == 1 and delta_token_ids[0] in [ + self.think_end_token_id, + self.response_start_token_id, + self.response_end_token_id, + self.tool_call_start_token_id, + self.tool_call_end_token_id, + ]: return None - # Handle response case - if after_think.startswith(self.response_start_token) and self.response_end_token not in after_think: - # Do not return when tag itself appears - if delta_text == self.response_start_token or delta_text == self.response_end_token: - return None - return DeltaMessage(content=delta_text) + if model_status == "think_start": + if self.think_end_token in delta_text: + response_content = "" + end_index = delta_text.find(self.think_end_token) + reasoning_content = delta_text[:end_index] + response_start_pos = delta_text.find(self.response_start_token) + if response_start_pos != -1: + response_content = self._extract_response_content( + delta_text[response_start_pos + len(self.response_start_token) :] + ) + return DeltaMessage(reasoning_content=reasoning_content, content=response_content) + elif self.think_end_token in previous_text: + if self.response_start_token in previous_text and self.response_end_token not in previous_text: + return DeltaMessage(content=delta_text) + else: + return DeltaMessage(reasoning_content=delta_text) + elif model_status == "think_end": + if self.response_start_token in previous_text and self.response_end_token not in previous_text: + return DeltaMessage(content=delta_text) + elif model_status == "response_start": + if self.response_end_token not in previous_text: + return DeltaMessage(content=delta_text) - # Default case: return nothing return None - def extract_reasoning_content(self, model_output: str, request: ChatCompletionRequest) -> Tuple[str, str]: + def extract_reasoning_content( + self, model_output: str, request: ChatCompletionRequest, model_status: str + ) -> Tuple[str, str]: + """ + 优化版解析器。保留推理和响应内容中的换行符, + 仅删除闭合标签前的单个换行符。 + """ reasoning_content = "" response_content = "" - think_end_pos = model_output.find(self.think_end_token) - if think_end_pos != -1: - reasoning_content = model_output[:think_end_pos] - - remaining = model_output[think_end_pos + len(self.think_end_token) :] + if model_status in ["think_start", "think_end"]: + if model_status == "think_start": + think_end_pos = model_output.find(self.think_end_token) + if think_end_pos != -1: + reasoning_content = model_output[:think_end_pos] + remaining = model_output[think_end_pos + len(self.think_end_token) :].lstrip("\n") + else: + reasoning_content = model_output + remaining = "" + else: + remaining = model_output.lstrip("\n") - # find or - response_pos = remaining.find(self.response_start_token) - tool_pos = remaining.find(self.tool_call_start_token) + response_start_pos = remaining.find(self.response_start_token) + if response_start_pos != -1: + response_content = self._extract_response_content( + remaining[response_start_pos + len(self.response_start_token) :] + ) - # first - if response_pos != -1 and (tool_pos == -1 or response_pos < tool_pos): - # The content after the response_start position - remaining_response = remaining[response_pos + len(self.response_start_token) :] - response_end_pos = remaining_response.find(self.response_end_token) - if response_end_pos != -1: - response_content = remaining_response[:response_end_pos] - else: - response_content = remaining_response - # The content after the response_start position is tool_call - else: - reasoning_content = model_output - response_content = "" + elif model_status == "response_start": + response_content = self._extract_response_content(model_output) return reasoning_content, response_content + + def _extract_response_content(self, remaining: str) -> str: + """ + Extracts response content, ensuring that the last newline before + the tag is removed. + """ + response_end_pos = remaining.find(self.response_end_token) + if response_end_pos != -1: + return remaining[:response_end_pos] + return remaining diff --git a/fastdeploy/reasoning/qwen3_reasoning_parsers.py b/fastdeploy/reasoning/qwen3_reasoning_parsers.py index 463cab83df3..b01cdf0d692 100644 --- a/fastdeploy/reasoning/qwen3_reasoning_parsers.py +++ b/fastdeploy/reasoning/qwen3_reasoning_parsers.py @@ -35,22 +35,50 @@ class Qwen3ReasoningParser(ReasoningParser): def __init__(self, tokenizer): super().__init__(tokenizer) - self.think_start_token = "" - self.think_end_token = "" + + # 定义所有需要检查的token + token_definitions = { + "think_start_token": "", + "think_end_token": "", + } if not self.model_tokenizer: - raise ValueError( - "The model tokenizer must be passed to the ReasoningParser " "constructor during construction." + raise ValueError("The model tokenizer must be passed to the ReasoningParser constructor.") + + missing_tokens = [] + for name, token_value in token_definitions.items(): + setattr(self, name, token_value) + token_id = self.vocab.get(token_value) + setattr(self, f"{name}_id", token_id) + if token_id is None: + missing_tokens.append(token_value) + + if missing_tokens: + raise RuntimeError( + f"Qwen3 reasoning parser could not find the following token ids in tokenizer vocabulary: {', '.join(missing_tokens)}" ) - - self.think_start_token_id = self.vocab.get(self.think_start_token) - self.think_end_token_id = self.vocab.get(self.think_end_token) - if self.think_end_token_id is None: - raise RuntimeError("Qwen3 reasoning parser could not locate think end " "tokens in the tokenizer!") + self.token_status_mapping = { + self.think_start_token_id: "think_start", + self.think_end_token_id: "think_end", + } def is_reasoning_end(self, input_ids: list[int]) -> bool: return self.think_end_token_id in input_ids + def find_last_special_token(self, prompt_token_ids: list[int]) -> int: + for i in range(len(prompt_token_ids) - 1, -1, -1): + if prompt_token_ids[i] in self.token_status_mapping: + return prompt_token_ids[i] + return -1 + + def get_model_status(self, prompt_token_ids: list[int]): + special_token_id = self.find_last_special_token(prompt_token_ids) + + if special_token_id == -1: + return "think_start" + + return self.token_status_mapping[special_token_id] + def extract_reasoning_content_streaming( self, previous_text: str, @@ -59,6 +87,7 @@ def extract_reasoning_content_streaming( previous_token_ids: Sequence[int], current_token_ids: Sequence[int], delta_token_ids: Sequence[int], + model_status: str, ) -> Union[DeltaMessage, None]: """ Extract reasoning content from a delta message. @@ -71,39 +100,42 @@ def extract_reasoning_content_streaming( if len(delta_token_ids) == 1 and (delta_token_ids[0] in [self.think_start_token_id, self.think_end_token_id]): return None - # in delta - if self.think_end_token_id in delta_token_ids: - # in delta, in delta, extract reasoning content - if self.think_start_token_id in delta_token_ids: + if model_status == "think_start": + # in delta + if self.think_end_token_id in delta_token_ids: + # in delta, in delta, extract reasoning content + if self.think_start_token_id in delta_token_ids: + start_index = delta_text.find(self.think_start_token) + end_index = delta_token_ids.find(self.think_end_token) + reasoning_content = delta_text[start_index + len(self.think_start_token) : end_index] + content = delta_text[end_index + len(self.think_end_token) :] + return DeltaMessage(reasoning_content=reasoning_content, content=content) + # in previous, in delta, + else: + end_index = delta_text.find(self.think_end_token) + reasoning_content = delta_text[:end_index] + content = delta_text[end_index + len(self.think_end_token) :] + content = content if content else None + return DeltaMessage(reasoning_content=reasoning_content, content=content) + # in previous reasoning content continues + elif self.think_end_token_id in previous_token_ids: + return DeltaMessage(content=delta_text) + # in previous + elif self.think_start_token_id in previous_token_ids: + return DeltaMessage(reasoning_content=delta_text) + # in delta + elif self.think_start_token_id in delta_token_ids: start_index = delta_text.find(self.think_start_token) - end_index = delta_token_ids.find(self.think_end_token) - reasoning_content = delta_text[start_index + len(self.think_start_token) : end_index] - content = delta_text[end_index + len(self.think_end_token) :] + reasoning_content = delta_text[start_index + len(self.think_start_token) :] + content = "" return DeltaMessage(reasoning_content=reasoning_content, content=content) - # in previous, in delta, else: - end_index = delta_text.find(self.think_end_token) - reasoning_content = delta_text[:end_index] - content = delta_text[end_index + len(self.think_end_token) :] - content = content if content else None - return DeltaMessage(reasoning_content=reasoning_content, content=content) - # in previous reasoning content continues - elif self.think_end_token_id in previous_token_ids: - return DeltaMessage(content=delta_text) - # in previous - elif self.think_start_token_id in previous_token_ids: - return DeltaMessage(reasoning_content=delta_text) - # in delta - elif self.think_start_token_id in delta_token_ids: - start_index = delta_text.find(self.think_start_token) - reasoning_content = delta_text[start_index + len(self.think_start_token) :] - content = "" - return DeltaMessage(reasoning_content=reasoning_content, content=content) + return DeltaMessage(reasoning_content=delta_text) else: - return DeltaMessage(reasoning_content=delta_text) + return DeltaMessage(content=delta_text) def extract_reasoning_content( - self, model_output: str, request: ChatCompletionRequest + self, model_output: str, request: ChatCompletionRequest, model_status: str ) -> tuple[Optional[str], Optional[str]]: """ Extract reasoning content from the model output. @@ -116,36 +148,39 @@ def extract_reasoning_content( tuple[Optional[str], Optional[str]]: reasoning content and content """ - # 检查是否包含结束标签 - if self.think_end_token not in model_output: - return None, model_output - - # 检查是否有起始标签 - if self.think_start_token in model_output: - # 标准格式:contentanswer - if self.think_start_token not in model_output or self.think_end_token not in model_output: - return None, model_output - # Check if the is present in the model output, remove it - # if it is present. - model_output_parts = model_output.partition(self.think_start_token) - model_output = model_output_parts[2] if model_output_parts[1] else model_output_parts[0] - # Check if the model output contains the tokens. - # If the end token is not found, return the model output as is. + if model_status == "think_start": + # 检查是否包含结束标签 if self.think_end_token not in model_output: return None, model_output - # Extract reasoning content from the model output. - reasoning_content, _, content = model_output.partition(self.think_end_token) - - final_content = content or None - return reasoning_content, final_content - else: - # 缺少起始标签的格式:contentanswer - parts = model_output.split(self.think_end_token, 1) - - if len(parts) == 2: - reasoning_content = parts[0].strip() - final_content = parts[1].strip() if parts[1].strip() else None + # 检查是否有起始标签 + if self.think_start_token in model_output: + # 标准格式:contentanswer + if self.think_start_token not in model_output or self.think_end_token not in model_output: + return None, model_output + # Check if the is present in the model output, remove it + # if it is present. + model_output_parts = model_output.partition(self.think_start_token) + model_output = model_output_parts[2] if model_output_parts[1] else model_output_parts[0] + # Check if the model output contains the tokens. + # If the end token is not found, return the model output as is. + if self.think_end_token not in model_output: + return None, model_output + + # Extract reasoning content from the model output. + reasoning_content, _, content = model_output.partition(self.think_end_token) + + final_content = content or None return reasoning_content, final_content + else: + # 缺少起始标签的格式:contentanswer + parts = model_output.split(self.think_end_token, 1) + + if len(parts) == 2: + reasoning_content = parts[0].strip() + final_content = parts[1].strip() if parts[1].strip() else None + return reasoning_content, final_content - return None, model_output + return None, model_output + else: + return None, model_output diff --git a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py index e51018f201e..52c845071d0 100644 --- a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py +++ b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py @@ -482,7 +482,7 @@ def test_chat_with_thinking(openai_client, capsys): max_tokens=10, extra_body={"chat_template_kwargs": {"enable_thinking": False}}, ) - assert response.choices[0].message.reasoning_content is None + assert response.choices[0].message.reasoning_content == "" assert "" not in response.choices[0].message.content # test logic @@ -957,4 +957,4 @@ def test_thinking_logic_flag(openai_client, capsys): "chat_template_kwargs": {"enable_thinking": False}, }, ) - assert response_case_3.choices[0].message.reasoning_content is None + assert response_case_3.choices[0].message.reasoning_content == "" diff --git a/tests/e2e/test_EB_VL_Lite_serving.py b/tests/e2e/test_EB_VL_Lite_serving.py index f93f355a754..e2b4e4e60f8 100644 --- a/tests/e2e/test_EB_VL_Lite_serving.py +++ b/tests/e2e/test_EB_VL_Lite_serving.py @@ -545,7 +545,7 @@ def test_chat_with_thinking(openai_client, capsys): max_tokens=10, extra_body={"chat_template_kwargs": {"enable_thinking": False}}, ) - assert response.choices[0].message.reasoning_content is None + assert response.choices[0].message.reasoning_content == "" assert "" not in response.choices[0].message.content # test logic @@ -716,4 +716,4 @@ def test_thinking_logic_flag(openai_client, capsys): "chat_template_kwargs": {"enable_thinking": False}, }, ) - assert response_case_3.choices[0].message.reasoning_content is None + assert response_case_3.choices[0].message.reasoning_content == "" diff --git a/tests/e2e/test_EB_VL_Lite_sot_serving.py b/tests/e2e/test_EB_VL_Lite_sot_serving.py index b2d8add1b0e..b21c99329a5 100644 --- a/tests/e2e/test_EB_VL_Lite_sot_serving.py +++ b/tests/e2e/test_EB_VL_Lite_sot_serving.py @@ -312,7 +312,7 @@ def test_chat_with_thinking(openai_client, capsys): max_tokens=10, extra_body={"chat_template_kwargs": {"enable_thinking": False}}, ) - assert response.choices[0].message.reasoning_content is None + assert response.choices[0].message.reasoning_content == "" assert "" not in response.choices[0].message.content # test logic @@ -404,4 +404,4 @@ def test_thinking_logic_flag(openai_client, capsys): "chat_template_kwargs": {"enable_thinking": False}, }, ) - assert response_case_3.choices[0].message.reasoning_content is None + assert response_case_3.choices[0].message.reasoning_content == "" diff --git a/tests/entrypoints/openai/test_finish_reason.py b/tests/entrypoints/openai/test_finish_reason.py index 4bdb3feefc8..d39cf917208 100644 --- a/tests/entrypoints/openai/test_finish_reason.py +++ b/tests/entrypoints/openai/test_finish_reason.py @@ -43,6 +43,8 @@ async def asyncSetUp(self): self.multi_modal_processor._check_mm_limits = Mock() self.multi_modal_processor.append_completion_tokens = Mock() self.multi_modal_processor.pack_outputs = lambda x: x + self.multi_modal_processor.reasoning_parser = None + self.multi_modal_processor.model_status_dict = {} self.engine_client = Mock() self.engine_client.connection_initialized = False @@ -242,7 +244,7 @@ async def test_chat_full_max_tokens(self, mock_data_logger, mock_processor_class mock_processor_instance = Mock() mock_processor_instance.enable_multimodal_content.return_value = True - async def mock_process_response_chat_async(response, stream, enable_thinking, include_stop_str_in_output): + async def mock_process_response_chat_async(response, stream, include_stop_str_in_output): yield response mock_processor_instance.process_response_chat = mock_process_response_chat_async @@ -423,7 +425,7 @@ async def test_chat_stream_max_tokens(self, mock_api_logger, mock_processor_clas mock_processor_instance = Mock() mock_processor_instance.enable_multimodal_content.return_value = False - async def mock_process_response_chat_async(response, stream, enable_thinking, include_stop_str_in_output): + async def mock_process_response_chat_async(response, stream, include_stop_str_in_output): if isinstance(response, list): for res in response: yield res diff --git a/tests/entrypoints/openai/test_max_streaming_tokens.py b/tests/entrypoints/openai/test_max_streaming_tokens.py index 1e728aa3b8d..ab950e2b5ae 100644 --- a/tests/entrypoints/openai/test_max_streaming_tokens.py +++ b/tests/entrypoints/openai/test_max_streaming_tokens.py @@ -149,7 +149,7 @@ async def test_integration_with_chat_stream_generator(self, mock_processor_class mock_processor_instance = Mock() - async def mock_process_response_chat_single(response, stream, enable_thinking, include_stop_str_in_output): + async def mock_process_response_chat_single(response, stream, include_stop_str_in_output): yield response mock_processor_instance.process_response_chat = mock_process_response_chat_single @@ -515,7 +515,7 @@ async def test_chat_stream_usage_fields(self, mock_response_processor, api_serve mock_processor_instance = Mock() - async def mock_process_response_chat(response, stream, enable_thinking, include_stop_str_in_output): + async def mock_process_response_chat(response, stream, include_stop_str_in_output): delta_msg_mock = Mock() delta_msg_mock.content = response["outputs"]["text"] if response["outputs"]["text"] == "a": diff --git a/tests/entrypoints/openai/test_response_processors.py b/tests/entrypoints/openai/test_response_processors.py index afab163b97e..34cade7cd82 100644 --- a/tests/entrypoints/openai/test_response_processors.py +++ b/tests/entrypoints/openai/test_response_processors.py @@ -48,7 +48,7 @@ async def test_text_only_mode(self): results = [ r async for r in processor.process_response_chat( - request_outputs, stream=False, enable_thinking=False, include_stop_str_in_output=False + request_outputs, stream=False, include_stop_str_in_output=False ) ] @@ -67,7 +67,7 @@ async def test_streaming_text_and_image(self): results = [ r async for r in self.processor_mm.process_response_chat( - request_outputs, stream=True, enable_thinking=False, include_stop_str_in_output=False + request_outputs, stream=True, include_stop_str_in_output=False ) ] @@ -94,7 +94,7 @@ async def test_streaming_buffer_accumulation(self): results = [ r async for r in self.processor_mm.process_response_chat( - request_outputs, stream=True, enable_thinking=False, include_stop_str_in_output=False + request_outputs, stream=True, include_stop_str_in_output=False ) ] @@ -112,7 +112,7 @@ async def test_non_streaming_accumulate_and_emit(self): results = [ r async for r in self.processor_mm.process_response_chat( - request_outputs, stream=False, enable_thinking=False, include_stop_str_in_output=False + request_outputs, stream=False, include_stop_str_in_output=False ) ] diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py deleted file mode 100644 index 394a23f0f4e..00000000000 --- a/tests/entrypoints/openai/test_serving_chat.py +++ /dev/null @@ -1,71 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License" -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import unittest -from unittest.mock import MagicMock - -from fastdeploy.entrypoints.openai.protocol import ChatCompletionRequest -from fastdeploy.entrypoints.openai.serving_chat import OpenAIServingChat - - -class TestOpenAIServingCompletion(unittest.TestCase): - - def setUp(self): - """ - Set up the test environment by creating an instance of the OpenAIServingChat class using Mock. - """ - self.mock_engine = MagicMock() - self.chat_completion_handler = OpenAIServingChat( - self.mock_engine, - models=None, - pid=123, - ips=None, - max_waiting_time=10, - chat_template=None, - ) - - def test_enable_thinking(self): - request = ChatCompletionRequest(messages=[], chat_template_kwargs={}) - enable_thinking = self.chat_completion_handler._get_thinking_status(request) - self.assertEqual(enable_thinking, None) - - request = ChatCompletionRequest(messages=[], chat_template_kwargs={"enable_thinking": True}) - enable_thinking = self.chat_completion_handler._get_thinking_status(request) - self.assertEqual(enable_thinking, True) - - request = ChatCompletionRequest(messages=[], chat_template_kwargs={"enable_thinking": False}) - enable_thinking = self.chat_completion_handler._get_thinking_status(request) - self.assertEqual(enable_thinking, False) - - request = ChatCompletionRequest(messages=[], chat_template_kwargs={"options": {"thinking_mode": "close"}}) - enable_thinking = self.chat_completion_handler._get_thinking_status(request) - self.assertEqual(enable_thinking, False) - - request = ChatCompletionRequest(messages=[], chat_template_kwargs={"options": {"thinking_mode": "false"}}) - enable_thinking = self.chat_completion_handler._get_thinking_status(request) - self.assertEqual(enable_thinking, False) - - request = ChatCompletionRequest(messages=[], chat_template_kwargs={"options": {"thinking_mode": "open"}}) - enable_thinking = self.chat_completion_handler._get_thinking_status(request) - self.assertEqual(enable_thinking, True) - - request = ChatCompletionRequest(messages=[], chat_template_kwargs={"options": {"thinking_mode": "123"}}) - enable_thinking = self.chat_completion_handler._get_thinking_status(request) - self.assertEqual(enable_thinking, True) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/entrypoints/openai/tool_parsers/test_ernie_x1_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_ernie_x1_tool_parser.py index e818801d935..1b8b58d1e95 100644 --- a/tests/entrypoints/openai/tool_parsers/test_ernie_x1_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_ernie_x1_tool_parser.py @@ -52,33 +52,12 @@ def test_extract_tool_calls_complete(self): self.assertTrue(result.tools_called) self.assertEqual(result.tool_calls[0].function.name, "get_weather") - def test_extract_tool_calls_partial_arguments(self): - """Test partial extraction when arguments incomplete""" - output = '{"name": "get_weather", "arguments": {"location": "北"' - result = self.parser.extract_tool_calls(output, self.dummy_request) - self.assertFalse(result.tools_called) - self.assertEqual(result.tool_calls[0].function.name, "get_weather") - - def test_extract_tool_calls_invalid_response_before_toolcall(self): - """Test case where before is invalid""" - output = 'hello{"name": "get_weather", "arguments": {}}' - result = self.parser.extract_tool_calls(output, self.dummy_request) - self.assertFalse(result.tools_called) - self.assertIn("", result.content) - def test_extract_tool_calls_no_toolcall(self): """Test when no tool_call tags are present""" output = "no tool call here" result = self.parser.extract_tool_calls(output, self.dummy_request) self.assertFalse(result.tools_called) - def test_extract_tool_calls_invalid_json(self): - """Test tool_call with badly formatted JSON triggers fallback parser""" - output = '"name": "get_weather", "arguments": {' - result = self.parser.extract_tool_calls(output, self.dummy_request) - self.assertFalse(result.tools_called) - self.assertEqual(result.tool_calls[0].function.name, "get_weather") - def test_extract_tool_calls_exception(self): """Force exception to cover error branch""" with patch( diff --git a/tests/input/test_ernie4_5_processor.py b/tests/input/test_ernie4_5_processor.py index 8c7386fef85..8ccb4e60f9c 100644 --- a/tests/input/test_ernie4_5_processor.py +++ b/tests/input/test_ernie4_5_processor.py @@ -73,6 +73,7 @@ def extract_reasoning_content_streaming( previous_token_ids, all_token_ids, delta_token_ids, + model_status, ): """Return a simple object with reasoning_content to cover reasoning branch.""" @@ -145,6 +146,7 @@ def _make_processor(self, reasoning=False, tool=False): tool_cls = MockToolParser if tool else None proc = Ernie4_5Processor("dummy-model", reasoning_parser_obj=reasoning_cls, tool_parser_obj=tool_cls) proc._apply_default_parameters = lambda req: req + proc.model_status_dict = {"req-1": "think_start"} return proc def test_update_bad_words(self): diff --git a/tests/input/test_ernie_processor.py b/tests/input/test_ernie_processor.py index 2a9dcb23cf8..6f5fad89403 100644 --- a/tests/input/test_ernie_processor.py +++ b/tests/input/test_ernie_processor.py @@ -4,6 +4,11 @@ from fastdeploy.input.ernie4_5_processor import Ernie4_5Processor +class MockReasoningParser: + def get_model_status(self, prompt_token_ids): + return "think_start" + + class TestErnie4_5ProcessorProcessResponseDictStreaming(unittest.TestCase): def setUp(self): # 创建 Ernie4_5Processor 实例的模拟对象 @@ -14,12 +19,13 @@ def setUp(self): # 设置必要的属性 self.processor.tokenizer = MagicMock() self.processor.tokenizer.eos_token_id = 1 - self.processor.decode_status = {} + self.processor.decode_status = {"test": []} self.processor.reasoning_end_dict = {} self.processor.tool_parser_dict = {} self.processor.generation_config = MagicMock() self.processor.eos_token_ids = [1] - self.processor.reasoning_parser = MagicMock() + self.processor.reasoning_parser = MockReasoningParser() + self.processor.model_status_dict = {"request-id_0": "think_start", "test": "think_start"} # 模拟 ids2tokens 方法 def mock_ids2tokens(token_ids, task_id): @@ -56,7 +62,7 @@ def mock_apply_default_parameters(request): def test_process_response_dict_streaming_normal_case(self): """测试正常情况下的流式响应处理""" # 准备输入 - response_dict = {"finished": False, "request_id": "req1", "outputs": {"token_ids": [4, 5]}} + response_dict = {"finished": False, "request_id": "test", "outputs": {"token_ids": [4, 5]}} kwargs = {"enable_thinking": True} # 调用方法 @@ -67,6 +73,7 @@ def test_process_response_dict_streaming_normal_case(self): def test_process_request_dict(self): request_dict = { + "request_id": "123", "messages": [{"role": "user", "content": "Hello!"}], "chat_template_kwargs": {"chat_template": "Hello!"}, "eos_token_ids": [1], diff --git a/tests/input/test_ernie_vl_processor.py b/tests/input/test_ernie_vl_processor.py index 92d24d5b96f..facc8c30cfa 100644 --- a/tests/input/test_ernie_vl_processor.py +++ b/tests/input/test_ernie_vl_processor.py @@ -1,12 +1,19 @@ import unittest from unittest.mock import MagicMock, patch +import numpy as np + from fastdeploy.input.ernie4_5_vl_processor import Ernie4_5_VLProcessor -class TestErnie4_5_vl_ProcessorProcessResponseDictStreaming(unittest.TestCase): +class MockReasoningParser: + def get_model_status(self, prompt_token_ids): + return "think_start" + + +class TestErnie4_5VLProcessorProcessResponseDictStreaming(unittest.TestCase): def setUp(self): - # 创建 Ernie4_5Processor 实例的模拟对象 + # 创建 Ernie4_5_VLProcessor 实例的模拟对象 with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None) as mock_init: self.processor = Ernie4_5_VLProcessor("model_path") mock_init.side_effect = lambda *args, **kwargs: print(f"__init__ called with {args}, {kwargs}") @@ -14,38 +21,41 @@ def setUp(self): # 设置必要的属性 self.processor.tokenizer = MagicMock() self.processor.tokenizer.eos_token_id = 1 - self.processor.decode_status = {} + self.processor.decode_status = {"test": []} self.processor.reasoning_end_dict = {} self.processor.tool_parser_dict = {} self.processor.generation_config = MagicMock() self.processor.eos_token_ids = [1] - self.processor.reasoning_parser = MagicMock() - self.processor._check_mm_limits = MagicMock() + self.processor.reasoning_parser = MockReasoningParser() + self.processor.model_status_dict = {"test": "think_start"} self.processor.ernie4_5_processor = MagicMock() - self.processor.pack_outputs = MagicMock() # 模拟 ids2tokens 方法 def mock_ids2tokens(token_ids, task_id): - self.processor.decode_status[task_id] = "mock_decode_status" return "delta_text", [2, 3], "previous_texts" self.processor.ids2tokens = mock_ids2tokens - def mock_messages2ids(request, **kwargs): - if "chat_template" in kwargs: - return [1] - else: - return [0] + def mock_request2ids(request, **kwargs): + return {"input_ids": np.array([1, 2, 3]), "prompt_token_ids": [0]} + + def mock_check_mm_limits(item): + pass def mock_apply_default_parameters(request): return request + def mock_pack_outputs(outputs): + return outputs + self.processor._apply_default_parameters = mock_apply_default_parameters + self.processor._check_mm_limits = mock_check_mm_limits + self.processor.ernie4_5_processor.request2ids = mock_request2ids + self.processor.pack_outputs = mock_pack_outputs # 模拟推理解析器 self.mock_reasoning_parser = MagicMock() - self.mock_reasoning_parser.__class__.__name__ = "ErnieX1ReasoningParser" - # self.mock_reasoning_parser.extract_reasoning_content_streaming.return_value = ("reasoning", "text") + self.mock_reasoning_parser.extract_reasoning_content_streaming.return_value = None self.processor.reasoning_parser = self.mock_reasoning_parser # 模拟工具解析器 @@ -55,61 +65,29 @@ def mock_apply_default_parameters(request): self.mock_tool_parser_obj.return_value = self.mock_tool_parser self.processor.tool_parser_obj = self.mock_tool_parser_obj - def test_process_request_dict_with_options(self): - request_dict = { - "messages": [{"role": "user", "content": "Hello"}], - "prompt_token_ids": [1, 1, 1], - } - self.processor.process_request_dict(request_dict, 100) - self.assertEqual(request_dict["enable_thinking"], True) - - request_dict = { - "messages": [{"role": "user", "content": "Hello"}], - "chat_template_kwargs": {"enable_thinking": True}, - "prompt_token_ids": [1, 1, 1], - } - self.processor.process_request_dict(request_dict, 100) - self.assertEqual(request_dict["enable_thinking"], True) - - request_dict = { - "messages": [{"role": "user", "content": "Hello"}], - "chat_template_kwargs": {"enable_thinking": False}, - "prompt_token_ids": [1, 1, 1], - } - self.processor.process_request_dict(request_dict, 100) - self.assertEqual(request_dict["enable_thinking"], False) - - request_dict = { - "messages": [{"role": "user", "content": "Hello"}], - "chat_template_kwargs": {"options": {"thinking_mode": "open"}}, - "prompt_token_ids": [1, 1, 1], - } - self.processor.process_request_dict(request_dict, 100) - self.assertEqual(request_dict["enable_thinking"], True) + def test_process_response_dict_streaming_normal_case(self): + """测试正常情况下的流式响应处理""" + # 准备输入 + response_dict = {"finished": False, "request_id": "test", "outputs": {"token_ids": [4, 5]}} + kwargs = {"enable_thinking": True} - request_dict = { - "messages": [{"role": "user", "content": "Hello"}], - "chat_template_kwargs": {"options": {"thinking_mode": "close"}}, - "prompt_token_ids": [1, 1, 1], - } - self.processor.process_request_dict(request_dict, 100) - self.assertEqual(request_dict["enable_thinking"], False) + # 调用方法 + result = self.processor.process_response_dict_streaming(response_dict, **kwargs) - request_dict = { - "messages": [{"role": "user", "content": "Hello"}], - "chat_template_kwargs": {"options": {"thinking_mode": "false"}}, - "prompt_token_ids": [1, 1, 1], - } - self.processor.process_request_dict(request_dict, 100) - self.assertEqual(request_dict["enable_thinking"], False) + # 验证结果 + self.assertEqual(result["outputs"]["completion_tokens"], "delta_text") + def test_process_request_dict(self): request_dict = { - "messages": [{"role": "user", "content": "Hello"}], - "chat_template_kwargs": {"options": {"thinking_mode": "123"}}, - "prompt_token_ids": [1, 1, 1], + "request_id": "123", + "messages": [{"role": "user", "content": "Hello!"}], + "chat_template_kwargs": {"chat_template": "Hello!"}, + "eos_token_ids": [1], + "temperature": 1, + "top_p": 1, } - self.processor.process_request_dict(request_dict, 100) - self.assertEqual(request_dict["enable_thinking"], True) + result = self.processor.process_request_dict(request_dict, 100) + self.assertEqual(result["prompt_token_ids"], [1, 2, 3]) if __name__ == "__main__": diff --git a/tests/input/test_text_processor.py b/tests/input/test_text_processor.py index 794d81895d7..b22b2d5a0ad 100644 --- a/tests/input/test_text_processor.py +++ b/tests/input/test_text_processor.py @@ -20,6 +20,7 @@ def setUp(self): self.processor.tool_parser_dict = {} self.processor.generation_config = MagicMock() self.processor.eos_token_ids = [1] + self.processor.model_status_dict = {"request-id_0": "think_start"} self.processor.reasoning_parser = MagicMock() def mock_messages2ids(request, **kwargs): @@ -50,6 +51,7 @@ def test_process_request(self): def test_process_request_dict(self): request_dict = { + "request_id": "123", "messages": [{"role": "user", "content": "Hello!"}], "chat_template_kwargs": {"chat_template": "Hello!"}, "eos_token_ids": [1], diff --git a/tests/reasoning/test_qwen3_reasoning_parser.py b/tests/reasoning/test_qwen3_reasoning_parser.py new file mode 100644 index 00000000000..cde56601608 --- /dev/null +++ b/tests/reasoning/test_qwen3_reasoning_parser.py @@ -0,0 +1,197 @@ +import unittest + +from fastdeploy.entrypoints.openai.protocol import ChatCompletionRequest +from fastdeploy.reasoning.qwen3_reasoning_parsers import Qwen3ReasoningParser + + +class MockTokenizer: + """Minimal tokenizer with vocab for testing.""" + + def __init__(self): + self.vocab = { + "": 100, + "": 101, + } + + def get_vocab(self): + """Return vocab dict for testing.""" + return self.vocab + + +class MissingTokenTokenizer: + def __init__(self): + self.vocab = { + "": 100, + } + + def get_vocab(self): + """Return vocab dict for testing.""" + return self.vocab + + +class TestQwen3ReasoningParser(unittest.TestCase): + def setUp(self): + self.parser = Qwen3ReasoningParser(MockTokenizer()) + self.request = ChatCompletionRequest(model="test", messages=[{"role": "user", "content": "test message"}]) + self.tokenizer = MockTokenizer() + + def test_missing_token(self): + with self.assertRaises(RuntimeError) as context: + Qwen3ReasoningParser(MissingTokenTokenizer()) + exception_message = str(context.exception) + expected_message_part = "Qwen3 reasoning parser could not find the following token ids" + self.assertIn(expected_message_part, exception_message) + + def test_get_model_status(self): + status = self.parser.get_model_status([1, 2, 100]) + self.assertEqual(status, "think_start") + status = self.parser.get_model_status([1, 2, 101]) + self.assertEqual(status, "think_end") + status = self.parser.get_model_status([1]) + self.assertEqual(status, "think_start") + + def test_streaming_thinking_content(self): + msg = self.parser.extract_reasoning_content_streaming( + previous_text="", + current_text="a", + delta_text="a", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[200], + model_status="think_start", + ) + self.assertEqual(msg.reasoning_content, "a") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="", + current_text="ab", + delta_text="ab", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[99, 101, 102], + model_status="think_start", + ) + self.assertEqual(msg.reasoning_content, "a") + self.assertEqual(msg.content, "b") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="a", + current_text="ab", + delta_text="b", + previous_token_ids=[1, 101], + current_token_ids=[], + delta_token_ids=[102], + model_status="think_start", + ) + self.assertEqual(msg.content, "b") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="", + current_text="a", + delta_text="a", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + model_status="think_start", + ) + self.assertEqual(msg.reasoning_content, "a") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="", + current_text="a", + delta_text="a", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[200], + model_status="think_end", + ) + self.assertEqual(msg.content, "a") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="hello", + current_text="hellohi", + delta_text="hi", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[101, 200], + model_status="think_start", + ) + self.assertEqual(msg.content, "hi") + self.assertEqual(msg.reasoning_content, "") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="hello", + current_text="hellohi", + delta_text="hi", + previous_token_ids=[100], + current_token_ids=[], + delta_token_ids=[], + model_status="think_start", + ) + self.assertEqual(msg.content, None) + self.assertEqual(msg.reasoning_content, "hi") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="hello", + current_text="hellohi", + delta_text="hi", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[100, 200], + model_status="think_start", + ) + self.assertEqual(msg.content, "") + self.assertEqual(msg.reasoning_content, "hi") + + def test_none_streaming_thinking_content(self): + reasoning_content, content = self.parser.extract_reasoning_content( + model_output="a", + request={}, + model_status="think_start", + ) + self.assertEqual(reasoning_content, None) + self.assertEqual(content, "a") + + reasoning_content, content = self.parser.extract_reasoning_content( + model_output="ab", + request={}, + model_status="think_start", + ) + self.assertEqual(reasoning_content, "a") + self.assertEqual(content, "b") + + reasoning_content, content = self.parser.extract_reasoning_content( + model_output="a", + request={}, + model_status="think_end", + ) + self.assertEqual(reasoning_content, None) + self.assertEqual(content, "a") + + reasoning_content, content = self.parser.extract_reasoning_content( + model_output="a", + request={}, + model_status="think_start", + ) + self.assertEqual(reasoning_content, None) + self.assertEqual(content, "a") + + reasoning_content, content = self.parser.extract_reasoning_content( + model_output="ab", + request={}, + model_status="think_start", + ) + self.assertEqual(reasoning_content, "a") + self.assertEqual(content, "b") + + reasoning_content, content = self.parser.extract_reasoning_content( + model_output="b", + request={}, + model_status="think_start", + ) + self.assertEqual(reasoning_content, "") + self.assertEqual(content, "b") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/reasoning/test_reasoning_parser.py b/tests/reasoning/test_reasoning_parser.py index e6deded445d..b4899e46bf6 100644 --- a/tests/reasoning/test_reasoning_parser.py +++ b/tests/reasoning/test_reasoning_parser.py @@ -31,10 +31,25 @@ class DummyTokenizer: def __init__(self): self.vocab = { "": 100, - "": 101, - "": 102, - "": 103, - "": 104, + "": 101, + "": 102, + "": 103, + "": 104, + "": 105, + } + + def get_vocab(self): + """Return vocab dict for testing.""" + return self.vocab + + +class MissingTokenTokenizer: + def __init__(self): + self.vocab = { + "": 100, + "": 101, + "": 102, + "": 103, } def get_vocab(self): @@ -132,6 +147,17 @@ def setUp(self): self.request = ChatCompletionRequest(model="test", messages=[{"role": "user", "content": "test message"}]) self.tokenizer = DummyTokenizer() + def test_missing_token(self): + with self.assertRaises(RuntimeError) as context: + ErnieX1ReasoningParser(MissingTokenTokenizer()) + exception_message = str(context.exception) + expected_message_part = "ernie x1 reasoning parser could not find the following token ids" + self.assertIn(expected_message_part, exception_message) + + def test_get_model_status(self): + model_status = self.parser.get_model_status([88, 99, 104]) + self.assertEqual(model_status, "response_start") + # ---- Streaming parsing ---- def test_streaming_thinking_content(self): msg = self.parser.extract_reasoning_content_streaming( @@ -141,6 +167,7 @@ def test_streaming_thinking_content(self): previous_token_ids=[], current_token_ids=[], delta_token_ids=[200], + model_status="think_start", ) self.assertEqual(msg.reasoning_content, "a") @@ -152,6 +179,7 @@ def test_streaming_thinking_newline_preserved(self): previous_token_ids=[], current_token_ids=[], delta_token_ids=[201], + model_status="think_start", ) self.assertEqual(msg.reasoning_content, "\n") @@ -163,6 +191,7 @@ def test_streaming_thinking_end_tag(self): previous_token_ids=[], current_token_ids=[], delta_token_ids=[self.parser.think_end_token_id], + model_status="think_start", ) self.assertIsNone(msg) @@ -174,6 +203,7 @@ def test_streaming_response_content(self): previous_token_ids=[], current_token_ids=[], delta_token_ids=[202], + model_status="think_start", ) self.assertEqual(msg.content, "h") @@ -185,6 +215,7 @@ def test_streaming_response_newline_preserved(self): previous_token_ids=[], current_token_ids=[], delta_token_ids=[203], + model_status="think_start", ) self.assertEqual(msg.content, "\n") @@ -197,6 +228,7 @@ def test_streaming_response_ignore_tags(self): previous_token_ids=[], current_token_ids=[], delta_token_ids=[self.parser.vocab[""]], + model_status="think_start", ) ) @@ -207,6 +239,7 @@ def test_streaming_response_ignore_tags(self): previous_token_ids=[], current_token_ids=[], delta_token_ids=[204], + model_status="think_start", ) self.assertIsInstance(msg, DeltaMessage) self.assertEqual(msg.content, "\n") @@ -219,9 +252,82 @@ def test_streaming_response_ignore_tags(self): previous_token_ids=[], current_token_ids=[], delta_token_ids=[self.parser.vocab[""]], + model_status="think_start", ) ) + def test_extract_reasoning_content_streaming(self): + msg = self.parser.extract_reasoning_content_streaming( + previous_text="hello", + current_text="hello", + delta_text="", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[100, 200], + model_status="think_start", + ) + self.assertEqual(msg.content, "") + self.assertEqual(msg.reasoning_content, "") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="hello", + current_text="hellohi", + delta_text="hi", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[100, 200], + model_status="think_start", + ) + self.assertEqual(msg.content, "hi") + self.assertEqual(msg.reasoning_content, "") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="", + current_text="hellohi", + delta_text="hellohi", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[100, 200], + model_status="think_start", + ) + self.assertEqual(msg.content, "hi") + self.assertEqual(msg.reasoning_content, "hello") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="hello", + current_text="hellohi", + delta_text="hi", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[100, 200], + model_status="think_end", + ) + self.assertEqual(msg.content, "hi") + self.assertEqual(msg.reasoning_content, None) + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="hello", + current_text="hellohi", + delta_text="hi", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[100, 200], + model_status="response_start", + ) + self.assertEqual(msg.content, "hi") + self.assertEqual(msg.reasoning_content, None) + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="hellohi", + current_text="hellohiend", + delta_text="end", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[100, 200], + model_status="response_start", + ) + self.assertEqual(msg, None) + def test_streaming_tool_call(self): msg = self.parser.extract_reasoning_content_streaming( previous_text="", @@ -230,40 +336,48 @@ def test_streaming_tool_call(self): previous_token_ids=[], current_token_ids=[], delta_token_ids=[self.parser.vocab[""]], + model_status="think_start", ) self.assertIsNone(msg) # ---- Batch parsing ---- def test_batch_reasoning_and_response(self): text = "abc\n\nhello\nworld" - reasoning, response = self.parser.extract_reasoning_content(text, self.request) + reasoning, response = self.parser.extract_reasoning_content(text, self.request, "think_start") self.assertEqual(reasoning, "abc\n") self.assertEqual(response, "hello\nworld") def test_batch_reasoning_and_tool_call(self): text = "abccall_here" - reasoning, response = self.parser.extract_reasoning_content(text, self.request) + reasoning, response = self.parser.extract_reasoning_content(text, self.request, "think_start") self.assertEqual(reasoning, "abc") self.assertEqual(response, "") def test_batch_no_thinking_tag(self): text = "no_thinking_here" - reasoning, response = self.parser.extract_reasoning_content(text, self.request) + reasoning, response = self.parser.extract_reasoning_content(text, self.request, "think_start") self.assertEqual(reasoning, "no_thinking_here") self.assertEqual(response, "") def test_batch_response_without_end_tag(self): text = "abcpartial response" - reasoning, response = self.parser.extract_reasoning_content(text, self.request) + reasoning, response = self.parser.extract_reasoning_content(text, self.request, "think_start") self.assertEqual(reasoning, "abc") self.assertEqual(response, "partial response") def test_batch_preserve_all_newlines(self): text = "abc\n\nline1\nline2\n" - reasoning, response = self.parser.extract_reasoning_content(text, self.request) + reasoning, response = self.parser.extract_reasoning_content(text, self.request, "think_start") self.assertEqual(reasoning, "abc\n") self.assertEqual(response, "line1\nline2\n") + def test_extract_reasoning_content(self): + reasoning_content, response_content = self.parser.extract_reasoning_content( + model_output="hello", request=self.request, model_status="response_start" + ) + self.assertEqual(reasoning_content, "") + self.assertEqual(response_content, "hello") + class TestErnie45VLThinkingReasoningParser(unittest.TestCase): def setUp(self): @@ -281,6 +395,7 @@ def test_streaming_non_reasoning(self): previous_token_ids=[], current_token_ids=[200], delta_token_ids=[200], + model_status="think_start", ) self.assertIsInstance(result, DeltaMessage) self.assertEqual(result.reasoning_content, "a") @@ -294,6 +409,7 @@ def test_streaming_with_reasoning(self): previous_token_ids=[200, 201], current_token_ids=[200, 201, 100], delta_token_ids=[100], + model_status="think_start", ) self.assertIsNone(result) @@ -305,6 +421,7 @@ def test_streaming_with_reasoning_and_content(self): previous_token_ids=[200, 201], current_token_ids=[200, 201, 100, 300, 400], delta_token_ids=[100, 300, 400], + model_status="think_start", ) self.assertIsInstance(result, DeltaMessage) self.assertIsNone(result.reasoning_content) @@ -318,6 +435,7 @@ def test_streaming_with_reasoning_new_line(self): previous_token_ids=[200, 201, 202], current_token_ids=[200, 201, 202, 100], delta_token_ids=[100], + model_status="think_start", ) self.assertIsNone(result) @@ -329,9 +447,10 @@ def test_streaming_with_reasoning_and_tool(self): previous_token_ids=[200, 201, 202], current_token_ids=[200, 201, 202, 100, 200, 101], delta_token_ids=[100, 200, 101], + model_status="think_start", ) self.assertIsInstance(result, DeltaMessage) - self.assertEqual(result.reasoning_content, "") + self.assertEqual(result.reasoning_content, None) def test_streaming_with_reasoning_and_illegal_tool(self): result = self.parser.extract_reasoning_content_streaming( @@ -341,6 +460,7 @@ def test_streaming_with_reasoning_and_illegal_tool(self): previous_token_ids=[200, 201, 202], current_token_ids=[200, 201, 202, 100, 200, 101], delta_token_ids=[109, 200, 101], + model_status="think_start", ) self.assertIsInstance(result, DeltaMessage) self.assertEqual(result.content, "\n\nhello") @@ -353,6 +473,7 @@ def test_streaming_with_reasoning_no_tool(self): previous_token_ids=[200, 201, 202], current_token_ids=[200, 201, 202, 100, 200, 110], delta_token_ids=[100, 200, 110], + model_status="think_start", ) self.assertIsInstance(result, DeltaMessage) self.assertEqual(result.reasoning_content, "hello") @@ -366,6 +487,7 @@ def test_streaming_reasoning_previous_no_tool(self): previous_token_ids=[100], current_token_ids=[100, 110, 111], delta_token_ids=[110, 111], + model_status="think_start", ) self.assertIsInstance(result, DeltaMessage) self.assertIsNone(result.reasoning_content) @@ -379,52 +501,127 @@ def test_streaming_no_reasoning_previous_tool(self): previous_token_ids=[101], current_token_ids=[101, 110], delta_token_ids=[110], + model_status="think_start", ) self.assertIsInstance(result, DeltaMessage) self.assertEqual(result.reasoning_content, "hello") + def test_think_end_status_streaming(self): + result = self.parser.extract_reasoning_content_streaming( + previous_text="", + current_text="hello", + delta_text="hello", + previous_token_ids=[101], + current_token_ids=[101, 110], + delta_token_ids=[110], + model_status="think_end", + ) + self.assertIs(result, None) + + result = self.parser.extract_reasoning_content_streaming( + previous_text="hello, ", + current_text="hello, hi", + delta_text="hi", + previous_token_ids=[101], + current_token_ids=[101, 110], + delta_token_ids=[110], + model_status="think_end", + ) + self.assertIsInstance(result, DeltaMessage) + self.assertEqual(result.content, "hi") + + def test_other_status_streaming(self): + result = self.parser.extract_reasoning_content_streaming( + previous_text="hello, ", + current_text="hello, hi", + delta_text="hi", + previous_token_ids=[101], + current_token_ids=[101, 110], + delta_token_ids=[110], + model_status="tool_call_start", + ) + self.assertIs(result, None) + def test_batch_no_think_end(self): reasoning, content = self.parser.extract_reasoning_content( - model_output="direct response", request=self.test_request + model_output="direct response", request=self.test_request, model_status="think_start" ) self.assertEqual(reasoning, "direct response") self.assertEqual(content, "") def test_batch_no_think_end_with_tool(self): reasoning, content = self.parser.extract_reasoning_content( - model_output="direct responseabc", request=self.test_request + model_output="direct responseabc", request=self.test_request, model_status="think_start" ) self.assertEqual(reasoning, "direct responseabc") self.assertEqual(content, "") def test_batch_think_end_normal_content(self): reasoning, content = self.parser.extract_reasoning_content( - model_output="reasoning\nresponse", request=self.test_request + model_output="reasoning\nresponse", request=self.test_request, model_status="think_start" ) self.assertEqual(reasoning, "reasoning") self.assertEqual(content, "\nresponse") def test_batch_think_end_with_tool(self): reasoning, content = self.parser.extract_reasoning_content( - model_output="reasoning\ntool params", request=self.test_request + model_output="reasoning\ntool params", + request=self.test_request, + model_status="think_start", ) self.assertEqual(reasoning, "reasoning") self.assertEqual(content, "") def test_batch_think_end_with_illegal_tool(self): reasoning, content = self.parser.extract_reasoning_content( - model_output="reasoning\nABC\ntool params", request=self.test_request + model_output="reasoning\nABC\ntool params", + request=self.test_request, + model_status="think_start", ) self.assertEqual(reasoning, "reasoning") self.assertEqual(content, "\nABC\ntool params") def test_batch_think_end_content_with_newline(self): reasoning, content = self.parser.extract_reasoning_content( - model_output="reasoning\n\n actual response", request=self.test_request + model_output="reasoning\n\n actual response", + request=self.test_request, + model_status="think_start", ) self.assertEqual(reasoning, "reasoning") self.assertEqual(content, "\n\n actual response") + def test_think_end_status_non_streaming(self): + reasoning, content = self.parser.extract_reasoning_content( + model_output="response", request=self.test_request, model_status="think_end" + ) + self.assertEqual(reasoning, "") + self.assertEqual(content, "response") + + reasoning, content = self.parser.extract_reasoning_content( + model_output="response", request=self.test_request, model_status="think_end" + ) + self.assertEqual(reasoning, "") + self.assertEqual(content, "") + + reasoning, content = self.parser.extract_reasoning_content( + model_output="\n 1response", request=self.test_request, model_status="think_end" + ) + self.assertEqual(reasoning, "") + self.assertEqual(content, "\n 1response") + + def test_other_status_non_streaming(self): + reasoning, content = self.parser.extract_reasoning_content( + model_output="response", request=self.test_request, model_status="tool_call_start" + ) + self.assertEqual(reasoning, "") + self.assertEqual(content, "") + + reasoning, content = self.parser.extract_reasoning_content( + model_output="response", request=self.test_request, model_status="tool_call_end" + ) + self.assertEqual(reasoning, "") + self.assertEqual(content, "") + class TestErnieVLReasoningParser(unittest.TestCase): def setUp(self): @@ -442,6 +639,7 @@ def test_extract_reasoning_content_stream(self): previous_token_ids=[200, 201, 202], current_token_ids=[200, 201, 202, 100, 110, 120, 130], delta_token_ids=[100, 110, 120, 130], + model_status="think_start", ) self.assertIsInstance(result, DeltaMessage) self.assertEqual(result.reasoning_content, "") @@ -455,6 +653,7 @@ def test_extract_reasoning_content_stream_think_in_previous(self): previous_token_ids=[200, 201, 202, 100], current_token_ids=[200, 201, 202, 100, 110, 120, 130], delta_token_ids=[110, 120, 130], + model_status="think_start", ) self.assertIsInstance(result, DeltaMessage) self.assertIsNone(result.reasoning_content) @@ -468,6 +667,7 @@ def test_extract_reasoning_content_stream_no_think_token(self): previous_token_ids=[200, 201, 202], current_token_ids=[200, 201, 202, 110, 120, 130], delta_token_ids=[110, 120, 130], + model_status="think_start", ) self.assertIsInstance(result, DeltaMessage) self.assertIsNone(result.content) @@ -475,7 +675,7 @@ def test_extract_reasoning_content_stream_no_think_token(self): def test_extract_reasoning_content(self): reasoning, content = self.parser.extract_reasoning_content( - model_output="reasoning\nactual response", request=self.test_request + model_output="reasoning\nactual response", request=self.test_request, model_status="think_start" ) self.assertEqual(reasoning, "reasoning") self.assertEqual(content, "\nactual response") diff --git a/tests/reasoning/test_vl_reasoning_parser.py b/tests/reasoning/test_vl_reasoning_parser.py new file mode 100644 index 00000000000..f9a36dd952e --- /dev/null +++ b/tests/reasoning/test_vl_reasoning_parser.py @@ -0,0 +1,135 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import unittest + +from fastdeploy.entrypoints.openai.protocol import ChatCompletionRequest +from fastdeploy.reasoning.ernie_vl_reasoning_parsers import ErnieVLReasoningParser + + +class MockTokenizer: + """Minimal tokenizer with vocab for testing.""" + + def __init__(self): + self.vocab = { + "": 100, + "": 101, + } + + def get_vocab(self): + """Return vocab dict for testing.""" + return self.vocab + + +class TestErnieVLReasoningParser(unittest.TestCase): + def setUp(self): + self.parser = ErnieVLReasoningParser(MockTokenizer()) + self.request = ChatCompletionRequest(model="test", messages=[{"role": "user", "content": "test message"}]) + self.tokenizer = MockTokenizer() + + def test_get_model_status(self): + status = self.parser.get_model_status([1, 2, 100]) + self.assertEqual(status, "think_start") + status = self.parser.get_model_status([1, 2, 101]) + self.assertEqual(status, "think_end") + status = self.parser.get_model_status([1]) + self.assertEqual(status, "think_start") + + def test_streaming_thinking_content(self): + msg = self.parser.extract_reasoning_content_streaming( + previous_text="", + current_text="a", + delta_text="a", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[200], + model_status="think_start", + ) + self.assertEqual(msg.reasoning_content, "a") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="", + current_text="ab", + delta_text="ab", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[100, 101, 102], + model_status="think_start", + ) + self.assertEqual(msg.reasoning_content, "a") + self.assertEqual(msg.content, "b") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="a", + current_text="ab", + delta_text="b", + previous_token_ids=[1, 101], + current_token_ids=[], + delta_token_ids=[102], + model_status="think_start", + ) + self.assertEqual(msg.content, "b") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="", + current_text="a", + delta_text="a", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + model_status="think_start", + ) + self.assertEqual(msg.reasoning_content, "a") + + msg = self.parser.extract_reasoning_content_streaming( + previous_text="", + current_text="a", + delta_text="a", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[200], + model_status="think_end", + ) + self.assertEqual(msg.content, "a") + + def test_none_streaming_thinking_content(self): + reasoning_content, content = self.parser.extract_reasoning_content( + model_output="a", + request={}, + model_status="think_start", + ) + self.assertEqual(reasoning_content, "") + self.assertEqual(content, "a") + + reasoning_content, content = self.parser.extract_reasoning_content( + model_output="ab", + request={}, + model_status="think_start", + ) + self.assertEqual(reasoning_content, "a") + self.assertEqual(content, "b") + + reasoning_content, content = self.parser.extract_reasoning_content( + model_output="a", + request={}, + model_status="think_end", + ) + self.assertEqual(reasoning_content, "") + self.assertEqual(content, "a") + + +if __name__ == "__main__": + unittest.main()