PaddlePaddle · luukunn · Sep 23, 2025 · Sep 24, 2025 · Sep 24, 2025 · Sep 25, 2025
diff --git a/fastdeploy/entrypoints/openai/response_processors.py b/fastdeploy/entrypoints/openai/response_processors.py
@@ -68,13 +68,12 @@ def accumulate_token_ids(self, request_output):
             else:
                 self._multipart_buffer.append({"decode_type": decode_type, "request_output": request_output})
 
-    async def process_response_chat(self, request_outputs, stream, enable_thinking, include_stop_str_in_output):
+    async def process_response_chat(self, request_outputs, stream, include_stop_str_in_output):
         """
         Process a list of responses into a generator that yields each processed response as it's generated.
         Args:
             request_outputs: The list of outputs to be processed.
             stream: Whether or not to stream the output.
-            enable_thinking: Whether or not to show thinking messages.
             include_stop_str_in_output: Whether or not to include stop strings in the output.
         """
         for request_output in request_outputs:
@@ -83,7 +82,6 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
                 yield self.data_processor.process_response_dict(
                     response_dict=request_output,
                     stream=stream,
-                    enable_thinking=enable_thinking,
                     include_stop_str_in_output=include_stop_str_in_output,
                 )
             elif stream:
@@ -111,7 +109,6 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
                     self.data_processor.process_response_dict(
                         response_dict=request_output,
                         stream=stream,
-                        enable_thinking=enable_thinking,
                         include_stop_str_in_output=include_stop_str_in_output,
                     )
                     text = {"type": "text", "text": request_output["outputs"]["text"]}
@@ -132,7 +129,6 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
                             self.data_processor.process_response_dict(
                                 response_dict=part["request_output"],
                                 stream=False,
-                                enable_thinking=enable_thinking,
                                 include_stop_str_in_output=include_stop_str_in_output,
                             )
                             text = {"type": "text", "text": part["request_output"]["outputs"]["text"]}

diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -210,8 +210,6 @@ async def chat_completion_stream_generator(
 
         max_streaming_response_tokens = max(1, max_streaming_response_tokens)
 
-        enable_thinking = self._get_thinking_status(request)
-
         include_stop_str_in_output = request.include_stop_str_in_output
 
         stream_options = request.stream_options
@@ -267,7 +265,6 @@ async def chat_completion_stream_generator(
                 generator = response_processor.process_response_chat(
                     response,
                     stream=True,
-                    enable_thinking=enable_thinking,
                     include_stop_str_in_output=include_stop_str_in_output,
                 )
 
@@ -469,7 +466,6 @@ async def chat_completion_full_generator(
         """
         created_time = int(time.time())
         num_choices = 1 if request.n is None else request.n
-        enable_thinking = self._get_thinking_status(request)
 
         include_stop_str_in_output = request.include_stop_str_in_output
         try:
@@ -523,7 +519,6 @@ async def chat_completion_full_generator(
                 generator = response_processor.process_response_chat(
                     response,
                     stream=False,
-                    enable_thinking=enable_thinking,
                     include_stop_str_in_output=include_stop_str_in_output,
                 )
                 async for data in generator:
@@ -757,20 +752,3 @@ def _build_logprobs_response(
             error_msg = f"Error in _build_logprobs_response: {e}, {str(traceback.format_exc())}"
             api_server_logger.error(error_msg)
             return None
-
-    def _get_thinking_status(self, request: ChatCompletionRequest) -> bool:
-        """
-        Get the thinking status from the request.
-        """
-        enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None
-        if enable_thinking is None:
-            enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None
-        options = request.chat_template_kwargs.get("options") if request.chat_template_kwargs else None
-        if options:
-            thinking_mode = options.get("thinking_mode")
-            if thinking_mode:
-                if thinking_mode == "close" or thinking_mode == "false":
-                    enable_thinking = False
-                else:
-                    enable_thinking = True
-        return enable_thinking
diff --git a/fastdeploy/input/ernie4_5_processor.py b/fastdeploy/input/ernie4_5_processor.py
@@ -60,6 +60,7 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_ob
         self.decode_status = dict()
         self.tool_parser_dict = dict()
         self.thinking_parser_dict = dict()
+        self.model_status_dict = dict()
         self._load_tokenizer()
         data_processor_logger.info(
             f"tokenizer information: bos_token is {self.tokenizer.bos_token} \
@@ -151,8 +152,13 @@ def process_request(self, request, max_model_len=None, **kwargs):
             request.set("temperature", 1)
         if request.get("top_p") < _SAMPLING_EPS:
             request.set("top_p", _SAMPLING_EPS)
-        if self.reasoning_parser and self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser":
-            request.enable_thinking = True
+        if self.reasoning_parser:
+            real_req_id = request.request_id.split("_")[0]
+            n = request.get("n", 1)
+            model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids)
+            for idx in range(n):
+                self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
+            request.enable_thinking = model_status == "think_start"
 
         data_processor_logger.info(f"Processed request: {request}")
         return request
@@ -229,9 +235,13 @@ def process_request_dict(self, request, max_model_len=None):
             request["temperature"] = 1
         if request.get("top_p") < _SAMPLING_EPS:
             request["top_p"] = _SAMPLING_EPS
-        if self.reasoning_parser and self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser":
-            request["enable_thinking"] = True
-
+        if self.reasoning_parser:
+            real_req_id = request["request_id"].split("_")[0]
+            model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
+            n = request.get("n", 1)
+            for idx in range(n):
+                self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
+            request["enable_thinking"] = model_status == "think_start"
         data_processor_logger.info(f"Processed request dict: {request}")
         return request
 
@@ -253,7 +263,11 @@ def process_response(self, response_dict, **kwargs):
             token_ids = token_ids[:-1]
         full_text = self.tokenizer.decode(token_ids)
         if self.reasoning_parser:
-            reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict)
+            reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
+                full_text,
+                response_dict,
+                self.model_status_dict[req_id],
+            )
             response_dict.outputs.text = text
             response_dict.outputs.reasoning_content = reasoning_content
         else:
@@ -264,6 +278,8 @@ def process_response(self, response_dict, **kwargs):
             if tool_call_info.tools_called:
                 response_dict.outputs.tool_calls = tool_call_info.tool_calls
                 response_dict.outputs.text = tool_call_info.content
+        if req_id in self.model_status_dict:
+            del self.model_status_dict[req_id]
         data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}")
         if response_dict.outputs.text == "" and response_dict.outputs.reasoning_content == "":
             return None
@@ -294,7 +310,6 @@ def process_response_dict_normal(self, response_dict, **kwargs):
         Returns:
             Dict: response contain text fields
         """
-        enable_thinking = kwargs.get("enable_thinking")
         token_ids = response_dict["outputs"]["token_ids"]
         is_end = response_dict["finished"]
         req_id = response_dict["request_id"]
@@ -304,16 +319,17 @@ def process_response_dict_normal(self, response_dict, **kwargs):
         delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id)
         if is_end:
             full_text = previous_texts + delta_text
-            if self.reasoning_parser and (
-                enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser"
-            ):
-                reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict)
+            response_dict["outputs"]["text"] = full_text
+            if self.reasoning_parser:
+                reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
+                    full_text,
+                    response_dict,
+                    self.model_status_dict[req_id],
+                )
                 response_dict["outputs"]["text"] = text
                 response_dict["outputs"]["reasoning_content"] = reasoning_content
                 reasoning_tokens = self.tokenizer.tokenize(reasoning_content)
                 response_dict["outputs"]["reasoning_token_num"] = len(reasoning_tokens)
-            else:
-                response_dict["outputs"]["text"] = full_text
             if self.tool_parser_obj:
                 tool_parser = self.tool_parser_obj(self.tokenizer)
                 tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
@@ -323,6 +339,8 @@ def process_response_dict_normal(self, response_dict, **kwargs):
             response_dict["outputs"]["completion_tokens"] = full_text
             data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
             del self.decode_status[req_id]
+            if req_id in self.model_status_dict:
+                del self.model_status_dict[req_id]
         return response_dict
 
     def process_response_dict_streaming(self, response_dict, **kwargs):
@@ -335,7 +353,6 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
         Returns:
             Dict: response contain text fields
         """
-        enable_thinking = kwargs.get("enable_thinking")
         is_end = response_dict["finished"]
         req_id = response_dict["request_id"]
         token_ids = response_dict["outputs"]["token_ids"]
@@ -345,16 +362,15 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
                 token_ids = token_ids[:-1]
         delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id)
         response_dict["outputs"]["completion_tokens"] = delta_text
-        if self.reasoning_parser and (
-            enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser"
-        ):
+        if self.reasoning_parser:
             reasoning_delta_message = self.reasoning_parser.extract_reasoning_content_streaming(
                 previous_texts,
                 previous_texts + delta_text,
                 delta_text,
                 previous_token_ids,
                 previous_token_ids + token_ids,
                 token_ids,
+                self.model_status_dict[req_id],
             )
             response_dict["outputs"]["delta_message"] = reasoning_delta_message
             reasoning_content = reasoning_delta_message.reasoning_content if reasoning_delta_message else None
@@ -381,6 +397,8 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
             del self.decode_status[req_id]
             if req_id in self.tool_parser_dict:
                 del self.tool_parser_dict[req_id]
+            if req_id in self.model_status_dict:
+                del self.model_status_dict[req_id]
         return response_dict
 
     def messages2ids(self, request_or_messages, **kwargs):

diff --git a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py
@@ -58,6 +58,7 @@ def __init__(
 
         self.tool_parser_dict = dict()
         self.decode_status = dict()
+        self.model_status_dict = dict()
         self._load_tokenizer()
 
         # Generation config
@@ -272,6 +273,13 @@ def process_request_dict(self, request, max_model_len=None):
             request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
         data_processor_logger.info(f"Processed request {request}")
 
+        if self.reasoning_parser:
+            real_req_id = request["request_id"].split("_")[0]
+            model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
+            n = request.get("n", 1)
+            for idx in range(n):
+                self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
+            request["enable_thinking"] = model_status == "think_start"
         if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS:
             request["top_p"] = _SAMPLING_EPS
 
@@ -307,21 +315,3 @@ def pack_outputs(self, outs):
         outs["position_ids"] = np.array(outs["position_ids"], dtype=np.int64)
 
         return outs
-
-    def process_response_dict(self, response_dict, stream, **kwargs):
-        """
-        Preprocess the response
-
-        Args:
-            response_dict (Dict): response for engine, contain ids fields
-
-        Returns:
-            Dict: response contain text fields
-        """
-        enable_thinking = kwargs.pop("enable_thinking", True)
-        if enable_thinking is None:
-            enable_thinking = True
-        if stream:
-            return self.process_response_dict_streaming(response_dict, enable_thinking=enable_thinking, **kwargs)
-        else:
-            return self.process_response_dict_normal(response_dict, enable_thinking=enable_thinking, **kwargs)
diff --git a/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py b/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py
@@ -269,6 +269,13 @@ def process_request_dict(self, request, max_model_len=None):
         # Set default max_tokens if not specified
         if request.get("max_tokens") is None:
             request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"]))  # Ensure at least 1 token
+        if self.reasoning_parser:
+            real_req_id = request["request_id"].split("_")[0]
+            model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
+            n = request.get("n", 1)
+            for idx in range(n):
+                self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
+            request["enable_thinking"] = model_status == "think_start"
         data_processor_logger.info(f"Processed request {request}")
 
         return request