From a829d0ef882b6f0f540bfba5e21bee5e199ed761 Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Thu, 17 Jul 2025 16:38:54 +0800
Subject: [PATCH 01/20] mm support structured output

---
 docs/features/structured_outputs.md           | 62 ++++++++++++++
 docs/zh/features/structured_outputs.md        | 64 +++++++++++++++
 fastdeploy/config.py                          |  2 +
 fastdeploy/engine/config.py                   | 20 ++---
 fastdeploy/engine/engine.py                   | 20 +++++
 fastdeploy/engine/sampling_params.py          | 45 +++++++++++
 fastdeploy/entrypoints/llm.py                 |  3 +
 fastdeploy/input/ernie_processor.py           |  4 +-
 fastdeploy/input/ernie_vl_processor.py        |  6 +-
 fastdeploy/input/text_processor.py            | 24 +++++-
 .../guided_decoding/__init__.py               |  4 +-
 .../guided_decoding/base_guided_decoding.py   | 63 +++++++++++----
 .../guided_decoding/xgrammar_backend.py       | 40 +++++----
 .../model_executor/layers/sample/sampler.py   | 81 ++++++++++++++-----
 .../reasoning/ernie_vl_reasoning_parsers.py   | 14 ++++
 .../reasoning/qwen3_reasoning_parsers.py      | 14 ++++
 fastdeploy/worker/gpu_model_runner.py         | 34 ++++----
 fastdeploy/worker/worker_process.py           |  5 ++
 18 files changed, 412 insertions(+), 93 deletions(-)

diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index 40e177c1ce..f7ee424cb6 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -330,3 +330,65 @@ ParsedChatCompletionMessage[Info](content='{"addr": "No.1 Century Avenue, Pudong
 Address: No.1 Century Avenue, Pudong New Area, Shanghai
 Height: 468
 ```
+
+### Offline Inference
+
+Offline inference allows restricting the model's output format by pre-specified constraints. In `FastDeploy`, constraints can be specified through the `GuidedDecodingParams` class in `SamplingParams`. `GuidedDecodingParams` supports the following constraint types, with usage similar to online inference:
+
+```python
+json: Optional[Union[str, dict]] = None
+regex: Optional[str] = None
+choice: Optional[List[str]] = None
+grammar: Optional[str] = None
+json_object: Optional[bool] = None
+structural_tag: Optional[str] = None
+```
+
+The following example demonstrates how to use offline inference to generate a structured json:
+
+```python
+from fastdeploy import LLM, SamplingParams
+from fastdeploy.engine.sampling_params import GuidedDecodingParams
+from pydantic import BaseModel
+from enum import Enum
+
+class BookType(str, Enum):
+    romance = "Romance"
+    historical = "Historical"
+    adventure = "Adventure"
+    mystery = "Mystery"
+    dystopian = "Dystopian"
+
+class BookDescription(BaseModel):
+    author: str
+    title: str
+    genre: BookType
+
+# Constrained decoding parameters
+guided_decoding_params = GuidedDecodingParams(json=BookDescription.model_json_schema())
+
+# Sampling parameters
+sampling_params = SamplingParams(
+    top_p=0.95,
+    max_tokens=6400,
+    guided_decoding=guided_decoding_params,
+)
+
+# Load model
+llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192, guided_decoding_backend="auto")
+
+outputs = llm.generate(
+    prompts="Generate a JSON describing a literary work, including author, title and book type.",
+    sampling_params=sampling_params,
+)
+
+# Output results
+for output in outputs:
+    print(output.outputs.text)
+```
+
+Output:
+
+```
+{"author": "George Orwell", "title": "1984", "genre": "Dystopian"}
+```
diff --git a/docs/zh/features/structured_outputs.md b/docs/zh/features/structured_outputs.md
index ce33f1232d..cafda804c6 100644
--- a/docs/zh/features/structured_outputs.md
+++ b/docs/zh/features/structured_outputs.md
@@ -330,3 +330,67 @@ ParsedChatCompletionMessage[Info](content='{"addr": "上海市浦东新区世纪
 地址: 上海市浦东新区世纪大道1号
 高度: 468
 ```
+
+### 离线推理
+
+离线推理允许通过预先指定约束条件，限制模型输出格式。在 `FastDeploy` 中，支持通过 `SamplingParams` 中的 `GuidedDecodingParams` 类指定相关约束条件。`GuidedDecodingParams` 支持以下几种约束条件，使用方式可以参考在线推理：
+
+```python
+json: Optional[Union[str, dict]] = None
+regex: Optional[str] = None
+choice: Optional[List[str]] = None
+grammar: Optional[str] = None
+json_object: Optional[bool] = None
+structural_tag: Optional[str] = None
+```
+
+以下示例展示了如何使用离线推理生成一个结构化的 json :
+
+```python
+
+from fastdeploy import LLM, SamplingParams
+from fastdeploy.engine.sampling_params import GuidedDecodingParams
+from pydantic import BaseModel
+from enum import Enum
+
+class BookType(str, Enum):
+    romance = "Romance"
+    historical = "Historical"
+    adventure = "Adventure"
+    mystery = "Mystery"
+    dystopian = "Dystopian"
+
+class BookDescription(BaseModel):
+    author: str
+    title: str
+    genre: BookType
+
+# Constrained decoding parameters
+guided_decoding_params = GuidedDecodingParams(json=BookDescription.model_json_schema())
+
+# Sampling parameters
+sampling_params = SamplingParams(
+    top_p=0.95,
+    max_tokens=6400,
+    guided_decoding=guided_decoding_params,
+)
+
+# Load model
+llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192, guided_decoding_backend="auto")
+
+outputs = llm.generate(
+    prompts="生成一个JSON，描述一本中国的著作，要包含作者、标题和书籍类型。",
+    sampling_params=sampling_params,
+)
+
+# Output results
+for output in outputs:
+    print(output.outputs.text)
+
+```
+
+输出
+
+```
+{"author": "曹雪芹", "title": "红楼梦", "genre": "Historical"}
+```
diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index c8428d1f97..a68ae6d58e 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -116,6 +116,8 @@ def __init__(
         self.enable_redundant_experts = False
         self.redundant_experts_num = 0
         self.quantization = None
+        self.reasoning_parser = None
+
         for key, value in args.items():
             if hasattr(self, key):
                 setattr(self, key, value)
diff --git a/fastdeploy/engine/config.py b/fastdeploy/engine/config.py
index e9c693480b..0ae8269126 100644
--- a/fastdeploy/engine/config.py
+++ b/fastdeploy/engine/config.py
@@ -329,7 +329,8 @@ def postprocess(self):
         self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size)
 
         if self.guided_decoding_backend == "auto":
-            if self.enable_mm:
+            if current_platform.is_xpu() or self.speculative_config.method is not None:
+                llm_logger.warning("Speculative Decoding and XPU currently do not support Guided decoding, set off.")
                 self.guided_decoding_backend = "off"
             else:
                 self.guided_decoding_backend = "xgrammar"
@@ -396,10 +397,10 @@ def check(self):
             ], f"Only support xgrammar、auto guided decoding backend, but got {self.guided_decoding_backend}."
 
             if self.guided_decoding_backend != "off":
-                # TODO: mm support guided_decoding
-                assert self.enable_mm is False, "Multimodal model currently do not support guided_decoding"
 
                 # TODO: speculative decoding support guided_decoding
+                assert self.speculative_config.method is None, \
+                "speculative decoding currently do not support guided_decoding"
 
                 # TODO: xpu support guided_decoding
                 assert not current_platform.is_xpu(), "XPU currently do not support guided_decoding"
@@ -425,13 +426,12 @@ def print(self, file=None):
             if k == "generation_config" and v is not None:
                 for gck, gcv in v.to_dict().items():
                     llm_logger.info("{:<20}:{:<6}{}".format(gck, "", gcv))
-            elif (
-                k == "cache_config"
-                or k == "model_config"
-                or k == "scheduler_config"
-                or k == "parallel_config"
-                or k == "commit_config"
-            ):
+            elif (k == "cache_config" or
+                  k == "model_config" or
+                  k == "scheduler_config" or
+                  k == "parallel_config" or
+                  k == "commit_config" or
+                  k == "speculative_config"):
                 v.print()
             else:
                 llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py
index e9ecbefa2f..38b54b62bc 100644
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -434,6 +434,13 @@ def _insert_zmq_task_to_scheduler(self):
                     llm_logger.debug(f"Receive request: {request}")
 
                     err_msg = None
+                    if ((request.guided_json is not None
+                    or request.guided_regex is not None
+                    or request.structural_tag is not None
+                    or request.guided_grammar is not None) and self.guided_decoding_checker is None):
+                        err_msg = "guided_backend is None, use --guided-decoding-backend to " \
+                                  "specify the backend at server startup."
+
                     if self.guided_decoding_checker is not None:
                         request, err_msg = self.guided_decoding_checker.schema_format(request)
 
@@ -526,6 +533,14 @@ def add_requests(self, task, sampling_params=None, **kwargs):
             llm_logger.error(error_msg)
             raise EngineError(error_msg, error_code=400)
 
+        if ((request.guided_json is not None
+        or request.guided_regex is not None
+        or request.structural_tag is not None
+        or request.guided_grammar is not None) and self.guided_decoding_checker is None):
+            err_msg = "guided_backend is None, use --guided-decoding-backend to specify the backend at server startup."
+            llm_logger.error(err_msg)
+            raise EngineError(err_msg, error_code=400)
+
         if self.guided_decoding_checker is not None:
             request, err_msg = self.guided_decoding_checker.schema_format(request)
             if err_msg is not None:
@@ -1084,8 +1099,13 @@ def _start_worker_service(self):
             f" --speculative_config '{self.cfg.speculative_config.to_json_string()}'"
             f" --graph_optimization_config '{self.cfg.graph_optimization_config.to_json_string()}'"
             f" --guided_decoding_backend {self.cfg.guided_decoding_backend}"
+<<<<<<< HEAD
             f" --load_strategy {self.cfg.load_config.load_strategy}"
         )
+=======
+            f" --load_strategy {self.cfg.model_config.load_strategy}"
+            f" --reasoning_parser {self.cfg.reasoning_parser}")
+>>>>>>> 04c2f3c1 (mm support structured output)
 
         worker_append_flag = {
             "enable_expert_parallel": self.cfg.parallel_config.enable_expert_parallel,
diff --git a/fastdeploy/engine/sampling_params.py b/fastdeploy/engine/sampling_params.py
index 91babf7a86..e31e3ef467 100644
--- a/fastdeploy/engine/sampling_params.py
+++ b/fastdeploy/engine/sampling_params.py
@@ -97,6 +97,7 @@ class SamplingParams:
     min_tokens: int = 1
     logprobs: Optional[int] = None
     bad_words: Optional[List[str]] = None
+    guided_decoding: Optional[GuidedDecodingParams] = None
 
     @classmethod
     def from_dict(cls, req_dict: dict[str, Any]) -> SamplingParams:
@@ -128,6 +129,7 @@ def from_optional(
         min_tokens=1,
         logprobs=None,
         bad_words=None,
+        guided_decoding=None,
     ) -> SamplingParams:
         """Create instance from command line arguments"""
         return cls(
@@ -148,6 +150,7 @@ def from_optional(
             min_tokens=min_tokens,
             logprobs=logprobs,
             bad_words=bad_words,
+            guided_decoding=guided_decoding,
         )
 
     def __post_init__(self):
@@ -218,3 +221,45 @@ class BeamSearchParams:
     temperature: float = 0.0
     length_penalty: float = 1.0
     include_stop_str_in_output: bool = False
+
+
+@dataclass
+class GuidedDecodingParams:
+    """Guided decoding parameters for text generation."""
+    json: Optional[Union[str, dict]] = None
+    regex: Optional[str] = None
+    choice: Optional[List[str]] = None
+    grammar: Optional[str] = None
+    json_object: Optional[bool] = None
+    structural_tag: Optional[str] = None
+
+    def to_dict(self):
+        """convert to dict"""
+        key_dict = {
+            "guided_json": self.json,
+            "guided_regex": self.regex,
+            "guided_choice": self.choice,
+            "guided_grammar": self.grammar,
+            "structural_tag": self.structural_tag,
+            "guided_json_object": self.json_object,
+        }
+
+        guided_dict = {}
+        for key, value in key_dict.items():
+            if value is not None:
+                guided_dict[key] = value
+        return guided_dict
+
+    def __post_init__(self):
+        """Verify the arguments."""
+        guided_count = sum([
+            self.json is not None, self.regex is not None, self.choice
+            is not None, self.grammar is not None, self.json_object
+            is not None, self.structural_tag is not None
+        ])
+
+        if guided_count > 1:
+            raise ValueError(
+                "You can only use one kind of guided decoding "
+                "('json', 'json_object', 'regex', 'choice', 'grammar', 'structural_tag')."
+            )
diff --git a/fastdeploy/entrypoints/llm.py b/fastdeploy/entrypoints/llm.py
index 1204a67f9d..0ac5db20f8 100644
--- a/fastdeploy/entrypoints/llm.py
+++ b/fastdeploy/entrypoints/llm.py
@@ -275,6 +275,9 @@ def _add_request(
             enable_thinking = None
             if chat_template_kwargs is not None:
                 enable_thinking = chat_template_kwargs.get("enable_thinking", None)
+            if current_sampling_params.guided_decoding is not None:
+                guided_decoding_dict = current_sampling_params.guided_decoding.to_dict()
+                tasks.update(guided_decoding_dict)
             self.llm_engine.add_requests(tasks, current_sampling_params, enable_thinking=enable_thinking)
         return req_ids
 
diff --git a/fastdeploy/input/ernie_processor.py b/fastdeploy/input/ernie_processor.py
index 63feda9348..f66640a7b3 100644
--- a/fastdeploy/input/ernie_processor.py
+++ b/fastdeploy/input/ernie_processor.py
@@ -60,7 +60,6 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None):
         self.eos_token_ids = [self.tokenizer.eos_token_id]
         self.eos_token_id_len = len(self.eos_token_ids)
         self.pad_token_id = self.get_pad_id()
-        self.reasoning_parser = None
         if reasoning_parser_obj:
             self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
 
@@ -264,7 +263,6 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
         Returns:
             Dict: response contain text fields
         """
-        enable_thinking = kwargs.get("enable_thinking")
         is_end = response_dict["finished"]
         req_id = response_dict["request_id"]
         token_ids = response_dict["outputs"]["token_ids"]
@@ -273,6 +271,8 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
             if token_ids[-1] == self.tokenizer.eos_token_id:
                 token_ids = token_ids[:-1]
         delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id)
+
+        enable_thinking = self.get_enable_thinking(kwargs.get("enable_thinking"))
         if enable_thinking and self.reasoning_parser:
             reasoning_content, text = self.reasoning_parser.extract_reasoning_content_streaming(
                 previous_texts,
diff --git a/fastdeploy/input/ernie_vl_processor.py b/fastdeploy/input/ernie_vl_processor.py
index 1b8669e293..afb399910d 100644
--- a/fastdeploy/input/ernie_vl_processor.py
+++ b/fastdeploy/input/ernie_vl_processor.py
@@ -78,7 +78,7 @@ def _load_tokenizer(self):
     def process_request(self, request, max_model_len=None, **kwargs):
         """process the input data"""
         task = request.to_dict()
-        task["enable_thinking"] = kwargs.get("enable_thinking", True)
+        task['enable_thinking'] = self.get_enable_thinking(kwargs.get("enable_thinking"))
         self.process_request_dict(task, max_model_len)
         request = Request.from_dict(task)
 
@@ -244,9 +244,7 @@ def process_response_dict(self, response_dict, stream, **kwargs):
         Returns:
             Dict: response contain text fields
         """
-        enable_thinking = kwargs.pop("enable_thinking", True)
-        if enable_thinking is None:
-            enable_thinking = True
+        enable_thinking = self.get_enable_thinking(kwargs.get("enable_thinking"))
         if stream:
             return self.process_response_dict_streaming(response_dict, enable_thinking=enable_thinking, **kwargs)
         else:
diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py
index 664868a595..d775fc7db7 100644
--- a/fastdeploy/input/text_processor.py
+++ b/fastdeploy/input/text_processor.py
@@ -50,6 +50,26 @@ def __init__(self):
             )
         )
 
+    def get_enable_thinking(self, enable_thinking=None):
+        """
+        get enable_thinking param
+
+        1. if enable_thinking is None:
+            1.1 if reasoning_parser is not None, set enable_thinking to True.
+            1.2 if reasoning_parser is None, set enable_thinking to False.
+        2. if reasoning_parser is None but enable_thinking is True, set enable_thinking to False and print warning.
+
+        """
+        if enable_thinking is None:
+            enable_thinking = False if self.reasoning_parser is None else True
+        if enable_thinking and self.reasoning_parser is None:
+            enable_thinking = False
+            data_processor_logger.warning(
+                "enable_thinking is True, but reasoning_parser is None. "
+                "enable_thinking will be set to False."
+            )
+        return enable_thinking
+
     def _apply_default_parameters(self, request):
         """
         Apply default value for parameters in request
@@ -229,6 +249,7 @@ def process_request(self, request, max_model_len=None, **kwargs):
             request.set("stop_token_ids", stop_seqs)
             request.set("stop_seqs_len", stop_seqs_len)
 
+        request.set("enable_thinking", self.get_enable_thinking(kwargs.get("enable_thinking")))
         if request.prompt_token_ids is None or len(request.prompt_token_ids) == 0:
             if request.prompt is not None:
                 request.prompt_token_ids = self.text2ids(request.prompt, max_model_len)
@@ -236,7 +257,6 @@ def process_request(self, request, max_model_len=None, **kwargs):
                 if self.tokenizer.chat_template is None:
                     raise ValueError("This model does not support chat_template.")
                 task = request.to_dict()
-                task["enable_thinking"] = kwargs.get("enable_thinking", True)
                 request.prompt_token_ids = self.messages2ids(task)
             else:
                 raise ValueError(f"The request should have `input_ids`, `text` or `messages`: {request}.")
@@ -372,7 +392,6 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
         Returns:
             Dict: response contain text fields
         """
-        enable_thinking = kwargs.get("enable_thinking")
         is_end = response_dict["finished"]
         req_id = response_dict["request_id"]
         token_ids = response_dict["outputs"]["token_ids"]
@@ -382,6 +401,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
                 token_ids = token_ids[:-1]
         delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id)
 
+        enable_thinking = self.get_enable_thinking(kwargs.get("enable_thinking"))
         if enable_thinking and self.reasoning_parser:
             reasoning_content, text = self.reasoning_parser.extract_reasoning_content_streaming(
                 previous_texts,
diff --git a/fastdeploy/model_executor/guided_decoding/__init__.py b/fastdeploy/model_executor/guided_decoding/__init__.py
index d6ee611992..01e887502e 100644
--- a/fastdeploy/model_executor/guided_decoding/__init__.py
+++ b/fastdeploy/model_executor/guided_decoding/__init__.py
@@ -15,8 +15,10 @@
 """
 
 # from fastdeploy.config import FDConfig
+from fastdeploy.model_executor.guided_decoding.base_guided_decoding import (
+    BackendBase, BaseChecker, LogitsProcessorBase)
 
-__all__ = ["get_guided_backend", "schema_checker"]
+__all__ = ['get_guided_backend', 'schema_checker', 'LogitsProcessorBase', 'BackendBase', 'BaseChecker']
 
 
 def get_guided_backend(
diff --git a/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py b/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py
index 7baf2fe971..adcfbba6c6 100644
--- a/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py
+++ b/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py
@@ -19,6 +19,7 @@
 
 from fastdeploy.config import ErnieArchitectures, FDConfig
 from fastdeploy.engine.request import Request
+from fastdeploy.reasoning import ReasoningParserManager
 from fastdeploy.utils import llm_logger
 
 
@@ -34,8 +35,9 @@ class LogitsProcessorBase:
         None (all state should be managed by subclasses)
     """
 
-    def __init__(self):
-        pass
+    def __init__(self, enable_reasoning):
+        self.reasoning_ended = False
+        self.enable_reasoning = enable_reasoning
 
     def fill_token_bitmask(self, token_bitmask, idx):
         """
@@ -136,8 +138,13 @@ def __init__(self, fd_config: FDConfig):
         self.fd_config = fd_config
         self.executor = ThreadPoolExecutor()
         self.max_cache_size = 2048
+        self.reasoning_parser = None
 
         self.hf_tokenizer = self._get_tokenizer_hf()
+        if self.fd_config.model_config.reasoning_parser:
+            reasoning_parser_obj = ReasoningParserManager.get_reasoning_parser(
+                self.fd_config.model_config.reasoning_parser)
+            self.reasoning_parser = reasoning_parser_obj(self.hf_tokenizer)
 
     def _create_processor(self):
         """
@@ -148,70 +155,88 @@ def _create_processor(self):
         """
         raise NotImplementedError
 
-    def _json_processor(self, schemata):
+    def _json_processor(self, schemata, enable_thinking=False):
         """
         Process JSON schemata.
 
         Args:
             schemata (str): The schemata string.
+            enable_thinking (bool): Whether to enable thinking mode.
 
         Raises:
             NotImplementedError: This method should be implemented in subclasses.
         """
         raise NotImplementedError
 
-    def _regex_processor(self, schemata):
+    def _regex_processor(self, schemata, enable_thinking=False):
         """
         Process regular expression schemata.
 
         Args:
             schemata (str): The schemata string.
+            enable_thinking (bool): Whether to enable thinking mode.
 
         Raises:
             NotImplementedError: This method should be implemented in subclasses.
         """
         raise NotImplementedError
 
-    def _grammar_processor(self, schemata):
+    def _grammar_processor(self, schemata, enable_thinking=False):
         """
         Process grammar schemata.
 
         Args:
             schemata (str): The schemata string.
+            enable_thinking (bool): Whether to enable thinking mode.
 
         Raises:
             NotImplementedError: This method should be implemented in subclasses.
         """
         raise NotImplementedError
 
-    def _structural_tag_processor(self, schemata):
+    def _structural_tag_processor(self, schemata, enable_thinking=False):
         """
         Process structural tag schemata.
 
         Args:
             schemata (str): The schemata string.
+            enable_thinking (bool): Whether to enable thinking mode.
 
         Raises:
             NotImplementedError: This method should be implemented in subclasses.
         """
         raise NotImplementedError
 
-    def _unsupported_processor_type(self, key_type, schemata):
+    def _unsupported_processor_type(self, key_type, schemata, enable_thinking=False):
         """
         Process unsupported type.
 
         Args:
             key_type (str): The key type string.
             schemata (str): The schemata string.
+            enable_thinking (bool): Whether to enable thinking mode.
         """
         raise Exception(f"Unsupported processor type {key_type}.")
 
-    def _init_logits_processor(self, schemata_key: tuple[str, str]) -> LogitsProcessorBase:
+    def get_reasoning_parser(self):
+        """
+        Get reasoning parser object.
+        Returns:
+            ReasoningParser: Reasoning parser object or None
+        """
+        return self.reasoning_parser
+
+    def _init_logits_processor(
+            self,
+            schemata_key: tuple[str, str],
+            enable_thinking: bool = False,
+        ) -> LogitsProcessorBase:
         """
         init logits processor by type and schemata.
 
         Args:
             schemata_key (tuple[str, str]): Tuple containing processor type and schema string
+            enable_thinking (bool): Whether to enable thinking step
 
         Returns:
             LogitsProcessorBase: Initialized logits processor instance
@@ -221,18 +246,22 @@ def _init_logits_processor(self, schemata_key: tuple[str, str]) -> LogitsProcess
         """
         key_type, schemata = schemata_key
         if key_type == "json":
-            return self._json_processor(schemata)
+            return self._json_processor(schemata, enable_thinking)
         elif key_type == "regex":
-            return self._regex_processor(schemata)
+            return self._regex_processor(schemata, enable_thinking)
         elif key_type == "grammar":
-            return self._grammar_processor(schemata)
+            return self._grammar_processor(schemata, enable_thinking)
         elif key_type == "structural_tag":
-            return self._structural_tag_processor(schemata)
+            return self._structural_tag_processor(schemata, enable_thinking)
         else:
             llm_logger.error(f"Unsupported processor type {key_type}.")
             return None
 
-    def get_logits_processor(self, schemata_key: tuple[str, str]) -> tuple[LogitsProcessorBase, bool]:
+    def get_logits_processor(
+            self,
+            schemata_key: tuple[str, str],
+            enable_thinking: bool = False,
+        ) -> tuple[LogitsProcessorBase, bool]:
         """
         get logits processor by key from cache or create new one.
 
@@ -246,8 +275,10 @@ def get_logits_processor(self, schemata_key: tuple[str, str]) -> tuple[LogitsPro
         """
         value = self.cache.get(schemata_key, None)
         if value:
-            return value.copy(), True
-        value = self.executor.submit(self._init_logits_processor, schemata_key)
+            value_copy = value.copy()
+            value_copy.enable_reasoning = enable_thinking
+            return value_copy, True
+        value = self.executor.submit(self._init_logits_processor, schemata_key, enable_thinking)
         return value, False
 
     def _get_tokenizer_hf(self):
@@ -266,9 +297,7 @@ def _get_tokenizer_hf(self):
         try:
             architectures = self.fd_config.model_config.architectures
             if not ErnieArchitectures.contains_ernie_arch(architectures):
-
                 from transformers import AutoTokenizer, PreTrainedTokenizerFast
-
                 tokenizer = AutoTokenizer.from_pretrained(
                     self.fd_config.model_config.model,
                     use_fast=False,
diff --git a/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py b/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py
index f702a1085e..b03ff09291 100644
--- a/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py
+++ b/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py
@@ -23,11 +23,9 @@
 
 from fastdeploy.config import FDConfig
 from fastdeploy.engine.request import Request
-from fastdeploy.model_executor.guided_decoding.base_guided_decoding import (
-    BackendBase,
-    BaseChecker,
-    LogitsProcessorBase,
-)
+from fastdeploy.model_executor.guided_decoding import (BackendBase,
+                                                       BaseChecker,
+                                                       LogitsProcessorBase)
 from fastdeploy.utils import llm_logger
 
 try:
@@ -56,7 +54,6 @@ class XGrammarProcessor(LogitsProcessorBase):
         max_rollback_tokens (int): Maximum number of tokens to rollback on mismatch
         vocab_size (int): Size of the vocabulary
         batch_size (int): Batch size for processing
-        splitwise_role (str): Role for splitwise processing
         compiled_grammar (CompiledGrammar): Compiled grammar rules
         terminate_without_stop_token (bool): Whether to terminate without stop token
         override_stop_tokens (Optional[List[int]]): Custom stop tokens
@@ -70,13 +67,12 @@ def __init__(
         override_stop_tokens: Optional[List[int]] = None,
         vocab_size: Optional[int] = None,
         batch_size: Optional[int] = None,
-        splitwise_role: str = "mixed",
+        enable_thinking: bool = False,
     ):
-        super().__init__()
+        super().__init__(enable_reasoning=enable_thinking)
         self.max_rollback_tokens = 200
         self.vocab_size = vocab_size
         self.batch_size = batch_size
-        self.splitwise_role = splitwise_role
         self.compiled_grammar = compiled_grammar
         self.terminate_without_stop_token = terminate_without_stop_token
         self.override_stop_tokens = override_stop_tokens
@@ -187,7 +183,6 @@ def copy(self) -> "XGrammarProcessor":
             override_stop_tokens=self.override_stop_tokens,
             vocab_size=self.vocab_size,
             batch_size=self.batch_size,
-            splitwise_role=self.splitwise_role,
         )
 
 
@@ -202,7 +197,6 @@ class XGrammarBackend(BackendBase):
         vocab_size (int): Size of the vocabulary from config
         batch_size (int): Maximum batch size from config
         any_whitespace (bool): Whether to allow any whitespace in JSON
-        splitwise_role (str): Role for splitwise processing
         grammar_compiler (GrammarCompiler): Grammar compilation engine
     """
 
@@ -216,7 +210,6 @@ def __init__(
         self.batch_size = fd_config.parallel_config.max_num_seqs
 
         self.any_whitespace = not fd_config.parallel_config.disable_any_whitespace
-        self.splitwise_role = fd_config.parallel_config.splitwise_role
 
         try:
             tokenizer_info = TokenizerInfo.from_huggingface(self.hf_tokenizer, vocab_size=self.vocab_size)
@@ -229,6 +222,7 @@ def _create_processor(
         compiled_grammar: CompiledGrammar,
         terminate_without_stop_token: bool = False,
         override_stop_tokens: Optional[List[int]] = None,
+        enable_thinking: bool = False,
     ) -> XGrammarProcessor:
         """
         Create a logits processor instance for the given compiled grammar.
@@ -237,6 +231,7 @@ def _create_processor(
             compiled_grammar (CompiledGrammar): Compiled grammar rules
             terminate_without_stop_token (bool): Whether to terminate without stop token
             override_stop_tokens (Optional[List[int]]): Custom stop tokens to override defaults
+            enable_thinking (bool): Whether to enable thinking mode
 
         Returns:
             XGrammarProcessor: Configured grammar processor instance
@@ -247,15 +242,16 @@ def _create_processor(
             override_stop_tokens=override_stop_tokens,
             vocab_size=self.vocab_size,
             batch_size=self.batch_size,
-            splitwise_role=self.splitwise_role,
+            enable_thinking=enable_thinking,
         )
 
-    def _json_processor(self, schemata: str) -> Optional[XGrammarProcessor]:
+    def _json_processor(self, schemata: str, enable_thinking: bool=False) -> Optional[XGrammarProcessor]:
         """
         Compile JSON schema into a grammar processor.
 
         Args:
             schemata (str): JSON schema string to compile
+            enable_thinking (bool): Whether to enable thinking mode
 
         Returns:
             Optional[XGrammarProcessor]: Configured processor if successful, None on failure
@@ -265,14 +261,15 @@ def _json_processor(self, schemata: str) -> Optional[XGrammarProcessor]:
         except Exception as e:
             llm_logger.error(f"Failed to compile json schema: {e}")
             return None
-        return self._create_processor(compiled_grammar)
+        return self._create_processor(compiled_grammar, enable_thinking=enable_thinking)
 
-    def _regex_processor(self, schemata: str) -> Optional[XGrammarProcessor]:
+    def _regex_processor(self, schemata: str, enable_thinking: bool=False) -> Optional[XGrammarProcessor]:
         """
         Compile regex pattern into a grammar processor.
 
         Args:
             schemata (str): Regex pattern string to compile
+            enable_thinking (bool): Whether to enable thinking mode
 
         Returns:
             Optional[XGrammarProcessor]: Configured processor if successful, None on failure
@@ -282,14 +279,15 @@ def _regex_processor(self, schemata: str) -> Optional[XGrammarProcessor]:
         except Exception as e:
             llm_logger.error(f"Failed to compile regex schema: {e}")
             return None
-        return self._create_processor(compiled_grammar)
+        return self._create_processor(compiled_grammar, enable_thinking=enable_thinking)
 
-    def _grammar_processor(self, schemata: str) -> Optional[XGrammarProcessor]:
+    def _grammar_processor(self, schemata: str, enable_thinking: bool=False) -> Optional[XGrammarProcessor]:
         """
         Compile grammar (EBNF) into a grammar processor.
 
         Args:
             schemata (str): Grammar string in EBNF format
+            enable_thinking (bool): Whether to enable thinking mode
 
         Returns:
             Optional[XGrammarProcessor]: Configured processor if successful, None on failure
@@ -299,9 +297,9 @@ def _grammar_processor(self, schemata: str) -> Optional[XGrammarProcessor]:
         except Exception as e:
             llm_logger.error(f"Failed to compile ebnf schema: {e}")
             return None
-        return self._create_processor(compiled_grammar)
+        return self._create_processor(compiled_grammar, enable_thinking=enable_thinking)
 
-    def _structural_tag_processor(self, schemata: str) -> Optional[XGrammarProcessor]:
+    def _structural_tag_processor(self, schemata: str, enable_thinking: bool=False) -> Optional[XGrammarProcessor]:
         """
         Compile structural tags into a grammar processor.
 
@@ -326,7 +324,7 @@ def _structural_tag_processor(self, schemata: str) -> Optional[XGrammarProcessor
         except Exception as e:
             llm_logger.error(f"Failed to compile structural tags schema: {e}")
             return None
-        return self._create_processor(compiled_grammar)
+        return self._create_processor(compiled_grammar, enable_thinking=enable_thinking)
 
 
 class XGrammarChecker(BaseChecker):
diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py
index f064cf9d1e..2763052660 100644
--- a/fastdeploy/model_executor/layers/sample/sampler.py
+++ b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -23,9 +23,7 @@
 from paddle import nn
 
 from fastdeploy.config import FDConfig
-from fastdeploy.model_executor.guided_decoding.base_guided_decoding import (
-    LogitsProcessorBase,
-)
+from fastdeploy.model_executor.guided_decoding import LogitsProcessorBase
 from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
 from fastdeploy.model_executor.layers.sample.ops import (
     apply_penalty_multi_scores,
@@ -34,6 +32,7 @@
     top_k_top_p_sampling,
 )
 from fastdeploy.platforms import current_platform
+from fastdeploy.reasoning import ReasoningParser
 from fastdeploy.worker.output import LogprobsTensors, SamplerOutput
 
 
@@ -48,6 +47,10 @@ def __init__(self):
         self.logits_processor: Dict[int, Optional[Any]] = dict()
         self.executor = ThreadPoolExecutor()
         self.logits_lock = threading.Lock()
+        self.reasoning_parser = None
+
+    def apply_reasoning_parser(self, reasoning_parser: Optional[ReasoningParser] = None):
+        self.reasoning_parser = reasoning_parser
 
     def add_logits_processor(
         self,
@@ -124,9 +127,16 @@ def apply_token_mask(self, logits: paddle.Tensor, skip_idx_list: List[int] = [])
         if available_processors is None:
             return logits
 
-        indices = list(self.logits_processor.keys())
-        mask_idx = [i for i in indices if i not in skip_idx_list]
-        return available_processors.apply_token_mask(logits, self.token_bitmask, indices=mask_idx)
+        indices = []
+        for idx, processor in self.logits_processor.items():
+            if processor is None or idx in skip_idx_list:
+                continue
+            if not processor.enable_reasoning or processor.reasoning_ended:
+                indices.append(idx)
+
+        return available_processors.apply_token_mask(logits,
+                                                     self.token_bitmask,
+                                                     indices=indices)
 
     def _accept_token(self, idx: int, token: int):
         """accept token"""
@@ -136,6 +146,15 @@ def _accept_token(self, idx: int, token: int):
         if self.logits_processor[idx].is_terminated():
             return
 
+        if (
+            self.reasoning_parser is not None
+            and self.logits_processor[idx].enable_reasoning
+            and not self.logits_processor[idx].reasoning_ended
+        ):
+            reasoning_ended = self.reasoning_parser.is_reasoning_end([token])
+            self.logits_processor[idx].reasoning_ended = reasoning_ended
+            return
+
         self.logits_processor[idx].accept_token(token)
 
     def update_output_tokens(self, next_tokens: paddle.Tensor, skip_idx_list: List[int] = []):
@@ -181,19 +200,25 @@ def __init__(self):
 
         self.processor = SamplerProcessor()
 
-    def apply_logits_processor(
-        self,
-        ids: int,
-        future: Optional[Any] = None,
-        prefill_tokens: List[int] = [],
-    ):
-        """apply logits processor to sampler"""
+    def set_reasoning_parser(self, reasoning_parser: Optional[ReasoningParser] = None):
+        """ set reasoning parser """
+        self.processor.apply_reasoning_parser(reasoning_parser)
+
+    def apply_logits_processor(self,
+                               ids: int,
+                               future: Optional[Any] = None,
+                               prefill_tokens: List[int] = []):
+        """ apply logits processor to sampler """
         self.processor.add_logits_processor(ids, future, prefill_tokens)
 
     def pre_process(self, skip_idx_list: List[int] = []):
         """pre process before running"""
         self.processor.pre_process(skip_idx_list)
 
+    def post_process(self, next_tokens: paddle.Tensor, skip_idx_list: List[int] = []):
+        """ post process after running """
+        self.processor.update_output_tokens(next_tokens, skip_idx_list)
+
     def compute_logprobs(self, logits: paddle.Tensor) -> paddle.Tensor:
         """ """
         return F.log_softmax(logits, axis=-1)
@@ -276,8 +301,6 @@ def forward_cuda(
             None if num_logprobs is None else self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=next_tokens)
         )
 
-        self.processor.update_output_tokens(next_tokens, skip_idx_list)
-
         sampler_output = SamplerOutput(
             # The sampled tokens are expanded to 2D tensor with shape
             # [num_requests, 1], where each row represents one generated
@@ -309,13 +332,19 @@ def pre_process(self, skip_idx_list: List[int] = []):
         """pre process before running"""
         pass
 
-    def apply_logits_processor(
-        self,
-        ids: int,
-        future: Optional[Any] = None,
-        prefill_tokens: List[int] = [],
-    ):
-        """apply logits processor to sampler"""
+    def set_reasoning_parser(self, reasoning_parser: Optional[ReasoningParser] = None):
+        """ set reasoning parser """
+        pass
+
+    def post_process(self, next_tokens: paddle.Tensor, skip_idx_list: List[int] = []):
+        """ post process after running """
+        pass
+
+    def apply_logits_processor(self,
+                               ids: int,
+                               future: Optional[Any] = None,
+                               prefill_tokens: List[int] = []):
+        """ apply logits processor to sampler """
         pass
 
     def forward_cuda(
@@ -409,6 +438,14 @@ def apply_logits_processor(
         """apply logits processor to sampler"""
         pass
 
+    def set_reasoning_parser(self, reasoning_parser: Optional[ReasoningParser] = None):
+        """ set reasoning parser """
+        pass
+
+    def post_process(self, next_tokens: paddle.Tensor, skip_idx_list: List[int] = []):
+        """ post process after running """
+        pass
+
     def forward_cuda(
         self,
         logits: paddle.Tensor,
diff --git a/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py b/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py
index f5762b791f..6589892a4d 100644
--- a/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py
+++ b/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py
@@ -46,6 +46,20 @@ def __init__(self, tokenizer):
         if self.think_end_token_id is None:
             raise RuntimeError("Ernie VL reasoning parser could not locate think end " "tokens in the tokenizer!")
 
+    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
+        """
+        Check if the reasoning content ends in the input_ids.
+        It is used in structured engines like `xgrammar` to check if the
+        reasoning content ends in the model output.
+        Parameters:
+        input_ids: list[int]
+            The input_ids of the model output.
+        Returns:
+        bool
+            True if the reasoning content ends in the input_ids.
+        """
+        return self.think_end_token_id in input_ids
+
     def extract_reasoning_content_streaming(
         self,
         previous_text: str,
diff --git a/fastdeploy/reasoning/qwen3_reasoning_parsers.py b/fastdeploy/reasoning/qwen3_reasoning_parsers.py
index 4fc565c6c1..fd00e675e4 100644
--- a/fastdeploy/reasoning/qwen3_reasoning_parsers.py
+++ b/fastdeploy/reasoning/qwen3_reasoning_parsers.py
@@ -48,6 +48,20 @@ def __init__(self, tokenizer):
         if self.think_end_token_id is None:
             raise RuntimeError("Qwen3  reasoning parser could not locate think end " "tokens in the tokenizer!")
 
+    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
+        """
+        Check if the reasoning content ends in the input_ids.
+        It is used in structured engines like `xgrammar` to check if the
+        reasoning content ends in the model output.
+        Parameters:
+        input_ids: list[int]
+            The input_ids of the model output.
+        Returns:
+        bool
+            True if the reasoning content ends in the input_ids.
+        """
+        return self.think_end_token_id in input_ids
+
     def extract_reasoning_content_streaming(
         self,
         previous_text: str,
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 4ec2411ec4..ced71571da 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -29,10 +29,8 @@
     profile_run_guard,
     sot_warmup_guard,
 )
-from fastdeploy.model_executor.guided_decoding import get_guided_backend
-from fastdeploy.model_executor.guided_decoding.base_guided_decoding import (
-    LogitsProcessorBase,
-)
+from fastdeploy.model_executor.guided_decoding import (LogitsProcessorBase,
+                                                       get_guided_backend)
 from fastdeploy.model_executor.layers.attention import get_attention_backend
 from fastdeploy.model_executor.layers.attention.base_attention_backend import (
     AttentionBackend,
@@ -83,10 +81,6 @@ def __init__(
         self.speculative_decoding = self.speculative_method is not None
         self.enable_logprob = fd_config.model_config.enable_logprob
 
-        self.guided_backend = None
-        if self.fd_config.parallel_config.guided_decoding_backend != "off":
-            self.guided_backend = get_guided_backend(fd_config=self.fd_config)
-
         # VL model config:
         if self.enable_mm:
             self._init_image_preprocess()
@@ -115,6 +109,11 @@ def __init__(
         else:
             self.sampler = SpeculativeSampler(fd_config)
 
+        self.guided_backend = None
+        if self.fd_config.parallel_config.guided_decoding_backend != "off":
+            self.guided_backend = get_guided_backend(fd_config=self.fd_config)
+            self.sampler.set_reasoning_parser(self.guided_backend.get_reasoning_parser())
+
         # Lazy initialize kv cache after model loading
         # self.kv_caches: list[paddle.Tensor] = []
 
@@ -191,7 +190,10 @@ def _init_logits_processor(self, request):
         elif request.structural_tag is not None:
             schemata_key = ("structural_tag", request.structural_tag)
 
-        return self.guided_backend.get_logits_processor(schemata_key=schemata_key), schemata_key
+        return self.guided_backend.get_logits_processor(
+            schemata_key=schemata_key,
+            enable_thinking=request.get("enable_thinking"),
+        ), schemata_key
 
     def insert_tasks_v1(self, req_dicts: List[Request]):
         """
@@ -395,8 +397,7 @@ def insert_prefill_inputs(self, req_dicts: List[Request]):
                     self.share_inputs["prompt_lens"][idx : idx + 1] = length
 
                 if self.enable_mm:
-                    enable_thinking = request.get("enable_thinking", True)
-                    enable_thinking = enable_thinking if enable_thinking is not None else True
+                    enable_thinking = request.get("enable_thinking")
                     self.share_inputs["enable_thinking"][:] = enable_thinking
                     self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0
                     self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048)
@@ -1160,10 +1161,14 @@ def _get_skip_idx(self, model_forward_batch: Optional[List[Request]] = None):
         Returns:
             A list of indices corresponding to the requests that need to be skipped.
         """
-        skip_idx_list = []
-        if not self.cache_config.enable_chunked_prefill or self.guided_backend is None:
-            return skip_idx_list
+        if (
+            not self.parallel_config.enable_chunked_prefill
+            or self.guided_backend is None
+            or model_forward_batch is None
+        ):
+            return []
 
+        skip_idx_list = []
         for task in model_forward_batch:
             if task.get("prefill_chunk_info", None) is None or task.chunk_idx >= len(task.prefill_chunk_info):
                 continue
@@ -1247,6 +1252,7 @@ class at the server level, which is too granular for ModelRunner.
             if self.parallel_config.tensor_parallel_size > 1:
                 paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0)
 
+            self.sampler.post_process(sampler_output.sampled_token_ids, skip_idx_list)
         else:
             self.sampler(
                 logits,
diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py
index d23d57bf7e..0dbcc696d3 100644
--- a/fastdeploy/worker/worker_process.py
+++ b/fastdeploy/worker/worker_process.py
@@ -565,6 +565,11 @@ def parse_args():
         action="store_true",
         help="Enable output of token-level log probabilities.",
     )
+    parser.add_argument("--reasoning_parser",
+                        type=str,
+                        default=None,
+                        help="Flag specifies the reasoning parser to use for " \
+                        "extracting reasoning content from the model output")
 
     args = parser.parse_args()
     return args

From 6bd36760b3e7bd5b8bf2a6560d0b6af2dd1f49c9 Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Fri, 18 Jul 2025 16:42:36 +0800
Subject: [PATCH 02/20] update code

---
 fastdeploy/config.py                   | 2 +-
 fastdeploy/input/ernie_vl_processor.py | 3 ++-
 fastdeploy/input/text_processor.py     | 4 +---
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index a68ae6d58e..7f2e309eb7 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -119,7 +119,7 @@ def __init__(
         self.reasoning_parser = None
 
         for key, value in args.items():
-            if hasattr(self, key):
+            if hasattr(self, key) and value != "None":
                 setattr(self, key, value)
 
         assert self.model != ""
diff --git a/fastdeploy/input/ernie_vl_processor.py b/fastdeploy/input/ernie_vl_processor.py
index afb399910d..845f2b613f 100644
--- a/fastdeploy/input/ernie_vl_processor.py
+++ b/fastdeploy/input/ernie_vl_processor.py
@@ -162,6 +162,7 @@ def _check_mm_limits(self, item):
     def process_request_dict(self, request, max_model_len=None):
         """process the input data"""
 
+        request['enable_thinking'] = self.get_enable_thinking(request.get("enable_thinking"))
         if not request.get("eos_token_ids"):
             request["eos_token_ids"] = self.eos_token_ids
 
@@ -244,7 +245,7 @@ def process_response_dict(self, response_dict, stream, **kwargs):
         Returns:
             Dict: response contain text fields
         """
-        enable_thinking = self.get_enable_thinking(kwargs.get("enable_thinking"))
+        enable_thinking = self.get_enable_thinking(kwargs.pop("enable_thinking", None))
         if stream:
             return self.process_response_dict_streaming(response_dict, enable_thinking=enable_thinking, **kwargs)
         else:
diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py
index d775fc7db7..40fb8e8784 100644
--- a/fastdeploy/input/text_processor.py
+++ b/fastdeploy/input/text_processor.py
@@ -430,9 +430,7 @@ def process_response_dict(self, response_dict, **kwargs):
         Returns:
             Dict: response contain text fields
         """
-        enable_thinking = kwargs.pop("enable_thinking", True)
-        if enable_thinking is None:
-            enable_thinking = True
+        enable_thinking = self.get_enable_thinking(kwargs.pop("enable_thinking", None))
         stream = kwargs.get("stream", True)
         if stream:
             return self.process_response_dict_streaming(response_dict, enable_thinking=enable_thinking, **kwargs)

From 65458b33264a3db4b3439931b24c8fa6a36f222f Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Fri, 1 Aug 2025 18:04:28 +0800
Subject: [PATCH 03/20] update code

---
 fastdeploy/engine/request.py                  |  4 +--
 fastdeploy/input/ernie_processor.py           |  5 +--
 fastdeploy/input/ernie_vl_processor.py        |  3 +-
 fastdeploy/input/text_processor.py            | 12 ++++---
 .../model_executor/layers/sample/sampler.py   | 36 ++++++++-----------
 fastdeploy/worker/gpu_model_runner.py         | 20 ++++++-----
 6 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py
index db183bb27a..9bb5709be3 100644
--- a/fastdeploy/engine/request.py
+++ b/fastdeploy/engine/request.py
@@ -69,7 +69,7 @@ def __init__(
         guided_grammar: Optional[Any] = None,
         structural_tag: Optional[Any] = None,
         guided_json_object: Optional[bool] = None,
-        enable_thinking: Optional[bool] = True,
+        enable_thinking: Optional[bool] = None,
         trace_carrier: dict = dict(),
     ) -> None:
         self.request_id = request_id
@@ -147,7 +147,7 @@ def from_dict(cls, d: dict):
             guided_grammar=d.get("guided_grammar", None),
             structural_tag=d.get("structural_tag", None),
             guided_json_object=d.get("guided_json_object", None),
-            enable_thinking=d.get("enable_thinking", True),
+            enable_thinking=d.get("enable_thinking", None),
             trace_carrier=d.get("trace_carrier", {}),
         )
 
diff --git a/fastdeploy/input/ernie_processor.py b/fastdeploy/input/ernie_processor.py
index f66640a7b3..ab79c7e71b 100644
--- a/fastdeploy/input/ernie_processor.py
+++ b/fastdeploy/input/ernie_processor.py
@@ -60,6 +60,7 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None):
         self.eos_token_ids = [self.tokenizer.eos_token_id]
         self.eos_token_id_len = len(self.eos_token_ids)
         self.pad_token_id = self.get_pad_id()
+        self.reasoning_parser = None
         if reasoning_parser_obj:
             self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
 
@@ -203,7 +204,7 @@ def process_response(self, response_dict, **kwargs):
             response_dict.outputs.reasoning_content = reasoning_content
         else:
             response_dict.outputs.text = full_text
-        data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}")
+        data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}")
         if response_dict.outputs.text == "" and response_dict.outputs.reasoning_content == "":
             return None
         return response_dict
@@ -233,7 +234,6 @@ def process_response_dict_normal(self, response_dict, **kwargs):
         Returns:
             Dict: response contain text fields
         """
-        enable_thinking = kwargs.get("enable_thinking")
         token_ids = response_dict["outputs"]["token_ids"]
         is_end = response_dict["finished"]
         req_id = response_dict["request_id"]
@@ -243,6 +243,7 @@ def process_response_dict_normal(self, response_dict, **kwargs):
         delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id)
         if is_end:
             full_text = previous_texts + delta_text
+            enable_thinking = self.get_enable_thinking(kwargs.get("enable_thinking"))
             if enable_thinking and self.reasoning_parser:
                 reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict)
                 response_dict["outputs"]["text"] = text
diff --git a/fastdeploy/input/ernie_vl_processor.py b/fastdeploy/input/ernie_vl_processor.py
index 845f2b613f..e0ba224495 100644
--- a/fastdeploy/input/ernie_vl_processor.py
+++ b/fastdeploy/input/ernie_vl_processor.py
@@ -78,7 +78,6 @@ def _load_tokenizer(self):
     def process_request(self, request, max_model_len=None, **kwargs):
         """process the input data"""
         task = request.to_dict()
-        task['enable_thinking'] = self.get_enable_thinking(kwargs.get("enable_thinking"))
         self.process_request_dict(task, max_model_len)
         request = Request.from_dict(task)
 
@@ -162,7 +161,7 @@ def _check_mm_limits(self, item):
     def process_request_dict(self, request, max_model_len=None):
         """process the input data"""
 
-        request['enable_thinking'] = self.get_enable_thinking(request.get("enable_thinking"))
+        request["enable_thinking"] = self.get_enable_thinking(request.get("enable_thinking"))
         if not request.get("eos_token_ids"):
             request["eos_token_ids"] = self.eos_token_ids
 
diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py
index 40fb8e8784..58682276a1 100644
--- a/fastdeploy/input/text_processor.py
+++ b/fastdeploy/input/text_processor.py
@@ -65,8 +65,7 @@ def get_enable_thinking(self, enable_thinking=None):
         if enable_thinking and self.reasoning_parser is None:
             enable_thinking = False
             data_processor_logger.warning(
-                "enable_thinking is True, but reasoning_parser is None. "
-                "enable_thinking will be set to False."
+                "enable_thinking is True, but reasoning_parser is None. " "enable_thinking will be set to False."
             )
         return enable_thinking
 
@@ -89,6 +88,10 @@ def set_value(req, key, value):
         set_value(request, "repetition_penalty", 1.0)
         set_value(request, "frequency_penalty", 0.0)
         set_value(request, "presence_penalty", 0.0)
+
+        enable_thinking = self.get_enable_thinking(request.get("enable_thinking"))
+        set_value(request, "enable_thinking", enable_thinking)
+
         return request
 
     @abstractmethod
@@ -287,6 +290,7 @@ def process_request_dict(self, request, max_model_len=None, **kwargs):
             str: error message
         """
         request = self._apply_default_parameters(request)
+        request["enable_thinking"] = self.get_enable_thinking(kwargs.get("enable_thinking"))
         if not request.get("eos_token_ids"):
             request["eos_token_ids"] = self.eos_token_ids
 
@@ -348,7 +352,7 @@ def process_response(self, response_dict, **kwargs):
         else:
             # 模型不支持思考,并且没单独设置enable_thinking为false
             response_dict.outputs.text = full_text
-        data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}")
+        data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}")
 
         return response_dict
 
@@ -362,7 +366,6 @@ def process_response_dict_normal(self, response_dict, **kwargs):
         Returns:
             Dict: response contain text fields
         """
-        enable_thinking = kwargs.get("enable_thinking")
         token_ids = response_dict["outputs"]["token_ids"]
         is_end = response_dict["finished"]
         req_id = response_dict["request_id"]
@@ -372,6 +375,7 @@ def process_response_dict_normal(self, response_dict, **kwargs):
         delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id)
         if is_end:
             full_text = previous_texts + delta_text
+            enable_thinking = self.get_enable_thinking(kwargs.get("enable_thinking"))
             if enable_thinking and self.reasoning_parser:
                 reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict)
                 response_dict["outputs"]["text"] = text
diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py
index 2763052660..d86c0716a4 100644
--- a/fastdeploy/model_executor/layers/sample/sampler.py
+++ b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -131,12 +131,10 @@ def apply_token_mask(self, logits: paddle.Tensor, skip_idx_list: List[int] = [])
         for idx, processor in self.logits_processor.items():
             if processor is None or idx in skip_idx_list:
                 continue
-            if not processor.enable_reasoning or processor.reasoning_ended:
+            if self.reasoning_parser is None or not processor.enable_reasoning or processor.reasoning_ended:
                 indices.append(idx)
 
-        return available_processors.apply_token_mask(logits,
-                                                     self.token_bitmask,
-                                                     indices=indices)
+        return available_processors.apply_token_mask(logits, self.token_bitmask, indices=indices)
 
     def _accept_token(self, idx: int, token: int):
         """accept token"""
@@ -201,14 +199,11 @@ def __init__(self):
         self.processor = SamplerProcessor()
 
     def set_reasoning_parser(self, reasoning_parser: Optional[ReasoningParser] = None):
-        """ set reasoning parser """
+        """set reasoning parser"""
         self.processor.apply_reasoning_parser(reasoning_parser)
 
-    def apply_logits_processor(self,
-                               ids: int,
-                               future: Optional[Any] = None,
-                               prefill_tokens: List[int] = []):
-        """ apply logits processor to sampler """
+    def apply_logits_processor(self, ids: int, future: Optional[Any] = None, prefill_tokens: List[int] = []):
+        """apply logits processor to sampler"""
         self.processor.add_logits_processor(ids, future, prefill_tokens)
 
     def pre_process(self, skip_idx_list: List[int] = []):
@@ -216,7 +211,7 @@ def pre_process(self, skip_idx_list: List[int] = []):
         self.processor.pre_process(skip_idx_list)
 
     def post_process(self, next_tokens: paddle.Tensor, skip_idx_list: List[int] = []):
-        """ post process after running """
+        """post process after running"""
         self.processor.update_output_tokens(next_tokens, skip_idx_list)
 
     def compute_logprobs(self, logits: paddle.Tensor) -> paddle.Tensor:
@@ -270,12 +265,12 @@ def forward_cuda(
         skip_idx_list: List[int] = [],
     ) -> SamplerOutput:
         """ """
+        logits = self.processor.apply_token_mask(logits, skip_idx_list)
+
         num_logprobs = sampling_metadata.max_num_logprobs
         if num_logprobs is not None:
             raw_logprobs = self.compute_logprobs(logits)
 
-        logits = self.processor.apply_token_mask(logits, skip_idx_list)
-
         logits = apply_penalty_multi_scores(
             sampling_metadata.pre_token_ids,
             sampling_metadata.prompt_ids,
@@ -333,18 +328,15 @@ def pre_process(self, skip_idx_list: List[int] = []):
         pass
 
     def set_reasoning_parser(self, reasoning_parser: Optional[ReasoningParser] = None):
-        """ set reasoning parser """
+        """set reasoning parser"""
         pass
 
     def post_process(self, next_tokens: paddle.Tensor, skip_idx_list: List[int] = []):
-        """ post process after running """
+        """post process after running"""
         pass
 
-    def apply_logits_processor(self,
-                               ids: int,
-                               future: Optional[Any] = None,
-                               prefill_tokens: List[int] = []):
-        """ apply logits processor to sampler """
+    def apply_logits_processor(self, ids: int, future: Optional[Any] = None, prefill_tokens: List[int] = []):
+        """apply logits processor to sampler"""
         pass
 
     def forward_cuda(
@@ -439,11 +431,11 @@ def apply_logits_processor(
         pass
 
     def set_reasoning_parser(self, reasoning_parser: Optional[ReasoningParser] = None):
-        """ set reasoning parser """
+        """set reasoning parser"""
         pass
 
     def post_process(self, next_tokens: paddle.Tensor, skip_idx_list: List[int] = []):
-        """ post process after running """
+        """post process after running"""
         pass
 
     def forward_cuda(
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index ced71571da..d851c34a9f 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -29,8 +29,10 @@
     profile_run_guard,
     sot_warmup_guard,
 )
-from fastdeploy.model_executor.guided_decoding import (LogitsProcessorBase,
-                                                       get_guided_backend)
+from fastdeploy.model_executor.guided_decoding import (
+    LogitsProcessorBase,
+    get_guided_backend,
+)
 from fastdeploy.model_executor.layers.attention import get_attention_backend
 from fastdeploy.model_executor.layers.attention.base_attention_backend import (
     AttentionBackend,
@@ -190,10 +192,13 @@ def _init_logits_processor(self, request):
         elif request.structural_tag is not None:
             schemata_key = ("structural_tag", request.structural_tag)
 
-        return self.guided_backend.get_logits_processor(
-            schemata_key=schemata_key,
-            enable_thinking=request.get("enable_thinking"),
-        ), schemata_key
+        return (
+            self.guided_backend.get_logits_processor(
+                schemata_key=schemata_key,
+                enable_thinking=request.get("enable_thinking"),
+            ),
+            schemata_key,
+        )
 
     def insert_tasks_v1(self, req_dicts: List[Request]):
         """
@@ -1251,8 +1256,6 @@ class at the server level, which is too granular for ModelRunner.
             )
             if self.parallel_config.tensor_parallel_size > 1:
                 paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0)
-
-            self.sampler.post_process(sampler_output.sampled_token_ids, skip_idx_list)
         else:
             self.sampler(
                 logits,
@@ -1313,6 +1316,7 @@ class at the server level, which is too granular for ModelRunner.
             speculative_decoding=self.speculative_decoding,
             skip_save_output=skip_save_output,
         )
+        self.sampler.post_process(sampler_output.sampled_token_ids, skip_idx_list)
 
         # 6. Speculative decode
         if self.speculative_decoding:

From f1141fbfadfa183e73d4c7df13d51d9b4cfd1e92 Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Fri, 1 Aug 2025 18:18:30 +0800
Subject: [PATCH 04/20] update format

---
 .../guided_decoding/__init__.py               |  7 +++++--
 .../guided_decoding/base_guided_decoding.py   | 20 ++++++++++---------
 .../guided_decoding/xgrammar_backend.py       | 16 ++++++++-------
 3 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/fastdeploy/model_executor/guided_decoding/__init__.py b/fastdeploy/model_executor/guided_decoding/__init__.py
index 01e887502e..9336f4a04e 100644
--- a/fastdeploy/model_executor/guided_decoding/__init__.py
+++ b/fastdeploy/model_executor/guided_decoding/__init__.py
@@ -16,9 +16,12 @@
 
 # from fastdeploy.config import FDConfig
 from fastdeploy.model_executor.guided_decoding.base_guided_decoding import (
-    BackendBase, BaseChecker, LogitsProcessorBase)
+    BackendBase,
+    BaseChecker,
+    LogitsProcessorBase,
+)
 
-__all__ = ['get_guided_backend', 'schema_checker', 'LogitsProcessorBase', 'BackendBase', 'BaseChecker']
+__all__ = ["get_guided_backend", "schema_checker", "LogitsProcessorBase", "BackendBase", "BaseChecker"]
 
 
 def get_guided_backend(
diff --git a/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py b/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py
index adcfbba6c6..dd50f39b38 100644
--- a/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py
+++ b/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py
@@ -143,7 +143,8 @@ def __init__(self, fd_config: FDConfig):
         self.hf_tokenizer = self._get_tokenizer_hf()
         if self.fd_config.model_config.reasoning_parser:
             reasoning_parser_obj = ReasoningParserManager.get_reasoning_parser(
-                self.fd_config.model_config.reasoning_parser)
+                self.fd_config.model_config.reasoning_parser
+            )
             self.reasoning_parser = reasoning_parser_obj(self.hf_tokenizer)
 
     def _create_processor(self):
@@ -227,10 +228,10 @@ def get_reasoning_parser(self):
         return self.reasoning_parser
 
     def _init_logits_processor(
-            self,
-            schemata_key: tuple[str, str],
-            enable_thinking: bool = False,
-        ) -> LogitsProcessorBase:
+        self,
+        schemata_key: tuple[str, str],
+        enable_thinking: bool = False,
+    ) -> LogitsProcessorBase:
         """
         init logits processor by type and schemata.
 
@@ -258,10 +259,10 @@ def _init_logits_processor(
             return None
 
     def get_logits_processor(
-            self,
-            schemata_key: tuple[str, str],
-            enable_thinking: bool = False,
-        ) -> tuple[LogitsProcessorBase, bool]:
+        self,
+        schemata_key: tuple[str, str],
+        enable_thinking: bool = False,
+    ) -> tuple[LogitsProcessorBase, bool]:
         """
         get logits processor by key from cache or create new one.
 
@@ -298,6 +299,7 @@ def _get_tokenizer_hf(self):
             architectures = self.fd_config.model_config.architectures
             if not ErnieArchitectures.contains_ernie_arch(architectures):
                 from transformers import AutoTokenizer, PreTrainedTokenizerFast
+
                 tokenizer = AutoTokenizer.from_pretrained(
                     self.fd_config.model_config.model,
                     use_fast=False,
diff --git a/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py b/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py
index b03ff09291..2349e85bf0 100644
--- a/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py
+++ b/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py
@@ -23,9 +23,11 @@
 
 from fastdeploy.config import FDConfig
 from fastdeploy.engine.request import Request
-from fastdeploy.model_executor.guided_decoding import (BackendBase,
-                                                       BaseChecker,
-                                                       LogitsProcessorBase)
+from fastdeploy.model_executor.guided_decoding import (
+    BackendBase,
+    BaseChecker,
+    LogitsProcessorBase,
+)
 from fastdeploy.utils import llm_logger
 
 try:
@@ -245,7 +247,7 @@ def _create_processor(
             enable_thinking=enable_thinking,
         )
 
-    def _json_processor(self, schemata: str, enable_thinking: bool=False) -> Optional[XGrammarProcessor]:
+    def _json_processor(self, schemata: str, enable_thinking: bool = False) -> Optional[XGrammarProcessor]:
         """
         Compile JSON schema into a grammar processor.
 
@@ -263,7 +265,7 @@ def _json_processor(self, schemata: str, enable_thinking: bool=False) -> Optiona
             return None
         return self._create_processor(compiled_grammar, enable_thinking=enable_thinking)
 
-    def _regex_processor(self, schemata: str, enable_thinking: bool=False) -> Optional[XGrammarProcessor]:
+    def _regex_processor(self, schemata: str, enable_thinking: bool = False) -> Optional[XGrammarProcessor]:
         """
         Compile regex pattern into a grammar processor.
 
@@ -281,7 +283,7 @@ def _regex_processor(self, schemata: str, enable_thinking: bool=False) -> Option
             return None
         return self._create_processor(compiled_grammar, enable_thinking=enable_thinking)
 
-    def _grammar_processor(self, schemata: str, enable_thinking: bool=False) -> Optional[XGrammarProcessor]:
+    def _grammar_processor(self, schemata: str, enable_thinking: bool = False) -> Optional[XGrammarProcessor]:
         """
         Compile grammar (EBNF) into a grammar processor.
 
@@ -299,7 +301,7 @@ def _grammar_processor(self, schemata: str, enable_thinking: bool=False) -> Opti
             return None
         return self._create_processor(compiled_grammar, enable_thinking=enable_thinking)
 
-    def _structural_tag_processor(self, schemata: str, enable_thinking: bool=False) -> Optional[XGrammarProcessor]:
+    def _structural_tag_processor(self, schemata: str, enable_thinking: bool = False) -> Optional[XGrammarProcessor]:
         """
         Compile structural tags into a grammar processor.
 

From b8f8d717c9c8a5fd81e2a7d11873de3dc0fef005 Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Mon, 4 Aug 2025 10:14:58 +0800
Subject: [PATCH 05/20] update code

---
 fastdeploy/worker/gpu_model_runner.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 6a6b52a7e2..c4d623cd70 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1234,11 +1234,7 @@ def _get_skip_idx(self, model_forward_batch: Optional[List[Request]] = None):
         Returns:
             A list of indices corresponding to the requests that need to be skipped.
         """
-        if (
-            not self.parallel_config.enable_chunked_prefill
-            or self.guided_backend is None
-            or model_forward_batch is None
-        ):
+        if not self.cache_config.enable_chunked_prefill or self.guided_backend is None or model_forward_batch is None:
             return []
 
         skip_idx_list = []

From ce01f296ffed6cb9cb4823604f6f45ba3d394080 Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Mon, 4 Aug 2025 19:05:58 +0800
Subject: [PATCH 06/20] update code

---
 fastdeploy/worker/gpu_model_runner.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index c4d623cd70..33ed69ddd2 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1380,7 +1380,8 @@ class at the server level, which is too granular for ModelRunner.
             speculative_decoding=self.speculative_decoding,
             skip_save_output=skip_save_output,
         )
-        self.sampler.post_process(sampler_output.sampled_token_ids, skip_idx_list)
+        if sampler_output is not None:
+            self.sampler.post_process(sampler_output.sampled_token_ids, skip_idx_list)
 
         # 6. Speculative decode
         if self.speculative_decoding:

From c2d64b9f9ab4f7a8ca2b1159ace1de1a0c8467ea Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Mon, 4 Aug 2025 20:26:16 +0800
Subject: [PATCH 07/20] add enable_thinking default

---
 fastdeploy/input/ernie_vl_processor.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fastdeploy/input/ernie_vl_processor.py b/fastdeploy/input/ernie_vl_processor.py
index 4df4c38ec8..1e3ebd92a4 100644
--- a/fastdeploy/input/ernie_vl_processor.py
+++ b/fastdeploy/input/ernie_vl_processor.py
@@ -104,6 +104,9 @@ def set_value(req, key, value):
         set_value(request, "repetition_penalty", 1.0)
         set_value(request, "frequency_penalty", 0.0)
         set_value(request, "presence_penalty", 0.0)
+
+        enable_thinking = self.get_enable_thinking(request.get("enable_thinking", None))
+        set_value(request, "enable_thinking", enable_thinking)
         return request
 
     def process_request(self, request, max_model_len=None, **kwargs):

From da81a946248deb241bfbe85f966b85fb4c27fe94 Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Tue, 5 Aug 2025 17:10:53 +0800
Subject: [PATCH 08/20] update code

---
 fastdeploy/engine/engine.py           | 55 +++++++++++++++------------
 fastdeploy/worker/gpu_model_runner.py |  4 +-
 2 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py
index 0475278382..4f4aaf6da0 100644
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -436,19 +436,14 @@ def _insert_zmq_task_to_scheduler(self):
                     llm_logger.debug(f"Receive request: {request}")
 
                     err_msg = None
-                    if (
-                        request.guided_json is not None
-                        or request.guided_regex is not None
-                        or request.structural_tag is not None
-                        or request.guided_grammar is not None
-                    ) and self.guided_decoding_checker is None:
-                        err_msg = (
-                            "guided_backend is None, use --guided-decoding-backend to "
-                            "specify the backend at server startup."
-                        )
-
-                    if self.guided_decoding_checker is not None:
-                        request, err_msg = self.guided_decoding_checker.schema_format(request)
+                    if self._has_guided_input(request):
+                        if self.guided_decoding_checker is None:
+                            err_msg = (
+                                "guided_backend is None, use --guided-decoding-backend to "
+                                "specify the backend at server startup."
+                            )
+                        else:
+                            request, err_msg = self.guided_decoding_checker.schema_format(request)
 
                     if err_msg is not None:
                         llm_logger.error(err_msg)
@@ -488,6 +483,20 @@ def _insert_zmq_task_to_scheduler(self):
                     f"traceback={traceback.format_exc()}"
                 )
 
+    def _has_guided_input(self, request):
+        """
+        Check if the request has any guided input.
+        """
+        return any(
+            x is not None
+            for x in (
+                request.guided_json,
+                request.guided_regex,
+                request.structural_tag,
+                request.guided_grammar,
+            )
+        )
+
     def add_requests(self, task, sampling_params=None, **kwargs):
         """
         Add a new request to the queue.
@@ -541,18 +550,14 @@ def add_requests(self, task, sampling_params=None, **kwargs):
             llm_logger.error(error_msg)
             raise EngineError(error_msg, error_code=400)
 
-        if (
-            request.guided_json is not None
-            or request.guided_regex is not None
-            or request.structural_tag is not None
-            or request.guided_grammar is not None
-        ) and self.guided_decoding_checker is None:
-            err_msg = "guided_backend is None, use --guided-decoding-backend to specify the backend at server startup."
-            llm_logger.error(err_msg)
-            raise EngineError(err_msg, error_code=400)
-
-        if self.guided_decoding_checker is not None:
-            request, err_msg = self.guided_decoding_checker.schema_format(request)
+        if self._has_guided_input(request):
+            err_msg = None
+            if self.guided_decoding_checker is None:
+                err_msg = (
+                    "guided_backend is None, use --guided-decoding-backend to specify the backend at server startup."
+                )
+            else:
+                request, err_msg = self.guided_decoding_checker.schema_format(request)
             if err_msg is not None:
                 llm_logger.error(err_msg)
                 raise EngineError(err_msg, error_code=400)
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 33ed69ddd2..912db4cdc9 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1380,7 +1380,7 @@ class at the server level, which is too granular for ModelRunner.
             speculative_decoding=self.speculative_decoding,
             skip_save_output=skip_save_output,
         )
-        if sampler_output is not None:
+        if self.guided_backend is not None and sampler_output is not None:
             self.sampler.post_process(sampler_output.sampled_token_ids, skip_idx_list)
 
         # 6. Speculative decode
@@ -1410,7 +1410,7 @@ def _add_cache(self, model_forward_batch) -> None:
         """
         Add cache for guided decoding.
         """
-        if self.guided_backend is None:
+        if self.guided_backend is None or model_forward_batch is None:
             return
 
         for request in model_forward_batch:

From 255783922d9f1df4a876508e80c4329660290d3e Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Fri, 8 Aug 2025 16:09:36 +0800
Subject: [PATCH 09/20] add structured_outputs test case

---
 test/ci_use/EB_Lite/test_EB_Lite_serving.py   | 333 +++++++++++++++++
 .../EB_VL_Lite/test_EB_VL_Lite_serving.py     | 341 ++++++++++++++++++
 2 files changed, 674 insertions(+)

diff --git a/test/ci_use/EB_Lite/test_EB_Lite_serving.py b/test/ci_use/EB_Lite/test_EB_Lite_serving.py
index 85cddcba1c..b08821fd89 100644
--- a/test/ci_use/EB_Lite/test_EB_Lite_serving.py
+++ b/test/ci_use/EB_Lite/test_EB_Lite_serving.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
 import os
 import signal
 import socket
@@ -108,6 +109,8 @@ def setup_and_run_server():
         "--use-cudagraph",
         "--graph-optimization-config",
         '{"cudagraph_capture_sizes": [1]}',
+        "--guided-decoding-backend",
+        "auto",
     ]
 
     # Start subprocess in new process group
@@ -939,3 +942,333 @@ def test_streaming_completion_with_bad_words(openai_client, capsys):
         assert hasattr(chunk.choices[0], "text")
         output_1.append(chunk.choices[0].text)
     assert output_0 not in output_1
+
+
+def test_streaming_chat_base(openai_client, chat_param):
+    """
+    Test streaming chat base functionality with the local service
+    """
+    assert isinstance(chat_param, dict), f"{chat_param} should be a dict"
+    assert "messages" in chat_param, f"{chat_param} should contain messages"
+
+    response = openai_client.chat.completions.create(
+        model="default",
+        stream=True,
+        **chat_param,
+    )
+
+    output = []
+    for chunk in response:
+        if hasattr(chunk.choices[0], "delta") and hasattr(chunk.choices[0].delta, "content"):
+            output.append(chunk.choices[0].delta.content)
+    assert len(output) > 2
+    return "".join(output)
+
+
+def test_non_streaming_chat_base(openai_client, chat_param):
+    """
+    Test non streaming chat base functionality with the local service
+    """
+    assert isinstance(chat_param, dict), f"{chat_param} should be a dict"
+    assert "messages" in chat_param, f"{chat_param} should contain messages"
+
+    response = openai_client.chat.completions.create(
+        model="default",
+        stream=False,
+        **chat_param,
+    )
+
+    assert hasattr(response, "choices")
+    assert len(response.choices) > 0
+    assert hasattr(response.choices[0], "message")
+    assert hasattr(response.choices[0].message, "content")
+    return response.choices[0].message.content
+
+
+def test_structured_outputs_json_schema(openai_client):
+    """
+    Test structured outputs json_schema functionality with the local service
+    """
+    chat_param = {
+        "temperature": 1,
+        "max_tokens": 1024,
+    }
+
+    # json_object
+    json_chat_param = {
+        "messages": [
+            {
+                "role": "user",
+                "content": "Generate a JSON object containing: names of China's Four Great Inventions, their dynasties of origin, and brief descriptions (each under 50 characters)",
+            }
+        ],
+        "response_format": {"type": "json_object"},
+    }
+    json_chat_param.update(chat_param)
+
+    response = test_streaming_chat_base(openai_client, json_chat_param)
+    try:
+        json.loads(response)
+        is_valid = True
+    except ValueError:
+        is_valid = False
+
+    assert is_valid, f"json_schema streaming response: {response} is not a valid json"
+
+    response = test_non_streaming_chat_base(openai_client, json_chat_param)
+    try:
+        json.loads(response)
+        is_valid = True
+    except ValueError:
+        is_valid = False
+
+    assert is_valid, f"json_schema non_streaming response: {response} is not a valid json"
+
+    # json_schema
+    from enum import Enum
+
+    from pydantic import BaseModel
+
+    class BookType(str, Enum):
+        romance = "Romance"
+        historical = "Historical"
+        adventure = "Adventure"
+        mystery = "Mystery"
+        dystopian = "Dystopian"
+
+    class BookDescription(BaseModel):
+        author: str
+        title: str
+        genre: BookType
+
+    json_schema_param = {
+        "messages": [
+            {
+                "role": "user",
+                "content": "Generate a JSON describing a literary work, including author, title and book type.",
+            }
+        ],
+        "response_format": {
+            "type": "json_schema",
+            "json_schema": {"name": "book-description", "schema": BookDescription.model_json_schema()},
+        },
+    }
+    json_schema_param.update(chat_param)
+    response = test_streaming_chat_base(openai_client, json_schema_param)
+    try:
+        json_schema_response = json.loads(response)
+        is_valid = True
+    except ValueError:
+        is_valid = False
+
+    assert is_valid, f"json_schema streaming response: {response} is not a valid json"
+    assert (
+        "author" in json_schema_response and "title" in json_schema_response and "genre" in json_schema_response
+    ), f"json_schema streaming response: {response} is not a valid book-description"
+    assert json_schema_response["genre"] in {
+        genre.value for genre in BookType
+    }, f"json_schema streaming response: {json_schema_response['genre']} is not a valid book-type"
+
+    response = test_non_streaming_chat_base(openai_client, json_schema_param)
+    try:
+        json_schema_response = json.loads(response)
+        is_valid = True
+    except ValueError:
+        is_valid = False
+
+    assert is_valid, f"json_schema non_streaming response: {response} is not a valid json"
+    assert (
+        "author" in json_schema_response and "title" in json_schema_response and "genre" in json_schema_response
+    ), f"json_schema non_streaming response: {response} is not a valid book-description"
+    assert json_schema_response["genre"] in {
+        genre.value for genre in BookType
+    }, f"json_schema non_streaming response: {json_schema_response['genre']} is not a valid book-type"
+
+
+def test_structured_outputs_structural_tag(openai_client):
+    """
+    Test structured outputs structural_tag functionality with the local service
+    """
+    content_str = """
+        You have the following function available:
+
+        {
+            "name": "get_current_date",
+            "description": "Get current date and time for given timezone",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "timezone": {
+                        "type": "string",
+                        "description": "Timezone to get current date/time, e.g.: Asia/Shanghai",
+                    }
+                },
+                "required": ["timezone"],
+            }
+        }
+
+        If you choose to call only this function, reply in this format:
+        <{start_tag}={function_name}>{parameters}{end_tag}
+        where:
+
+        start_tag => `<function`
+        parameters => JSON dictionary with parameter names as keys
+        end_tag => `</function>`
+
+        Example:
+        <function=example_function>{"param": "value"}</function>
+
+        Note:
+        - Function call must follow specified format
+        - Required parameters must be specified
+        - Only one function can be called at a time
+        - Place entire function call response on a single line
+
+        You are an AI assistant. Answer the following question.
+    """
+
+    structural_tag_param = {
+        "temperature": 1,
+        "max_tokens": 1024,
+        "messages": [
+            {
+                "role": "system",
+                "content": content_str,
+            },
+            {
+                "role": "user",
+                "content": "You're traveling to Shanghai today",
+            },
+        ],
+        "response_format": {
+            "type": "structural_tag",
+            "structures": [
+                {
+                    "begin": "<function=get_current_date>",
+                    "schema": {
+                        "type": "object",
+                        "properties": {
+                            "timezone": {
+                                "type": "string",
+                                "description": "Timezone to get current date/time, e.g.: Asia/Shanghai",
+                            }
+                        },
+                        "required": ["timezone"],
+                    },
+                    "end": "</function>",
+                }
+            ],
+            "triggers": ["<function="],
+        },
+    }
+
+    expect_str = '<function=get_current_date>{"timezone": "Asia/Shanghai"}</function>'
+    response = test_streaming_chat_base(openai_client, structural_tag_param)
+    assert response == expect_str, f"structural_tag streaming response: {response} is not as expected"
+
+    response = test_non_streaming_chat_base(openai_client, structural_tag_param)
+    assert response == expect_str, f"structural_tag non_streaming response: {response} is not as expected"
+
+
+def test_structured_outputs_choice(openai_client):
+    """
+    Test structured outputs choice functionality with the local service
+    """
+    choice_param = {
+        "temperature": 1,
+        "max_tokens": 1024,
+        "messages": [{"role": "user", "content": "What is the landmark building in Shenzhen?"}],
+        "extra_body": {
+            "guided_choice": ["Ping An Finance Centre", "China Resources Headquarters", "KK100", "Diwang Mansion"]
+        },
+    }
+
+    response = test_streaming_chat_base(openai_client, choice_param)
+    assert response in [
+        "Ping An Finance Centre",
+        "China Resources Headquarters",
+        "KK100",
+        "Diwang Mansion",
+    ], f"choice streaming response: {response} is not as expected"
+    response = test_non_streaming_chat_base(openai_client, choice_param)
+    assert response in [
+        "Ping An Finance Centre",
+        "China Resources Headquarters",
+        "KK100",
+        "Diwang Mansion",
+    ], f"choice non_streaming response: {response} is not as expected"
+
+
+def test_structured_outputs_regex(openai_client):
+    """
+    Test structured outputs regex functionality with the local service
+    """
+    regex_param = {
+        "temperature": 1,
+        "max_tokens": 1024,
+        "messages": [
+            {
+                "role": "user",
+                "content": "Generate a standard format web address including protocol and domain.\n",
+            }
+        ],
+        "extra_body": {"guided_regex": r"^https:\/\/www\.[a-zA-Z]+\.com\/?$\n"},
+    }
+
+    import re
+
+    response = test_streaming_chat_base(openai_client, regex_param)
+    assert re.fullmatch(
+        r"^https:\/\/www\.[a-zA-Z]+\.com\/?$", response
+    ), f"regex streaming response: {response} is not as expected"
+    response = test_non_streaming_chat_base(openai_client, regex_param)
+    assert re.fullmatch(
+        r"^https:\/\/www\.[a-zA-Z]+\.com\/?$", response
+    ), f"regex non_streaming response: {response} is not as expected"
+
+
+def test_structured_outputs_grammar(openai_client):
+    """
+    Test structured outputs grammar functionality with the local service
+    """
+    html_h1_grammar = """
+        root ::= html_statement
+
+        html_statement ::= "<h1" style_attribute? ">" text "</h1>"
+
+        style_attribute ::= " style=" dq style_value dq
+
+        style_value ::= (font_style ("; " font_weight)?) | (font_weight ("; " font_style)?)
+
+        font_style ::= "font-family: '" font_name "'"
+
+        font_weight ::= "font-weight: " weight_value
+
+        font_name ::= "Arial" | "Times New Roman" | "Courier New"
+
+        weight_value ::= "normal" | "bold"
+
+        text ::= [A-Za-z0-9 ]+
+
+        dq ::= ["]
+    """
+
+    grammar_param = {
+        "temperature": 1,
+        "max_tokens": 1024,
+        "messages": [
+            {
+                "role": "user",
+                "content": "Generate HTML code for this heading in bold Times New Roman font: ERNIE Bot",
+            }
+        ],
+        "extra_body": {"guided_grammar": html_h1_grammar},
+    }
+
+    import re
+
+    pattern = r'^<h1( style="font-family: \'(Arial|Times New Roman|Courier New)\'(; font-weight: (normal|bold))?|; font-weight: (normal|bold)(; font-family: \'(Arial|Times New Roman|Courier New)\')?)")?>[A-Za-z0-9 ]+</h1>$'
+    response = test_streaming_chat_base(openai_client, grammar_param)
+    assert re.fullmatch(pattern, response), f"grammar streaming response: {response} is not as expected"
+    response = test_non_streaming_chat_base(openai_client, grammar_param)
+    assert re.fullmatch(pattern, response), f"grammar non_streaming response: {response} is not as expected"
diff --git a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
index fb31a655f8..dc1e906283 100644
--- a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
+++ b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
@@ -118,6 +118,8 @@ def setup_and_run_server():
         "wint4",
         "--reasoning-parser",
         "ernie-45-vl",
+        "--guided-decoding-backend",
+        "auto",
     ]
 
     # Start subprocess in new process group
@@ -535,3 +537,342 @@ def test_chat_with_thinking(openai_client, capsys):
         total_tokens += len(delta_message.completion_token_ids)
     assert completion_tokens + reasoning_tokens == total_tokens
     assert reasoning_tokens <= reasoning_max_tokens
+
+
+def test_streaming_chat_base(openai_client, chat_param):
+    """
+    Test streaming chat base functionality with the local service
+    """
+    assert isinstance(chat_param, dict), f"{chat_param} should be a dict"
+    assert "messages" in chat_param, f"{chat_param} should contain messages"
+
+    response = openai_client.chat.completions.create(
+        model="default",
+        stream=True,
+        **chat_param,
+    )
+
+    output = []
+    for chunk in response:
+        if hasattr(chunk.choices[0], "delta") and hasattr(chunk.choices[0].delta, "content"):
+            output.append(chunk.choices[0].delta.content)
+    assert len(output) > 2
+    return "".join(output)
+
+
+def test_non_streaming_chat_base(openai_client, chat_param):
+    """
+    Test non streaming chat base functionality with the local service
+    """
+    assert isinstance(chat_param, dict), f"{chat_param} should be a dict"
+    assert "messages" in chat_param, f"{chat_param} should contain messages"
+
+    response = openai_client.chat.completions.create(
+        model="default",
+        stream=False,
+        **chat_param,
+    )
+
+    assert hasattr(response, "choices")
+    assert len(response.choices) > 0
+    assert hasattr(response.choices[0], "message")
+    assert hasattr(response.choices[0].message, "content")
+    return response.choices[0].message.content
+
+
+def test_structured_outputs_json_schema(openai_client):
+    """
+    Test structured outputs json_schema functionality with the local service
+    """
+    chat_param = {
+        "temperature": 1,
+        "max_tokens": 1024,
+    }
+
+    # json_object
+    json_chat_param = {
+        "messages": [
+            {"role": "system", "content": "You are a helpful AI assistant."},
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
+                            "detail": "high",
+                        },
+                    },
+                    {"type": "text", "text": "请描述图片内容,使用json格式输出结果"},
+                ],
+            },
+        ],
+        "response_format": {"type": "json_object"},
+    }
+    json_chat_param.update(chat_param)
+
+    outputs = []
+    outputs.append(test_streaming_chat_base(openai_client, json_chat_param))
+    outputs.append(test_non_streaming_chat_base(openai_client, json_chat_param))
+
+    json_chat_param["extra_body"] = {"chat_template_kwargs": {"enable_thinking": False}}
+    outputs.append(test_streaming_chat_base(openai_client, json_chat_param))
+    outputs.append(test_non_streaming_chat_base(openai_client, json_chat_param))
+
+    for response in outputs:
+        try:
+            json.loads(response)
+            is_valid = True
+        except ValueError:
+            is_valid = False
+
+        assert is_valid, f"json_object response: {response} is not a valid json"
+
+    # json_schema
+    from enum import Enum
+
+    from pydantic import BaseModel
+
+    class BookType(str, Enum):
+        romance = "Romance"
+        historical = "Historical"
+        adventure = "Adventure"
+        mystery = "Mystery"
+        dystopian = "Dystopian"
+
+    class BookDescription(BaseModel):
+        author: str
+        title: str
+        genre: BookType
+
+    json_schema_param = {
+        "messages": [
+            {
+                "role": "user",
+                "content": "Generate a JSON describing a literary work, including author, title and book type.",
+            }
+        ],
+        "response_format": {
+            "type": "json_schema",
+            "json_schema": {"name": "book-description", "schema": BookDescription.model_json_schema()},
+        },
+    }
+    json_schema_param.update(chat_param)
+    response = test_streaming_chat_base(openai_client, json_schema_param)
+    try:
+        json_schema_response = json.loads(response)
+        is_valid = True
+    except ValueError:
+        is_valid = False
+
+    assert is_valid, f"json_schema streaming response: {response} is not a valid json"
+    assert (
+        "author" in json_schema_response and "title" in json_schema_response and "genre" in json_schema_response
+    ), f"json_schema streaming response: {response} is not a valid book-description"
+    assert json_schema_response["genre"] in {
+        genre.value for genre in BookType
+    }, f"json_schema streaming response: {json_schema_response['genre']} is not a valid book-type"
+
+    response = test_non_streaming_chat_base(openai_client, json_schema_param)
+    try:
+        json_schema_response = json.loads(response)
+        is_valid = True
+    except ValueError:
+        is_valid = False
+
+    assert is_valid, f"json_schema non_streaming response: {response} is not a valid json"
+    assert (
+        "author" in json_schema_response and "title" in json_schema_response and "genre" in json_schema_response
+    ), f"json_schema non_streaming response: {response} is not a valid book-description"
+    assert json_schema_response["genre"] in {
+        genre.value for genre in BookType
+    }, f"json_schema non_streaming response: {json_schema_response['genre']} is not a valid book-type"
+
+
+def test_structured_outputs_structural_tag(openai_client):
+    """
+    Test structured outputs structural_tag functionality with the local service
+    """
+    content_str = """
+        You have the following function available:
+
+        {
+            "name": "get_current_date",
+            "description": "Get current date and time for given timezone",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "timezone": {
+                        "type": "string",
+                        "description": "Timezone to get current date/time, e.g.: Asia/Shanghai",
+                    }
+                },
+                "required": ["timezone"],
+            }
+        }
+
+        If you choose to call only this function, reply in this format:
+        <{start_tag}={function_name}>{parameters}{end_tag}
+        where:
+
+        start_tag => `<function`
+        parameters => JSON dictionary with parameter names as keys
+        end_tag => `</function>`
+
+        Example:
+        <function=example_function>{"param": "value"}</function>
+
+        Note:
+        - Function call must follow specified format
+        - Required parameters must be specified
+        - Only one function can be called at a time
+        - Place entire function call response on a single line
+
+        You are an AI assistant. Answer the following question.
+    """
+
+    structural_tag_param = {
+        "temperature": 1,
+        "max_tokens": 1024,
+        "messages": [
+            {
+                "role": "system",
+                "content": content_str,
+            },
+            {
+                "role": "user",
+                "content": "You're traveling to Shanghai today",
+            },
+        ],
+        "response_format": {
+            "type": "structural_tag",
+            "structures": [
+                {
+                    "begin": "<function=get_current_date>",
+                    "schema": {
+                        "type": "object",
+                        "properties": {
+                            "timezone": {
+                                "type": "string",
+                                "description": "Timezone to get current date/time, e.g.: Asia/Shanghai",
+                            }
+                        },
+                        "required": ["timezone"],
+                    },
+                    "end": "</function>",
+                }
+            ],
+            "triggers": ["<function="],
+        },
+    }
+
+    expect_str = '<function=get_current_date>{"timezone": "Asia/Shanghai"}</function>'
+    response = test_streaming_chat_base(openai_client, structural_tag_param)
+    assert response == expect_str, f"structural_tag streaming response: {response} is not as expected"
+
+    response = test_non_streaming_chat_base(openai_client, structural_tag_param)
+    assert response == expect_str, f"structural_tag non_streaming response: {response} is not as expected"
+
+
+def test_structured_outputs_choice(openai_client):
+    """
+    Test structured outputs choice functionality with the local service
+    """
+    choice_param = {
+        "temperature": 1,
+        "max_tokens": 1024,
+        "messages": [{"role": "user", "content": "What is the landmark building in Shenzhen?"}],
+        "extra_body": {
+            "guided_choice": ["Ping An Finance Centre", "China Resources Headquarters", "KK100", "Diwang Mansion"]
+        },
+    }
+
+    response = test_streaming_chat_base(openai_client, choice_param)
+    assert response in [
+        "Ping An Finance Centre",
+        "China Resources Headquarters",
+        "KK100",
+        "Diwang Mansion",
+    ], f"choice streaming response: {response} is not as expected"
+    response = test_non_streaming_chat_base(openai_client, choice_param)
+    assert response in [
+        "Ping An Finance Centre",
+        "China Resources Headquarters",
+        "KK100",
+        "Diwang Mansion",
+    ], f"choice non_streaming response: {response} is not as expected"
+
+
+def test_structured_outputs_regex(openai_client):
+    """
+    Test structured outputs regex functionality with the local service
+    """
+    regex_param = {
+        "temperature": 1,
+        "max_tokens": 1024,
+        "messages": [
+            {
+                "role": "user",
+                "content": "Generate a standard format web address including protocol and domain.\n",
+            }
+        ],
+        "extra_body": {"guided_regex": r"^https:\/\/www\.[a-zA-Z]+\.com\/?$\n"},
+    }
+
+    import re
+
+    response = test_streaming_chat_base(openai_client, regex_param)
+    assert re.fullmatch(
+        r"^https:\/\/www\.[a-zA-Z]+\.com\/?$", response
+    ), f"regex streaming response: {response} is not as expected"
+    response = test_non_streaming_chat_base(openai_client, regex_param)
+    assert re.fullmatch(
+        r"^https:\/\/www\.[a-zA-Z]+\.com\/?$", response
+    ), f"regex non_streaming response: {response} is not as expected"
+
+
+def test_structured_outputs_grammar(openai_client):
+    """
+    Test structured outputs grammar functionality with the local service
+    """
+    html_h1_grammar = """
+        root ::= html_statement
+
+        html_statement ::= "<h1" style_attribute? ">" text "</h1>"
+
+        style_attribute ::= " style=" dq style_value dq
+
+        style_value ::= (font_style ("; " font_weight)?) | (font_weight ("; " font_style)?)
+
+        font_style ::= "font-family: '" font_name "'"
+
+        font_weight ::= "font-weight: " weight_value
+
+        font_name ::= "Arial" | "Times New Roman" | "Courier New"
+
+        weight_value ::= "normal" | "bold"
+
+        text ::= [A-Za-z0-9 ]+
+
+        dq ::= ["]
+    """
+
+    grammar_param = {
+        "temperature": 1,
+        "max_tokens": 1024,
+        "messages": [
+            {
+                "role": "user",
+                "content": "Generate HTML code for this heading in bold Times New Roman font: ERNIE Bot",
+            }
+        ],
+        "extra_body": {"guided_grammar": html_h1_grammar},
+    }
+
+    import re
+
+    pattern = r'^<h1( style="font-family: \'(Arial|Times New Roman|Courier New)\'(; font-weight: (normal|bold))?|; font-weight: (normal|bold)(; font-family: \'(Arial|Times New Roman|Courier New)\')?)")?>[A-Za-z0-9 ]+</h1>$'
+    response = test_streaming_chat_base(openai_client, grammar_param)
+    assert re.fullmatch(pattern, response), f"grammar streaming response: {response} is not as expected"
+    response = test_non_streaming_chat_base(openai_client, grammar_param)
+    assert re.fullmatch(pattern, response), f"grammar non_streaming response: {response} is not as expected"

From 3ff2a4def367edd5ae76203eb4a462f990f4e41b Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Fri, 8 Aug 2025 17:41:42 +0800
Subject: [PATCH 10/20] add ci install xgrammar

---
 scripts/run_pre_ce.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/run_pre_ce.sh b/scripts/run_pre_ce.sh
index 726b91e857..4ffd041ef9 100644
--- a/scripts/run_pre_ce.sh
+++ b/scripts/run_pre_ce.sh
@@ -7,6 +7,7 @@ python -m pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/p
 
 python -m pip install -r requirements.txt
 python -m pip install jsonschema aistudio_sdk==0.3.5
+python -m pip install xgrammar==0.1.19
 
 failed_files=()
 run_path="$DIR/../test/ci_use/"

From 83df9a4b49d3e75daccaf48311c96500525f9013 Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Fri, 8 Aug 2025 19:21:54 +0800
Subject: [PATCH 11/20] add ci timeout time

---
 scripts/run_pre_ce.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/run_pre_ce.sh b/scripts/run_pre_ce.sh
index 4ffd041ef9..b2f2564d46 100644
--- a/scripts/run_pre_ce.sh
+++ b/scripts/run_pre_ce.sh
@@ -25,7 +25,7 @@ for subdir in "$run_path"*/; do
                 echo "------------------------------------------------------------"
 
                 set +e
-                timeout 600 python -m pytest --disable-warnings -sv "$file"
+                timeout 1200 python -m pytest --disable-warnings -sv "$file"
                 exit_code=$?
                 set -e
 

From 9a41035b9720ad1f8c26ea5fa72ef3390e44caca Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Mon, 11 Aug 2025 14:00:57 +0800
Subject: [PATCH 12/20] update test for structured_outputs

---
 scripts/run_pre_ce.sh                         |  2 +-
 test/ci_use/EB_Lite/test_EB_Lite_serving.py   | 28 ++++++++--------
 .../EB_VL_Lite/test_EB_VL_Lite_serving.py     | 32 +++++++++----------
 3 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/scripts/run_pre_ce.sh b/scripts/run_pre_ce.sh
index b2f2564d46..4ffd041ef9 100644
--- a/scripts/run_pre_ce.sh
+++ b/scripts/run_pre_ce.sh
@@ -25,7 +25,7 @@ for subdir in "$run_path"*/; do
                 echo "------------------------------------------------------------"
 
                 set +e
-                timeout 1200 python -m pytest --disable-warnings -sv "$file"
+                timeout 600 python -m pytest --disable-warnings -sv "$file"
                 exit_code=$?
                 set -e
 
diff --git a/test/ci_use/EB_Lite/test_EB_Lite_serving.py b/test/ci_use/EB_Lite/test_EB_Lite_serving.py
index b08821fd89..9c1689fcb3 100644
--- a/test/ci_use/EB_Lite/test_EB_Lite_serving.py
+++ b/test/ci_use/EB_Lite/test_EB_Lite_serving.py
@@ -944,7 +944,7 @@ def test_streaming_completion_with_bad_words(openai_client, capsys):
     assert output_0 not in output_1
 
 
-def test_streaming_chat_base(openai_client, chat_param):
+def streaming_chat_base(openai_client, chat_param):
     """
     Test streaming chat base functionality with the local service
     """
@@ -965,7 +965,7 @@ def test_streaming_chat_base(openai_client, chat_param):
     return "".join(output)
 
 
-def test_non_streaming_chat_base(openai_client, chat_param):
+def non_streaming_chat_base(openai_client, chat_param):
     """
     Test non streaming chat base functionality with the local service
     """
@@ -1006,7 +1006,7 @@ def test_structured_outputs_json_schema(openai_client):
     }
     json_chat_param.update(chat_param)
 
-    response = test_streaming_chat_base(openai_client, json_chat_param)
+    response = streaming_chat_base(openai_client, json_chat_param)
     try:
         json.loads(response)
         is_valid = True
@@ -1015,7 +1015,7 @@ def test_structured_outputs_json_schema(openai_client):
 
     assert is_valid, f"json_schema streaming response: {response} is not a valid json"
 
-    response = test_non_streaming_chat_base(openai_client, json_chat_param)
+    response = non_streaming_chat_base(openai_client, json_chat_param)
     try:
         json.loads(response)
         is_valid = True
@@ -1054,7 +1054,7 @@ class BookDescription(BaseModel):
         },
     }
     json_schema_param.update(chat_param)
-    response = test_streaming_chat_base(openai_client, json_schema_param)
+    response = streaming_chat_base(openai_client, json_schema_param)
     try:
         json_schema_response = json.loads(response)
         is_valid = True
@@ -1069,7 +1069,7 @@ class BookDescription(BaseModel):
         genre.value for genre in BookType
     }, f"json_schema streaming response: {json_schema_response['genre']} is not a valid book-type"
 
-    response = test_non_streaming_chat_base(openai_client, json_schema_param)
+    response = non_streaming_chat_base(openai_client, json_schema_param)
     try:
         json_schema_response = json.loads(response)
         is_valid = True
@@ -1163,10 +1163,10 @@ def test_structured_outputs_structural_tag(openai_client):
     }
 
     expect_str = '<function=get_current_date>{"timezone": "Asia/Shanghai"}</function>'
-    response = test_streaming_chat_base(openai_client, structural_tag_param)
+    response = streaming_chat_base(openai_client, structural_tag_param)
     assert response == expect_str, f"structural_tag streaming response: {response} is not as expected"
 
-    response = test_non_streaming_chat_base(openai_client, structural_tag_param)
+    response = non_streaming_chat_base(openai_client, structural_tag_param)
     assert response == expect_str, f"structural_tag non_streaming response: {response} is not as expected"
 
 
@@ -1183,14 +1183,14 @@ def test_structured_outputs_choice(openai_client):
         },
     }
 
-    response = test_streaming_chat_base(openai_client, choice_param)
+    response = streaming_chat_base(openai_client, choice_param)
     assert response in [
         "Ping An Finance Centre",
         "China Resources Headquarters",
         "KK100",
         "Diwang Mansion",
     ], f"choice streaming response: {response} is not as expected"
-    response = test_non_streaming_chat_base(openai_client, choice_param)
+    response = non_streaming_chat_base(openai_client, choice_param)
     assert response in [
         "Ping An Finance Centre",
         "China Resources Headquarters",
@@ -1217,11 +1217,11 @@ def test_structured_outputs_regex(openai_client):
 
     import re
 
-    response = test_streaming_chat_base(openai_client, regex_param)
+    response = streaming_chat_base(openai_client, regex_param)
     assert re.fullmatch(
         r"^https:\/\/www\.[a-zA-Z]+\.com\/?$", response
     ), f"regex streaming response: {response} is not as expected"
-    response = test_non_streaming_chat_base(openai_client, regex_param)
+    response = non_streaming_chat_base(openai_client, regex_param)
     assert re.fullmatch(
         r"^https:\/\/www\.[a-zA-Z]+\.com\/?$", response
     ), f"regex non_streaming response: {response} is not as expected"
@@ -1268,7 +1268,7 @@ def test_structured_outputs_grammar(openai_client):
     import re
 
     pattern = r'^<h1( style="font-family: \'(Arial|Times New Roman|Courier New)\'(; font-weight: (normal|bold))?|; font-weight: (normal|bold)(; font-family: \'(Arial|Times New Roman|Courier New)\')?)")?>[A-Za-z0-9 ]+</h1>$'
-    response = test_streaming_chat_base(openai_client, grammar_param)
+    response = streaming_chat_base(openai_client, grammar_param)
     assert re.fullmatch(pattern, response), f"grammar streaming response: {response} is not as expected"
-    response = test_non_streaming_chat_base(openai_client, grammar_param)
+    response = non_streaming_chat_base(openai_client, grammar_param)
     assert re.fullmatch(pattern, response), f"grammar non_streaming response: {response} is not as expected"
diff --git a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
index 8fd5cb217c..42dd91ee3e 100644
--- a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
+++ b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
@@ -539,7 +539,7 @@ def test_chat_with_thinking(openai_client, capsys):
     assert reasoning_tokens <= reasoning_max_tokens
 
 
-def test_streaming_chat_base(openai_client, chat_param):
+def streaming_chat_base(openai_client, chat_param):
     """
     Test streaming chat base functionality with the local service
     """
@@ -560,7 +560,7 @@ def test_streaming_chat_base(openai_client, chat_param):
     return "".join(output)
 
 
-def test_non_streaming_chat_base(openai_client, chat_param):
+def non_streaming_chat_base(openai_client, chat_param):
     """
     Test non streaming chat base functionality with the local service
     """
@@ -612,12 +612,12 @@ def test_structured_outputs_json_schema(openai_client):
     json_chat_param.update(chat_param)
 
     outputs = []
-    outputs.append(test_streaming_chat_base(openai_client, json_chat_param))
-    outputs.append(test_non_streaming_chat_base(openai_client, json_chat_param))
+    outputs.append(streaming_chat_base(openai_client, json_chat_param))
+    outputs.append(non_streaming_chat_base(openai_client, json_chat_param))
 
     json_chat_param["extra_body"] = {"chat_template_kwargs": {"enable_thinking": False}}
-    outputs.append(test_streaming_chat_base(openai_client, json_chat_param))
-    outputs.append(test_non_streaming_chat_base(openai_client, json_chat_param))
+    outputs.append(streaming_chat_base(openai_client, json_chat_param))
+    outputs.append(non_streaming_chat_base(openai_client, json_chat_param))
 
     for response in outputs:
         try:
@@ -658,7 +658,7 @@ class BookDescription(BaseModel):
         },
     }
     json_schema_param.update(chat_param)
-    response = test_streaming_chat_base(openai_client, json_schema_param)
+    response = streaming_chat_base(openai_client, json_schema_param)
     try:
         json_schema_response = json.loads(response)
         is_valid = True
@@ -673,7 +673,7 @@ class BookDescription(BaseModel):
         genre.value for genre in BookType
     }, f"json_schema streaming response: {json_schema_response['genre']} is not a valid book-type"
 
-    response = test_non_streaming_chat_base(openai_client, json_schema_param)
+    response = non_streaming_chat_base(openai_client, json_schema_param)
     try:
         json_schema_response = json.loads(response)
         is_valid = True
@@ -767,10 +767,10 @@ def test_structured_outputs_structural_tag(openai_client):
     }
 
     expect_str = '<function=get_current_date>{"timezone": "Asia/Shanghai"}</function>'
-    response = test_streaming_chat_base(openai_client, structural_tag_param)
+    response = streaming_chat_base(openai_client, structural_tag_param)
     assert response == expect_str, f"structural_tag streaming response: {response} is not as expected"
 
-    response = test_non_streaming_chat_base(openai_client, structural_tag_param)
+    response = non_streaming_chat_base(openai_client, structural_tag_param)
     assert response == expect_str, f"structural_tag non_streaming response: {response} is not as expected"
 
 
@@ -787,14 +787,14 @@ def test_structured_outputs_choice(openai_client):
         },
     }
 
-    response = test_streaming_chat_base(openai_client, choice_param)
+    response = streaming_chat_base(openai_client, choice_param)
     assert response in [
         "Ping An Finance Centre",
         "China Resources Headquarters",
         "KK100",
         "Diwang Mansion",
     ], f"choice streaming response: {response} is not as expected"
-    response = test_non_streaming_chat_base(openai_client, choice_param)
+    response = non_streaming_chat_base(openai_client, choice_param)
     assert response in [
         "Ping An Finance Centre",
         "China Resources Headquarters",
@@ -821,11 +821,11 @@ def test_structured_outputs_regex(openai_client):
 
     import re
 
-    response = test_streaming_chat_base(openai_client, regex_param)
+    response = streaming_chat_base(openai_client, regex_param)
     assert re.fullmatch(
         r"^https:\/\/www\.[a-zA-Z]+\.com\/?$", response
     ), f"regex streaming response: {response} is not as expected"
-    response = test_non_streaming_chat_base(openai_client, regex_param)
+    response = non_streaming_chat_base(openai_client, regex_param)
     assert re.fullmatch(
         r"^https:\/\/www\.[a-zA-Z]+\.com\/?$", response
     ), f"regex non_streaming response: {response} is not as expected"
@@ -872,7 +872,7 @@ def test_structured_outputs_grammar(openai_client):
     import re
 
     pattern = r'^<h1( style="font-family: \'(Arial|Times New Roman|Courier New)\'(; font-weight: (normal|bold))?|; font-weight: (normal|bold)(; font-family: \'(Arial|Times New Roman|Courier New)\')?)")?>[A-Za-z0-9 ]+</h1>$'
-    response = test_streaming_chat_base(openai_client, grammar_param)
+    response = streaming_chat_base(openai_client, grammar_param)
     assert re.fullmatch(pattern, response), f"grammar streaming response: {response} is not as expected"
-    response = test_non_streaming_chat_base(openai_client, grammar_param)
+    response = non_streaming_chat_base(openai_client, grammar_param)
     assert re.fullmatch(pattern, response), f"grammar non_streaming response: {response} is not as expected"

From f0ea9993737c8e5b6ea17169f443d485ada1e7f8 Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Mon, 11 Aug 2025 14:20:22 +0800
Subject: [PATCH 13/20] update code

---
 fastdeploy/worker/gpu_model_runner.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 4fbbfb00f2..4e7f344ee6 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -272,7 +272,9 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
                     )
 
                 input_ids = request.prompt_token_ids + request.output_token_ids
-                logger.debug(f"Handle prefill request {request} at idx {idx} prefill_start_index {prefill_start_index} prefill_end_index {prefill_end_index} need_prefilled_token_num {len(input_ids)}")
+                logger.debug(
+                    f"Handle prefill request {request} at idx {idx} prefill_start_index {prefill_start_index} prefill_end_index {prefill_end_index} need_prefilled_token_num {len(input_ids)}"
+                )
                 self.share_inputs["input_ids"][idx : idx + 1, :length] = np.array(
                     input_ids[prefill_start_index:prefill_end_index]
                 )

From 1fe01e7a8110e07260b3dba17b689ff37bb69b58 Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Fri, 15 Aug 2025 10:22:13 +0800
Subject: [PATCH 14/20] add error traceback info

---
 fastdeploy/cache_manager/cache_messager.py    |  3 +-
 .../cache_manager/cache_transfer_manager.py   |  3 +-
 .../cache_manager/prefix_cache_manager.py     | 13 +++++----
 fastdeploy/engine/engine.py                   |  8 ++---
 fastdeploy/engine/expert_service.py           |  4 +--
 .../engine/sched/resource_manager_v1.py       |  5 ++--
 fastdeploy/entrypoints/api_server.py          |  3 +-
 fastdeploy/entrypoints/engine_client.py       |  5 ++--
 fastdeploy/entrypoints/llm.py                 |  2 +-
 fastdeploy/entrypoints/openai/api_server.py   |  5 ++--
 fastdeploy/entrypoints/openai/serving_chat.py | 26 ++++++++++++-----
 .../entrypoints/openai/serving_completion.py  | 29 +++++++++++++++----
 .../tool_parsers/ernie_x1_tool_parser.py      | 19 ++++++++----
 fastdeploy/input/ernie_vl_processor.py        |  4 ++-
 .../inter_communicator/engine_cache_queue.py  |  3 +-
 fastdeploy/inter_communicator/zmq_client.py   |  9 +++---
 .../guided_decoding/base_guided_decoding.py   |  3 +-
 .../guided_decoding/xgrammar_backend.py       |  9 +++---
 fastdeploy/output/token_processor.py          |  4 +--
 fastdeploy/platforms/cuda.py                  |  5 +++-
 fastdeploy/platforms/dcu.py                   |  5 +++-
 fastdeploy/platforms/gcu.py                   |  5 +++-
 fastdeploy/platforms/maca.py                  |  4 ++-
 fastdeploy/platforms/xpu.py                   |  5 +++-
 fastdeploy/scheduler/global_scheduler.py      |  2 +-
 fastdeploy/scheduler/splitwise_scheduler.py   | 24 ++++++++-------
 fastdeploy/splitwise/splitwise_connector.py   |  7 +++--
 fastdeploy/worker/utils.py                    |  3 +-
 test/ce/accuracy_cases/gsm8k.py               |  3 +-
 test/ce/deploy/deploy.py                      | 27 ++++++++++++-----
 30 files changed, 164 insertions(+), 83 deletions(-)

diff --git a/fastdeploy/cache_manager/cache_messager.py b/fastdeploy/cache_manager/cache_messager.py
index 456ba1c342..409941f7d8 100644
--- a/fastdeploy/cache_manager/cache_messager.py
+++ b/fastdeploy/cache_manager/cache_messager.py
@@ -17,6 +17,7 @@
 import math
 import threading
 import time
+import traceback
 
 import numpy as np
 import paddle
@@ -309,4 +310,4 @@ def _prefill_layerwise_send_cache_thread(self):
                     self.last_layer_idx = prefilled_layer_idx
 
         except Exception as e:
-            logger.error(f"prefill layerwise send cache thread has exception: {e}")
+            logger.error(f"prefill layerwise send cache thread has exception: {e}, {str(traceback.format_exc())}")
diff --git a/fastdeploy/cache_manager/cache_transfer_manager.py b/fastdeploy/cache_manager/cache_transfer_manager.py
index 34ccf144ca..5078a513dd 100644
--- a/fastdeploy/cache_manager/cache_transfer_manager.py
+++ b/fastdeploy/cache_manager/cache_transfer_manager.py
@@ -19,6 +19,7 @@
 import json
 import queue
 import time
+import traceback
 
 import numpy as np
 import paddle
@@ -342,7 +343,7 @@ def do_data_transfer(self):
                     if self.rank == 0:
                         self.cache_task_queue.barrier3.reset()
             except Exception as e:
-                logger.info(f"do_data_transfer: error: {e}")
+                logger.info(f"do_data_transfer: error: {e}, {str(traceback.format_exc())}")
 
     def _transfer_data(
         self,
diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py
index f033a565c9..e57f0f43b8 100644
--- a/fastdeploy/cache_manager/prefix_cache_manager.py
+++ b/fastdeploy/cache_manager/prefix_cache_manager.py
@@ -20,6 +20,7 @@
 import sys
 import threading
 import time
+import traceback
 import uuid
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor
@@ -469,7 +470,7 @@ def update_cache_blocks(self, task, block_size):
                 self.leaf_req_map[leaf_node].add(req_id)
                 self.cache_info[req_id] = (leaf_node, input_ids)
         except Exception as e:
-            logger.error(f"update_cache_blocks, error: {type(e)} {e}")
+            logger.error(f"update_cache_blocks, error: {type(e)} {e}, {str(traceback.format_exc())}")
             raise e
 
     def request_match_blocks(self, task, block_size, *args):
@@ -555,7 +556,7 @@ def request_match_blocks(self, task, block_size, *args):
                 )
                 return common_block_ids, matched_token_num, hit_info
             except Exception as e:
-                logger.error(f"request_block_ids: error: {type(e)} {e}")
+                logger.error(f"request_block_ids: error: {type(e)} {e}, {str(traceback.format_exc())}")
                 raise e
 
     def request_block_ids(self, task, block_size, dec_token_num, *args):
@@ -660,7 +661,7 @@ def request_block_ids(self, task, block_size, dec_token_num, *args):
                 )
                 return common_block_ids, unique_block_ids, hit_info
             except Exception as e:
-                logger.error(f"request_block_ids: error: {type(e)} {e}")
+                logger.error(f"request_block_ids: error: {type(e)} {e}, {str(traceback.format_exc())}")
                 raise e
 
     def release_block_ids_async(self, task):
@@ -709,7 +710,7 @@ def release_block_ids(self, task):
                 )
                 return
             except Exception as e:
-                logger.error(f"release_block_ids: error: {type(e)} {e}")
+                logger.error(f"release_block_ids: error: {type(e)} {e}, {str(traceback.format_exc())}")
                 raise e
 
     def _handle_free_gpu_node_without_cpu(self, node):
@@ -899,7 +900,7 @@ def free_block_ids_async(self, need_block_num):
                 else:
                     self.gpu_free_task_future = None
             except Exception as e:
-                logger.error(f"free_block_ids_async: error: {type(e)} {e}")
+                logger.error(f"free_block_ids_async: error: {type(e)} {e}, {str(traceback.format_exc())}")
                 raise e
 
     def free_cpu_block_ids(self, need_block_num):
@@ -1218,5 +1219,5 @@ def recv_data_transfer_result(self):
                     + f"task_cpu_block_id {task_cpu_block_id} event_type {event_type} done"
                 )
             except Exception as e:
-                logger.warning(f"recv_data_transfer_result: error: {e}")
+                logger.warning(f"recv_data_transfer_result: error: {e}, {str(traceback.format_exc())}")
                 raise e
diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py
index db3bdefffe..c3149b55d2 100644
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -600,7 +600,7 @@ def receiver_loop():
                         time.sleep(0.001)
 
                 except Exception as e:
-                    llm_logger.error(f"Error in main loop: {e}")
+                    llm_logger.error(f"Error in main loop: {e}, {str(traceback.format_exc())}")
                     time.sleep(0.1)
 
         threading.Thread(target=receiver_loop, daemon=True).start()
@@ -987,7 +987,7 @@ def _exit_sub_services(self):
                 try:
                     os.killpg(p.pid, signal.SIGTERM)
                 except Exception as e:
-                    print(f"Error extracting file: {e}")
+                    print(f"Error extracting file: {e}, {str(traceback.format_exc())}")
         self.worker_ready_signal.clear()
         self.exist_task_signal.clear()
         self.exist_swapped_task_signal.clear()
@@ -1000,7 +1000,7 @@ def _exit_sub_services(self):
             try:
                 os.killpg(self.worker_proc.pid, signal.SIGTERM)
             except Exception as e:
-                print(f"Error extracting sub services: {e}")
+                print(f"Error extracting sub services: {e}, {str(traceback.format_exc())}")
 
         self.engine_worker_queue.cleanup()
         if hasattr(self, "zmq_server") and self.zmq_server is not None:
@@ -1175,7 +1175,7 @@ def generate(self, prompts, stream):
         try:
             req_id = self._format_and_add_data(prompts)
         except Exception as e:
-            llm_logger.error(f"Error happend while adding request, details={e}")
+            llm_logger.error(f"Error happend while adding request, details={e}, {str(traceback.format_exc())}")
             raise EngineError(str(e), error_code=400)
 
         # Get the result of the current request
diff --git a/fastdeploy/engine/expert_service.py b/fastdeploy/engine/expert_service.py
index 9cf5f97f7f..2ed5f8924a 100644
--- a/fastdeploy/engine/expert_service.py
+++ b/fastdeploy/engine/expert_service.py
@@ -269,7 +269,7 @@ def receiver_loop():
                         time.sleep(0.001)
                         continue
                 except Exception as e:
-                    llm_logger.error(f"get decode tasks error: {e}")
+                    llm_logger.error(f"get decode tasks error: {e}, {str(traceback.format_exc())}")
 
         threading.Thread(target=receiver_loop, daemon=True).start()
 
@@ -378,4 +378,4 @@ def start_expert_service(cfg, local_data_parallel_id, ipc_signal_suffix):
         expert_service.start(ipc_signal_suffix, local_data_parallel_id)
         expert_service.split_connector.start_receiver()
     except Exception as e:
-        llm_logger.exception(f"Expert service failed to start: {e}")
+        llm_logger.exception(f"Expert service failed to start: {e}, {str(traceback.format_exc())}")
diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py
index 26eba4ae09..ec8703ee0d 100644
--- a/fastdeploy/engine/sched/resource_manager_v1.py
+++ b/fastdeploy/engine/sched/resource_manager_v1.py
@@ -16,6 +16,7 @@
 
 import threading
 import time
+import traceback
 from collections import deque
 from collections.abc import Iterable
 from concurrent.futures import ThreadPoolExecutor
@@ -389,7 +390,7 @@ def get_prefix_cached_blocks(self, request: Request):
             request.cache_prepare_time = time.time() - cache_prepare_time
             return True
         except Exception as e:
-            llm_logger.error(f"prefix match blocks error: {e}, waiting reschedule...")
+            llm_logger.error(f"prefix match blocks error: {e}, {str(traceback.format_exc())} waiting reschedule...")
             return False
 
     def add_request(self, request: Request) -> None:
@@ -441,4 +442,4 @@ def finish_requests(self, request_ids: Union[str, Iterable[str]]):
                     self.stop_flags[request.idx] = True
                     del self.requests[req_id]
         except Exception as e:
-            llm_logger.error(e)
+            llm_logger.error(f"finish_request err: {e}, {str(traceback.format_exc())}")
diff --git a/fastdeploy/entrypoints/api_server.py b/fastdeploy/entrypoints/api_server.py
index f27c008314..4f4d7f2250 100644
--- a/fastdeploy/entrypoints/api_server.py
+++ b/fastdeploy/entrypoints/api_server.py
@@ -15,6 +15,7 @@
 """
 
 import json
+import traceback
 
 import uvicorn
 from fastapi import FastAPI
@@ -114,7 +115,7 @@ def launch_api_server(args) -> None:
             log_level="info",
         )  # set log level to error to avoid log
     except Exception as e:
-        api_server_logger.error(f"launch sync http server error, {e}")
+        api_server_logger.error(f"launch sync http server error, {e}, {str(traceback.format_exc())}")
 
 
 def main():
diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py
index daed93b8f9..cf1ebdd297 100644
--- a/fastdeploy/entrypoints/engine_client.py
+++ b/fastdeploy/entrypoints/engine_client.py
@@ -15,6 +15,7 @@
 """
 
 import time
+import traceback
 import uuid
 
 import numpy as np
@@ -141,7 +142,7 @@ def add_requests(self, task):
             work_process_metrics.prompt_tokens_total.inc(input_ids_len)
             work_process_metrics.request_prompt_tokens.observe(input_ids_len)
         except Exception as e:
-            api_server_logger.error(e)
+            api_server_logger.error(f"add_requests error: {e}, {str(traceback.format_exc())}")
             raise EngineError(str(e), error_code=400)
 
         if input_ids_len + min_tokens >= self.max_model_len:
@@ -194,7 +195,7 @@ def add_requests(self, task):
             else:
                 self.zmq_client.send_pyobj(task)
         except Exception as e:
-            api_server_logger.error(e)
+            api_server_logger.error(f"zmq_client send task error: {e}, {str(traceback.format_exc())}")
             raise EngineError(str(e), error_code=400)
 
     def vaild_parameters(self, data):
diff --git a/fastdeploy/entrypoints/llm.py b/fastdeploy/entrypoints/llm.py
index 001cfad3e0..dd48e6d00e 100644
--- a/fastdeploy/entrypoints/llm.py
+++ b/fastdeploy/entrypoints/llm.py
@@ -341,7 +341,7 @@ def _build_sample_logprobs(self, logprobs_lists: LogprobsLists, topk_logprobs: i
             return result
 
         except Exception as e:
-            llm_logger.error(f"Error building sample logprobs from LogprobsLists: {e}")
+            llm_logger.error(f"Error building sample logprobs from LogprobsLists: {e}, {str(traceback.format_exc())}")
 
     def _run_engine(self, req_ids: list[str], use_tqdm: bool, topk_logprobs: Optional[int] = None):
         """
diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py
index 2a4c0e7aba..6a5355f102 100644
--- a/fastdeploy/entrypoints/openai/api_server.py
+++ b/fastdeploy/entrypoints/openai/api_server.py
@@ -18,6 +18,7 @@
 import os
 import threading
 import time
+import traceback
 from collections.abc import AsyncGenerator
 from contextlib import asynccontextmanager
 from multiprocessing import current_process
@@ -155,7 +156,7 @@ async def lifespan(app: FastAPI):
         multiprocess.mark_process_dead(os.getpid())
         api_server_logger.info(f"Closing metrics client pid: {pid}")
     except Exception as e:
-        api_server_logger.warning(e)
+        api_server_logger.warning(f"exit error: {e}, {str(traceback.format_exc())}")
 
 
 app = FastAPI(lifespan=lifespan)
@@ -349,7 +350,7 @@ def launch_api_server() -> None:
             log_level="info",
         )  # set log level to error to avoid log
     except Exception as e:
-        api_server_logger.error(f"launch sync http server error, {e}")
+        api_server_logger.error(f"launch sync http server error, {e}, {str(traceback.format_exc())}")
 
 
 metrics_app = FastAPI()
diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
index b14f28e627..91751fd1c0 100644
--- a/fastdeploy/entrypoints/openai/serving_chat.py
+++ b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -92,7 +92,9 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
             if isinstance(prompt_token_ids, np.ndarray):
                 prompt_token_ids = prompt_token_ids.tolist()
         except Exception as e:
-            return ErrorResponse(code=400, message=str(e))
+            error_msg = f"request[{request_id}] send to infer error: {str(e)}, {str(traceback.format_exc())}"
+            api_server_logger.error(error_msg)
+            return ErrorResponse(code=400, message=error_msg)
 
         del current_req_dict
         try:
@@ -101,8 +103,13 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
                 await self.engine_client.semaphore.acquire()
             else:
                 await asyncio.wait_for(self.engine_client.semaphore.acquire(), timeout=self.max_waiting_time)
-        except Exception:
-            return ErrorResponse(code=408, message=f"Request queued time exceed {self.max_waiting_time}")
+        except Exception as e:
+            error_msg = (
+                f"request[{request_id}] waiting error: {str(e)}, {str(traceback.format_exc())}, "
+                f"max waiting time: {self.max_waiting_time}"
+            )
+            api_server_logger.error(error_msg)
+            return ErrorResponse(code=408, message=error_msg)
 
         if request.stream:
             return self.chat_completion_stream_generator(
@@ -114,9 +121,12 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
                     request, request_id, request.model, prompt_token_ids, text_after_process
                 )
             except Exception as e:
-                return ErrorResponse(code=400, message=str(e))
+                error_msg = f"request[{request_id}] generator error: {str(e)}, {str(traceback.format_exc())}"
+                api_server_logger.error(error_msg)
+                return ErrorResponse(code=400, message=error_msg)
 
     def _create_streaming_error_response(self, message: str) -> str:
+        api_server_logger.error(message)
         error_response = ErrorResponse(
             code=400,
             message=message,
@@ -334,7 +344,9 @@ async def chat_completion_stream_generator(
                 yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
 
         except Exception as e:
-            error_data = self._create_streaming_error_response(str(e))
+            error_data = self._create_streaming_error_response(
+                f"equest[{request_id}] generate stream error: {str(e)}, {str(traceback.format_exc())}"
+            )
             yield f"data: {error_data}\n\n"
         finally:
             dealer.close()
@@ -553,6 +565,6 @@ def _build_logprobs_response(
             return LogProbs(content=[sampled_entry])
 
         except Exception as e:
-            api_server_logger.error("Error in _build_logprobs_response: %s", e)
-            api_server_logger.error(traceback.format_exc())
+            error_msg = f"Error in _build_logprobs_response: {e}, {str(traceback.format_exc())}"
+            api_server_logger.error(error_msg)
             return None
diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py
index a6aadcf060..896fb6aa32 100644
--- a/fastdeploy/entrypoints/openai/serving_completion.py
+++ b/fastdeploy/entrypoints/openai/serving_completion.py
@@ -16,6 +16,7 @@
 
 import asyncio
 import time
+import traceback
 import uuid
 from typing import List, Optional
 
@@ -92,7 +93,9 @@ async def create_completion(self, request: CompletionRequest):
             else:
                 raise ValueError("Prompt must be a string, a list of strings or a list of integers.")
         except Exception as e:
-            return ErrorResponse(message=str(e), code=400)
+            error_msg = f"OpenAIServingCompletion create_completion: {e}, {str(traceback.format_exc())}"
+            api_server_logger.error(error_msg)
+            return ErrorResponse(message=error_msg, code=400)
 
         if request_prompt_ids is not None:
             request_prompts = request_prompt_ids
@@ -113,6 +116,8 @@ async def create_completion(self, request: CompletionRequest):
                     text_after_process_list.append(current_req_dict.get("text_after_process"))
                     prompt_batched_token_ids.append(prompt_token_ids)
                 except Exception as e:
+                    error_msg = f"OpenAIServingCompletion format error: {e}, {str(traceback.format_exc())}"
+                    api_server_logger.error(error_msg)
                     return ErrorResponse(message=str(e), code=400)
 
                 del current_req_dict
@@ -122,8 +127,13 @@ async def create_completion(self, request: CompletionRequest):
                     await self.engine_client.semaphore.acquire()
                 else:
                     await asyncio.wait_for(self.engine_client.semaphore.acquire(), timeout=self.max_waiting_time)
-            except Exception:
-                return ErrorResponse(code=408, message=f"Request queued time exceed {self.max_waiting_time}")
+            except Exception as e:
+                error_msg = (
+                    f"OpenAIServingCompletion waiting error: {e}, {str(traceback.format_exc())}, "
+                    f"max waiting time: {self.max_waiting_time}"
+                )
+                api_server_logger.error(error_msg)
+                return ErrorResponse(code=408, message=error_msg)
 
             if request.stream:
                 return self.completion_stream_generator(
@@ -147,10 +157,16 @@ async def create_completion(self, request: CompletionRequest):
                         text_after_process_list=text_after_process_list,
                     )
                 except Exception as e:
-                    return ErrorResponse(code=400, message=str(e))
+                    error_msg = (
+                        f"OpenAIServingCompletion completion_full_generator error: {e}, {str(traceback.format_exc())}"
+                    )
+                    api_server_logger.error(error_msg)
+                    return ErrorResponse(code=400, message=error_msg)
 
         except Exception as e:
-            return ErrorResponse(message=str(e), code=400)
+            error_msg = f"OpenAIServingCompletion create_completion error: {e}, {str(traceback.format_exc())}"
+            api_server_logger.error(error_msg)
+            return ErrorResponse(message=error_msg, code=400)
 
     async def completion_full_generator(
         self,
@@ -422,6 +438,7 @@ async def completion_stream_generator(
                     choices = []
 
         except Exception as e:
+            api_server_logger.error(f"Error in completion_stream_generator: {e}, {str(traceback.format_exc())}")
             yield f"data: {ErrorResponse(message=str(e), code=400).model_dump_json(exclude_unset=True)}\n\n"
         finally:
             del request
@@ -607,5 +624,5 @@ def _build_logprobs_response(
             )
 
         except Exception as e:
-            api_server_logger.error("Error in _build_logprobs_response: %s", e)
+            api_server_logger.error(f"Error in _build_logprobs_response: {str(e)}, {str(traceback.format_exc())}")
             return None
diff --git a/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py
index cec1f68401..6f0534cf1e 100644
--- a/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py
+++ b/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py
@@ -14,6 +14,7 @@
 
 import json
 import re
+import traceback
 import uuid
 from collections.abc import Sequence
 from typing import Union
@@ -162,10 +163,12 @@ def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest)
                                 }
                             )
                     except Exception as e:
-                        data_processor_logger.debug(f"Failed to parse tool call: {str(e)}")
+                        data_processor_logger.error(
+                            f"Failed to parse tool call: {str(e)}, {str(traceback.format_exc())}"
+                        )
                         continue
                 except Exception as e:
-                    data_processor_logger.debug(f"Failed to parse tool call: {str(e)}")
+                    data_processor_logger.error(f"Failed to parse tool call: {str(e)}, {str(traceback.format_exc())}")
                     continue
 
             if not function_call_arr:
@@ -211,7 +214,9 @@ def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest)
             )
 
         except Exception as e:
-            data_processor_logger.error(f"Error in extracting tool call from response: {str(e)}")
+            data_processor_logger.error(
+                f"Error in extracting tool call from response: {str(e)}, {str(traceback.format_exc())}"
+            )
             return ExtractedToolCallInformation(tools_called=False, tool_calls=None, content=model_output)
 
     def extract_tool_calls_streaming(
@@ -302,7 +307,9 @@ def extract_tool_calls_streaming(
                                 self.streamed_args_for_tool[self.current_tool_id] = args_json
                                 return delta
                     except Exception as e:
-                        data_processor_logger.debug(f"Partial arguments parsing: {str(e)}")
+                        data_processor_logger.error(
+                            f"Partial arguments parsing: {str(e)}, {str(traceback.format_exc())}"
+                        )
 
             if "</tool_call>" in self.buffer:
                 end_pos = self.buffer.find("</tool_call>")
@@ -316,5 +323,7 @@ def extract_tool_calls_streaming(
             return delta
 
         except Exception as e:
-            data_processor_logger.error(f"Error in streaming tool call extraction: {str(e)}")
+            data_processor_logger.error(
+                f"Error in streaming tool call extraction: {str(e)}, {str(traceback.format_exc())}"
+            )
             return None
diff --git a/fastdeploy/input/ernie_vl_processor.py b/fastdeploy/input/ernie_vl_processor.py
index e8239f7adb..11472fe7aa 100644
--- a/fastdeploy/input/ernie_vl_processor.py
+++ b/fastdeploy/input/ernie_vl_processor.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """
 
+import traceback
+
 import numpy as np
 from paddleformers.generation import GenerationConfig
 
@@ -151,7 +153,7 @@ def _parse_processor_kwargs(self, kwargs):
             return kwargs
 
         except Exception as e:
-            data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}")
+            data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}, {str(traceback.format_exc())}")
             return {}
 
     def _parse_limits(self, limits):
diff --git a/fastdeploy/inter_communicator/engine_cache_queue.py b/fastdeploy/inter_communicator/engine_cache_queue.py
index 03fae97d7d..6f56550386 100644
--- a/fastdeploy/inter_communicator/engine_cache_queue.py
+++ b/fastdeploy/inter_communicator/engine_cache_queue.py
@@ -16,6 +16,7 @@
 
 import threading
 import time
+import traceback
 from multiprocessing.managers import (
     AcquirerProxy,
     BaseManager,
@@ -275,5 +276,5 @@ def empty(self):
         try:
             return len(self.transfer_task_queue) == 0
         except Exception as e:
-            logger.error(f"empty function meets error: {e}")
+            logger.error(f"empty function meets error: {e}, {str(traceback.format_exc())}")
             raise e
diff --git a/fastdeploy/inter_communicator/zmq_client.py b/fastdeploy/inter_communicator/zmq_client.py
index 05e55929dd..2703efe3a4 100644
--- a/fastdeploy/inter_communicator/zmq_client.py
+++ b/fastdeploy/inter_communicator/zmq_client.py
@@ -17,6 +17,7 @@
 import os
 import threading
 import time
+import traceback
 
 import msgpack
 import zmq
@@ -135,7 +136,7 @@ def send_multipart(self, req_id, data):
             llm_logger.debug(f"send_multipart result: {req_id} len {len(data)} elapse: {time.time()-start_send}")
 
         except Exception as e:
-            llm_logger.error(f"Send result to zmq client failed: {e}")
+            llm_logger.error(f"Send result to zmq client failed: {e}, {str(traceback.format_exc())}")
 
         if data[-1].finished:
             with self.mutex:
@@ -155,7 +156,7 @@ def receive_json_once(self, block=False):
             return None, None
         except Exception as e:
             self.close()
-            llm_logger.warning(f"{e}")
+            llm_logger.warning(f"{e}, {str(traceback.format_exc())}")
             return str(e), None
 
     def receive_pyobj_once(self, block=False):
@@ -171,7 +172,7 @@ def receive_pyobj_once(self, block=False):
             return None, None
         except Exception as e:
             self.close()
-            llm_logger.warning(f"{e}")
+            llm_logger.warning(f"{e}, {str(traceback.format_exc())}")
             return str(e), None
 
     def _clear_ipc(self, name):
@@ -206,7 +207,7 @@ def close(self):
             self._clear_ipc(self.file_name)
             self._clear_ipc(self.router_path)
         except Exception as e:
-            llm_logger.warning(f"Failed to close ZMQ connection - {e}")
+            llm_logger.warning(f"Failed to close ZMQ connection - {e}, {str(traceback.format_exc())}")
             return
 
     def __exit__(self, exc_type, exc_val, exc_tb):
diff --git a/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py b/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py
index 7baf2fe971..b23d0c85d8 100644
--- a/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py
+++ b/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py
@@ -15,6 +15,7 @@
 """
 
 import os
+import traceback
 from concurrent.futures import ThreadPoolExecutor
 
 from fastdeploy.config import ErnieArchitectures, FDConfig
@@ -300,7 +301,7 @@ def _get_tokenizer_hf(self):
 
             return tokenizer
         except Exception as e:
-            raise Exception(f"Fail to initialize hf tokenizer: {e}")
+            raise Exception(f"Fail to initialize hf tokenizer: {e}, {str(traceback.format_exc())}")
 
     def add_cache(self, schemata_key: tuple[str, str], processor: LogitsProcessorBase) -> None:
         """
diff --git a/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py b/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py
index f702a1085e..0d448d4293 100644
--- a/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py
+++ b/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py
@@ -16,6 +16,7 @@
 
 import json
 import re
+import traceback
 from typing import Any, List, Optional
 
 import paddle
@@ -263,7 +264,7 @@ def _json_processor(self, schemata: str) -> Optional[XGrammarProcessor]:
         try:
             compiled_grammar = self.grammar_compiler.compile_json_schema(schemata, any_whitespace=self.any_whitespace)
         except Exception as e:
-            llm_logger.error(f"Failed to compile json schema: {e}")
+            llm_logger.error(f"Failed to compile json schema: {e}, {str(traceback.format_exc())}")
             return None
         return self._create_processor(compiled_grammar)
 
@@ -280,7 +281,7 @@ def _regex_processor(self, schemata: str) -> Optional[XGrammarProcessor]:
         try:
             compiled_grammar = self.grammar_compiler.compile_regex(schemata)
         except Exception as e:
-            llm_logger.error(f"Failed to compile regex schema: {e}")
+            llm_logger.error(f"Failed to compile regex schema: {e}, {str(traceback.format_exc())}")
             return None
         return self._create_processor(compiled_grammar)
 
@@ -297,7 +298,7 @@ def _grammar_processor(self, schemata: str) -> Optional[XGrammarProcessor]:
         try:
             compiled_grammar = self.grammar_compiler.compile_grammar(schemata)
         except Exception as e:
-            llm_logger.error(f"Failed to compile ebnf schema: {e}")
+            llm_logger.error(f"Failed to compile ebnf schema: {e}, {str(traceback.format_exc())}")
             return None
         return self._create_processor(compiled_grammar)
 
@@ -324,7 +325,7 @@ def _structural_tag_processor(self, schemata: str) -> Optional[XGrammarProcessor
 
             compiled_grammar = self.grammar_compiler.compile_structural_tag(tags, structural_tag["triggers"])
         except Exception as e:
-            llm_logger.error(f"Failed to compile structural tags schema: {e}")
+            llm_logger.error(f"Failed to compile structural tags schema: {e}, {str(traceback.format_exc())}")
             return None
         return self._create_processor(compiled_grammar)
 
diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py
index ebb64cebc7..36ab0c362b 100644
--- a/fastdeploy/output/token_processor.py
+++ b/fastdeploy/output/token_processor.py
@@ -201,7 +201,7 @@ def process_metrics():
                         self.prefill_time_signal.value[current_index] = 0
                     current_index += 1
             except Exception as e:
-                llm_logger.error(f"Error processing prefill metrics: {e}")
+                llm_logger.error(f"Error processing prefill metrics: {e}, {str(traceback.format_exc())}")
 
         self.executor.submit(process_metrics)
 
@@ -215,7 +215,7 @@ def postprocess(self, batch_result):
         try:
             self.cached_generated_tokens.put_results(batch_result)
         except Exception as e:
-            llm_logger.error(f"Error in TokenProcessor's postprocess: {e}")
+            llm_logger.error(f"Error in TokenProcessor's postprocess: {e}, {str(traceback.format_exc())}")
 
     def _recycle_resources(self, task_id, index, task, result=None, is_prefill=False):
         """
diff --git a/fastdeploy/platforms/cuda.py b/fastdeploy/platforms/cuda.py
index 6676d3c0f5..38504134a1 100644
--- a/fastdeploy/platforms/cuda.py
+++ b/fastdeploy/platforms/cuda.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """
 
+import traceback
+
 import paddle
 
 from fastdeploy.utils import console_logger as logger
@@ -40,7 +42,8 @@ def available(self):
             logger.warning(
                 "You are using GPU version PaddlePaddle, but there is no GPU "
                 "detected on your machine. Maybe CUDA devices is not set properly."
-                f"\n Original Error is {e}"
+                f"\n Original Error is {e}, "
+                f"{str(traceback.format_exc())}"
             )
             return False
 
diff --git a/fastdeploy/platforms/dcu.py b/fastdeploy/platforms/dcu.py
index bfd848335c..c18c45aca4 100644
--- a/fastdeploy/platforms/dcu.py
+++ b/fastdeploy/platforms/dcu.py
@@ -14,6 +14,8 @@
 """
 dcu platform file
 """
+import traceback
+
 import paddle
 from paddleformers.utils.log import logger
 
@@ -39,7 +41,8 @@ def available(self):
             logger.warning(
                 "You are using GPU version PaddlePaddle, but there is no GPU "
                 "detected on your machine. Maybe CUDA devices is not set properly."
-                f"\n Original Error is {e}"
+                f"\n Original Error is {e}, "
+                f"{str(traceback.format_exc())}"
             )
             return False
 
diff --git a/fastdeploy/platforms/gcu.py b/fastdeploy/platforms/gcu.py
index e812113e1e..76bb170b54 100644
--- a/fastdeploy/platforms/gcu.py
+++ b/fastdeploy/platforms/gcu.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """
 
+import traceback
+
 import paddle
 
 from fastdeploy.utils import console_logger as logger
@@ -40,7 +42,8 @@ def available(self):
             logger.warning(
                 "You are using GCUPlatform, but there is no GCU "
                 "detected on your machine. Maybe GCU devices is not set properly."
-                f"\n Original Error is {e}"
+                f"\n Original Error is {e}, "
+                f"{str(traceback.format_exc())}"
             )
             return False
 
diff --git a/fastdeploy/platforms/maca.py b/fastdeploy/platforms/maca.py
index f695a3d01a..250cebf6e1 100644
--- a/fastdeploy/platforms/maca.py
+++ b/fastdeploy/platforms/maca.py
@@ -17,6 +17,7 @@
 """
 maca platform file
 """
+import traceback
 
 import paddle
 from paddleformers.utils.log import logger
@@ -43,7 +44,8 @@ def available(self):
             logger.warning(
                 "You are using GPU version PaddlePaddle, but there is no GPU "
                 "detected on your machine. Maybe CUDA devices is not set properly."
-                f"\n Original Error is {e}"
+                f"\n Original Error is {e}, "
+                f"{str(traceback.format_exc())}"
             )
             return False
 
diff --git a/fastdeploy/platforms/xpu.py b/fastdeploy/platforms/xpu.py
index 2f31107423..8bc8236359 100644
--- a/fastdeploy/platforms/xpu.py
+++ b/fastdeploy/platforms/xpu.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import traceback
+
 import paddle
 
 from fastdeploy.utils import console_logger as logger
@@ -38,7 +40,8 @@ def available(self):
             logger.warning(
                 "You are using XPU version PaddlePaddle, but there is no XPU "
                 "detected on your machine. Maybe CUDA devices is not set properly."
-                f"\n Original Error is {e}"
+                f"\n Original Error is {e}, "
+                f"{str(traceback.format_exc())}"
             )
             return False
 
diff --git a/fastdeploy/scheduler/global_scheduler.py b/fastdeploy/scheduler/global_scheduler.py
index 8d9b67a6a8..f3962992cc 100644
--- a/fastdeploy/scheduler/global_scheduler.py
+++ b/fastdeploy/scheduler/global_scheduler.py
@@ -237,7 +237,7 @@ def _keep_alive(self):
                 )
                 time.sleep(self.keep_alive_duration / 2)
             except Exception as e:
-                scheduler_logger.error(f"Scheduler keep alive failed: {e}")
+                scheduler_logger.error(f"Scheduler keep alive failed: {e}, {str(traceback.format_exc())}")
                 time.sleep(min(3, self.keep_alive_duration / 4))
 
     def _scheduler_name_from_request_queue(self, request_queue: str) -> str:
diff --git a/fastdeploy/scheduler/splitwise_scheduler.py b/fastdeploy/scheduler/splitwise_scheduler.py
index 61dbd22309..ab1799f440 100644
--- a/fastdeploy/scheduler/splitwise_scheduler.py
+++ b/fastdeploy/scheduler/splitwise_scheduler.py
@@ -20,6 +20,7 @@
 import random
 import threading
 import time
+import traceback
 from collections import deque
 from typing import List
 
@@ -379,7 +380,7 @@ def run(self):
                 if total == 0:
                     time.sleep(0.01)
             except Exception as e:
-                logger.error(f"ResultsReader{self.idx} sync results error: {e!s}")
+                logger.error(f"ResultsReader{self.idx} sync results error: {e!s}, {str(traceback.format_exc())}")
 
     def sync_results(self, keys):
         """
@@ -402,7 +403,7 @@ def sync_results(self, keys):
                     result = RequestOutput.from_dict(data)
                     self.data.appendleft(result)
                 except Exception as e:
-                    logger.error(f"Parse Result Error:{e}, {result}")
+                    logger.error(f"Parse Result Error:{e}, {str(traceback.format_exc())}, {result}")
         return total
 
 
@@ -498,7 +499,7 @@ def loop_schedule(self):
             except IndexError:
                 continue
             except Exception as e:
-                logger.error(f"APIScheduler Schedule req error: {e!s}")
+                logger.error(f"APIScheduler Schedule req error: {e!s}, {str(traceback.format_exc())}")
 
     def schedule(self, req, pnodes, dnodes, mnodes, group=""):
         """
@@ -573,8 +574,8 @@ def loop_clear_expired_nodes(self):
                     # logger.info(f"clear expired nodes: {nodeid}")
                     self.client.hdel(self.cluster_key, nodeid)
                 time.sleep(self.clear_expired_nodes_period)
-            except Exception:
-                logger.error("APIScheduler clear expired nodes error: {str(e)}")
+            except Exception as e:
+                logger.error(f"APIScheduler clear expired nodes error: {str(e)}, {str(traceback.format_exc())}")
 
     def select_pd(self, req, nodes, role):
         """
@@ -664,7 +665,7 @@ def run(self):
                     # e = time.time()
                     # logger.info(f"Lpush {self.idx}: {key} used {e-s} {len(items)} items")
             except Exception as e:
-                logger.error(f"ResultWriter write error: {e!s}")
+                logger.error(f"ResultWriter write error: {e!s}, {str(traceback.format_exc())}")
 
 
 class InferScheduler:
@@ -723,7 +724,7 @@ def routine_report(self):
                 self.client.hset(self.cluster_key, self.nodeid, info)
                 time.sleep(self.sync_period / 1000.0)
             except Exception as e:
-                logger.error(f"InferScheduler routine report error: {e!s}")
+                logger.error(f"InferScheduler routine report error: {e!s}, {str(traceback.format_exc())}")
 
     def loop_expire_reqs(self):
         """
@@ -733,8 +734,8 @@ def loop_expire_reqs(self):
             try:
                 self.node.expire_reqs(self.release_load_expire_period)
                 time.sleep(60)
-            except Exception:
-                logger.error("InferScheduler expire reqs error: {e}")
+            except Exception as e:
+                logger.error(f"InferScheduler expire reqs error: {e}, {str(traceback.format_exc())}")
 
     def loop_get_reqs(self):
         """
@@ -772,7 +773,7 @@ def select_writer(req):
                     else:
                         self.node.add_req(req.request_id, 1)
             except Exception as e:
-                logger.error(f"InferScheduler loop get reqs error: {e!s}")
+                logger.error(f"InferScheduler loop get reqs error: {e!s}, {str(traceback.format_exc())}")
 
     def get_requests(
         self,
@@ -807,7 +808,8 @@ def get_requests(
                     return reqs
                 # logger.info(f"Get Requests from Scheduler: {req.request_id}")
                 reqs.append(req)
-            except Exception:
+            except Exception as e:
+                logger.error(f"InferScheduler get requests error: {e}, {str(traceback.format_exc())}")
                 return reqs
         return reqs
 
diff --git a/fastdeploy/splitwise/splitwise_connector.py b/fastdeploy/splitwise/splitwise_connector.py
index 6b4c8ce04d..8924c00f56 100644
--- a/fastdeploy/splitwise/splitwise_connector.py
+++ b/fastdeploy/splitwise/splitwise_connector.py
@@ -16,6 +16,7 @@
 
 import json
 import time
+import traceback
 from concurrent.futures import ThreadPoolExecutor
 from typing import Dict
 
@@ -97,7 +98,7 @@ def start_receiver(self):
                 time.sleep(0.001)
 
             except Exception as e:
-                logger.error(f"Receiver error: {e}")
+                logger.error(f"Receiver error: {e}, {str(traceback.format_exc())}")
                 time.sleep(1)
 
     def _get_push_socket(self, addr):
@@ -152,7 +153,7 @@ def _send_message(self, addr, msg_type: str, payload):
             except zmq.Again:
                 logger.warning(f"Send queue full for {addr}")
             except Exception as e:
-                logger.error(f"Send to {addr} failed: {e}")
+                logger.error(f"Send to {addr} failed: {e}, {str(traceback.format_exc())}")
                 self._close_connection(addr)
 
         except Exception as e:
@@ -433,7 +434,7 @@ def _process_message(self, message: bytes):
                 self.engine_worker_queue.put_cache_info(payload)
 
         except Exception as e:
-            logger.error(f"Message processing failed: {e}")
+            logger.error(f"Message processing failed: {e}, {str(traceback.format_exc())}")
 
     def _handle_prefill(self, tasks):
         """
diff --git a/fastdeploy/worker/utils.py b/fastdeploy/worker/utils.py
index bf727c3bbf..7554c7c08a 100644
--- a/fastdeploy/worker/utils.py
+++ b/fastdeploy/worker/utils.py
@@ -15,6 +15,7 @@
 """
 
 import os
+import traceback
 
 
 def check_safetensors_model(model_dir: str):
@@ -45,5 +46,5 @@ def check_safetensors_model(model_dir: str):
             sum(flags) == safetensors_num
         ), f"Number of safetensor files should be {len(model_files)}, but now it's {sum(flags)}"
     except Exception as e:
-        raise Exception(f"Failed to check unified checkpoint, details: {e}.")
+        raise Exception(f"Failed to check unified checkpoint, details: {e}, {str(traceback.format_exc())}.")
     return is_safetensors
diff --git a/test/ce/accuracy_cases/gsm8k.py b/test/ce/accuracy_cases/gsm8k.py
index f156f58c7f..b02e4c9f1a 100644
--- a/test/ce/accuracy_cases/gsm8k.py
+++ b/test/ce/accuracy_cases/gsm8k.py
@@ -6,6 +6,7 @@
 
 import os
 import re
+import traceback
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from urllib.parse import urlparse, urlunparse
 
@@ -120,7 +121,7 @@ def query_model(prompt):
         )
         return response.choices[0].message.content.strip()
     except Exception as e:
-        return f"[Error] {e}"
+        return f"[Error] {e}, {str(traceback.format_exc())}"
 
 
 # ========== 评估函数 ==========
diff --git a/test/ce/deploy/deploy.py b/test/ce/deploy/deploy.py
index aa305360b8..50e540a997 100644
--- a/test/ce/deploy/deploy.py
+++ b/test/ce/deploy/deploy.py
@@ -7,6 +7,7 @@
 import subprocess
 import sys
 import time
+import traceback
 
 import requests
 import yaml
@@ -175,7 +176,7 @@ def stop_server(signum=None, frame=None):
         # 终止进程组（包括所有子进程）
         os.killpg(os.getpgid(pid_port["PID"]), signal.SIGTERM)
     except Exception as e:
-        print(f"Failed to stop server: {e}")
+        print(f"Failed to stop server: {e}, {str(traceback.format_exc())}")
 
     for port in [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT]:
         try:
@@ -184,7 +185,7 @@ def stop_server(signum=None, frame=None):
                 os.kill(int(pid), signal.SIGKILL)
                 print(f"Killed process on port {port}, pid={pid}")
         except Exception as e:
-            print(f"Failed to killed process on port: {e}")
+            print(f"Failed to killed process on port: {e}, {str(traceback.format_exc())}")
     # 若log目录存在，则重命名为log_timestamp
     if os.path.isdir("./log"):
         os.rename("./log", "./log_{}".format(time.strftime("%Y%m%d%H%M%S")))
@@ -229,8 +230,10 @@ def start_service():
         # 构建命令
         cmd = build_command(final_config)
     except Exception as e:
+        error_msg = f"Failed to start service: {e}, {str(traceback.format_exc())}"
+        print(error_msg)
         return Response(
-            json.dumps({"status": "error", "message": str(e)}, ensure_ascii=False),
+            json.dumps({"status": "error", "message": error_msg}, ensure_ascii=False),
             status=500,
             content_type="application/json",
         )
@@ -264,8 +267,10 @@ def start_service():
 
         return Response(json.dumps(json_data, ensure_ascii=False), status=200, content_type="application/json")
     except Exception as e:
+        error_msg = f"Failed to start service: {e}, {str(traceback.format_exc())}"
+        print(error_msg)
         return Response(
-            json.dumps({"status": "error", "message": str(e)}, ensure_ascii=False),
+            json.dumps({"status": "error", "message": error_msg}, ensure_ascii=False),
             status=500,
             content_type="application/json",
         )
@@ -295,8 +300,10 @@ def switch_service():
         # 构建命令
         cmd = build_command(final_config)
     except Exception as e:
+        error_msg = f"Failed to switch service: {e}, {str(traceback.format_exc())}"
+        print(error_msg)
         return Response(
-            json.dumps({"status": "error", "message": str(e)}, ensure_ascii=False),
+            json.dumps({"status": "error", "message": error_msg}, ensure_ascii=False),
             status=500,
             content_type="application/json",
         )
@@ -330,8 +337,10 @@ def switch_service():
 
         return Response(json.dumps(json_data, ensure_ascii=False), status=200, content_type="application/json")
     except Exception as e:
+        error_msg = f"Failed to switch service: {e}, {str(traceback.format_exc())}"
+        print(error_msg)
         return Response(
-            json.dumps({"status": "error", "message": str(e)}, ensure_ascii=False),
+            json.dumps({"status": "error", "message": error_msg}, ensure_ascii=False),
             status=500,
             content_type="application/json",
         )
@@ -406,8 +415,10 @@ def get_config():
         )
 
     except Exception as e:
+        error_msg = f"{e}, {str(traceback.format_exc())}"
+        print(error_msg)
         return Response(
-            json.dumps({"message": "api_server.log解析失败，请检查log", "error": str(e)}, ensure_ascii=False),
+            json.dumps({"message": "api_server.log解析失败，请检查log", "error": error_msg}, ensure_ascii=False),
             status=500,
             content_type="application/json",
         )
@@ -447,7 +458,7 @@ def tail_file(path, lines=50):
                         with open(path, "r", encoding="utf-8", errors="ignore") as f:
                             return "".join(f.readlines()[-lines:])
                     except Exception as e:
-                        return f"[无法读取 {path}]: {e}\n"
+                        return f"[无法读取 {path}]: {e}, {str(traceback.format_exc())}\n"
 
                 result = f"服务启动超时，耗时：[{timeout}s]\n\n"
                 result += "==== server.log tail 50 ====\n"

From eea387726132a79065d988b7197b8970e836cd2b Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Tue, 19 Aug 2025 11:22:25 +0800
Subject: [PATCH 15/20] update error msg

---
 fastdeploy/engine/engine.py                   | 8 ++++++--
 fastdeploy/entrypoints/openai/serving_chat.py | 2 +-
 test/ce/deploy/deploy.py                      | 2 +-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py
index 47e1bfd323..d4cc4f6fb6 100644
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -984,7 +984,9 @@ def _exit_sub_services(self):
                 try:
                     os.killpg(p.pid, signal.SIGTERM)
                 except Exception as e:
-                    print(f"Error extracting file: {e}, {str(traceback.format_exc())}")
+                    error_msg = f"Error killing cache manager process {p.pid}: {e}, {str(traceback.format_exc())}"
+                    print(error_msg)
+                    llm_logger.error(error_msg)
         self.worker_ready_signal.clear()
         self.exist_task_signal.clear()
         self.exist_swapped_task_signal.clear()
@@ -997,7 +999,9 @@ def _exit_sub_services(self):
             try:
                 os.killpg(self.worker_proc.pid, signal.SIGTERM)
             except Exception as e:
-                print(f"Error extracting sub services: {e}, {str(traceback.format_exc())}")
+                error_msg = f"Error extracting sub services: {e}, {str(traceback.format_exc())}"
+                print(error_msg)
+                llm_logger.error(error_msg)
 
         self.engine_worker_queue.cleanup()
         if hasattr(self, "zmq_server") and self.zmq_server is not None:
diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
index a0d28eedaa..28f4cb41bd 100644
--- a/fastdeploy/entrypoints/openai/serving_chat.py
+++ b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -348,7 +348,7 @@ async def chat_completion_stream_generator(
 
         except Exception as e:
             error_data = self._create_streaming_error_response(
-                f"equest[{request_id}] generate stream error: {str(e)}, {str(traceback.format_exc())}"
+                f"request[{request_id}] generate stream error: {str(e)}, {str(traceback.format_exc())}"
             )
             yield f"data: {error_data}\n\n"
         finally:
diff --git a/test/ce/deploy/deploy.py b/test/ce/deploy/deploy.py
index 50e540a997..3947d22288 100644
--- a/test/ce/deploy/deploy.py
+++ b/test/ce/deploy/deploy.py
@@ -185,7 +185,7 @@ def stop_server(signum=None, frame=None):
                 os.kill(int(pid), signal.SIGKILL)
                 print(f"Killed process on port {port}, pid={pid}")
         except Exception as e:
-            print(f"Failed to killed process on port: {e}, {str(traceback.format_exc())}")
+            print(f"Failed to kill process on port: {e}, {str(traceback.format_exc())}")
     # 若log目录存在，则重命名为log_timestamp
     if os.path.isdir("./log"):
         os.rename("./log", "./log_{}".format(time.strftime("%Y%m%d%H%M%S")))

From 4d8d46af76a158a6c50d81dd02b77d65ed26ddbf Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Tue, 19 Aug 2025 17:23:59 +0800
Subject: [PATCH 16/20] update structred output code

---
 fastdeploy/engine/engine.py                       |  2 ++
 fastdeploy/engine/request.py                      | 12 +++++-------
 test/ci_use/EB_Lite/test_EB_Lite_serving.py       | 15 +++++++++------
 test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py | 15 +++++++++------
 4 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py
index a403017fae..1522fd7acd 100644
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -492,8 +492,10 @@ def _has_guided_input(self, request):
             for x in (
                 request.guided_json,
                 request.guided_regex,
+                request.guided_choice,
                 request.structural_tag,
                 request.guided_grammar,
+                request.guided_json_object,
             )
         )
 
diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py
index a703ac010d..57abf71086 100644
--- a/fastdeploy/engine/request.py
+++ b/fastdeploy/engine/request.py
@@ -221,13 +221,11 @@ def set(self, key, value):
             setattr(self, key, value)
 
     def __repr__(self) -> str:
-        return (
-            f"Request(request_id={self.request_id}, "
-            f"prompt={self.prompt!r}, "
-            f"prompt_token_ids={self.prompt_token_ids}, "
-            f"draft_token_ids={self.draft_token_ids}, "
-            f"sampling_params={self.sampling_params})"
-        )
+        non_none_fields = []
+        for attr, value in vars(self).items():
+            if value is not None and not attr.startswith("_"):
+                non_none_fields.append(f"{attr}={value!r}")
+        return f"Request({', '.join(non_none_fields)})"
 
 
 @dataclass(slots=True)
diff --git a/test/ci_use/EB_Lite/test_EB_Lite_serving.py b/test/ci_use/EB_Lite/test_EB_Lite_serving.py
index 9c1689fcb3..bb00259ee1 100644
--- a/test/ci_use/EB_Lite/test_EB_Lite_serving.py
+++ b/test/ci_use/EB_Lite/test_EB_Lite_serving.py
@@ -1162,12 +1162,15 @@ def test_structured_outputs_structural_tag(openai_client):
         },
     }
 
-    expect_str = '<function=get_current_date>{"timezone": "Asia/Shanghai"}</function>'
+    expect_str1 = "get_current_date"
+    expect_str2 = "Asia/Shanghai"
     response = streaming_chat_base(openai_client, structural_tag_param)
-    assert response == expect_str, f"structural_tag streaming response: {response} is not as expected"
+    assert expect_str1 in response, f"structural_tag streaming response: {response} is not as expected"
+    assert expect_str2 in response, f"structural_tag streaming response: {response} is not as expected"
 
     response = non_streaming_chat_base(openai_client, structural_tag_param)
-    assert response == expect_str, f"structural_tag non_streaming response: {response} is not as expected"
+    assert expect_str1 in response, f"structural_tag non_streaming response: {response} is not as expected"
+    assert expect_str2 in response, f"structural_tag non_streaming response: {response} is not as expected"
 
 
 def test_structured_outputs_choice(openai_client):
@@ -1219,11 +1222,11 @@ def test_structured_outputs_regex(openai_client):
 
     response = streaming_chat_base(openai_client, regex_param)
     assert re.fullmatch(
-        r"^https:\/\/www\.[a-zA-Z]+\.com\/?$", response
+        r"^https:\/\/www\.[a-zA-Z]+\.com\/?$\n", response
     ), f"regex streaming response: {response} is not as expected"
     response = non_streaming_chat_base(openai_client, regex_param)
     assert re.fullmatch(
-        r"^https:\/\/www\.[a-zA-Z]+\.com\/?$", response
+        r"^https:\/\/www\.[a-zA-Z]+\.com\/?$\n", response
     ), f"regex non_streaming response: {response} is not as expected"
 
 
@@ -1267,7 +1270,7 @@ def test_structured_outputs_grammar(openai_client):
 
     import re
 
-    pattern = r'^<h1( style="font-family: \'(Arial|Times New Roman|Courier New)\'(; font-weight: (normal|bold))?|; font-weight: (normal|bold)(; font-family: \'(Arial|Times New Roman|Courier New)\')?)")?>[A-Za-z0-9 ]+</h1>$'
+    pattern = r'^<h1( style="[^"]*")?>[A-Za-z0-9 ]+</h1>$'
     response = streaming_chat_base(openai_client, grammar_param)
     assert re.fullmatch(pattern, response), f"grammar streaming response: {response} is not as expected"
     response = non_streaming_chat_base(openai_client, grammar_param)
diff --git a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
index 42dd91ee3e..819a2fdeec 100644
--- a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
+++ b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
@@ -766,12 +766,15 @@ def test_structured_outputs_structural_tag(openai_client):
         },
     }
 
-    expect_str = '<function=get_current_date>{"timezone": "Asia/Shanghai"}</function>'
+    expect_str1 = "get_current_date"
+    expect_str2 = "Asia/Shanghai"
     response = streaming_chat_base(openai_client, structural_tag_param)
-    assert response == expect_str, f"structural_tag streaming response: {response} is not as expected"
+    assert expect_str1 in response, f"structural_tag streaming response: {response} is not as expected"
+    assert expect_str2 in response, f"structural_tag streaming response: {response} is not as expected"
 
     response = non_streaming_chat_base(openai_client, structural_tag_param)
-    assert response == expect_str, f"structural_tag non_streaming response: {response} is not as expected"
+    assert expect_str1 in response, f"structural_tag non_streaming response: {response} is not as expected"
+    assert expect_str2 in response, f"structural_tag non_streaming response: {response} is not as expected"
 
 
 def test_structured_outputs_choice(openai_client):
@@ -823,11 +826,11 @@ def test_structured_outputs_regex(openai_client):
 
     response = streaming_chat_base(openai_client, regex_param)
     assert re.fullmatch(
-        r"^https:\/\/www\.[a-zA-Z]+\.com\/?$", response
+        r"^https:\/\/www\.[a-zA-Z]+\.com\/?$\n", response
     ), f"regex streaming response: {response} is not as expected"
     response = non_streaming_chat_base(openai_client, regex_param)
     assert re.fullmatch(
-        r"^https:\/\/www\.[a-zA-Z]+\.com\/?$", response
+        r"^https:\/\/www\.[a-zA-Z]+\.com\/?$\n", response
     ), f"regex non_streaming response: {response} is not as expected"
 
 
@@ -871,7 +874,7 @@ def test_structured_outputs_grammar(openai_client):
 
     import re
 
-    pattern = r'^<h1( style="font-family: \'(Arial|Times New Roman|Courier New)\'(; font-weight: (normal|bold))?|; font-weight: (normal|bold)(; font-family: \'(Arial|Times New Roman|Courier New)\')?)")?>[A-Za-z0-9 ]+</h1>$'
+    pattern = r'^<h1( style="[^"]*")?>[A-Za-z0-9 ]+</h1>$'
     response = streaming_chat_base(openai_client, grammar_param)
     assert re.fullmatch(pattern, response), f"grammar streaming response: {response} is not as expected"
     response = non_streaming_chat_base(openai_client, grammar_param)

From 4d0b1e45126f6e3e8c9c78bcba1f6871579e4919 Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Tue, 19 Aug 2025 17:25:13 +0800
Subject: [PATCH 17/20] update code

---
 fastdeploy/engine/engine.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py
index b6c52b80de..3494186fa4 100644
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -985,9 +985,9 @@ def _exit_sub_services(self):
                 try:
                     os.killpg(p.pid, signal.SIGTERM)
                 except Exception as e:
-                    error_msg = f"Error killing cache manager process {p.pid}: {e}, {str(traceback.format_exc())}"
-                    print(error_msg)
-                    llm_logger.error(error_msg)
+                    console_logger.error(
+                        f"Error killing cache manager process {p.pid}: {e}, {str(traceback.format_exc())}"
+                    )
         self.worker_ready_signal.clear()
         self.exist_task_signal.clear()
         self.exist_swapped_task_signal.clear()
@@ -1000,9 +1000,7 @@ def _exit_sub_services(self):
             try:
                 os.killpg(self.worker_proc.pid, signal.SIGTERM)
             except Exception as e:
-                error_msg = f"Error extracting sub services: {e}, {str(traceback.format_exc())}"
-                print(error_msg)
-                llm_logger.error(error_msg)
+                console_logger.error(f"Error extracting sub services: {e}, {str(traceback.format_exc())}")
 
         self.engine_worker_queue.cleanup()
         if hasattr(self, "zmq_server") and self.zmq_server is not None:

From 7ad87f6ee09be75a7aca857f3be70db41f6fbd0a Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Tue, 19 Aug 2025 20:33:38 +0800
Subject: [PATCH 18/20] update code

---
 fastdeploy/engine/request.py           |  4 +--
 fastdeploy/input/ernie_processor.py    |  5 ++--
 fastdeploy/input/ernie_vl_processor.py |  8 +++---
 fastdeploy/input/text_processor.py     | 36 +++++---------------------
 fastdeploy/worker/gpu_model_runner.py  |  8 ++++--
 5 files changed, 21 insertions(+), 40 deletions(-)

diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py
index c274b9bed9..0131188e06 100644
--- a/fastdeploy/engine/request.py
+++ b/fastdeploy/engine/request.py
@@ -70,7 +70,7 @@ def __init__(
         guided_grammar: Optional[Any] = None,
         structural_tag: Optional[Any] = None,
         guided_json_object: Optional[bool] = None,
-        enable_thinking: Optional[bool] = None,
+        enable_thinking: Optional[bool] = True,
         trace_carrier: dict = dict(),
         chat_template: Optional[str] = None,
     ) -> None:
@@ -153,7 +153,7 @@ def from_dict(cls, d: dict):
             guided_grammar=d.get("guided_grammar", None),
             structural_tag=d.get("structural_tag", None),
             guided_json_object=d.get("guided_json_object", None),
-            enable_thinking=d.get("enable_thinking", None),
+            enable_thinking=d.get("enable_thinking", True),
             trace_carrier=d.get("trace_carrier", {}),
             chat_template=d.get("chat_template", None),
         )
diff --git a/fastdeploy/input/ernie_processor.py b/fastdeploy/input/ernie_processor.py
index 6d7d2896bf..07e0d6cbaf 100644
--- a/fastdeploy/input/ernie_processor.py
+++ b/fastdeploy/input/ernie_processor.py
@@ -257,6 +257,7 @@ def process_response_dict_normal(self, response_dict, **kwargs):
         Returns:
             Dict: response contain text fields
         """
+        enable_thinking = kwargs.get("enable_thinking")
         token_ids = response_dict["outputs"]["token_ids"]
         is_end = response_dict["finished"]
         req_id = response_dict["request_id"]
@@ -266,7 +267,6 @@ def process_response_dict_normal(self, response_dict, **kwargs):
         delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id)
         if is_end:
             full_text = previous_texts + delta_text
-            enable_thinking = self.get_enable_thinking(kwargs.get("enable_thinking"))
             if self.reasoning_parser and (
                 enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser"
             ):
@@ -296,6 +296,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
         Returns:
             Dict: response contain text fields
         """
+        enable_thinking = kwargs.get("enable_thinking")
         is_end = response_dict["finished"]
         req_id = response_dict["request_id"]
         token_ids = response_dict["outputs"]["token_ids"]
@@ -304,9 +305,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
             if token_ids[-1] == self.tokenizer.eos_token_id:
                 token_ids = token_ids[:-1]
         delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id)
-
         response_dict["outputs"]["raw_prediction"] = delta_text
-        enable_thinking = self.get_enable_thinking(kwargs.get("enable_thinking"))
         if self.reasoning_parser and (
             enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser"
         ):
diff --git a/fastdeploy/input/ernie_vl_processor.py b/fastdeploy/input/ernie_vl_processor.py
index 9e824bdadc..5c64952a14 100644
--- a/fastdeploy/input/ernie_vl_processor.py
+++ b/fastdeploy/input/ernie_vl_processor.py
@@ -105,9 +105,6 @@ def set_value(req, key, value):
         set_value(request, "repetition_penalty", 1.0)
         set_value(request, "frequency_penalty", 0.0)
         set_value(request, "presence_penalty", 0.0)
-
-        enable_thinking = self.get_enable_thinking(request.get("enable_thinking", None))
-        set_value(request, "enable_thinking", enable_thinking)
         return request
 
     def process_request(self, request, max_model_len=None, **kwargs):
@@ -198,6 +195,7 @@ def _check_mm_limits(self, item):
 
     def process_request_dict(self, request, max_model_len=None):
         """process the input data"""
+
         request = self._apply_default_parameters(request)
         if not request.get("eos_token_ids"):
             request["eos_token_ids"] = self.eos_token_ids
@@ -292,7 +290,9 @@ def process_response_dict(self, response_dict, stream, **kwargs):
         Returns:
             Dict: response contain text fields
         """
-        enable_thinking = self.get_enable_thinking(kwargs.pop("enable_thinking", None))
+        enable_thinking = kwargs.pop("enable_thinking", True)
+        if enable_thinking is None:
+            enable_thinking = True
         if stream:
             return self.process_response_dict_streaming(response_dict, enable_thinking=enable_thinking, **kwargs)
         else:
diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py
index 1ebc4cf846..dc2d91cb87 100644
--- a/fastdeploy/input/text_processor.py
+++ b/fastdeploy/input/text_processor.py
@@ -50,25 +50,6 @@ def __init__(self):
             )
         )
 
-    def get_enable_thinking(self, enable_thinking=None):
-        """
-        get enable_thinking param
-
-        1. if enable_thinking is None:
-            1.1 if reasoning_parser is not None, set enable_thinking to True.
-            1.2 if reasoning_parser is None, set enable_thinking to False.
-        2. if reasoning_parser is None but enable_thinking is True, set enable_thinking to False and print warning.
-
-        """
-        if enable_thinking is None:
-            enable_thinking = False if self.reasoning_parser is None else True
-        if enable_thinking and self.reasoning_parser is None:
-            enable_thinking = False
-            data_processor_logger.warning(
-                "enable_thinking is True, but reasoning_parser is None. " "enable_thinking will be set to False."
-            )
-        return enable_thinking
-
     def _apply_default_parameters(self, request):
         """
         Apply default value for parameters in request
@@ -88,10 +69,6 @@ def set_value(req, key, value):
         set_value(request, "repetition_penalty", 1.0)
         set_value(request, "frequency_penalty", 0.0)
         set_value(request, "presence_penalty", 0.0)
-
-        enable_thinking = self.get_enable_thinking(request.get("enable_thinking"))
-        set_value(request, "enable_thinking", enable_thinking)
-
         return request
 
     @abstractmethod
@@ -237,7 +214,6 @@ def process_request(self, request, max_model_len=None, **kwargs):
             request.set("stop_token_ids", stop_seqs)
             request.set("stop_seqs_len", stop_seqs_len)
 
-        request.set("enable_thinking", self.get_enable_thinking(kwargs.get("enable_thinking")))
         if request.prompt_token_ids is None or len(request.prompt_token_ids) == 0:
             if request.prompt is not None:
                 request.prompt_token_ids = self.text2ids(request.prompt, max_model_len)
@@ -253,6 +229,7 @@ def process_request(self, request, max_model_len=None, **kwargs):
                                 task[k] = v
                     else:
                         raise ValueError("Invalid input: chat_template_kwargs must be a dict")
+                task.setdefault("enable_thinking", True)
                 request.prompt_token_ids = self.messages2ids(task)
             else:
                 raise ValueError(f"The request should have `input_ids`, `text` or `messages`: {request}.")
@@ -283,7 +260,6 @@ def process_request_dict(self, request, max_model_len=None, **kwargs):
             str: error message
         """
         request = self._apply_default_parameters(request)
-        request["enable_thinking"] = self.get_enable_thinking(kwargs.get("enable_thinking"))
         if not request.get("eos_token_ids"):
             request["eos_token_ids"] = self.eos_token_ids
 
@@ -311,6 +287,7 @@ def process_request_dict(self, request, max_model_len=None, **kwargs):
                                 request[k] = v
                     else:
                         raise ValueError("Invalid input: chat_template_kwargs must be a dict")
+                request.setdefault("enable_thinking", True)
                 request["prompt_token_ids"] = self.messages2ids(request)
             else:
                 raise ValueError(f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}")
@@ -374,6 +351,7 @@ def process_response_dict_normal(self, response_dict, **kwargs):
         Returns:
             Dict: response contain text fields
         """
+        enable_thinking = kwargs.get("enable_thinking")
         token_ids = response_dict["outputs"]["token_ids"]
         is_end = response_dict["finished"]
         req_id = response_dict["request_id"]
@@ -383,7 +361,6 @@ def process_response_dict_normal(self, response_dict, **kwargs):
         delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id)
         if is_end:
             full_text = previous_texts + delta_text
-            enable_thinking = self.get_enable_thinking(kwargs.get("enable_thinking"))
             response_dict["outputs"]["raw_prediction"] = full_text
             if enable_thinking and self.reasoning_parser:
                 reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict)
@@ -411,6 +388,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
         Returns:
             Dict: response contain text fields
         """
+        enable_thinking = kwargs.get("enable_thinking")
         is_end = response_dict["finished"]
         req_id = response_dict["request_id"]
         token_ids = response_dict["outputs"]["token_ids"]
@@ -419,8 +397,6 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
             if token_ids[-1] == self.tokenizer.eos_token_id:
                 token_ids = token_ids[:-1]
         delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id)
-
-        enable_thinking = self.get_enable_thinking(kwargs.get("enable_thinking"))
         response_dict["outputs"]["raw_prediction"] = delta_text
         if enable_thinking and self.reasoning_parser:
             reasoning_content, text = self.reasoning_parser.extract_reasoning_content_streaming(
@@ -466,7 +442,9 @@ def process_response_dict(self, response_dict, **kwargs):
         Returns:
             Dict: response contain text fields
         """
-        enable_thinking = self.get_enable_thinking(kwargs.pop("enable_thinking", None))
+        enable_thinking = kwargs.pop("enable_thinking", True)
+        if enable_thinking is None:
+            enable_thinking = True
         stream = kwargs.get("stream", True)
         if stream:
             return self.process_response_dict_streaming(response_dict, enable_thinking=enable_thinking, **kwargs)
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 9ecdc27b53..4ef329491e 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -202,10 +202,13 @@ def _init_logits_processor(self, request):
         elif request.structural_tag is not None:
             schemata_key = ("structural_tag", request.structural_tag)
 
+        enable_thinking = request.get("enable_thinking", True)
+        enable_thinking = enable_thinking if enable_thinking is not None else True
+
         return (
             self.guided_backend.get_logits_processor(
                 schemata_key=schemata_key,
-                enable_thinking=request.get("enable_thinking"),
+                enable_thinking=enable_thinking,
             ),
             schemata_key,
         )
@@ -468,7 +471,8 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests:
                     self.share_inputs["prompt_lens"][idx : idx + 1] = length
 
                 if self.enable_mm:
-                    enable_thinking = request.get("enable_thinking")
+                    enable_thinking = request.get("enable_thinking", True)
+                    enable_thinking = enable_thinking if enable_thinking is not None else True
                     self.share_inputs["enable_thinking"][:] = enable_thinking
                     self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0
                     self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048)

From 5ad6432ed1fabca28e5bd25c54ce07f1270ed809 Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Thu, 28 Aug 2025 11:59:33 +0800
Subject: [PATCH 19/20] update config

---
 fastdeploy/config.py        |  12 +-
 fastdeploy/engine/config.py | 438 ------------------------------------
 2 files changed, 6 insertions(+), 444 deletions(-)
 delete mode 100644 fastdeploy/engine/config.py

diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index 20e196b243..198868e6c6 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -1186,7 +1186,8 @@ def postprocess(self):
         self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size)
 
         if self.guided_decoding_backend == "auto":
-            if self.model_config.enable_mm:
+            if current_platform.is_xpu() or self.speculative_config.method is not None:
+                logger.warning("Speculative Decoding and XPU currently do not support Guided decoding, set off.")
                 self.guided_decoding_backend = "off"
             else:
                 self.guided_decoding_backend = "xgrammar"
@@ -1256,12 +1257,10 @@ def check(self):
             ], f"Only support xgrammar、auto guided decoding backend, but got {self.guided_decoding_backend}."
 
             if self.guided_decoding_backend != "off":
-                # TODO: mm support guided_decoding
-                assert (
-                    self.model_config.enable_mm is False
-                ), "Multimodal model currently do not support guided_decoding"
-
                 # TODO: speculative decoding support guided_decoding
+                assert (
+                    self.speculative_config.method is None
+                ), "speculative decoding currently do not support guided_decoding"
 
                 # TODO: xpu support guided_decoding
                 assert not current_platform.is_xpu(), "XPU currently do not support guided_decoding"
@@ -1272,6 +1271,7 @@ def check(self):
                     raise Exception(
                         f"import XGrammar failed, please install XGrammar use `pip install xgrammar==0.1.19`. \n\t {e}"
                     )
+
         if self.scheduler_config is not None:
             self.scheduler_config.check()
 
diff --git a/fastdeploy/engine/config.py b/fastdeploy/engine/config.py
deleted file mode 100644
index eb3595b69c..0000000000
--- a/fastdeploy/engine/config.py
+++ /dev/null
@@ -1,438 +0,0 @@
-"""
-# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import json
-import os
-from datetime import datetime
-from typing import Any, Dict, List, Optional
-
-from fastdeploy.config import (
-    CacheConfig,
-    CommitConfig,
-    LoadConfig,
-    ModelConfig,
-    ParallelConfig,
-)
-from fastdeploy.multimodal.registry import MultimodalRegistry
-from fastdeploy.platforms import current_platform
-from fastdeploy.scheduler import SchedulerConfig
-from fastdeploy.utils import ceil_div, get_host_ip, is_port_available, llm_logger
-
-
-class Config:
-    """
-    Initial configuration class.
-
-    Attributes:
-        model_config (ModelConfig): Model configuration object.
-        cache_config (CacheConfig): Cache configuration object.
-        model_name_or_path (str): Directory path to the model or the model name.
-        tokenizer (Optional[str]): Default is the model.
-        max_num_batched_tokens (Optional[int]): Maximum number of batched tokens.
-        tensor_parallel_size (int): Tensor parallel size.
-        nnode (int): Number of nodes.
-        max_model_len (int): Maximum model length. Default is 8192.
-        max_num_seqs (int): Maximum number of sequences. Default is 8.
-        mm_processor_kwargs (Optional[Dict[str, Any]]): Additional arguments for multi-modal processor.
-        speculative_config (Optional[Dict[str, Any]]): Speculative execution configuration.
-        use_warmup (bool): Flag to use warmup.
-        engine_worker_queue_port (int): Port for engine worker queue.
-        enable_mm (bool): Flag to enable multi-modal processing.
-        reasoning_parser(str): Flag specifies the reasoning parser to use for
-            extracting reasoning content from the model output
-        splitwise_role (str): Splitwise role.
-        innode_prefill_ports (Optional[List[int]]): Innode prefill ports.
-            Temporary configuration, will be removed in the future.
-        load_choices(str):The format of the model weights to load. .Default is default
-    """
-
-    def __init__(
-        self,
-        model_config: ModelConfig,
-        cache_config: CacheConfig,
-        scheduler_config: SchedulerConfig,
-        parallel_config: ParallelConfig,
-        load_config: LoadConfig,
-        commit_config: CommitConfig = CommitConfig(),
-        model_name_or_path: str = None,
-        tokenizer: str = None,
-        tensor_parallel_size: int = 8,
-        max_model_len: int = 8192,
-        max_num_seqs: int = 8,
-        max_num_batched_tokens: Optional[int] = None,
-        ips: str = None,
-        speculative_config: Optional[Dict[str, Any]] = None,
-        graph_optimization_config: Optional[Dict[str, Any]] = None,
-        use_warmup: bool = False,
-        engine_worker_queue_port: int = 8002,
-        limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
-        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
-        # enable_mm: bool = False,
-        splitwise_role: str = "mixed",
-        innode_prefill_ports: Optional[List[int]] = None,
-        max_num_partial_prefills: int = 1,
-        max_long_partial_prefills: int = 1,
-        long_prefill_token_threshold: int = 0,
-        reasoning_parser: str = None,
-        tool_parser: str = None,
-        guided_decoding_backend: Optional[str] = None,
-        disable_any_whitespace: bool = False,
-        enable_logprob: bool = False,
-        early_stop_config: Optional[Dict[str, Any]] = None,
-        load_choices: str = "default",
-    ):
-        """
-        Initialize the Config class.
-
-        Args:
-            model_config (ModelConfig): Model configuration object.
-            cache_config (CacheConfig): Cache configuration object.
-            parallel_config (ParallelConfig): Parallel configuration object.
-            scheduler_config (SchedulerConfig): Scheduler configuration object.
-            model_name_or_path (str): Model directory path or model name.
-            tokenizer (str): Default is the model.
-            tensor_parallel_size (int): Tensor parallel size. Default is 8.
-            max_model_len (int): Maximum model length. Default is 8192.
-            max_num_seqs (int): Maximum number of sequences. Default is 8.
-            max_num_batched_tokens (Optional[int]): Maximum number of batched tokens. Default is None.
-            mm_processor_kwargs (Optional[Dict[str, Any]]): Additional arguments for multi-modal processor. Default is None.
-            speculative_config (Optional[Dict[str, Any]]): Speculative execution configuration. Default is None.
-            graph_optimization_config (Optional[Dict[str, Any]]): Graph optimizaion backend execution configuration. Default is None.
-            use_warmup (bool): Flag to use warmup. Default is False.
-            engine_worker_queue_port (int): Engine worker queue port. Default is 8002.
-            enable_mm (bool): Flag to enable multi-modal processing. Default is False.
-            splitwise_role (str): Splitwise role. Default is "mixed".
-            innode_prefill_ports (Optional[List[int]]): Innode prefill ports. Default is None.
-            reasoning_parser (str): Flag specifies the reasoning parser to use for
-                   extracting reasoning content from the model output. Default is None.
-            guided_decoding_backend(str): Guided decoding backend. Default is None.
-            disable_any_whitespace(bool): Disable any whitespace when using guided decoding.
-                Default is False.
-            enable_logprob(bool): Enable logprob. Default is False.
-            early_stop_config (Optional[Dict[str, Any]]): Early stop configuration. Default is None.
-            load_choices(str):The format of the model weights to load. .Default is default
-        """
-        self.model_config = model_config
-        self.cache_config = cache_config
-        self.scheduler_config = scheduler_config
-        self.parallel_config = parallel_config
-        self.load_config = load_config
-        self.commit_config = commit_config
-        self.model_name_or_path = model_name_or_path
-        self.tokenizer = tokenizer
-        self.max_num_batched_tokens = max_num_batched_tokens
-        self.tensor_parallel_size = tensor_parallel_size
-        self.ips = ips
-
-        if self.ips is None:
-            self.master_ip = "0.0.0.0"
-        elif isinstance(self.ips, list):
-            self.master_ip = self.ips[0]
-        else:
-            self.ips = self.ips.split(",")
-            self.master_ip = self.ips[0]
-
-        if self.ips is None:
-            self.nnode = 1
-            self.node_rank = 0
-        else:
-            self.nnode = len(self.ips)
-
-            for idx, ip in enumerate(self.ips):
-                if ip == self.master_ip:
-                    self.node_rank = idx
-
-        self.max_model_len = max_model_len
-        self.max_num_seqs = max_num_seqs
-        self.limit_mm_per_prompt = limit_mm_per_prompt
-        self.mm_processor_kwargs = mm_processor_kwargs
-        # self.enable_mm = enable_mm
-        self.speculative_config = speculative_config
-        self.use_warmup = use_warmup
-        self.splitwise_role = splitwise_role
-        self.innode_prefill_ports = innode_prefill_ports
-        self.max_num_partial_prefills = max_num_partial_prefills
-        self.max_long_partial_prefills = max_long_partial_prefills
-        self.long_prefill_token_threshold = long_prefill_token_threshold
-        self.reasoning_parser = reasoning_parser
-        self.tool_parser = tool_parser
-        self.graph_optimization_config = graph_optimization_config
-        self.early_stop_config = early_stop_config
-        self.guided_decoding_backend = guided_decoding_backend
-        self.disable_any_whitespace = disable_any_whitespace
-        self._str_to_list("innode_prefill_ports", int)
-        self.load_choices = load_choices
-
-        assert self.splitwise_role in ["mixed", "prefill", "decode"]
-
-        import fastdeploy.model_executor.models  # noqa: F401
-
-        architectures = self.model_config.architectures[0]
-        if MultimodalRegistry.contains_model(architectures):
-            self.enable_mm = True
-        else:
-            self.enable_mm = False
-
-        # TODO
-        self.max_prefill_batch = 3
-        if current_platform.is_xpu():
-            self.max_prefill_batch = 1
-        if self.enable_mm:
-            self.max_prefill_batch = 1  # TODO:当前多模prefill阶段只支持并行度为1,待优化
-
-        # TODO(@wufeisheng): TP and EP need to be supported simultaneously.
-        assert (self.tensor_parallel_size == 1 and self.parallel_config.expert_parallel_size >= 1) or (
-            self.tensor_parallel_size >= 1 and self.parallel_config.expert_parallel_size == 1
-        ), "TP and EP cannot be enabled at the same time"
-
-        num_ranks = self.tensor_parallel_size * self.parallel_config.expert_parallel_size
-        self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
-        if num_ranks > self.max_chips_per_node:
-            self.worker_num_per_node = self.max_chips_per_node
-            nnode = ceil_div(num_ranks, self.worker_num_per_node)
-            assert nnode == self.nnode, f"nnode: {nnode}, but got {self.nnode}"
-        else:
-            self.worker_num_per_node = num_ranks
-
-        self.engine_worker_queue_port = engine_worker_queue_port
-        self.device_ids = ",".join([str(i) for i in range(self.worker_num_per_node)])
-        self.device_ids = os.getenv("CUDA_VISIBLE_DEVICES", self.device_ids)
-        if current_platform.is_xpu():
-            self.device_ids = os.getenv("XPU_VISIBLE_DEVICES", self.device_ids)
-
-        self.enable_logprob = enable_logprob
-
-        self.read_from_config()
-        self.postprocess()
-        self.check()
-        self.print()
-
-    def postprocess(self):
-        """
-        calculate some parameters
-        """
-        assert (
-            self.device_ids.split(",").__len__() == self.worker_num_per_node
-        ), f"invalid CUDA_VISIBLE_DEVICES, should be equal to {self.worker_num_per_node}"
-
-        self.local_device_ids = self.device_ids.split(",")[: self.tensor_parallel_size]
-
-        self.host_ip = get_host_ip()
-
-        if self.ips is None or self.host_ip == self.master_ip:
-            self.is_master = True
-        else:
-            self.is_master = False
-
-        if self.tensor_parallel_size <= self.worker_num_per_node:
-            self.is_master = True
-
-        import paddle
-
-        self.paddle_commit_id = paddle.version.commit
-
-        if self.max_num_batched_tokens is None:
-            if self.cache_config.enable_chunked_prefill:
-                self.max_num_batched_tokens = 2048
-            else:
-                if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
-                    self.max_num_batched_tokens = self.max_model_len
-                else:
-                    self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
-
-        if self.long_prefill_token_threshold == 0:
-            self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
-
-        self.cache_config.postprocess(self.max_num_batched_tokens, self.max_num_seqs)
-        self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size)
-
-        if self.guided_decoding_backend == "auto":
-            if current_platform.is_xpu() or self.speculative_config.method is not None:
-                llm_logger.warning("Speculative Decoding and XPU currently do not support Guided decoding, set off.")
-                self.guided_decoding_backend = "off"
-            else:
-                self.guided_decoding_backend = "xgrammar"
-
-    def check(self):
-        """
-        check the legality of config
-        """
-        assert self.max_num_seqs <= 256, (
-            "The parameter `max_num_seqs` is not allowed to exceed 256, " f"but now it's {self.max_num_seqs}."
-        )
-        assert is_port_available(
-            "0.0.0.0", self.engine_worker_queue_port
-        ), f"The parameter `engine_worker_queue_port`:{self.engine_worker_queue_port} is already in use."
-        assert self.nnode >= 1, f"nnode: {self.nnode} should no less than 1"
-        assert self.max_model_len >= 16, f"max_model_len: {self.max_model_len} should be larger than 16"
-        assert self.max_num_seqs >= 1, f"max_num_seqs: {self.max_num_seqs} should be larger than 1"
-        assert self.max_num_batched_tokens >= self.max_num_seqs, (
-            f"max_num_batched_tokens: {self.max_num_batched_tokens} "
-            f"should be larger than or equal to max_num_seqs: {self.max_num_seqs}"
-        )
-        assert self.max_num_batched_tokens <= self.max_model_len * self.max_num_seqs, (
-            f"max_num_batched_tokens: {self.max_num_batched_tokens} should be larger"
-            f"than or equal to max_num_seqs: {self.max_num_seqs} * max_model_len: {self.max_model_len}"
-        )
-        assert (
-            self.max_num_partial_prefills >= 1
-        ), f"max_num_partial_prefills: {self.max_num_partial_prefills} should be larger than or equal to 1"
-
-        assert (
-            self.max_long_partial_prefills >= 1
-        ), f"max_long_partial_prefills: {self.max_long_partial_prefills} should be larger than or equal to 1"
-        assert self.max_long_partial_prefills <= self.max_num_partial_prefills, (
-            f"max_long_partial_prefills: {self.max_long_partial_prefills} should "
-            f"be less than or equal to max_num_partial_prefills: {self.max_num_partial_prefills}"
-        )
-
-        if not self.cache_config.enable_chunked_prefill:
-            if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
-                assert self.max_num_batched_tokens >= self.max_model_len, (
-                    f"max_num_batched_tokens: {self.max_num_batched_tokens} "
-                    f"should be larger than or equal to max_model_len: {self.max_model_len}"
-                )
-        else:
-            assert self.max_num_batched_tokens >= self.cache_config.block_size, (
-                f"max_num_batched_tokens: {self.max_num_batched_tokens} "
-                f"should be larger than or equal to block_size: {self.cache_config.block_size}"
-            )
-
-        if self.max_num_partial_prefills > 1:
-            assert (
-                self.cache_config.enable_chunked_prefill is True
-            ), "Chunked prefill must be enabled to set max_num_partial_prefills > 1"
-            assert self.long_prefill_token_threshold < self.max_model_len, (
-                f"long_prefill_token_threshold: {self.long_prefill_token_threshold} should be less than"
-                f" max_model_len: {self.max_model_len}"
-            )
-
-        if self.guided_decoding_backend is not None:
-            assert self.guided_decoding_backend in [
-                "xgrammar",
-                "XGrammar",
-                "auto",
-                "off",
-            ], f"Only support xgrammar、auto guided decoding backend, but got {self.guided_decoding_backend}."
-
-            if self.guided_decoding_backend != "off":
-
-                # TODO: speculative decoding support guided_decoding
-                assert (
-                    self.speculative_config.method is None
-                ), "speculative decoding currently do not support guided_decoding"
-
-                # TODO: xpu support guided_decoding
-                assert not current_platform.is_xpu(), "XPU currently do not support guided_decoding"
-
-                try:
-                    import xgrammar  # noqa
-                except Exception as e:
-                    raise Exception(
-                        f"import XGrammar failed, please install XGrammar use `pip install xgrammar==0.1.19`. \n\t {e}"
-                    )
-
-        self.scheduler_config.check()
-
-    def print(self, file=None):
-        """
-        print all config
-
-        Args:
-            file (str): the path of file to save config
-        """
-        llm_logger.info("=================== Configuration Information ===============")
-        for k, v in self.__dict__.items():
-            if k == "generation_config" and v is not None:
-                for gck, gcv in v.to_dict().items():
-                    llm_logger.info("{:<20}:{:<6}{}".format(gck, "", gcv))
-            elif (
-                k == "cache_config"
-                or k == "model_config"
-                or k == "scheduler_config"
-                or k == "parallel_config"
-                or k == "commit_config"
-                or k == "speculative_config"
-            ):
-                v.print()
-            else:
-                llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
-        llm_logger.info("=============================================================")
-        if file is not None:
-            f = open(file, "a")
-            now_time = datetime.now()
-            f.write(f"{now_time} configuration information as below,\n")
-            for k, v in self.__dict__.items():
-                f.write("{:<20}:{:<6}{}\n".format(k, "", v))
-            f.close()
-
-    def init_cache_info(self):
-        """
-        initialize cache info
-        """
-        disaggregate_info = {}
-        if self.splitwise_role != "mixed":
-            disaggregate_info["role"] = self.splitwise_role
-            disaggregate_info["cache_info"] = dict()
-            current_protocol = self.cache_config.cache_transfer_protocol.split(",")
-            disaggregate_info["transfer_protocol"] = current_protocol
-            for protocol in current_protocol:
-                if protocol == "ipc":
-                    disaggregate_info["cache_info"][protocol] = {
-                        "ip": self.host_ip,
-                        "port": self.engine_worker_queue_port,
-                        "device_ids": self.local_device_ids,
-                    }
-                elif protocol == "rdma":
-                    disaggregate_info["cache_info"][protocol] = {
-                        "ip": self.host_ip,
-                        "port": self.cache_config.pd_comm_port[0],
-                        "rdma_port": self.cache_config.rdma_comm_ports,
-                    }
-        self.disaggregate_info = disaggregate_info
-        llm_logger.info(f"disaggregate_info: {self.disaggregate_info}")
-
-    def read_from_config(self):
-        """
-        reset model config from json file
-        """
-
-        def reset_value(cls, value_name, key):
-            if hasattr(cls, key):
-                value = getattr(cls, key)
-                setattr(cls, value_name, value)
-                llm_logger.info(f"Reset parameter {value_name} = {value} from configuration.")
-
-        reset_value(self.cache_config, "block_size", "infer_model_block_size")
-        reset_value(
-            self.model_config,
-            "return_full_hidden_states",
-            "return_full_hidden_states",
-        )
-        reset_value(self.cache_config, "cache_dtype", "infer_model_dtype")
-
-    def _check_master(self):
-        return self.is_master
-
-    def _str_to_list(self, attr_name, default_type):
-        if hasattr(self, attr_name):
-            val = getattr(self, attr_name)
-            if type(val) is str:
-                setattr(self, attr_name, [default_type(i) for i in val.split(",")])
-            else:
-                setattr(self, attr_name, val)
-
-    def __str__(self) -> str:
-        return json.dumps(self.__dict__, indent=4)

From da8d37a29f076cbd432000609872b60397aa9d61 Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Thu, 28 Aug 2025 12:03:27 +0800
Subject: [PATCH 20/20] update torch version

---
 scripts/run_pre_ce.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/run_pre_ce.sh b/scripts/run_pre_ce.sh
index 8ec9af730a..ab36dac961 100644
--- a/scripts/run_pre_ce.sh
+++ b/scripts/run_pre_ce.sh
@@ -7,7 +7,7 @@ python -m pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/p
 
 python -m pip install -r requirements.txt
 python -m pip install jsonschema aistudio_sdk==0.3.5
-python -m pip install xgrammar==0.1.19
+python -m pip install xgrammar==0.1.19 torch==2.6.0
 
 failed_files=()
 run_path="$DIR/../tests/ci_use/"