From a829d0ef882b6f0f540bfba5e21bee5e199ed761 Mon Sep 17 00:00:00 2001 From: kevin Date: Thu, 17 Jul 2025 16:38:54 +0800 Subject: [PATCH 01/20] mm support structured output --- docs/features/structured_outputs.md | 62 ++++++++++++++ docs/zh/features/structured_outputs.md | 64 +++++++++++++++ fastdeploy/config.py | 2 + fastdeploy/engine/config.py | 20 ++--- fastdeploy/engine/engine.py | 20 +++++ fastdeploy/engine/sampling_params.py | 45 +++++++++++ fastdeploy/entrypoints/llm.py | 3 + fastdeploy/input/ernie_processor.py | 4 +- fastdeploy/input/ernie_vl_processor.py | 6 +- fastdeploy/input/text_processor.py | 24 +++++- .../guided_decoding/__init__.py | 4 +- .../guided_decoding/base_guided_decoding.py | 63 +++++++++++---- .../guided_decoding/xgrammar_backend.py | 40 +++++---- .../model_executor/layers/sample/sampler.py | 81 ++++++++++++++----- .../reasoning/ernie_vl_reasoning_parsers.py | 14 ++++ .../reasoning/qwen3_reasoning_parsers.py | 14 ++++ fastdeploy/worker/gpu_model_runner.py | 34 ++++---- fastdeploy/worker/worker_process.py | 5 ++ 18 files changed, 412 insertions(+), 93 deletions(-) diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index 40e177c1ce..f7ee424cb6 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -330,3 +330,65 @@ ParsedChatCompletionMessage[Info](content='{"addr": "No.1 Century Avenue, Pudong Address: No.1 Century Avenue, Pudong New Area, Shanghai Height: 468 ``` + +### Offline Inference + +Offline inference allows restricting the model's output format by pre-specified constraints. In `FastDeploy`, constraints can be specified through the `GuidedDecodingParams` class in `SamplingParams`. `GuidedDecodingParams` supports the following constraint types, with usage similar to online inference: + +```python +json: Optional[Union[str, dict]] = None +regex: Optional[str] = None +choice: Optional[List[str]] = None +grammar: Optional[str] = None +json_object: Optional[bool] = None +structural_tag: Optional[str] = None +``` + +The following example demonstrates how to use offline inference to generate a structured json: + +```python +from fastdeploy import LLM, SamplingParams +from fastdeploy.engine.sampling_params import GuidedDecodingParams +from pydantic import BaseModel +from enum import Enum + +class BookType(str, Enum): + romance = "Romance" + historical = "Historical" + adventure = "Adventure" + mystery = "Mystery" + dystopian = "Dystopian" + +class BookDescription(BaseModel): + author: str + title: str + genre: BookType + +# Constrained decoding parameters +guided_decoding_params = GuidedDecodingParams(json=BookDescription.model_json_schema()) + +# Sampling parameters +sampling_params = SamplingParams( + top_p=0.95, + max_tokens=6400, + guided_decoding=guided_decoding_params, +) + +# Load model +llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192, guided_decoding_backend="auto") + +outputs = llm.generate( + prompts="Generate a JSON describing a literary work, including author, title and book type.", + sampling_params=sampling_params, +) + +# Output results +for output in outputs: + print(output.outputs.text) +``` + +Output: + +``` +{"author": "George Orwell", "title": "1984", "genre": "Dystopian"} +``` diff --git a/docs/zh/features/structured_outputs.md b/docs/zh/features/structured_outputs.md index ce33f1232d..cafda804c6 100644 --- a/docs/zh/features/structured_outputs.md +++ b/docs/zh/features/structured_outputs.md @@ -330,3 +330,67 @@ ParsedChatCompletionMessage[Info](content='{"addr": "上海市浦东新区世纪 地址: 上海市浦东新区世纪大道1号 高度: 468 ``` + +### 离线推理 + +离线推理允许通过预先指定约束条件,限制模型输出格式。在 `FastDeploy` 中,支持通过 `SamplingParams` 中的 `GuidedDecodingParams` 类指定相关约束条件。`GuidedDecodingParams` 支持以下几种约束条件,使用方式可以参考在线推理: + +```python +json: Optional[Union[str, dict]] = None +regex: Optional[str] = None +choice: Optional[List[str]] = None +grammar: Optional[str] = None +json_object: Optional[bool] = None +structural_tag: Optional[str] = None +``` + +以下示例展示了如何使用离线推理生成一个结构化的 json : + +```python + +from fastdeploy import LLM, SamplingParams +from fastdeploy.engine.sampling_params import GuidedDecodingParams +from pydantic import BaseModel +from enum import Enum + +class BookType(str, Enum): + romance = "Romance" + historical = "Historical" + adventure = "Adventure" + mystery = "Mystery" + dystopian = "Dystopian" + +class BookDescription(BaseModel): + author: str + title: str + genre: BookType + +# Constrained decoding parameters +guided_decoding_params = GuidedDecodingParams(json=BookDescription.model_json_schema()) + +# Sampling parameters +sampling_params = SamplingParams( + top_p=0.95, + max_tokens=6400, + guided_decoding=guided_decoding_params, +) + +# Load model +llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192, guided_decoding_backend="auto") + +outputs = llm.generate( + prompts="生成一个JSON,描述一本中国的著作,要包含作者、标题和书籍类型。", + sampling_params=sampling_params, +) + +# Output results +for output in outputs: + print(output.outputs.text) + +``` + +输出 + +``` +{"author": "曹雪芹", "title": "红楼梦", "genre": "Historical"} +``` diff --git a/fastdeploy/config.py b/fastdeploy/config.py index c8428d1f97..a68ae6d58e 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -116,6 +116,8 @@ def __init__( self.enable_redundant_experts = False self.redundant_experts_num = 0 self.quantization = None + self.reasoning_parser = None + for key, value in args.items(): if hasattr(self, key): setattr(self, key, value) diff --git a/fastdeploy/engine/config.py b/fastdeploy/engine/config.py index e9c693480b..0ae8269126 100644 --- a/fastdeploy/engine/config.py +++ b/fastdeploy/engine/config.py @@ -329,7 +329,8 @@ def postprocess(self): self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size) if self.guided_decoding_backend == "auto": - if self.enable_mm: + if current_platform.is_xpu() or self.speculative_config.method is not None: + llm_logger.warning("Speculative Decoding and XPU currently do not support Guided decoding, set off.") self.guided_decoding_backend = "off" else: self.guided_decoding_backend = "xgrammar" @@ -396,10 +397,10 @@ def check(self): ], f"Only support xgrammar、auto guided decoding backend, but got {self.guided_decoding_backend}." if self.guided_decoding_backend != "off": - # TODO: mm support guided_decoding - assert self.enable_mm is False, "Multimodal model currently do not support guided_decoding" # TODO: speculative decoding support guided_decoding + assert self.speculative_config.method is None, \ + "speculative decoding currently do not support guided_decoding" # TODO: xpu support guided_decoding assert not current_platform.is_xpu(), "XPU currently do not support guided_decoding" @@ -425,13 +426,12 @@ def print(self, file=None): if k == "generation_config" and v is not None: for gck, gcv in v.to_dict().items(): llm_logger.info("{:<20}:{:<6}{}".format(gck, "", gcv)) - elif ( - k == "cache_config" - or k == "model_config" - or k == "scheduler_config" - or k == "parallel_config" - or k == "commit_config" - ): + elif (k == "cache_config" or + k == "model_config" or + k == "scheduler_config" or + k == "parallel_config" or + k == "commit_config" or + k == "speculative_config"): v.print() else: llm_logger.info("{:<20}:{:<6}{}".format(k, "", v)) diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index e9ecbefa2f..38b54b62bc 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -434,6 +434,13 @@ def _insert_zmq_task_to_scheduler(self): llm_logger.debug(f"Receive request: {request}") err_msg = None + if ((request.guided_json is not None + or request.guided_regex is not None + or request.structural_tag is not None + or request.guided_grammar is not None) and self.guided_decoding_checker is None): + err_msg = "guided_backend is None, use --guided-decoding-backend to " \ + "specify the backend at server startup." + if self.guided_decoding_checker is not None: request, err_msg = self.guided_decoding_checker.schema_format(request) @@ -526,6 +533,14 @@ def add_requests(self, task, sampling_params=None, **kwargs): llm_logger.error(error_msg) raise EngineError(error_msg, error_code=400) + if ((request.guided_json is not None + or request.guided_regex is not None + or request.structural_tag is not None + or request.guided_grammar is not None) and self.guided_decoding_checker is None): + err_msg = "guided_backend is None, use --guided-decoding-backend to specify the backend at server startup." + llm_logger.error(err_msg) + raise EngineError(err_msg, error_code=400) + if self.guided_decoding_checker is not None: request, err_msg = self.guided_decoding_checker.schema_format(request) if err_msg is not None: @@ -1084,8 +1099,13 @@ def _start_worker_service(self): f" --speculative_config '{self.cfg.speculative_config.to_json_string()}'" f" --graph_optimization_config '{self.cfg.graph_optimization_config.to_json_string()}'" f" --guided_decoding_backend {self.cfg.guided_decoding_backend}" +<<<<<<< HEAD f" --load_strategy {self.cfg.load_config.load_strategy}" ) +======= + f" --load_strategy {self.cfg.model_config.load_strategy}" + f" --reasoning_parser {self.cfg.reasoning_parser}") +>>>>>>> 04c2f3c1 (mm support structured output) worker_append_flag = { "enable_expert_parallel": self.cfg.parallel_config.enable_expert_parallel, diff --git a/fastdeploy/engine/sampling_params.py b/fastdeploy/engine/sampling_params.py index 91babf7a86..e31e3ef467 100644 --- a/fastdeploy/engine/sampling_params.py +++ b/fastdeploy/engine/sampling_params.py @@ -97,6 +97,7 @@ class SamplingParams: min_tokens: int = 1 logprobs: Optional[int] = None bad_words: Optional[List[str]] = None + guided_decoding: Optional[GuidedDecodingParams] = None @classmethod def from_dict(cls, req_dict: dict[str, Any]) -> SamplingParams: @@ -128,6 +129,7 @@ def from_optional( min_tokens=1, logprobs=None, bad_words=None, + guided_decoding=None, ) -> SamplingParams: """Create instance from command line arguments""" return cls( @@ -148,6 +150,7 @@ def from_optional( min_tokens=min_tokens, logprobs=logprobs, bad_words=bad_words, + guided_decoding=guided_decoding, ) def __post_init__(self): @@ -218,3 +221,45 @@ class BeamSearchParams: temperature: float = 0.0 length_penalty: float = 1.0 include_stop_str_in_output: bool = False + + +@dataclass +class GuidedDecodingParams: + """Guided decoding parameters for text generation.""" + json: Optional[Union[str, dict]] = None + regex: Optional[str] = None + choice: Optional[List[str]] = None + grammar: Optional[str] = None + json_object: Optional[bool] = None + structural_tag: Optional[str] = None + + def to_dict(self): + """convert to dict""" + key_dict = { + "guided_json": self.json, + "guided_regex": self.regex, + "guided_choice": self.choice, + "guided_grammar": self.grammar, + "structural_tag": self.structural_tag, + "guided_json_object": self.json_object, + } + + guided_dict = {} + for key, value in key_dict.items(): + if value is not None: + guided_dict[key] = value + return guided_dict + + def __post_init__(self): + """Verify the arguments.""" + guided_count = sum([ + self.json is not None, self.regex is not None, self.choice + is not None, self.grammar is not None, self.json_object + is not None, self.structural_tag is not None + ]) + + if guided_count > 1: + raise ValueError( + "You can only use one kind of guided decoding " + "('json', 'json_object', 'regex', 'choice', 'grammar', 'structural_tag')." + ) diff --git a/fastdeploy/entrypoints/llm.py b/fastdeploy/entrypoints/llm.py index 1204a67f9d..0ac5db20f8 100644 --- a/fastdeploy/entrypoints/llm.py +++ b/fastdeploy/entrypoints/llm.py @@ -275,6 +275,9 @@ def _add_request( enable_thinking = None if chat_template_kwargs is not None: enable_thinking = chat_template_kwargs.get("enable_thinking", None) + if current_sampling_params.guided_decoding is not None: + guided_decoding_dict = current_sampling_params.guided_decoding.to_dict() + tasks.update(guided_decoding_dict) self.llm_engine.add_requests(tasks, current_sampling_params, enable_thinking=enable_thinking) return req_ids diff --git a/fastdeploy/input/ernie_processor.py b/fastdeploy/input/ernie_processor.py index 63feda9348..f66640a7b3 100644 --- a/fastdeploy/input/ernie_processor.py +++ b/fastdeploy/input/ernie_processor.py @@ -60,7 +60,6 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None): self.eos_token_ids = [self.tokenizer.eos_token_id] self.eos_token_id_len = len(self.eos_token_ids) self.pad_token_id = self.get_pad_id() - self.reasoning_parser = None if reasoning_parser_obj: self.reasoning_parser = reasoning_parser_obj(self.tokenizer) @@ -264,7 +263,6 @@ def process_response_dict_streaming(self, response_dict, **kwargs): Returns: Dict: response contain text fields """ - enable_thinking = kwargs.get("enable_thinking") is_end = response_dict["finished"] req_id = response_dict["request_id"] token_ids = response_dict["outputs"]["token_ids"] @@ -273,6 +271,8 @@ def process_response_dict_streaming(self, response_dict, **kwargs): if token_ids[-1] == self.tokenizer.eos_token_id: token_ids = token_ids[:-1] delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id) + + enable_thinking = self.get_enable_thinking(kwargs.get("enable_thinking")) if enable_thinking and self.reasoning_parser: reasoning_content, text = self.reasoning_parser.extract_reasoning_content_streaming( previous_texts, diff --git a/fastdeploy/input/ernie_vl_processor.py b/fastdeploy/input/ernie_vl_processor.py index 1b8669e293..afb399910d 100644 --- a/fastdeploy/input/ernie_vl_processor.py +++ b/fastdeploy/input/ernie_vl_processor.py @@ -78,7 +78,7 @@ def _load_tokenizer(self): def process_request(self, request, max_model_len=None, **kwargs): """process the input data""" task = request.to_dict() - task["enable_thinking"] = kwargs.get("enable_thinking", True) + task['enable_thinking'] = self.get_enable_thinking(kwargs.get("enable_thinking")) self.process_request_dict(task, max_model_len) request = Request.from_dict(task) @@ -244,9 +244,7 @@ def process_response_dict(self, response_dict, stream, **kwargs): Returns: Dict: response contain text fields """ - enable_thinking = kwargs.pop("enable_thinking", True) - if enable_thinking is None: - enable_thinking = True + enable_thinking = self.get_enable_thinking(kwargs.get("enable_thinking")) if stream: return self.process_response_dict_streaming(response_dict, enable_thinking=enable_thinking, **kwargs) else: diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py index 664868a595..d775fc7db7 100644 --- a/fastdeploy/input/text_processor.py +++ b/fastdeploy/input/text_processor.py @@ -50,6 +50,26 @@ def __init__(self): ) ) + def get_enable_thinking(self, enable_thinking=None): + """ + get enable_thinking param + + 1. if enable_thinking is None: + 1.1 if reasoning_parser is not None, set enable_thinking to True. + 1.2 if reasoning_parser is None, set enable_thinking to False. + 2. if reasoning_parser is None but enable_thinking is True, set enable_thinking to False and print warning. + + """ + if enable_thinking is None: + enable_thinking = False if self.reasoning_parser is None else True + if enable_thinking and self.reasoning_parser is None: + enable_thinking = False + data_processor_logger.warning( + "enable_thinking is True, but reasoning_parser is None. " + "enable_thinking will be set to False." + ) + return enable_thinking + def _apply_default_parameters(self, request): """ Apply default value for parameters in request @@ -229,6 +249,7 @@ def process_request(self, request, max_model_len=None, **kwargs): request.set("stop_token_ids", stop_seqs) request.set("stop_seqs_len", stop_seqs_len) + request.set("enable_thinking", self.get_enable_thinking(kwargs.get("enable_thinking"))) if request.prompt_token_ids is None or len(request.prompt_token_ids) == 0: if request.prompt is not None: request.prompt_token_ids = self.text2ids(request.prompt, max_model_len) @@ -236,7 +257,6 @@ def process_request(self, request, max_model_len=None, **kwargs): if self.tokenizer.chat_template is None: raise ValueError("This model does not support chat_template.") task = request.to_dict() - task["enable_thinking"] = kwargs.get("enable_thinking", True) request.prompt_token_ids = self.messages2ids(task) else: raise ValueError(f"The request should have `input_ids`, `text` or `messages`: {request}.") @@ -372,7 +392,6 @@ def process_response_dict_streaming(self, response_dict, **kwargs): Returns: Dict: response contain text fields """ - enable_thinking = kwargs.get("enable_thinking") is_end = response_dict["finished"] req_id = response_dict["request_id"] token_ids = response_dict["outputs"]["token_ids"] @@ -382,6 +401,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): token_ids = token_ids[:-1] delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id) + enable_thinking = self.get_enable_thinking(kwargs.get("enable_thinking")) if enable_thinking and self.reasoning_parser: reasoning_content, text = self.reasoning_parser.extract_reasoning_content_streaming( previous_texts, diff --git a/fastdeploy/model_executor/guided_decoding/__init__.py b/fastdeploy/model_executor/guided_decoding/__init__.py index d6ee611992..01e887502e 100644 --- a/fastdeploy/model_executor/guided_decoding/__init__.py +++ b/fastdeploy/model_executor/guided_decoding/__init__.py @@ -15,8 +15,10 @@ """ # from fastdeploy.config import FDConfig +from fastdeploy.model_executor.guided_decoding.base_guided_decoding import ( + BackendBase, BaseChecker, LogitsProcessorBase) -__all__ = ["get_guided_backend", "schema_checker"] +__all__ = ['get_guided_backend', 'schema_checker', 'LogitsProcessorBase', 'BackendBase', 'BaseChecker'] def get_guided_backend( diff --git a/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py b/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py index 7baf2fe971..adcfbba6c6 100644 --- a/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py +++ b/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py @@ -19,6 +19,7 @@ from fastdeploy.config import ErnieArchitectures, FDConfig from fastdeploy.engine.request import Request +from fastdeploy.reasoning import ReasoningParserManager from fastdeploy.utils import llm_logger @@ -34,8 +35,9 @@ class LogitsProcessorBase: None (all state should be managed by subclasses) """ - def __init__(self): - pass + def __init__(self, enable_reasoning): + self.reasoning_ended = False + self.enable_reasoning = enable_reasoning def fill_token_bitmask(self, token_bitmask, idx): """ @@ -136,8 +138,13 @@ def __init__(self, fd_config: FDConfig): self.fd_config = fd_config self.executor = ThreadPoolExecutor() self.max_cache_size = 2048 + self.reasoning_parser = None self.hf_tokenizer = self._get_tokenizer_hf() + if self.fd_config.model_config.reasoning_parser: + reasoning_parser_obj = ReasoningParserManager.get_reasoning_parser( + self.fd_config.model_config.reasoning_parser) + self.reasoning_parser = reasoning_parser_obj(self.hf_tokenizer) def _create_processor(self): """ @@ -148,70 +155,88 @@ def _create_processor(self): """ raise NotImplementedError - def _json_processor(self, schemata): + def _json_processor(self, schemata, enable_thinking=False): """ Process JSON schemata. Args: schemata (str): The schemata string. + enable_thinking (bool): Whether to enable thinking mode. Raises: NotImplementedError: This method should be implemented in subclasses. """ raise NotImplementedError - def _regex_processor(self, schemata): + def _regex_processor(self, schemata, enable_thinking=False): """ Process regular expression schemata. Args: schemata (str): The schemata string. + enable_thinking (bool): Whether to enable thinking mode. Raises: NotImplementedError: This method should be implemented in subclasses. """ raise NotImplementedError - def _grammar_processor(self, schemata): + def _grammar_processor(self, schemata, enable_thinking=False): """ Process grammar schemata. Args: schemata (str): The schemata string. + enable_thinking (bool): Whether to enable thinking mode. Raises: NotImplementedError: This method should be implemented in subclasses. """ raise NotImplementedError - def _structural_tag_processor(self, schemata): + def _structural_tag_processor(self, schemata, enable_thinking=False): """ Process structural tag schemata. Args: schemata (str): The schemata string. + enable_thinking (bool): Whether to enable thinking mode. Raises: NotImplementedError: This method should be implemented in subclasses. """ raise NotImplementedError - def _unsupported_processor_type(self, key_type, schemata): + def _unsupported_processor_type(self, key_type, schemata, enable_thinking=False): """ Process unsupported type. Args: key_type (str): The key type string. schemata (str): The schemata string. + enable_thinking (bool): Whether to enable thinking mode. """ raise Exception(f"Unsupported processor type {key_type}.") - def _init_logits_processor(self, schemata_key: tuple[str, str]) -> LogitsProcessorBase: + def get_reasoning_parser(self): + """ + Get reasoning parser object. + Returns: + ReasoningParser: Reasoning parser object or None + """ + return self.reasoning_parser + + def _init_logits_processor( + self, + schemata_key: tuple[str, str], + enable_thinking: bool = False, + ) -> LogitsProcessorBase: """ init logits processor by type and schemata. Args: schemata_key (tuple[str, str]): Tuple containing processor type and schema string + enable_thinking (bool): Whether to enable thinking step Returns: LogitsProcessorBase: Initialized logits processor instance @@ -221,18 +246,22 @@ def _init_logits_processor(self, schemata_key: tuple[str, str]) -> LogitsProcess """ key_type, schemata = schemata_key if key_type == "json": - return self._json_processor(schemata) + return self._json_processor(schemata, enable_thinking) elif key_type == "regex": - return self._regex_processor(schemata) + return self._regex_processor(schemata, enable_thinking) elif key_type == "grammar": - return self._grammar_processor(schemata) + return self._grammar_processor(schemata, enable_thinking) elif key_type == "structural_tag": - return self._structural_tag_processor(schemata) + return self._structural_tag_processor(schemata, enable_thinking) else: llm_logger.error(f"Unsupported processor type {key_type}.") return None - def get_logits_processor(self, schemata_key: tuple[str, str]) -> tuple[LogitsProcessorBase, bool]: + def get_logits_processor( + self, + schemata_key: tuple[str, str], + enable_thinking: bool = False, + ) -> tuple[LogitsProcessorBase, bool]: """ get logits processor by key from cache or create new one. @@ -246,8 +275,10 @@ def get_logits_processor(self, schemata_key: tuple[str, str]) -> tuple[LogitsPro """ value = self.cache.get(schemata_key, None) if value: - return value.copy(), True - value = self.executor.submit(self._init_logits_processor, schemata_key) + value_copy = value.copy() + value_copy.enable_reasoning = enable_thinking + return value_copy, True + value = self.executor.submit(self._init_logits_processor, schemata_key, enable_thinking) return value, False def _get_tokenizer_hf(self): @@ -266,9 +297,7 @@ def _get_tokenizer_hf(self): try: architectures = self.fd_config.model_config.architectures if not ErnieArchitectures.contains_ernie_arch(architectures): - from transformers import AutoTokenizer, PreTrainedTokenizerFast - tokenizer = AutoTokenizer.from_pretrained( self.fd_config.model_config.model, use_fast=False, diff --git a/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py b/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py index f702a1085e..b03ff09291 100644 --- a/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py +++ b/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py @@ -23,11 +23,9 @@ from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request -from fastdeploy.model_executor.guided_decoding.base_guided_decoding import ( - BackendBase, - BaseChecker, - LogitsProcessorBase, -) +from fastdeploy.model_executor.guided_decoding import (BackendBase, + BaseChecker, + LogitsProcessorBase) from fastdeploy.utils import llm_logger try: @@ -56,7 +54,6 @@ class XGrammarProcessor(LogitsProcessorBase): max_rollback_tokens (int): Maximum number of tokens to rollback on mismatch vocab_size (int): Size of the vocabulary batch_size (int): Batch size for processing - splitwise_role (str): Role for splitwise processing compiled_grammar (CompiledGrammar): Compiled grammar rules terminate_without_stop_token (bool): Whether to terminate without stop token override_stop_tokens (Optional[List[int]]): Custom stop tokens @@ -70,13 +67,12 @@ def __init__( override_stop_tokens: Optional[List[int]] = None, vocab_size: Optional[int] = None, batch_size: Optional[int] = None, - splitwise_role: str = "mixed", + enable_thinking: bool = False, ): - super().__init__() + super().__init__(enable_reasoning=enable_thinking) self.max_rollback_tokens = 200 self.vocab_size = vocab_size self.batch_size = batch_size - self.splitwise_role = splitwise_role self.compiled_grammar = compiled_grammar self.terminate_without_stop_token = terminate_without_stop_token self.override_stop_tokens = override_stop_tokens @@ -187,7 +183,6 @@ def copy(self) -> "XGrammarProcessor": override_stop_tokens=self.override_stop_tokens, vocab_size=self.vocab_size, batch_size=self.batch_size, - splitwise_role=self.splitwise_role, ) @@ -202,7 +197,6 @@ class XGrammarBackend(BackendBase): vocab_size (int): Size of the vocabulary from config batch_size (int): Maximum batch size from config any_whitespace (bool): Whether to allow any whitespace in JSON - splitwise_role (str): Role for splitwise processing grammar_compiler (GrammarCompiler): Grammar compilation engine """ @@ -216,7 +210,6 @@ def __init__( self.batch_size = fd_config.parallel_config.max_num_seqs self.any_whitespace = not fd_config.parallel_config.disable_any_whitespace - self.splitwise_role = fd_config.parallel_config.splitwise_role try: tokenizer_info = TokenizerInfo.from_huggingface(self.hf_tokenizer, vocab_size=self.vocab_size) @@ -229,6 +222,7 @@ def _create_processor( compiled_grammar: CompiledGrammar, terminate_without_stop_token: bool = False, override_stop_tokens: Optional[List[int]] = None, + enable_thinking: bool = False, ) -> XGrammarProcessor: """ Create a logits processor instance for the given compiled grammar. @@ -237,6 +231,7 @@ def _create_processor( compiled_grammar (CompiledGrammar): Compiled grammar rules terminate_without_stop_token (bool): Whether to terminate without stop token override_stop_tokens (Optional[List[int]]): Custom stop tokens to override defaults + enable_thinking (bool): Whether to enable thinking mode Returns: XGrammarProcessor: Configured grammar processor instance @@ -247,15 +242,16 @@ def _create_processor( override_stop_tokens=override_stop_tokens, vocab_size=self.vocab_size, batch_size=self.batch_size, - splitwise_role=self.splitwise_role, + enable_thinking=enable_thinking, ) - def _json_processor(self, schemata: str) -> Optional[XGrammarProcessor]: + def _json_processor(self, schemata: str, enable_thinking: bool=False) -> Optional[XGrammarProcessor]: """ Compile JSON schema into a grammar processor. Args: schemata (str): JSON schema string to compile + enable_thinking (bool): Whether to enable thinking mode Returns: Optional[XGrammarProcessor]: Configured processor if successful, None on failure @@ -265,14 +261,15 @@ def _json_processor(self, schemata: str) -> Optional[XGrammarProcessor]: except Exception as e: llm_logger.error(f"Failed to compile json schema: {e}") return None - return self._create_processor(compiled_grammar) + return self._create_processor(compiled_grammar, enable_thinking=enable_thinking) - def _regex_processor(self, schemata: str) -> Optional[XGrammarProcessor]: + def _regex_processor(self, schemata: str, enable_thinking: bool=False) -> Optional[XGrammarProcessor]: """ Compile regex pattern into a grammar processor. Args: schemata (str): Regex pattern string to compile + enable_thinking (bool): Whether to enable thinking mode Returns: Optional[XGrammarProcessor]: Configured processor if successful, None on failure @@ -282,14 +279,15 @@ def _regex_processor(self, schemata: str) -> Optional[XGrammarProcessor]: except Exception as e: llm_logger.error(f"Failed to compile regex schema: {e}") return None - return self._create_processor(compiled_grammar) + return self._create_processor(compiled_grammar, enable_thinking=enable_thinking) - def _grammar_processor(self, schemata: str) -> Optional[XGrammarProcessor]: + def _grammar_processor(self, schemata: str, enable_thinking: bool=False) -> Optional[XGrammarProcessor]: """ Compile grammar (EBNF) into a grammar processor. Args: schemata (str): Grammar string in EBNF format + enable_thinking (bool): Whether to enable thinking mode Returns: Optional[XGrammarProcessor]: Configured processor if successful, None on failure @@ -299,9 +297,9 @@ def _grammar_processor(self, schemata: str) -> Optional[XGrammarProcessor]: except Exception as e: llm_logger.error(f"Failed to compile ebnf schema: {e}") return None - return self._create_processor(compiled_grammar) + return self._create_processor(compiled_grammar, enable_thinking=enable_thinking) - def _structural_tag_processor(self, schemata: str) -> Optional[XGrammarProcessor]: + def _structural_tag_processor(self, schemata: str, enable_thinking: bool=False) -> Optional[XGrammarProcessor]: """ Compile structural tags into a grammar processor. @@ -326,7 +324,7 @@ def _structural_tag_processor(self, schemata: str) -> Optional[XGrammarProcessor except Exception as e: llm_logger.error(f"Failed to compile structural tags schema: {e}") return None - return self._create_processor(compiled_grammar) + return self._create_processor(compiled_grammar, enable_thinking=enable_thinking) class XGrammarChecker(BaseChecker): diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py index f064cf9d1e..2763052660 100644 --- a/fastdeploy/model_executor/layers/sample/sampler.py +++ b/fastdeploy/model_executor/layers/sample/sampler.py @@ -23,9 +23,7 @@ from paddle import nn from fastdeploy.config import FDConfig -from fastdeploy.model_executor.guided_decoding.base_guided_decoding import ( - LogitsProcessorBase, -) +from fastdeploy.model_executor.guided_decoding import LogitsProcessorBase from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata from fastdeploy.model_executor.layers.sample.ops import ( apply_penalty_multi_scores, @@ -34,6 +32,7 @@ top_k_top_p_sampling, ) from fastdeploy.platforms import current_platform +from fastdeploy.reasoning import ReasoningParser from fastdeploy.worker.output import LogprobsTensors, SamplerOutput @@ -48,6 +47,10 @@ def __init__(self): self.logits_processor: Dict[int, Optional[Any]] = dict() self.executor = ThreadPoolExecutor() self.logits_lock = threading.Lock() + self.reasoning_parser = None + + def apply_reasoning_parser(self, reasoning_parser: Optional[ReasoningParser] = None): + self.reasoning_parser = reasoning_parser def add_logits_processor( self, @@ -124,9 +127,16 @@ def apply_token_mask(self, logits: paddle.Tensor, skip_idx_list: List[int] = []) if available_processors is None: return logits - indices = list(self.logits_processor.keys()) - mask_idx = [i for i in indices if i not in skip_idx_list] - return available_processors.apply_token_mask(logits, self.token_bitmask, indices=mask_idx) + indices = [] + for idx, processor in self.logits_processor.items(): + if processor is None or idx in skip_idx_list: + continue + if not processor.enable_reasoning or processor.reasoning_ended: + indices.append(idx) + + return available_processors.apply_token_mask(logits, + self.token_bitmask, + indices=indices) def _accept_token(self, idx: int, token: int): """accept token""" @@ -136,6 +146,15 @@ def _accept_token(self, idx: int, token: int): if self.logits_processor[idx].is_terminated(): return + if ( + self.reasoning_parser is not None + and self.logits_processor[idx].enable_reasoning + and not self.logits_processor[idx].reasoning_ended + ): + reasoning_ended = self.reasoning_parser.is_reasoning_end([token]) + self.logits_processor[idx].reasoning_ended = reasoning_ended + return + self.logits_processor[idx].accept_token(token) def update_output_tokens(self, next_tokens: paddle.Tensor, skip_idx_list: List[int] = []): @@ -181,19 +200,25 @@ def __init__(self): self.processor = SamplerProcessor() - def apply_logits_processor( - self, - ids: int, - future: Optional[Any] = None, - prefill_tokens: List[int] = [], - ): - """apply logits processor to sampler""" + def set_reasoning_parser(self, reasoning_parser: Optional[ReasoningParser] = None): + """ set reasoning parser """ + self.processor.apply_reasoning_parser(reasoning_parser) + + def apply_logits_processor(self, + ids: int, + future: Optional[Any] = None, + prefill_tokens: List[int] = []): + """ apply logits processor to sampler """ self.processor.add_logits_processor(ids, future, prefill_tokens) def pre_process(self, skip_idx_list: List[int] = []): """pre process before running""" self.processor.pre_process(skip_idx_list) + def post_process(self, next_tokens: paddle.Tensor, skip_idx_list: List[int] = []): + """ post process after running """ + self.processor.update_output_tokens(next_tokens, skip_idx_list) + def compute_logprobs(self, logits: paddle.Tensor) -> paddle.Tensor: """ """ return F.log_softmax(logits, axis=-1) @@ -276,8 +301,6 @@ def forward_cuda( None if num_logprobs is None else self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=next_tokens) ) - self.processor.update_output_tokens(next_tokens, skip_idx_list) - sampler_output = SamplerOutput( # The sampled tokens are expanded to 2D tensor with shape # [num_requests, 1], where each row represents one generated @@ -309,13 +332,19 @@ def pre_process(self, skip_idx_list: List[int] = []): """pre process before running""" pass - def apply_logits_processor( - self, - ids: int, - future: Optional[Any] = None, - prefill_tokens: List[int] = [], - ): - """apply logits processor to sampler""" + def set_reasoning_parser(self, reasoning_parser: Optional[ReasoningParser] = None): + """ set reasoning parser """ + pass + + def post_process(self, next_tokens: paddle.Tensor, skip_idx_list: List[int] = []): + """ post process after running """ + pass + + def apply_logits_processor(self, + ids: int, + future: Optional[Any] = None, + prefill_tokens: List[int] = []): + """ apply logits processor to sampler """ pass def forward_cuda( @@ -409,6 +438,14 @@ def apply_logits_processor( """apply logits processor to sampler""" pass + def set_reasoning_parser(self, reasoning_parser: Optional[ReasoningParser] = None): + """ set reasoning parser """ + pass + + def post_process(self, next_tokens: paddle.Tensor, skip_idx_list: List[int] = []): + """ post process after running """ + pass + def forward_cuda( self, logits: paddle.Tensor, diff --git a/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py b/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py index f5762b791f..6589892a4d 100644 --- a/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py +++ b/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py @@ -46,6 +46,20 @@ def __init__(self, tokenizer): if self.think_end_token_id is None: raise RuntimeError("Ernie VL reasoning parser could not locate think end " "tokens in the tokenizer!") + def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: + """ + Check if the reasoning content ends in the input_ids. + It is used in structured engines like `xgrammar` to check if the + reasoning content ends in the model output. + Parameters: + input_ids: list[int] + The input_ids of the model output. + Returns: + bool + True if the reasoning content ends in the input_ids. + """ + return self.think_end_token_id in input_ids + def extract_reasoning_content_streaming( self, previous_text: str, diff --git a/fastdeploy/reasoning/qwen3_reasoning_parsers.py b/fastdeploy/reasoning/qwen3_reasoning_parsers.py index 4fc565c6c1..fd00e675e4 100644 --- a/fastdeploy/reasoning/qwen3_reasoning_parsers.py +++ b/fastdeploy/reasoning/qwen3_reasoning_parsers.py @@ -48,6 +48,20 @@ def __init__(self, tokenizer): if self.think_end_token_id is None: raise RuntimeError("Qwen3 reasoning parser could not locate think end " "tokens in the tokenizer!") + def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: + """ + Check if the reasoning content ends in the input_ids. + It is used in structured engines like `xgrammar` to check if the + reasoning content ends in the model output. + Parameters: + input_ids: list[int] + The input_ids of the model output. + Returns: + bool + True if the reasoning content ends in the input_ids. + """ + return self.think_end_token_id in input_ids + def extract_reasoning_content_streaming( self, previous_text: str, diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 4ec2411ec4..ced71571da 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -29,10 +29,8 @@ profile_run_guard, sot_warmup_guard, ) -from fastdeploy.model_executor.guided_decoding import get_guided_backend -from fastdeploy.model_executor.guided_decoding.base_guided_decoding import ( - LogitsProcessorBase, -) +from fastdeploy.model_executor.guided_decoding import (LogitsProcessorBase, + get_guided_backend) from fastdeploy.model_executor.layers.attention import get_attention_backend from fastdeploy.model_executor.layers.attention.base_attention_backend import ( AttentionBackend, @@ -83,10 +81,6 @@ def __init__( self.speculative_decoding = self.speculative_method is not None self.enable_logprob = fd_config.model_config.enable_logprob - self.guided_backend = None - if self.fd_config.parallel_config.guided_decoding_backend != "off": - self.guided_backend = get_guided_backend(fd_config=self.fd_config) - # VL model config: if self.enable_mm: self._init_image_preprocess() @@ -115,6 +109,11 @@ def __init__( else: self.sampler = SpeculativeSampler(fd_config) + self.guided_backend = None + if self.fd_config.parallel_config.guided_decoding_backend != "off": + self.guided_backend = get_guided_backend(fd_config=self.fd_config) + self.sampler.set_reasoning_parser(self.guided_backend.get_reasoning_parser()) + # Lazy initialize kv cache after model loading # self.kv_caches: list[paddle.Tensor] = [] @@ -191,7 +190,10 @@ def _init_logits_processor(self, request): elif request.structural_tag is not None: schemata_key = ("structural_tag", request.structural_tag) - return self.guided_backend.get_logits_processor(schemata_key=schemata_key), schemata_key + return self.guided_backend.get_logits_processor( + schemata_key=schemata_key, + enable_thinking=request.get("enable_thinking"), + ), schemata_key def insert_tasks_v1(self, req_dicts: List[Request]): """ @@ -395,8 +397,7 @@ def insert_prefill_inputs(self, req_dicts: List[Request]): self.share_inputs["prompt_lens"][idx : idx + 1] = length if self.enable_mm: - enable_thinking = request.get("enable_thinking", True) - enable_thinking = enable_thinking if enable_thinking is not None else True + enable_thinking = request.get("enable_thinking") self.share_inputs["enable_thinking"][:] = enable_thinking self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0 self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048) @@ -1160,10 +1161,14 @@ def _get_skip_idx(self, model_forward_batch: Optional[List[Request]] = None): Returns: A list of indices corresponding to the requests that need to be skipped. """ - skip_idx_list = [] - if not self.cache_config.enable_chunked_prefill or self.guided_backend is None: - return skip_idx_list + if ( + not self.parallel_config.enable_chunked_prefill + or self.guided_backend is None + or model_forward_batch is None + ): + return [] + skip_idx_list = [] for task in model_forward_batch: if task.get("prefill_chunk_info", None) is None or task.chunk_idx >= len(task.prefill_chunk_info): continue @@ -1247,6 +1252,7 @@ class at the server level, which is too granular for ModelRunner. if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0) + self.sampler.post_process(sampler_output.sampled_token_ids, skip_idx_list) else: self.sampler( logits, diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index d23d57bf7e..0dbcc696d3 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -565,6 +565,11 @@ def parse_args(): action="store_true", help="Enable output of token-level log probabilities.", ) + parser.add_argument("--reasoning_parser", + type=str, + default=None, + help="Flag specifies the reasoning parser to use for " \ + "extracting reasoning content from the model output") args = parser.parse_args() return args From 6bd36760b3e7bd5b8bf2a6560d0b6af2dd1f49c9 Mon Sep 17 00:00:00 2001 From: kevin Date: Fri, 18 Jul 2025 16:42:36 +0800 Subject: [PATCH 02/20] update code --- fastdeploy/config.py | 2 +- fastdeploy/input/ernie_vl_processor.py | 3 ++- fastdeploy/input/text_processor.py | 4 +--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index a68ae6d58e..7f2e309eb7 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -119,7 +119,7 @@ def __init__( self.reasoning_parser = None for key, value in args.items(): - if hasattr(self, key): + if hasattr(self, key) and value != "None": setattr(self, key, value) assert self.model != "" diff --git a/fastdeploy/input/ernie_vl_processor.py b/fastdeploy/input/ernie_vl_processor.py index afb399910d..845f2b613f 100644 --- a/fastdeploy/input/ernie_vl_processor.py +++ b/fastdeploy/input/ernie_vl_processor.py @@ -162,6 +162,7 @@ def _check_mm_limits(self, item): def process_request_dict(self, request, max_model_len=None): """process the input data""" + request['enable_thinking'] = self.get_enable_thinking(request.get("enable_thinking")) if not request.get("eos_token_ids"): request["eos_token_ids"] = self.eos_token_ids @@ -244,7 +245,7 @@ def process_response_dict(self, response_dict, stream, **kwargs): Returns: Dict: response contain text fields """ - enable_thinking = self.get_enable_thinking(kwargs.get("enable_thinking")) + enable_thinking = self.get_enable_thinking(kwargs.pop("enable_thinking", None)) if stream: return self.process_response_dict_streaming(response_dict, enable_thinking=enable_thinking, **kwargs) else: diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py index d775fc7db7..40fb8e8784 100644 --- a/fastdeploy/input/text_processor.py +++ b/fastdeploy/input/text_processor.py @@ -430,9 +430,7 @@ def process_response_dict(self, response_dict, **kwargs): Returns: Dict: response contain text fields """ - enable_thinking = kwargs.pop("enable_thinking", True) - if enable_thinking is None: - enable_thinking = True + enable_thinking = self.get_enable_thinking(kwargs.pop("enable_thinking", None)) stream = kwargs.get("stream", True) if stream: return self.process_response_dict_streaming(response_dict, enable_thinking=enable_thinking, **kwargs) From 65458b33264a3db4b3439931b24c8fa6a36f222f Mon Sep 17 00:00:00 2001 From: kevin Date: Fri, 1 Aug 2025 18:04:28 +0800 Subject: [PATCH 03/20] update code --- fastdeploy/engine/request.py | 4 +-- fastdeploy/input/ernie_processor.py | 5 +-- fastdeploy/input/ernie_vl_processor.py | 3 +- fastdeploy/input/text_processor.py | 12 ++++--- .../model_executor/layers/sample/sampler.py | 36 ++++++++----------- fastdeploy/worker/gpu_model_runner.py | 20 ++++++----- 6 files changed, 40 insertions(+), 40 deletions(-) diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py index db183bb27a..9bb5709be3 100644 --- a/fastdeploy/engine/request.py +++ b/fastdeploy/engine/request.py @@ -69,7 +69,7 @@ def __init__( guided_grammar: Optional[Any] = None, structural_tag: Optional[Any] = None, guided_json_object: Optional[bool] = None, - enable_thinking: Optional[bool] = True, + enable_thinking: Optional[bool] = None, trace_carrier: dict = dict(), ) -> None: self.request_id = request_id @@ -147,7 +147,7 @@ def from_dict(cls, d: dict): guided_grammar=d.get("guided_grammar", None), structural_tag=d.get("structural_tag", None), guided_json_object=d.get("guided_json_object", None), - enable_thinking=d.get("enable_thinking", True), + enable_thinking=d.get("enable_thinking", None), trace_carrier=d.get("trace_carrier", {}), ) diff --git a/fastdeploy/input/ernie_processor.py b/fastdeploy/input/ernie_processor.py index f66640a7b3..ab79c7e71b 100644 --- a/fastdeploy/input/ernie_processor.py +++ b/fastdeploy/input/ernie_processor.py @@ -60,6 +60,7 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None): self.eos_token_ids = [self.tokenizer.eos_token_id] self.eos_token_id_len = len(self.eos_token_ids) self.pad_token_id = self.get_pad_id() + self.reasoning_parser = None if reasoning_parser_obj: self.reasoning_parser = reasoning_parser_obj(self.tokenizer) @@ -203,7 +204,7 @@ def process_response(self, response_dict, **kwargs): response_dict.outputs.reasoning_content = reasoning_content else: response_dict.outputs.text = full_text - data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}") + data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}") if response_dict.outputs.text == "" and response_dict.outputs.reasoning_content == "": return None return response_dict @@ -233,7 +234,6 @@ def process_response_dict_normal(self, response_dict, **kwargs): Returns: Dict: response contain text fields """ - enable_thinking = kwargs.get("enable_thinking") token_ids = response_dict["outputs"]["token_ids"] is_end = response_dict["finished"] req_id = response_dict["request_id"] @@ -243,6 +243,7 @@ def process_response_dict_normal(self, response_dict, **kwargs): delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id) if is_end: full_text = previous_texts + delta_text + enable_thinking = self.get_enable_thinking(kwargs.get("enable_thinking")) if enable_thinking and self.reasoning_parser: reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict) response_dict["outputs"]["text"] = text diff --git a/fastdeploy/input/ernie_vl_processor.py b/fastdeploy/input/ernie_vl_processor.py index 845f2b613f..e0ba224495 100644 --- a/fastdeploy/input/ernie_vl_processor.py +++ b/fastdeploy/input/ernie_vl_processor.py @@ -78,7 +78,6 @@ def _load_tokenizer(self): def process_request(self, request, max_model_len=None, **kwargs): """process the input data""" task = request.to_dict() - task['enable_thinking'] = self.get_enable_thinking(kwargs.get("enable_thinking")) self.process_request_dict(task, max_model_len) request = Request.from_dict(task) @@ -162,7 +161,7 @@ def _check_mm_limits(self, item): def process_request_dict(self, request, max_model_len=None): """process the input data""" - request['enable_thinking'] = self.get_enable_thinking(request.get("enable_thinking")) + request["enable_thinking"] = self.get_enable_thinking(request.get("enable_thinking")) if not request.get("eos_token_ids"): request["eos_token_ids"] = self.eos_token_ids diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py index 40fb8e8784..58682276a1 100644 --- a/fastdeploy/input/text_processor.py +++ b/fastdeploy/input/text_processor.py @@ -65,8 +65,7 @@ def get_enable_thinking(self, enable_thinking=None): if enable_thinking and self.reasoning_parser is None: enable_thinking = False data_processor_logger.warning( - "enable_thinking is True, but reasoning_parser is None. " - "enable_thinking will be set to False." + "enable_thinking is True, but reasoning_parser is None. " "enable_thinking will be set to False." ) return enable_thinking @@ -89,6 +88,10 @@ def set_value(req, key, value): set_value(request, "repetition_penalty", 1.0) set_value(request, "frequency_penalty", 0.0) set_value(request, "presence_penalty", 0.0) + + enable_thinking = self.get_enable_thinking(request.get("enable_thinking")) + set_value(request, "enable_thinking", enable_thinking) + return request @abstractmethod @@ -287,6 +290,7 @@ def process_request_dict(self, request, max_model_len=None, **kwargs): str: error message """ request = self._apply_default_parameters(request) + request["enable_thinking"] = self.get_enable_thinking(kwargs.get("enable_thinking")) if not request.get("eos_token_ids"): request["eos_token_ids"] = self.eos_token_ids @@ -348,7 +352,7 @@ def process_response(self, response_dict, **kwargs): else: # 模型不支持思考,并且没单独设置enable_thinking为false response_dict.outputs.text = full_text - data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}") + data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}") return response_dict @@ -362,7 +366,6 @@ def process_response_dict_normal(self, response_dict, **kwargs): Returns: Dict: response contain text fields """ - enable_thinking = kwargs.get("enable_thinking") token_ids = response_dict["outputs"]["token_ids"] is_end = response_dict["finished"] req_id = response_dict["request_id"] @@ -372,6 +375,7 @@ def process_response_dict_normal(self, response_dict, **kwargs): delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id) if is_end: full_text = previous_texts + delta_text + enable_thinking = self.get_enable_thinking(kwargs.get("enable_thinking")) if enable_thinking and self.reasoning_parser: reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict) response_dict["outputs"]["text"] = text diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py index 2763052660..d86c0716a4 100644 --- a/fastdeploy/model_executor/layers/sample/sampler.py +++ b/fastdeploy/model_executor/layers/sample/sampler.py @@ -131,12 +131,10 @@ def apply_token_mask(self, logits: paddle.Tensor, skip_idx_list: List[int] = []) for idx, processor in self.logits_processor.items(): if processor is None or idx in skip_idx_list: continue - if not processor.enable_reasoning or processor.reasoning_ended: + if self.reasoning_parser is None or not processor.enable_reasoning or processor.reasoning_ended: indices.append(idx) - return available_processors.apply_token_mask(logits, - self.token_bitmask, - indices=indices) + return available_processors.apply_token_mask(logits, self.token_bitmask, indices=indices) def _accept_token(self, idx: int, token: int): """accept token""" @@ -201,14 +199,11 @@ def __init__(self): self.processor = SamplerProcessor() def set_reasoning_parser(self, reasoning_parser: Optional[ReasoningParser] = None): - """ set reasoning parser """ + """set reasoning parser""" self.processor.apply_reasoning_parser(reasoning_parser) - def apply_logits_processor(self, - ids: int, - future: Optional[Any] = None, - prefill_tokens: List[int] = []): - """ apply logits processor to sampler """ + def apply_logits_processor(self, ids: int, future: Optional[Any] = None, prefill_tokens: List[int] = []): + """apply logits processor to sampler""" self.processor.add_logits_processor(ids, future, prefill_tokens) def pre_process(self, skip_idx_list: List[int] = []): @@ -216,7 +211,7 @@ def pre_process(self, skip_idx_list: List[int] = []): self.processor.pre_process(skip_idx_list) def post_process(self, next_tokens: paddle.Tensor, skip_idx_list: List[int] = []): - """ post process after running """ + """post process after running""" self.processor.update_output_tokens(next_tokens, skip_idx_list) def compute_logprobs(self, logits: paddle.Tensor) -> paddle.Tensor: @@ -270,12 +265,12 @@ def forward_cuda( skip_idx_list: List[int] = [], ) -> SamplerOutput: """ """ + logits = self.processor.apply_token_mask(logits, skip_idx_list) + num_logprobs = sampling_metadata.max_num_logprobs if num_logprobs is not None: raw_logprobs = self.compute_logprobs(logits) - logits = self.processor.apply_token_mask(logits, skip_idx_list) - logits = apply_penalty_multi_scores( sampling_metadata.pre_token_ids, sampling_metadata.prompt_ids, @@ -333,18 +328,15 @@ def pre_process(self, skip_idx_list: List[int] = []): pass def set_reasoning_parser(self, reasoning_parser: Optional[ReasoningParser] = None): - """ set reasoning parser """ + """set reasoning parser""" pass def post_process(self, next_tokens: paddle.Tensor, skip_idx_list: List[int] = []): - """ post process after running """ + """post process after running""" pass - def apply_logits_processor(self, - ids: int, - future: Optional[Any] = None, - prefill_tokens: List[int] = []): - """ apply logits processor to sampler """ + def apply_logits_processor(self, ids: int, future: Optional[Any] = None, prefill_tokens: List[int] = []): + """apply logits processor to sampler""" pass def forward_cuda( @@ -439,11 +431,11 @@ def apply_logits_processor( pass def set_reasoning_parser(self, reasoning_parser: Optional[ReasoningParser] = None): - """ set reasoning parser """ + """set reasoning parser""" pass def post_process(self, next_tokens: paddle.Tensor, skip_idx_list: List[int] = []): - """ post process after running """ + """post process after running""" pass def forward_cuda( diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index ced71571da..d851c34a9f 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -29,8 +29,10 @@ profile_run_guard, sot_warmup_guard, ) -from fastdeploy.model_executor.guided_decoding import (LogitsProcessorBase, - get_guided_backend) +from fastdeploy.model_executor.guided_decoding import ( + LogitsProcessorBase, + get_guided_backend, +) from fastdeploy.model_executor.layers.attention import get_attention_backend from fastdeploy.model_executor.layers.attention.base_attention_backend import ( AttentionBackend, @@ -190,10 +192,13 @@ def _init_logits_processor(self, request): elif request.structural_tag is not None: schemata_key = ("structural_tag", request.structural_tag) - return self.guided_backend.get_logits_processor( - schemata_key=schemata_key, - enable_thinking=request.get("enable_thinking"), - ), schemata_key + return ( + self.guided_backend.get_logits_processor( + schemata_key=schemata_key, + enable_thinking=request.get("enable_thinking"), + ), + schemata_key, + ) def insert_tasks_v1(self, req_dicts: List[Request]): """ @@ -1251,8 +1256,6 @@ class at the server level, which is too granular for ModelRunner. ) if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0) - - self.sampler.post_process(sampler_output.sampled_token_ids, skip_idx_list) else: self.sampler( logits, @@ -1313,6 +1316,7 @@ class at the server level, which is too granular for ModelRunner. speculative_decoding=self.speculative_decoding, skip_save_output=skip_save_output, ) + self.sampler.post_process(sampler_output.sampled_token_ids, skip_idx_list) # 6. Speculative decode if self.speculative_decoding: From f1141fbfadfa183e73d4c7df13d51d9b4cfd1e92 Mon Sep 17 00:00:00 2001 From: kevin Date: Fri, 1 Aug 2025 18:18:30 +0800 Subject: [PATCH 04/20] update format --- .../guided_decoding/__init__.py | 7 +++++-- .../guided_decoding/base_guided_decoding.py | 20 ++++++++++--------- .../guided_decoding/xgrammar_backend.py | 16 ++++++++------- 3 files changed, 25 insertions(+), 18 deletions(-) diff --git a/fastdeploy/model_executor/guided_decoding/__init__.py b/fastdeploy/model_executor/guided_decoding/__init__.py index 01e887502e..9336f4a04e 100644 --- a/fastdeploy/model_executor/guided_decoding/__init__.py +++ b/fastdeploy/model_executor/guided_decoding/__init__.py @@ -16,9 +16,12 @@ # from fastdeploy.config import FDConfig from fastdeploy.model_executor.guided_decoding.base_guided_decoding import ( - BackendBase, BaseChecker, LogitsProcessorBase) + BackendBase, + BaseChecker, + LogitsProcessorBase, +) -__all__ = ['get_guided_backend', 'schema_checker', 'LogitsProcessorBase', 'BackendBase', 'BaseChecker'] +__all__ = ["get_guided_backend", "schema_checker", "LogitsProcessorBase", "BackendBase", "BaseChecker"] def get_guided_backend( diff --git a/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py b/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py index adcfbba6c6..dd50f39b38 100644 --- a/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py +++ b/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py @@ -143,7 +143,8 @@ def __init__(self, fd_config: FDConfig): self.hf_tokenizer = self._get_tokenizer_hf() if self.fd_config.model_config.reasoning_parser: reasoning_parser_obj = ReasoningParserManager.get_reasoning_parser( - self.fd_config.model_config.reasoning_parser) + self.fd_config.model_config.reasoning_parser + ) self.reasoning_parser = reasoning_parser_obj(self.hf_tokenizer) def _create_processor(self): @@ -227,10 +228,10 @@ def get_reasoning_parser(self): return self.reasoning_parser def _init_logits_processor( - self, - schemata_key: tuple[str, str], - enable_thinking: bool = False, - ) -> LogitsProcessorBase: + self, + schemata_key: tuple[str, str], + enable_thinking: bool = False, + ) -> LogitsProcessorBase: """ init logits processor by type and schemata. @@ -258,10 +259,10 @@ def _init_logits_processor( return None def get_logits_processor( - self, - schemata_key: tuple[str, str], - enable_thinking: bool = False, - ) -> tuple[LogitsProcessorBase, bool]: + self, + schemata_key: tuple[str, str], + enable_thinking: bool = False, + ) -> tuple[LogitsProcessorBase, bool]: """ get logits processor by key from cache or create new one. @@ -298,6 +299,7 @@ def _get_tokenizer_hf(self): architectures = self.fd_config.model_config.architectures if not ErnieArchitectures.contains_ernie_arch(architectures): from transformers import AutoTokenizer, PreTrainedTokenizerFast + tokenizer = AutoTokenizer.from_pretrained( self.fd_config.model_config.model, use_fast=False, diff --git a/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py b/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py index b03ff09291..2349e85bf0 100644 --- a/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py +++ b/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py @@ -23,9 +23,11 @@ from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request -from fastdeploy.model_executor.guided_decoding import (BackendBase, - BaseChecker, - LogitsProcessorBase) +from fastdeploy.model_executor.guided_decoding import ( + BackendBase, + BaseChecker, + LogitsProcessorBase, +) from fastdeploy.utils import llm_logger try: @@ -245,7 +247,7 @@ def _create_processor( enable_thinking=enable_thinking, ) - def _json_processor(self, schemata: str, enable_thinking: bool=False) -> Optional[XGrammarProcessor]: + def _json_processor(self, schemata: str, enable_thinking: bool = False) -> Optional[XGrammarProcessor]: """ Compile JSON schema into a grammar processor. @@ -263,7 +265,7 @@ def _json_processor(self, schemata: str, enable_thinking: bool=False) -> Optiona return None return self._create_processor(compiled_grammar, enable_thinking=enable_thinking) - def _regex_processor(self, schemata: str, enable_thinking: bool=False) -> Optional[XGrammarProcessor]: + def _regex_processor(self, schemata: str, enable_thinking: bool = False) -> Optional[XGrammarProcessor]: """ Compile regex pattern into a grammar processor. @@ -281,7 +283,7 @@ def _regex_processor(self, schemata: str, enable_thinking: bool=False) -> Option return None return self._create_processor(compiled_grammar, enable_thinking=enable_thinking) - def _grammar_processor(self, schemata: str, enable_thinking: bool=False) -> Optional[XGrammarProcessor]: + def _grammar_processor(self, schemata: str, enable_thinking: bool = False) -> Optional[XGrammarProcessor]: """ Compile grammar (EBNF) into a grammar processor. @@ -299,7 +301,7 @@ def _grammar_processor(self, schemata: str, enable_thinking: bool=False) -> Opti return None return self._create_processor(compiled_grammar, enable_thinking=enable_thinking) - def _structural_tag_processor(self, schemata: str, enable_thinking: bool=False) -> Optional[XGrammarProcessor]: + def _structural_tag_processor(self, schemata: str, enable_thinking: bool = False) -> Optional[XGrammarProcessor]: """ Compile structural tags into a grammar processor. From b8f8d717c9c8a5fd81e2a7d11873de3dc0fef005 Mon Sep 17 00:00:00 2001 From: kevin Date: Mon, 4 Aug 2025 10:14:58 +0800 Subject: [PATCH 05/20] update code --- fastdeploy/worker/gpu_model_runner.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 6a6b52a7e2..c4d623cd70 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1234,11 +1234,7 @@ def _get_skip_idx(self, model_forward_batch: Optional[List[Request]] = None): Returns: A list of indices corresponding to the requests that need to be skipped. """ - if ( - not self.parallel_config.enable_chunked_prefill - or self.guided_backend is None - or model_forward_batch is None - ): + if not self.cache_config.enable_chunked_prefill or self.guided_backend is None or model_forward_batch is None: return [] skip_idx_list = [] From ce01f296ffed6cb9cb4823604f6f45ba3d394080 Mon Sep 17 00:00:00 2001 From: kevin Date: Mon, 4 Aug 2025 19:05:58 +0800 Subject: [PATCH 06/20] update code --- fastdeploy/worker/gpu_model_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index c4d623cd70..33ed69ddd2 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1380,7 +1380,8 @@ class at the server level, which is too granular for ModelRunner. speculative_decoding=self.speculative_decoding, skip_save_output=skip_save_output, ) - self.sampler.post_process(sampler_output.sampled_token_ids, skip_idx_list) + if sampler_output is not None: + self.sampler.post_process(sampler_output.sampled_token_ids, skip_idx_list) # 6. Speculative decode if self.speculative_decoding: From c2d64b9f9ab4f7a8ca2b1159ace1de1a0c8467ea Mon Sep 17 00:00:00 2001 From: kevin Date: Mon, 4 Aug 2025 20:26:16 +0800 Subject: [PATCH 07/20] add enable_thinking default --- fastdeploy/input/ernie_vl_processor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fastdeploy/input/ernie_vl_processor.py b/fastdeploy/input/ernie_vl_processor.py index 4df4c38ec8..1e3ebd92a4 100644 --- a/fastdeploy/input/ernie_vl_processor.py +++ b/fastdeploy/input/ernie_vl_processor.py @@ -104,6 +104,9 @@ def set_value(req, key, value): set_value(request, "repetition_penalty", 1.0) set_value(request, "frequency_penalty", 0.0) set_value(request, "presence_penalty", 0.0) + + enable_thinking = self.get_enable_thinking(request.get("enable_thinking", None)) + set_value(request, "enable_thinking", enable_thinking) return request def process_request(self, request, max_model_len=None, **kwargs): From da81a946248deb241bfbe85f966b85fb4c27fe94 Mon Sep 17 00:00:00 2001 From: kevin Date: Tue, 5 Aug 2025 17:10:53 +0800 Subject: [PATCH 08/20] update code --- fastdeploy/engine/engine.py | 55 +++++++++++++++------------ fastdeploy/worker/gpu_model_runner.py | 4 +- 2 files changed, 32 insertions(+), 27 deletions(-) diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index 0475278382..4f4aaf6da0 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -436,19 +436,14 @@ def _insert_zmq_task_to_scheduler(self): llm_logger.debug(f"Receive request: {request}") err_msg = None - if ( - request.guided_json is not None - or request.guided_regex is not None - or request.structural_tag is not None - or request.guided_grammar is not None - ) and self.guided_decoding_checker is None: - err_msg = ( - "guided_backend is None, use --guided-decoding-backend to " - "specify the backend at server startup." - ) - - if self.guided_decoding_checker is not None: - request, err_msg = self.guided_decoding_checker.schema_format(request) + if self._has_guided_input(request): + if self.guided_decoding_checker is None: + err_msg = ( + "guided_backend is None, use --guided-decoding-backend to " + "specify the backend at server startup." + ) + else: + request, err_msg = self.guided_decoding_checker.schema_format(request) if err_msg is not None: llm_logger.error(err_msg) @@ -488,6 +483,20 @@ def _insert_zmq_task_to_scheduler(self): f"traceback={traceback.format_exc()}" ) + def _has_guided_input(self, request): + """ + Check if the request has any guided input. + """ + return any( + x is not None + for x in ( + request.guided_json, + request.guided_regex, + request.structural_tag, + request.guided_grammar, + ) + ) + def add_requests(self, task, sampling_params=None, **kwargs): """ Add a new request to the queue. @@ -541,18 +550,14 @@ def add_requests(self, task, sampling_params=None, **kwargs): llm_logger.error(error_msg) raise EngineError(error_msg, error_code=400) - if ( - request.guided_json is not None - or request.guided_regex is not None - or request.structural_tag is not None - or request.guided_grammar is not None - ) and self.guided_decoding_checker is None: - err_msg = "guided_backend is None, use --guided-decoding-backend to specify the backend at server startup." - llm_logger.error(err_msg) - raise EngineError(err_msg, error_code=400) - - if self.guided_decoding_checker is not None: - request, err_msg = self.guided_decoding_checker.schema_format(request) + if self._has_guided_input(request): + err_msg = None + if self.guided_decoding_checker is None: + err_msg = ( + "guided_backend is None, use --guided-decoding-backend to specify the backend at server startup." + ) + else: + request, err_msg = self.guided_decoding_checker.schema_format(request) if err_msg is not None: llm_logger.error(err_msg) raise EngineError(err_msg, error_code=400) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 33ed69ddd2..912db4cdc9 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1380,7 +1380,7 @@ class at the server level, which is too granular for ModelRunner. speculative_decoding=self.speculative_decoding, skip_save_output=skip_save_output, ) - if sampler_output is not None: + if self.guided_backend is not None and sampler_output is not None: self.sampler.post_process(sampler_output.sampled_token_ids, skip_idx_list) # 6. Speculative decode @@ -1410,7 +1410,7 @@ def _add_cache(self, model_forward_batch) -> None: """ Add cache for guided decoding. """ - if self.guided_backend is None: + if self.guided_backend is None or model_forward_batch is None: return for request in model_forward_batch: From 255783922d9f1df4a876508e80c4329660290d3e Mon Sep 17 00:00:00 2001 From: kevin Date: Fri, 8 Aug 2025 16:09:36 +0800 Subject: [PATCH 09/20] add structured_outputs test case --- test/ci_use/EB_Lite/test_EB_Lite_serving.py | 333 +++++++++++++++++ .../EB_VL_Lite/test_EB_VL_Lite_serving.py | 341 ++++++++++++++++++ 2 files changed, 674 insertions(+) diff --git a/test/ci_use/EB_Lite/test_EB_Lite_serving.py b/test/ci_use/EB_Lite/test_EB_Lite_serving.py index 85cddcba1c..b08821fd89 100644 --- a/test/ci_use/EB_Lite/test_EB_Lite_serving.py +++ b/test/ci_use/EB_Lite/test_EB_Lite_serving.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import os import signal import socket @@ -108,6 +109,8 @@ def setup_and_run_server(): "--use-cudagraph", "--graph-optimization-config", '{"cudagraph_capture_sizes": [1]}', + "--guided-decoding-backend", + "auto", ] # Start subprocess in new process group @@ -939,3 +942,333 @@ def test_streaming_completion_with_bad_words(openai_client, capsys): assert hasattr(chunk.choices[0], "text") output_1.append(chunk.choices[0].text) assert output_0 not in output_1 + + +def test_streaming_chat_base(openai_client, chat_param): + """ + Test streaming chat base functionality with the local service + """ + assert isinstance(chat_param, dict), f"{chat_param} should be a dict" + assert "messages" in chat_param, f"{chat_param} should contain messages" + + response = openai_client.chat.completions.create( + model="default", + stream=True, + **chat_param, + ) + + output = [] + for chunk in response: + if hasattr(chunk.choices[0], "delta") and hasattr(chunk.choices[0].delta, "content"): + output.append(chunk.choices[0].delta.content) + assert len(output) > 2 + return "".join(output) + + +def test_non_streaming_chat_base(openai_client, chat_param): + """ + Test non streaming chat base functionality with the local service + """ + assert isinstance(chat_param, dict), f"{chat_param} should be a dict" + assert "messages" in chat_param, f"{chat_param} should contain messages" + + response = openai_client.chat.completions.create( + model="default", + stream=False, + **chat_param, + ) + + assert hasattr(response, "choices") + assert len(response.choices) > 0 + assert hasattr(response.choices[0], "message") + assert hasattr(response.choices[0].message, "content") + return response.choices[0].message.content + + +def test_structured_outputs_json_schema(openai_client): + """ + Test structured outputs json_schema functionality with the local service + """ + chat_param = { + "temperature": 1, + "max_tokens": 1024, + } + + # json_object + json_chat_param = { + "messages": [ + { + "role": "user", + "content": "Generate a JSON object containing: names of China's Four Great Inventions, their dynasties of origin, and brief descriptions (each under 50 characters)", + } + ], + "response_format": {"type": "json_object"}, + } + json_chat_param.update(chat_param) + + response = test_streaming_chat_base(openai_client, json_chat_param) + try: + json.loads(response) + is_valid = True + except ValueError: + is_valid = False + + assert is_valid, f"json_schema streaming response: {response} is not a valid json" + + response = test_non_streaming_chat_base(openai_client, json_chat_param) + try: + json.loads(response) + is_valid = True + except ValueError: + is_valid = False + + assert is_valid, f"json_schema non_streaming response: {response} is not a valid json" + + # json_schema + from enum import Enum + + from pydantic import BaseModel + + class BookType(str, Enum): + romance = "Romance" + historical = "Historical" + adventure = "Adventure" + mystery = "Mystery" + dystopian = "Dystopian" + + class BookDescription(BaseModel): + author: str + title: str + genre: BookType + + json_schema_param = { + "messages": [ + { + "role": "user", + "content": "Generate a JSON describing a literary work, including author, title and book type.", + } + ], + "response_format": { + "type": "json_schema", + "json_schema": {"name": "book-description", "schema": BookDescription.model_json_schema()}, + }, + } + json_schema_param.update(chat_param) + response = test_streaming_chat_base(openai_client, json_schema_param) + try: + json_schema_response = json.loads(response) + is_valid = True + except ValueError: + is_valid = False + + assert is_valid, f"json_schema streaming response: {response} is not a valid json" + assert ( + "author" in json_schema_response and "title" in json_schema_response and "genre" in json_schema_response + ), f"json_schema streaming response: {response} is not a valid book-description" + assert json_schema_response["genre"] in { + genre.value for genre in BookType + }, f"json_schema streaming response: {json_schema_response['genre']} is not a valid book-type" + + response = test_non_streaming_chat_base(openai_client, json_schema_param) + try: + json_schema_response = json.loads(response) + is_valid = True + except ValueError: + is_valid = False + + assert is_valid, f"json_schema non_streaming response: {response} is not a valid json" + assert ( + "author" in json_schema_response and "title" in json_schema_response and "genre" in json_schema_response + ), f"json_schema non_streaming response: {response} is not a valid book-description" + assert json_schema_response["genre"] in { + genre.value for genre in BookType + }, f"json_schema non_streaming response: {json_schema_response['genre']} is not a valid book-type" + + +def test_structured_outputs_structural_tag(openai_client): + """ + Test structured outputs structural_tag functionality with the local service + """ + content_str = """ + You have the following function available: + + { + "name": "get_current_date", + "description": "Get current date and time for given timezone", + "parameters": { + "type": "object", + "properties": { + "timezone": { + "type": "string", + "description": "Timezone to get current date/time, e.g.: Asia/Shanghai", + } + }, + "required": ["timezone"], + } + } + + If you choose to call only this function, reply in this format: + <{start_tag}={function_name}>{parameters}{end_tag} + where: + + start_tag => ` JSON dictionary with parameter names as keys + end_tag => `` + + Example: + {"param": "value"} + + Note: + - Function call must follow specified format + - Required parameters must be specified + - Only one function can be called at a time + - Place entire function call response on a single line + + You are an AI assistant. Answer the following question. + """ + + structural_tag_param = { + "temperature": 1, + "max_tokens": 1024, + "messages": [ + { + "role": "system", + "content": content_str, + }, + { + "role": "user", + "content": "You're traveling to Shanghai today", + }, + ], + "response_format": { + "type": "structural_tag", + "structures": [ + { + "begin": "", + "schema": { + "type": "object", + "properties": { + "timezone": { + "type": "string", + "description": "Timezone to get current date/time, e.g.: Asia/Shanghai", + } + }, + "required": ["timezone"], + }, + "end": "", + } + ], + "triggers": ["" text "" + + style_attribute ::= " style=" dq style_value dq + + style_value ::= (font_style ("; " font_weight)?) | (font_weight ("; " font_style)?) + + font_style ::= "font-family: '" font_name "'" + + font_weight ::= "font-weight: " weight_value + + font_name ::= "Arial" | "Times New Roman" | "Courier New" + + weight_value ::= "normal" | "bold" + + text ::= [A-Za-z0-9 ]+ + + dq ::= ["] + """ + + grammar_param = { + "temperature": 1, + "max_tokens": 1024, + "messages": [ + { + "role": "user", + "content": "Generate HTML code for this heading in bold Times New Roman font: ERNIE Bot", + } + ], + "extra_body": {"guided_grammar": html_h1_grammar}, + } + + import re + + pattern = r'^[A-Za-z0-9 ]+$' + response = test_streaming_chat_base(openai_client, grammar_param) + assert re.fullmatch(pattern, response), f"grammar streaming response: {response} is not as expected" + response = test_non_streaming_chat_base(openai_client, grammar_param) + assert re.fullmatch(pattern, response), f"grammar non_streaming response: {response} is not as expected" diff --git a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py index fb31a655f8..dc1e906283 100644 --- a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py +++ b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py @@ -118,6 +118,8 @@ def setup_and_run_server(): "wint4", "--reasoning-parser", "ernie-45-vl", + "--guided-decoding-backend", + "auto", ] # Start subprocess in new process group @@ -535,3 +537,342 @@ def test_chat_with_thinking(openai_client, capsys): total_tokens += len(delta_message.completion_token_ids) assert completion_tokens + reasoning_tokens == total_tokens assert reasoning_tokens <= reasoning_max_tokens + + +def test_streaming_chat_base(openai_client, chat_param): + """ + Test streaming chat base functionality with the local service + """ + assert isinstance(chat_param, dict), f"{chat_param} should be a dict" + assert "messages" in chat_param, f"{chat_param} should contain messages" + + response = openai_client.chat.completions.create( + model="default", + stream=True, + **chat_param, + ) + + output = [] + for chunk in response: + if hasattr(chunk.choices[0], "delta") and hasattr(chunk.choices[0].delta, "content"): + output.append(chunk.choices[0].delta.content) + assert len(output) > 2 + return "".join(output) + + +def test_non_streaming_chat_base(openai_client, chat_param): + """ + Test non streaming chat base functionality with the local service + """ + assert isinstance(chat_param, dict), f"{chat_param} should be a dict" + assert "messages" in chat_param, f"{chat_param} should contain messages" + + response = openai_client.chat.completions.create( + model="default", + stream=False, + **chat_param, + ) + + assert hasattr(response, "choices") + assert len(response.choices) > 0 + assert hasattr(response.choices[0], "message") + assert hasattr(response.choices[0].message, "content") + return response.choices[0].message.content + + +def test_structured_outputs_json_schema(openai_client): + """ + Test structured outputs json_schema functionality with the local service + """ + chat_param = { + "temperature": 1, + "max_tokens": 1024, + } + + # json_object + json_chat_param = { + "messages": [ + {"role": "system", "content": "You are a helpful AI assistant."}, + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", + "detail": "high", + }, + }, + {"type": "text", "text": "请描述图片内容,使用json格式输出结果"}, + ], + }, + ], + "response_format": {"type": "json_object"}, + } + json_chat_param.update(chat_param) + + outputs = [] + outputs.append(test_streaming_chat_base(openai_client, json_chat_param)) + outputs.append(test_non_streaming_chat_base(openai_client, json_chat_param)) + + json_chat_param["extra_body"] = {"chat_template_kwargs": {"enable_thinking": False}} + outputs.append(test_streaming_chat_base(openai_client, json_chat_param)) + outputs.append(test_non_streaming_chat_base(openai_client, json_chat_param)) + + for response in outputs: + try: + json.loads(response) + is_valid = True + except ValueError: + is_valid = False + + assert is_valid, f"json_object response: {response} is not a valid json" + + # json_schema + from enum import Enum + + from pydantic import BaseModel + + class BookType(str, Enum): + romance = "Romance" + historical = "Historical" + adventure = "Adventure" + mystery = "Mystery" + dystopian = "Dystopian" + + class BookDescription(BaseModel): + author: str + title: str + genre: BookType + + json_schema_param = { + "messages": [ + { + "role": "user", + "content": "Generate a JSON describing a literary work, including author, title and book type.", + } + ], + "response_format": { + "type": "json_schema", + "json_schema": {"name": "book-description", "schema": BookDescription.model_json_schema()}, + }, + } + json_schema_param.update(chat_param) + response = test_streaming_chat_base(openai_client, json_schema_param) + try: + json_schema_response = json.loads(response) + is_valid = True + except ValueError: + is_valid = False + + assert is_valid, f"json_schema streaming response: {response} is not a valid json" + assert ( + "author" in json_schema_response and "title" in json_schema_response and "genre" in json_schema_response + ), f"json_schema streaming response: {response} is not a valid book-description" + assert json_schema_response["genre"] in { + genre.value for genre in BookType + }, f"json_schema streaming response: {json_schema_response['genre']} is not a valid book-type" + + response = test_non_streaming_chat_base(openai_client, json_schema_param) + try: + json_schema_response = json.loads(response) + is_valid = True + except ValueError: + is_valid = False + + assert is_valid, f"json_schema non_streaming response: {response} is not a valid json" + assert ( + "author" in json_schema_response and "title" in json_schema_response and "genre" in json_schema_response + ), f"json_schema non_streaming response: {response} is not a valid book-description" + assert json_schema_response["genre"] in { + genre.value for genre in BookType + }, f"json_schema non_streaming response: {json_schema_response['genre']} is not a valid book-type" + + +def test_structured_outputs_structural_tag(openai_client): + """ + Test structured outputs structural_tag functionality with the local service + """ + content_str = """ + You have the following function available: + + { + "name": "get_current_date", + "description": "Get current date and time for given timezone", + "parameters": { + "type": "object", + "properties": { + "timezone": { + "type": "string", + "description": "Timezone to get current date/time, e.g.: Asia/Shanghai", + } + }, + "required": ["timezone"], + } + } + + If you choose to call only this function, reply in this format: + <{start_tag}={function_name}>{parameters}{end_tag} + where: + + start_tag => ` JSON dictionary with parameter names as keys + end_tag => `` + + Example: + {"param": "value"} + + Note: + - Function call must follow specified format + - Required parameters must be specified + - Only one function can be called at a time + - Place entire function call response on a single line + + You are an AI assistant. Answer the following question. + """ + + structural_tag_param = { + "temperature": 1, + "max_tokens": 1024, + "messages": [ + { + "role": "system", + "content": content_str, + }, + { + "role": "user", + "content": "You're traveling to Shanghai today", + }, + ], + "response_format": { + "type": "structural_tag", + "structures": [ + { + "begin": "", + "schema": { + "type": "object", + "properties": { + "timezone": { + "type": "string", + "description": "Timezone to get current date/time, e.g.: Asia/Shanghai", + } + }, + "required": ["timezone"], + }, + "end": "", + } + ], + "triggers": ["" text "" + + style_attribute ::= " style=" dq style_value dq + + style_value ::= (font_style ("; " font_weight)?) | (font_weight ("; " font_style)?) + + font_style ::= "font-family: '" font_name "'" + + font_weight ::= "font-weight: " weight_value + + font_name ::= "Arial" | "Times New Roman" | "Courier New" + + weight_value ::= "normal" | "bold" + + text ::= [A-Za-z0-9 ]+ + + dq ::= ["] + """ + + grammar_param = { + "temperature": 1, + "max_tokens": 1024, + "messages": [ + { + "role": "user", + "content": "Generate HTML code for this heading in bold Times New Roman font: ERNIE Bot", + } + ], + "extra_body": {"guided_grammar": html_h1_grammar}, + } + + import re + + pattern = r'^[A-Za-z0-9 ]+$' + response = test_streaming_chat_base(openai_client, grammar_param) + assert re.fullmatch(pattern, response), f"grammar streaming response: {response} is not as expected" + response = test_non_streaming_chat_base(openai_client, grammar_param) + assert re.fullmatch(pattern, response), f"grammar non_streaming response: {response} is not as expected" From 3ff2a4def367edd5ae76203eb4a462f990f4e41b Mon Sep 17 00:00:00 2001 From: kevin Date: Fri, 8 Aug 2025 17:41:42 +0800 Subject: [PATCH 10/20] add ci install xgrammar --- scripts/run_pre_ce.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/run_pre_ce.sh b/scripts/run_pre_ce.sh index 726b91e857..4ffd041ef9 100644 --- a/scripts/run_pre_ce.sh +++ b/scripts/run_pre_ce.sh @@ -7,6 +7,7 @@ python -m pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/p python -m pip install -r requirements.txt python -m pip install jsonschema aistudio_sdk==0.3.5 +python -m pip install xgrammar==0.1.19 failed_files=() run_path="$DIR/../test/ci_use/" From 83df9a4b49d3e75daccaf48311c96500525f9013 Mon Sep 17 00:00:00 2001 From: kevin Date: Fri, 8 Aug 2025 19:21:54 +0800 Subject: [PATCH 11/20] add ci timeout time --- scripts/run_pre_ce.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/run_pre_ce.sh b/scripts/run_pre_ce.sh index 4ffd041ef9..b2f2564d46 100644 --- a/scripts/run_pre_ce.sh +++ b/scripts/run_pre_ce.sh @@ -25,7 +25,7 @@ for subdir in "$run_path"*/; do echo "------------------------------------------------------------" set +e - timeout 600 python -m pytest --disable-warnings -sv "$file" + timeout 1200 python -m pytest --disable-warnings -sv "$file" exit_code=$? set -e From 9a41035b9720ad1f8c26ea5fa72ef3390e44caca Mon Sep 17 00:00:00 2001 From: kevin Date: Mon, 11 Aug 2025 14:00:57 +0800 Subject: [PATCH 12/20] update test for structured_outputs --- scripts/run_pre_ce.sh | 2 +- test/ci_use/EB_Lite/test_EB_Lite_serving.py | 28 ++++++++-------- .../EB_VL_Lite/test_EB_VL_Lite_serving.py | 32 +++++++++---------- 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/scripts/run_pre_ce.sh b/scripts/run_pre_ce.sh index b2f2564d46..4ffd041ef9 100644 --- a/scripts/run_pre_ce.sh +++ b/scripts/run_pre_ce.sh @@ -25,7 +25,7 @@ for subdir in "$run_path"*/; do echo "------------------------------------------------------------" set +e - timeout 1200 python -m pytest --disable-warnings -sv "$file" + timeout 600 python -m pytest --disable-warnings -sv "$file" exit_code=$? set -e diff --git a/test/ci_use/EB_Lite/test_EB_Lite_serving.py b/test/ci_use/EB_Lite/test_EB_Lite_serving.py index b08821fd89..9c1689fcb3 100644 --- a/test/ci_use/EB_Lite/test_EB_Lite_serving.py +++ b/test/ci_use/EB_Lite/test_EB_Lite_serving.py @@ -944,7 +944,7 @@ def test_streaming_completion_with_bad_words(openai_client, capsys): assert output_0 not in output_1 -def test_streaming_chat_base(openai_client, chat_param): +def streaming_chat_base(openai_client, chat_param): """ Test streaming chat base functionality with the local service """ @@ -965,7 +965,7 @@ def test_streaming_chat_base(openai_client, chat_param): return "".join(output) -def test_non_streaming_chat_base(openai_client, chat_param): +def non_streaming_chat_base(openai_client, chat_param): """ Test non streaming chat base functionality with the local service """ @@ -1006,7 +1006,7 @@ def test_structured_outputs_json_schema(openai_client): } json_chat_param.update(chat_param) - response = test_streaming_chat_base(openai_client, json_chat_param) + response = streaming_chat_base(openai_client, json_chat_param) try: json.loads(response) is_valid = True @@ -1015,7 +1015,7 @@ def test_structured_outputs_json_schema(openai_client): assert is_valid, f"json_schema streaming response: {response} is not a valid json" - response = test_non_streaming_chat_base(openai_client, json_chat_param) + response = non_streaming_chat_base(openai_client, json_chat_param) try: json.loads(response) is_valid = True @@ -1054,7 +1054,7 @@ class BookDescription(BaseModel): }, } json_schema_param.update(chat_param) - response = test_streaming_chat_base(openai_client, json_schema_param) + response = streaming_chat_base(openai_client, json_schema_param) try: json_schema_response = json.loads(response) is_valid = True @@ -1069,7 +1069,7 @@ class BookDescription(BaseModel): genre.value for genre in BookType }, f"json_schema streaming response: {json_schema_response['genre']} is not a valid book-type" - response = test_non_streaming_chat_base(openai_client, json_schema_param) + response = non_streaming_chat_base(openai_client, json_schema_param) try: json_schema_response = json.loads(response) is_valid = True @@ -1163,10 +1163,10 @@ def test_structured_outputs_structural_tag(openai_client): } expect_str = '{"timezone": "Asia/Shanghai"}' - response = test_streaming_chat_base(openai_client, structural_tag_param) + response = streaming_chat_base(openai_client, structural_tag_param) assert response == expect_str, f"structural_tag streaming response: {response} is not as expected" - response = test_non_streaming_chat_base(openai_client, structural_tag_param) + response = non_streaming_chat_base(openai_client, structural_tag_param) assert response == expect_str, f"structural_tag non_streaming response: {response} is not as expected" @@ -1183,14 +1183,14 @@ def test_structured_outputs_choice(openai_client): }, } - response = test_streaming_chat_base(openai_client, choice_param) + response = streaming_chat_base(openai_client, choice_param) assert response in [ "Ping An Finance Centre", "China Resources Headquarters", "KK100", "Diwang Mansion", ], f"choice streaming response: {response} is not as expected" - response = test_non_streaming_chat_base(openai_client, choice_param) + response = non_streaming_chat_base(openai_client, choice_param) assert response in [ "Ping An Finance Centre", "China Resources Headquarters", @@ -1217,11 +1217,11 @@ def test_structured_outputs_regex(openai_client): import re - response = test_streaming_chat_base(openai_client, regex_param) + response = streaming_chat_base(openai_client, regex_param) assert re.fullmatch( r"^https:\/\/www\.[a-zA-Z]+\.com\/?$", response ), f"regex streaming response: {response} is not as expected" - response = test_non_streaming_chat_base(openai_client, regex_param) + response = non_streaming_chat_base(openai_client, regex_param) assert re.fullmatch( r"^https:\/\/www\.[a-zA-Z]+\.com\/?$", response ), f"regex non_streaming response: {response} is not as expected" @@ -1268,7 +1268,7 @@ def test_structured_outputs_grammar(openai_client): import re pattern = r'^[A-Za-z0-9 ]+$' - response = test_streaming_chat_base(openai_client, grammar_param) + response = streaming_chat_base(openai_client, grammar_param) assert re.fullmatch(pattern, response), f"grammar streaming response: {response} is not as expected" - response = test_non_streaming_chat_base(openai_client, grammar_param) + response = non_streaming_chat_base(openai_client, grammar_param) assert re.fullmatch(pattern, response), f"grammar non_streaming response: {response} is not as expected" diff --git a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py index 8fd5cb217c..42dd91ee3e 100644 --- a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py +++ b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py @@ -539,7 +539,7 @@ def test_chat_with_thinking(openai_client, capsys): assert reasoning_tokens <= reasoning_max_tokens -def test_streaming_chat_base(openai_client, chat_param): +def streaming_chat_base(openai_client, chat_param): """ Test streaming chat base functionality with the local service """ @@ -560,7 +560,7 @@ def test_streaming_chat_base(openai_client, chat_param): return "".join(output) -def test_non_streaming_chat_base(openai_client, chat_param): +def non_streaming_chat_base(openai_client, chat_param): """ Test non streaming chat base functionality with the local service """ @@ -612,12 +612,12 @@ def test_structured_outputs_json_schema(openai_client): json_chat_param.update(chat_param) outputs = [] - outputs.append(test_streaming_chat_base(openai_client, json_chat_param)) - outputs.append(test_non_streaming_chat_base(openai_client, json_chat_param)) + outputs.append(streaming_chat_base(openai_client, json_chat_param)) + outputs.append(non_streaming_chat_base(openai_client, json_chat_param)) json_chat_param["extra_body"] = {"chat_template_kwargs": {"enable_thinking": False}} - outputs.append(test_streaming_chat_base(openai_client, json_chat_param)) - outputs.append(test_non_streaming_chat_base(openai_client, json_chat_param)) + outputs.append(streaming_chat_base(openai_client, json_chat_param)) + outputs.append(non_streaming_chat_base(openai_client, json_chat_param)) for response in outputs: try: @@ -658,7 +658,7 @@ class BookDescription(BaseModel): }, } json_schema_param.update(chat_param) - response = test_streaming_chat_base(openai_client, json_schema_param) + response = streaming_chat_base(openai_client, json_schema_param) try: json_schema_response = json.loads(response) is_valid = True @@ -673,7 +673,7 @@ class BookDescription(BaseModel): genre.value for genre in BookType }, f"json_schema streaming response: {json_schema_response['genre']} is not a valid book-type" - response = test_non_streaming_chat_base(openai_client, json_schema_param) + response = non_streaming_chat_base(openai_client, json_schema_param) try: json_schema_response = json.loads(response) is_valid = True @@ -767,10 +767,10 @@ def test_structured_outputs_structural_tag(openai_client): } expect_str = '{"timezone": "Asia/Shanghai"}' - response = test_streaming_chat_base(openai_client, structural_tag_param) + response = streaming_chat_base(openai_client, structural_tag_param) assert response == expect_str, f"structural_tag streaming response: {response} is not as expected" - response = test_non_streaming_chat_base(openai_client, structural_tag_param) + response = non_streaming_chat_base(openai_client, structural_tag_param) assert response == expect_str, f"structural_tag non_streaming response: {response} is not as expected" @@ -787,14 +787,14 @@ def test_structured_outputs_choice(openai_client): }, } - response = test_streaming_chat_base(openai_client, choice_param) + response = streaming_chat_base(openai_client, choice_param) assert response in [ "Ping An Finance Centre", "China Resources Headquarters", "KK100", "Diwang Mansion", ], f"choice streaming response: {response} is not as expected" - response = test_non_streaming_chat_base(openai_client, choice_param) + response = non_streaming_chat_base(openai_client, choice_param) assert response in [ "Ping An Finance Centre", "China Resources Headquarters", @@ -821,11 +821,11 @@ def test_structured_outputs_regex(openai_client): import re - response = test_streaming_chat_base(openai_client, regex_param) + response = streaming_chat_base(openai_client, regex_param) assert re.fullmatch( r"^https:\/\/www\.[a-zA-Z]+\.com\/?$", response ), f"regex streaming response: {response} is not as expected" - response = test_non_streaming_chat_base(openai_client, regex_param) + response = non_streaming_chat_base(openai_client, regex_param) assert re.fullmatch( r"^https:\/\/www\.[a-zA-Z]+\.com\/?$", response ), f"regex non_streaming response: {response} is not as expected" @@ -872,7 +872,7 @@ def test_structured_outputs_grammar(openai_client): import re pattern = r'^[A-Za-z0-9 ]+$' - response = test_streaming_chat_base(openai_client, grammar_param) + response = streaming_chat_base(openai_client, grammar_param) assert re.fullmatch(pattern, response), f"grammar streaming response: {response} is not as expected" - response = test_non_streaming_chat_base(openai_client, grammar_param) + response = non_streaming_chat_base(openai_client, grammar_param) assert re.fullmatch(pattern, response), f"grammar non_streaming response: {response} is not as expected" From f0ea9993737c8e5b6ea17169f443d485ada1e7f8 Mon Sep 17 00:00:00 2001 From: kevin Date: Mon, 11 Aug 2025 14:20:22 +0800 Subject: [PATCH 13/20] update code --- fastdeploy/worker/gpu_model_runner.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 4fbbfb00f2..4e7f344ee6 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -272,7 +272,9 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = ) input_ids = request.prompt_token_ids + request.output_token_ids - logger.debug(f"Handle prefill request {request} at idx {idx} prefill_start_index {prefill_start_index} prefill_end_index {prefill_end_index} need_prefilled_token_num {len(input_ids)}") + logger.debug( + f"Handle prefill request {request} at idx {idx} prefill_start_index {prefill_start_index} prefill_end_index {prefill_end_index} need_prefilled_token_num {len(input_ids)}" + ) self.share_inputs["input_ids"][idx : idx + 1, :length] = np.array( input_ids[prefill_start_index:prefill_end_index] ) From 1fe01e7a8110e07260b3dba17b689ff37bb69b58 Mon Sep 17 00:00:00 2001 From: kevin Date: Fri, 15 Aug 2025 10:22:13 +0800 Subject: [PATCH 14/20] add error traceback info --- fastdeploy/cache_manager/cache_messager.py | 3 +- .../cache_manager/cache_transfer_manager.py | 3 +- .../cache_manager/prefix_cache_manager.py | 13 +++++---- fastdeploy/engine/engine.py | 8 ++--- fastdeploy/engine/expert_service.py | 4 +-- .../engine/sched/resource_manager_v1.py | 5 ++-- fastdeploy/entrypoints/api_server.py | 3 +- fastdeploy/entrypoints/engine_client.py | 5 ++-- fastdeploy/entrypoints/llm.py | 2 +- fastdeploy/entrypoints/openai/api_server.py | 5 ++-- fastdeploy/entrypoints/openai/serving_chat.py | 26 ++++++++++++----- .../entrypoints/openai/serving_completion.py | 29 +++++++++++++++---- .../tool_parsers/ernie_x1_tool_parser.py | 19 ++++++++---- fastdeploy/input/ernie_vl_processor.py | 4 ++- .../inter_communicator/engine_cache_queue.py | 3 +- fastdeploy/inter_communicator/zmq_client.py | 9 +++--- .../guided_decoding/base_guided_decoding.py | 3 +- .../guided_decoding/xgrammar_backend.py | 9 +++--- fastdeploy/output/token_processor.py | 4 +-- fastdeploy/platforms/cuda.py | 5 +++- fastdeploy/platforms/dcu.py | 5 +++- fastdeploy/platforms/gcu.py | 5 +++- fastdeploy/platforms/maca.py | 4 ++- fastdeploy/platforms/xpu.py | 5 +++- fastdeploy/scheduler/global_scheduler.py | 2 +- fastdeploy/scheduler/splitwise_scheduler.py | 24 ++++++++------- fastdeploy/splitwise/splitwise_connector.py | 7 +++-- fastdeploy/worker/utils.py | 3 +- test/ce/accuracy_cases/gsm8k.py | 3 +- test/ce/deploy/deploy.py | 27 ++++++++++++----- 30 files changed, 164 insertions(+), 83 deletions(-) diff --git a/fastdeploy/cache_manager/cache_messager.py b/fastdeploy/cache_manager/cache_messager.py index 456ba1c342..409941f7d8 100644 --- a/fastdeploy/cache_manager/cache_messager.py +++ b/fastdeploy/cache_manager/cache_messager.py @@ -17,6 +17,7 @@ import math import threading import time +import traceback import numpy as np import paddle @@ -309,4 +310,4 @@ def _prefill_layerwise_send_cache_thread(self): self.last_layer_idx = prefilled_layer_idx except Exception as e: - logger.error(f"prefill layerwise send cache thread has exception: {e}") + logger.error(f"prefill layerwise send cache thread has exception: {e}, {str(traceback.format_exc())}") diff --git a/fastdeploy/cache_manager/cache_transfer_manager.py b/fastdeploy/cache_manager/cache_transfer_manager.py index 34ccf144ca..5078a513dd 100644 --- a/fastdeploy/cache_manager/cache_transfer_manager.py +++ b/fastdeploy/cache_manager/cache_transfer_manager.py @@ -19,6 +19,7 @@ import json import queue import time +import traceback import numpy as np import paddle @@ -342,7 +343,7 @@ def do_data_transfer(self): if self.rank == 0: self.cache_task_queue.barrier3.reset() except Exception as e: - logger.info(f"do_data_transfer: error: {e}") + logger.info(f"do_data_transfer: error: {e}, {str(traceback.format_exc())}") def _transfer_data( self, diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py index f033a565c9..e57f0f43b8 100644 --- a/fastdeploy/cache_manager/prefix_cache_manager.py +++ b/fastdeploy/cache_manager/prefix_cache_manager.py @@ -20,6 +20,7 @@ import sys import threading import time +import traceback import uuid from collections import defaultdict from concurrent.futures import ThreadPoolExecutor @@ -469,7 +470,7 @@ def update_cache_blocks(self, task, block_size): self.leaf_req_map[leaf_node].add(req_id) self.cache_info[req_id] = (leaf_node, input_ids) except Exception as e: - logger.error(f"update_cache_blocks, error: {type(e)} {e}") + logger.error(f"update_cache_blocks, error: {type(e)} {e}, {str(traceback.format_exc())}") raise e def request_match_blocks(self, task, block_size, *args): @@ -555,7 +556,7 @@ def request_match_blocks(self, task, block_size, *args): ) return common_block_ids, matched_token_num, hit_info except Exception as e: - logger.error(f"request_block_ids: error: {type(e)} {e}") + logger.error(f"request_block_ids: error: {type(e)} {e}, {str(traceback.format_exc())}") raise e def request_block_ids(self, task, block_size, dec_token_num, *args): @@ -660,7 +661,7 @@ def request_block_ids(self, task, block_size, dec_token_num, *args): ) return common_block_ids, unique_block_ids, hit_info except Exception as e: - logger.error(f"request_block_ids: error: {type(e)} {e}") + logger.error(f"request_block_ids: error: {type(e)} {e}, {str(traceback.format_exc())}") raise e def release_block_ids_async(self, task): @@ -709,7 +710,7 @@ def release_block_ids(self, task): ) return except Exception as e: - logger.error(f"release_block_ids: error: {type(e)} {e}") + logger.error(f"release_block_ids: error: {type(e)} {e}, {str(traceback.format_exc())}") raise e def _handle_free_gpu_node_without_cpu(self, node): @@ -899,7 +900,7 @@ def free_block_ids_async(self, need_block_num): else: self.gpu_free_task_future = None except Exception as e: - logger.error(f"free_block_ids_async: error: {type(e)} {e}") + logger.error(f"free_block_ids_async: error: {type(e)} {e}, {str(traceback.format_exc())}") raise e def free_cpu_block_ids(self, need_block_num): @@ -1218,5 +1219,5 @@ def recv_data_transfer_result(self): + f"task_cpu_block_id {task_cpu_block_id} event_type {event_type} done" ) except Exception as e: - logger.warning(f"recv_data_transfer_result: error: {e}") + logger.warning(f"recv_data_transfer_result: error: {e}, {str(traceback.format_exc())}") raise e diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index db3bdefffe..c3149b55d2 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -600,7 +600,7 @@ def receiver_loop(): time.sleep(0.001) except Exception as e: - llm_logger.error(f"Error in main loop: {e}") + llm_logger.error(f"Error in main loop: {e}, {str(traceback.format_exc())}") time.sleep(0.1) threading.Thread(target=receiver_loop, daemon=True).start() @@ -987,7 +987,7 @@ def _exit_sub_services(self): try: os.killpg(p.pid, signal.SIGTERM) except Exception as e: - print(f"Error extracting file: {e}") + print(f"Error extracting file: {e}, {str(traceback.format_exc())}") self.worker_ready_signal.clear() self.exist_task_signal.clear() self.exist_swapped_task_signal.clear() @@ -1000,7 +1000,7 @@ def _exit_sub_services(self): try: os.killpg(self.worker_proc.pid, signal.SIGTERM) except Exception as e: - print(f"Error extracting sub services: {e}") + print(f"Error extracting sub services: {e}, {str(traceback.format_exc())}") self.engine_worker_queue.cleanup() if hasattr(self, "zmq_server") and self.zmq_server is not None: @@ -1175,7 +1175,7 @@ def generate(self, prompts, stream): try: req_id = self._format_and_add_data(prompts) except Exception as e: - llm_logger.error(f"Error happend while adding request, details={e}") + llm_logger.error(f"Error happend while adding request, details={e}, {str(traceback.format_exc())}") raise EngineError(str(e), error_code=400) # Get the result of the current request diff --git a/fastdeploy/engine/expert_service.py b/fastdeploy/engine/expert_service.py index 9cf5f97f7f..2ed5f8924a 100644 --- a/fastdeploy/engine/expert_service.py +++ b/fastdeploy/engine/expert_service.py @@ -269,7 +269,7 @@ def receiver_loop(): time.sleep(0.001) continue except Exception as e: - llm_logger.error(f"get decode tasks error: {e}") + llm_logger.error(f"get decode tasks error: {e}, {str(traceback.format_exc())}") threading.Thread(target=receiver_loop, daemon=True).start() @@ -378,4 +378,4 @@ def start_expert_service(cfg, local_data_parallel_id, ipc_signal_suffix): expert_service.start(ipc_signal_suffix, local_data_parallel_id) expert_service.split_connector.start_receiver() except Exception as e: - llm_logger.exception(f"Expert service failed to start: {e}") + llm_logger.exception(f"Expert service failed to start: {e}, {str(traceback.format_exc())}") diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index 26eba4ae09..ec8703ee0d 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -16,6 +16,7 @@ import threading import time +import traceback from collections import deque from collections.abc import Iterable from concurrent.futures import ThreadPoolExecutor @@ -389,7 +390,7 @@ def get_prefix_cached_blocks(self, request: Request): request.cache_prepare_time = time.time() - cache_prepare_time return True except Exception as e: - llm_logger.error(f"prefix match blocks error: {e}, waiting reschedule...") + llm_logger.error(f"prefix match blocks error: {e}, {str(traceback.format_exc())} waiting reschedule...") return False def add_request(self, request: Request) -> None: @@ -441,4 +442,4 @@ def finish_requests(self, request_ids: Union[str, Iterable[str]]): self.stop_flags[request.idx] = True del self.requests[req_id] except Exception as e: - llm_logger.error(e) + llm_logger.error(f"finish_request err: {e}, {str(traceback.format_exc())}") diff --git a/fastdeploy/entrypoints/api_server.py b/fastdeploy/entrypoints/api_server.py index f27c008314..4f4d7f2250 100644 --- a/fastdeploy/entrypoints/api_server.py +++ b/fastdeploy/entrypoints/api_server.py @@ -15,6 +15,7 @@ """ import json +import traceback import uvicorn from fastapi import FastAPI @@ -114,7 +115,7 @@ def launch_api_server(args) -> None: log_level="info", ) # set log level to error to avoid log except Exception as e: - api_server_logger.error(f"launch sync http server error, {e}") + api_server_logger.error(f"launch sync http server error, {e}, {str(traceback.format_exc())}") def main(): diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py index daed93b8f9..cf1ebdd297 100644 --- a/fastdeploy/entrypoints/engine_client.py +++ b/fastdeploy/entrypoints/engine_client.py @@ -15,6 +15,7 @@ """ import time +import traceback import uuid import numpy as np @@ -141,7 +142,7 @@ def add_requests(self, task): work_process_metrics.prompt_tokens_total.inc(input_ids_len) work_process_metrics.request_prompt_tokens.observe(input_ids_len) except Exception as e: - api_server_logger.error(e) + api_server_logger.error(f"add_requests error: {e}, {str(traceback.format_exc())}") raise EngineError(str(e), error_code=400) if input_ids_len + min_tokens >= self.max_model_len: @@ -194,7 +195,7 @@ def add_requests(self, task): else: self.zmq_client.send_pyobj(task) except Exception as e: - api_server_logger.error(e) + api_server_logger.error(f"zmq_client send task error: {e}, {str(traceback.format_exc())}") raise EngineError(str(e), error_code=400) def vaild_parameters(self, data): diff --git a/fastdeploy/entrypoints/llm.py b/fastdeploy/entrypoints/llm.py index 001cfad3e0..dd48e6d00e 100644 --- a/fastdeploy/entrypoints/llm.py +++ b/fastdeploy/entrypoints/llm.py @@ -341,7 +341,7 @@ def _build_sample_logprobs(self, logprobs_lists: LogprobsLists, topk_logprobs: i return result except Exception as e: - llm_logger.error(f"Error building sample logprobs from LogprobsLists: {e}") + llm_logger.error(f"Error building sample logprobs from LogprobsLists: {e}, {str(traceback.format_exc())}") def _run_engine(self, req_ids: list[str], use_tqdm: bool, topk_logprobs: Optional[int] = None): """ diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py index 2a4c0e7aba..6a5355f102 100644 --- a/fastdeploy/entrypoints/openai/api_server.py +++ b/fastdeploy/entrypoints/openai/api_server.py @@ -18,6 +18,7 @@ import os import threading import time +import traceback from collections.abc import AsyncGenerator from contextlib import asynccontextmanager from multiprocessing import current_process @@ -155,7 +156,7 @@ async def lifespan(app: FastAPI): multiprocess.mark_process_dead(os.getpid()) api_server_logger.info(f"Closing metrics client pid: {pid}") except Exception as e: - api_server_logger.warning(e) + api_server_logger.warning(f"exit error: {e}, {str(traceback.format_exc())}") app = FastAPI(lifespan=lifespan) @@ -349,7 +350,7 @@ def launch_api_server() -> None: log_level="info", ) # set log level to error to avoid log except Exception as e: - api_server_logger.error(f"launch sync http server error, {e}") + api_server_logger.error(f"launch sync http server error, {e}, {str(traceback.format_exc())}") metrics_app = FastAPI() diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index b14f28e627..91751fd1c0 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -92,7 +92,9 @@ async def create_chat_completion(self, request: ChatCompletionRequest): if isinstance(prompt_token_ids, np.ndarray): prompt_token_ids = prompt_token_ids.tolist() except Exception as e: - return ErrorResponse(code=400, message=str(e)) + error_msg = f"request[{request_id}] send to infer error: {str(e)}, {str(traceback.format_exc())}" + api_server_logger.error(error_msg) + return ErrorResponse(code=400, message=error_msg) del current_req_dict try: @@ -101,8 +103,13 @@ async def create_chat_completion(self, request: ChatCompletionRequest): await self.engine_client.semaphore.acquire() else: await asyncio.wait_for(self.engine_client.semaphore.acquire(), timeout=self.max_waiting_time) - except Exception: - return ErrorResponse(code=408, message=f"Request queued time exceed {self.max_waiting_time}") + except Exception as e: + error_msg = ( + f"request[{request_id}] waiting error: {str(e)}, {str(traceback.format_exc())}, " + f"max waiting time: {self.max_waiting_time}" + ) + api_server_logger.error(error_msg) + return ErrorResponse(code=408, message=error_msg) if request.stream: return self.chat_completion_stream_generator( @@ -114,9 +121,12 @@ async def create_chat_completion(self, request: ChatCompletionRequest): request, request_id, request.model, prompt_token_ids, text_after_process ) except Exception as e: - return ErrorResponse(code=400, message=str(e)) + error_msg = f"request[{request_id}] generator error: {str(e)}, {str(traceback.format_exc())}" + api_server_logger.error(error_msg) + return ErrorResponse(code=400, message=error_msg) def _create_streaming_error_response(self, message: str) -> str: + api_server_logger.error(message) error_response = ErrorResponse( code=400, message=message, @@ -334,7 +344,9 @@ async def chat_completion_stream_generator( yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" except Exception as e: - error_data = self._create_streaming_error_response(str(e)) + error_data = self._create_streaming_error_response( + f"equest[{request_id}] generate stream error: {str(e)}, {str(traceback.format_exc())}" + ) yield f"data: {error_data}\n\n" finally: dealer.close() @@ -553,6 +565,6 @@ def _build_logprobs_response( return LogProbs(content=[sampled_entry]) except Exception as e: - api_server_logger.error("Error in _build_logprobs_response: %s", e) - api_server_logger.error(traceback.format_exc()) + error_msg = f"Error in _build_logprobs_response: {e}, {str(traceback.format_exc())}" + api_server_logger.error(error_msg) return None diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index a6aadcf060..896fb6aa32 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -16,6 +16,7 @@ import asyncio import time +import traceback import uuid from typing import List, Optional @@ -92,7 +93,9 @@ async def create_completion(self, request: CompletionRequest): else: raise ValueError("Prompt must be a string, a list of strings or a list of integers.") except Exception as e: - return ErrorResponse(message=str(e), code=400) + error_msg = f"OpenAIServingCompletion create_completion: {e}, {str(traceback.format_exc())}" + api_server_logger.error(error_msg) + return ErrorResponse(message=error_msg, code=400) if request_prompt_ids is not None: request_prompts = request_prompt_ids @@ -113,6 +116,8 @@ async def create_completion(self, request: CompletionRequest): text_after_process_list.append(current_req_dict.get("text_after_process")) prompt_batched_token_ids.append(prompt_token_ids) except Exception as e: + error_msg = f"OpenAIServingCompletion format error: {e}, {str(traceback.format_exc())}" + api_server_logger.error(error_msg) return ErrorResponse(message=str(e), code=400) del current_req_dict @@ -122,8 +127,13 @@ async def create_completion(self, request: CompletionRequest): await self.engine_client.semaphore.acquire() else: await asyncio.wait_for(self.engine_client.semaphore.acquire(), timeout=self.max_waiting_time) - except Exception: - return ErrorResponse(code=408, message=f"Request queued time exceed {self.max_waiting_time}") + except Exception as e: + error_msg = ( + f"OpenAIServingCompletion waiting error: {e}, {str(traceback.format_exc())}, " + f"max waiting time: {self.max_waiting_time}" + ) + api_server_logger.error(error_msg) + return ErrorResponse(code=408, message=error_msg) if request.stream: return self.completion_stream_generator( @@ -147,10 +157,16 @@ async def create_completion(self, request: CompletionRequest): text_after_process_list=text_after_process_list, ) except Exception as e: - return ErrorResponse(code=400, message=str(e)) + error_msg = ( + f"OpenAIServingCompletion completion_full_generator error: {e}, {str(traceback.format_exc())}" + ) + api_server_logger.error(error_msg) + return ErrorResponse(code=400, message=error_msg) except Exception as e: - return ErrorResponse(message=str(e), code=400) + error_msg = f"OpenAIServingCompletion create_completion error: {e}, {str(traceback.format_exc())}" + api_server_logger.error(error_msg) + return ErrorResponse(message=error_msg, code=400) async def completion_full_generator( self, @@ -422,6 +438,7 @@ async def completion_stream_generator( choices = [] except Exception as e: + api_server_logger.error(f"Error in completion_stream_generator: {e}, {str(traceback.format_exc())}") yield f"data: {ErrorResponse(message=str(e), code=400).model_dump_json(exclude_unset=True)}\n\n" finally: del request @@ -607,5 +624,5 @@ def _build_logprobs_response( ) except Exception as e: - api_server_logger.error("Error in _build_logprobs_response: %s", e) + api_server_logger.error(f"Error in _build_logprobs_response: {str(e)}, {str(traceback.format_exc())}") return None diff --git a/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py index cec1f68401..6f0534cf1e 100644 --- a/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py +++ b/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py @@ -14,6 +14,7 @@ import json import re +import traceback import uuid from collections.abc import Sequence from typing import Union @@ -162,10 +163,12 @@ def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest) } ) except Exception as e: - data_processor_logger.debug(f"Failed to parse tool call: {str(e)}") + data_processor_logger.error( + f"Failed to parse tool call: {str(e)}, {str(traceback.format_exc())}" + ) continue except Exception as e: - data_processor_logger.debug(f"Failed to parse tool call: {str(e)}") + data_processor_logger.error(f"Failed to parse tool call: {str(e)}, {str(traceback.format_exc())}") continue if not function_call_arr: @@ -211,7 +214,9 @@ def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest) ) except Exception as e: - data_processor_logger.error(f"Error in extracting tool call from response: {str(e)}") + data_processor_logger.error( + f"Error in extracting tool call from response: {str(e)}, {str(traceback.format_exc())}" + ) return ExtractedToolCallInformation(tools_called=False, tool_calls=None, content=model_output) def extract_tool_calls_streaming( @@ -302,7 +307,9 @@ def extract_tool_calls_streaming( self.streamed_args_for_tool[self.current_tool_id] = args_json return delta except Exception as e: - data_processor_logger.debug(f"Partial arguments parsing: {str(e)}") + data_processor_logger.error( + f"Partial arguments parsing: {str(e)}, {str(traceback.format_exc())}" + ) if "" in self.buffer: end_pos = self.buffer.find("") @@ -316,5 +323,7 @@ def extract_tool_calls_streaming( return delta except Exception as e: - data_processor_logger.error(f"Error in streaming tool call extraction: {str(e)}") + data_processor_logger.error( + f"Error in streaming tool call extraction: {str(e)}, {str(traceback.format_exc())}" + ) return None diff --git a/fastdeploy/input/ernie_vl_processor.py b/fastdeploy/input/ernie_vl_processor.py index e8239f7adb..11472fe7aa 100644 --- a/fastdeploy/input/ernie_vl_processor.py +++ b/fastdeploy/input/ernie_vl_processor.py @@ -14,6 +14,8 @@ # limitations under the License. """ +import traceback + import numpy as np from paddleformers.generation import GenerationConfig @@ -151,7 +153,7 @@ def _parse_processor_kwargs(self, kwargs): return kwargs except Exception as e: - data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}") + data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}, {str(traceback.format_exc())}") return {} def _parse_limits(self, limits): diff --git a/fastdeploy/inter_communicator/engine_cache_queue.py b/fastdeploy/inter_communicator/engine_cache_queue.py index 03fae97d7d..6f56550386 100644 --- a/fastdeploy/inter_communicator/engine_cache_queue.py +++ b/fastdeploy/inter_communicator/engine_cache_queue.py @@ -16,6 +16,7 @@ import threading import time +import traceback from multiprocessing.managers import ( AcquirerProxy, BaseManager, @@ -275,5 +276,5 @@ def empty(self): try: return len(self.transfer_task_queue) == 0 except Exception as e: - logger.error(f"empty function meets error: {e}") + logger.error(f"empty function meets error: {e}, {str(traceback.format_exc())}") raise e diff --git a/fastdeploy/inter_communicator/zmq_client.py b/fastdeploy/inter_communicator/zmq_client.py index 05e55929dd..2703efe3a4 100644 --- a/fastdeploy/inter_communicator/zmq_client.py +++ b/fastdeploy/inter_communicator/zmq_client.py @@ -17,6 +17,7 @@ import os import threading import time +import traceback import msgpack import zmq @@ -135,7 +136,7 @@ def send_multipart(self, req_id, data): llm_logger.debug(f"send_multipart result: {req_id} len {len(data)} elapse: {time.time()-start_send}") except Exception as e: - llm_logger.error(f"Send result to zmq client failed: {e}") + llm_logger.error(f"Send result to zmq client failed: {e}, {str(traceback.format_exc())}") if data[-1].finished: with self.mutex: @@ -155,7 +156,7 @@ def receive_json_once(self, block=False): return None, None except Exception as e: self.close() - llm_logger.warning(f"{e}") + llm_logger.warning(f"{e}, {str(traceback.format_exc())}") return str(e), None def receive_pyobj_once(self, block=False): @@ -171,7 +172,7 @@ def receive_pyobj_once(self, block=False): return None, None except Exception as e: self.close() - llm_logger.warning(f"{e}") + llm_logger.warning(f"{e}, {str(traceback.format_exc())}") return str(e), None def _clear_ipc(self, name): @@ -206,7 +207,7 @@ def close(self): self._clear_ipc(self.file_name) self._clear_ipc(self.router_path) except Exception as e: - llm_logger.warning(f"Failed to close ZMQ connection - {e}") + llm_logger.warning(f"Failed to close ZMQ connection - {e}, {str(traceback.format_exc())}") return def __exit__(self, exc_type, exc_val, exc_tb): diff --git a/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py b/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py index 7baf2fe971..b23d0c85d8 100644 --- a/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py +++ b/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py @@ -15,6 +15,7 @@ """ import os +import traceback from concurrent.futures import ThreadPoolExecutor from fastdeploy.config import ErnieArchitectures, FDConfig @@ -300,7 +301,7 @@ def _get_tokenizer_hf(self): return tokenizer except Exception as e: - raise Exception(f"Fail to initialize hf tokenizer: {e}") + raise Exception(f"Fail to initialize hf tokenizer: {e}, {str(traceback.format_exc())}") def add_cache(self, schemata_key: tuple[str, str], processor: LogitsProcessorBase) -> None: """ diff --git a/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py b/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py index f702a1085e..0d448d4293 100644 --- a/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py +++ b/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py @@ -16,6 +16,7 @@ import json import re +import traceback from typing import Any, List, Optional import paddle @@ -263,7 +264,7 @@ def _json_processor(self, schemata: str) -> Optional[XGrammarProcessor]: try: compiled_grammar = self.grammar_compiler.compile_json_schema(schemata, any_whitespace=self.any_whitespace) except Exception as e: - llm_logger.error(f"Failed to compile json schema: {e}") + llm_logger.error(f"Failed to compile json schema: {e}, {str(traceback.format_exc())}") return None return self._create_processor(compiled_grammar) @@ -280,7 +281,7 @@ def _regex_processor(self, schemata: str) -> Optional[XGrammarProcessor]: try: compiled_grammar = self.grammar_compiler.compile_regex(schemata) except Exception as e: - llm_logger.error(f"Failed to compile regex schema: {e}") + llm_logger.error(f"Failed to compile regex schema: {e}, {str(traceback.format_exc())}") return None return self._create_processor(compiled_grammar) @@ -297,7 +298,7 @@ def _grammar_processor(self, schemata: str) -> Optional[XGrammarProcessor]: try: compiled_grammar = self.grammar_compiler.compile_grammar(schemata) except Exception as e: - llm_logger.error(f"Failed to compile ebnf schema: {e}") + llm_logger.error(f"Failed to compile ebnf schema: {e}, {str(traceback.format_exc())}") return None return self._create_processor(compiled_grammar) @@ -324,7 +325,7 @@ def _structural_tag_processor(self, schemata: str) -> Optional[XGrammarProcessor compiled_grammar = self.grammar_compiler.compile_structural_tag(tags, structural_tag["triggers"]) except Exception as e: - llm_logger.error(f"Failed to compile structural tags schema: {e}") + llm_logger.error(f"Failed to compile structural tags schema: {e}, {str(traceback.format_exc())}") return None return self._create_processor(compiled_grammar) diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py index ebb64cebc7..36ab0c362b 100644 --- a/fastdeploy/output/token_processor.py +++ b/fastdeploy/output/token_processor.py @@ -201,7 +201,7 @@ def process_metrics(): self.prefill_time_signal.value[current_index] = 0 current_index += 1 except Exception as e: - llm_logger.error(f"Error processing prefill metrics: {e}") + llm_logger.error(f"Error processing prefill metrics: {e}, {str(traceback.format_exc())}") self.executor.submit(process_metrics) @@ -215,7 +215,7 @@ def postprocess(self, batch_result): try: self.cached_generated_tokens.put_results(batch_result) except Exception as e: - llm_logger.error(f"Error in TokenProcessor's postprocess: {e}") + llm_logger.error(f"Error in TokenProcessor's postprocess: {e}, {str(traceback.format_exc())}") def _recycle_resources(self, task_id, index, task, result=None, is_prefill=False): """ diff --git a/fastdeploy/platforms/cuda.py b/fastdeploy/platforms/cuda.py index 6676d3c0f5..38504134a1 100644 --- a/fastdeploy/platforms/cuda.py +++ b/fastdeploy/platforms/cuda.py @@ -14,6 +14,8 @@ # limitations under the License. """ +import traceback + import paddle from fastdeploy.utils import console_logger as logger @@ -40,7 +42,8 @@ def available(self): logger.warning( "You are using GPU version PaddlePaddle, but there is no GPU " "detected on your machine. Maybe CUDA devices is not set properly." - f"\n Original Error is {e}" + f"\n Original Error is {e}, " + f"{str(traceback.format_exc())}" ) return False diff --git a/fastdeploy/platforms/dcu.py b/fastdeploy/platforms/dcu.py index bfd848335c..c18c45aca4 100644 --- a/fastdeploy/platforms/dcu.py +++ b/fastdeploy/platforms/dcu.py @@ -14,6 +14,8 @@ """ dcu platform file """ +import traceback + import paddle from paddleformers.utils.log import logger @@ -39,7 +41,8 @@ def available(self): logger.warning( "You are using GPU version PaddlePaddle, but there is no GPU " "detected on your machine. Maybe CUDA devices is not set properly." - f"\n Original Error is {e}" + f"\n Original Error is {e}, " + f"{str(traceback.format_exc())}" ) return False diff --git a/fastdeploy/platforms/gcu.py b/fastdeploy/platforms/gcu.py index e812113e1e..76bb170b54 100644 --- a/fastdeploy/platforms/gcu.py +++ b/fastdeploy/platforms/gcu.py @@ -14,6 +14,8 @@ # limitations under the License. """ +import traceback + import paddle from fastdeploy.utils import console_logger as logger @@ -40,7 +42,8 @@ def available(self): logger.warning( "You are using GCUPlatform, but there is no GCU " "detected on your machine. Maybe GCU devices is not set properly." - f"\n Original Error is {e}" + f"\n Original Error is {e}, " + f"{str(traceback.format_exc())}" ) return False diff --git a/fastdeploy/platforms/maca.py b/fastdeploy/platforms/maca.py index f695a3d01a..250cebf6e1 100644 --- a/fastdeploy/platforms/maca.py +++ b/fastdeploy/platforms/maca.py @@ -17,6 +17,7 @@ """ maca platform file """ +import traceback import paddle from paddleformers.utils.log import logger @@ -43,7 +44,8 @@ def available(self): logger.warning( "You are using GPU version PaddlePaddle, but there is no GPU " "detected on your machine. Maybe CUDA devices is not set properly." - f"\n Original Error is {e}" + f"\n Original Error is {e}, " + f"{str(traceback.format_exc())}" ) return False diff --git a/fastdeploy/platforms/xpu.py b/fastdeploy/platforms/xpu.py index 2f31107423..8bc8236359 100644 --- a/fastdeploy/platforms/xpu.py +++ b/fastdeploy/platforms/xpu.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import traceback + import paddle from fastdeploy.utils import console_logger as logger @@ -38,7 +40,8 @@ def available(self): logger.warning( "You are using XPU version PaddlePaddle, but there is no XPU " "detected on your machine. Maybe CUDA devices is not set properly." - f"\n Original Error is {e}" + f"\n Original Error is {e}, " + f"{str(traceback.format_exc())}" ) return False diff --git a/fastdeploy/scheduler/global_scheduler.py b/fastdeploy/scheduler/global_scheduler.py index 8d9b67a6a8..f3962992cc 100644 --- a/fastdeploy/scheduler/global_scheduler.py +++ b/fastdeploy/scheduler/global_scheduler.py @@ -237,7 +237,7 @@ def _keep_alive(self): ) time.sleep(self.keep_alive_duration / 2) except Exception as e: - scheduler_logger.error(f"Scheduler keep alive failed: {e}") + scheduler_logger.error(f"Scheduler keep alive failed: {e}, {str(traceback.format_exc())}") time.sleep(min(3, self.keep_alive_duration / 4)) def _scheduler_name_from_request_queue(self, request_queue: str) -> str: diff --git a/fastdeploy/scheduler/splitwise_scheduler.py b/fastdeploy/scheduler/splitwise_scheduler.py index 61dbd22309..ab1799f440 100644 --- a/fastdeploy/scheduler/splitwise_scheduler.py +++ b/fastdeploy/scheduler/splitwise_scheduler.py @@ -20,6 +20,7 @@ import random import threading import time +import traceback from collections import deque from typing import List @@ -379,7 +380,7 @@ def run(self): if total == 0: time.sleep(0.01) except Exception as e: - logger.error(f"ResultsReader{self.idx} sync results error: {e!s}") + logger.error(f"ResultsReader{self.idx} sync results error: {e!s}, {str(traceback.format_exc())}") def sync_results(self, keys): """ @@ -402,7 +403,7 @@ def sync_results(self, keys): result = RequestOutput.from_dict(data) self.data.appendleft(result) except Exception as e: - logger.error(f"Parse Result Error:{e}, {result}") + logger.error(f"Parse Result Error:{e}, {str(traceback.format_exc())}, {result}") return total @@ -498,7 +499,7 @@ def loop_schedule(self): except IndexError: continue except Exception as e: - logger.error(f"APIScheduler Schedule req error: {e!s}") + logger.error(f"APIScheduler Schedule req error: {e!s}, {str(traceback.format_exc())}") def schedule(self, req, pnodes, dnodes, mnodes, group=""): """ @@ -573,8 +574,8 @@ def loop_clear_expired_nodes(self): # logger.info(f"clear expired nodes: {nodeid}") self.client.hdel(self.cluster_key, nodeid) time.sleep(self.clear_expired_nodes_period) - except Exception: - logger.error("APIScheduler clear expired nodes error: {str(e)}") + except Exception as e: + logger.error(f"APIScheduler clear expired nodes error: {str(e)}, {str(traceback.format_exc())}") def select_pd(self, req, nodes, role): """ @@ -664,7 +665,7 @@ def run(self): # e = time.time() # logger.info(f"Lpush {self.idx}: {key} used {e-s} {len(items)} items") except Exception as e: - logger.error(f"ResultWriter write error: {e!s}") + logger.error(f"ResultWriter write error: {e!s}, {str(traceback.format_exc())}") class InferScheduler: @@ -723,7 +724,7 @@ def routine_report(self): self.client.hset(self.cluster_key, self.nodeid, info) time.sleep(self.sync_period / 1000.0) except Exception as e: - logger.error(f"InferScheduler routine report error: {e!s}") + logger.error(f"InferScheduler routine report error: {e!s}, {str(traceback.format_exc())}") def loop_expire_reqs(self): """ @@ -733,8 +734,8 @@ def loop_expire_reqs(self): try: self.node.expire_reqs(self.release_load_expire_period) time.sleep(60) - except Exception: - logger.error("InferScheduler expire reqs error: {e}") + except Exception as e: + logger.error(f"InferScheduler expire reqs error: {e}, {str(traceback.format_exc())}") def loop_get_reqs(self): """ @@ -772,7 +773,7 @@ def select_writer(req): else: self.node.add_req(req.request_id, 1) except Exception as e: - logger.error(f"InferScheduler loop get reqs error: {e!s}") + logger.error(f"InferScheduler loop get reqs error: {e!s}, {str(traceback.format_exc())}") def get_requests( self, @@ -807,7 +808,8 @@ def get_requests( return reqs # logger.info(f"Get Requests from Scheduler: {req.request_id}") reqs.append(req) - except Exception: + except Exception as e: + logger.error(f"InferScheduler get requests error: {e}, {str(traceback.format_exc())}") return reqs return reqs diff --git a/fastdeploy/splitwise/splitwise_connector.py b/fastdeploy/splitwise/splitwise_connector.py index 6b4c8ce04d..8924c00f56 100644 --- a/fastdeploy/splitwise/splitwise_connector.py +++ b/fastdeploy/splitwise/splitwise_connector.py @@ -16,6 +16,7 @@ import json import time +import traceback from concurrent.futures import ThreadPoolExecutor from typing import Dict @@ -97,7 +98,7 @@ def start_receiver(self): time.sleep(0.001) except Exception as e: - logger.error(f"Receiver error: {e}") + logger.error(f"Receiver error: {e}, {str(traceback.format_exc())}") time.sleep(1) def _get_push_socket(self, addr): @@ -152,7 +153,7 @@ def _send_message(self, addr, msg_type: str, payload): except zmq.Again: logger.warning(f"Send queue full for {addr}") except Exception as e: - logger.error(f"Send to {addr} failed: {e}") + logger.error(f"Send to {addr} failed: {e}, {str(traceback.format_exc())}") self._close_connection(addr) except Exception as e: @@ -433,7 +434,7 @@ def _process_message(self, message: bytes): self.engine_worker_queue.put_cache_info(payload) except Exception as e: - logger.error(f"Message processing failed: {e}") + logger.error(f"Message processing failed: {e}, {str(traceback.format_exc())}") def _handle_prefill(self, tasks): """ diff --git a/fastdeploy/worker/utils.py b/fastdeploy/worker/utils.py index bf727c3bbf..7554c7c08a 100644 --- a/fastdeploy/worker/utils.py +++ b/fastdeploy/worker/utils.py @@ -15,6 +15,7 @@ """ import os +import traceback def check_safetensors_model(model_dir: str): @@ -45,5 +46,5 @@ def check_safetensors_model(model_dir: str): sum(flags) == safetensors_num ), f"Number of safetensor files should be {len(model_files)}, but now it's {sum(flags)}" except Exception as e: - raise Exception(f"Failed to check unified checkpoint, details: {e}.") + raise Exception(f"Failed to check unified checkpoint, details: {e}, {str(traceback.format_exc())}.") return is_safetensors diff --git a/test/ce/accuracy_cases/gsm8k.py b/test/ce/accuracy_cases/gsm8k.py index f156f58c7f..b02e4c9f1a 100644 --- a/test/ce/accuracy_cases/gsm8k.py +++ b/test/ce/accuracy_cases/gsm8k.py @@ -6,6 +6,7 @@ import os import re +import traceback from concurrent.futures import ThreadPoolExecutor, as_completed from urllib.parse import urlparse, urlunparse @@ -120,7 +121,7 @@ def query_model(prompt): ) return response.choices[0].message.content.strip() except Exception as e: - return f"[Error] {e}" + return f"[Error] {e}, {str(traceback.format_exc())}" # ========== 评估函数 ========== diff --git a/test/ce/deploy/deploy.py b/test/ce/deploy/deploy.py index aa305360b8..50e540a997 100644 --- a/test/ce/deploy/deploy.py +++ b/test/ce/deploy/deploy.py @@ -7,6 +7,7 @@ import subprocess import sys import time +import traceback import requests import yaml @@ -175,7 +176,7 @@ def stop_server(signum=None, frame=None): # 终止进程组(包括所有子进程) os.killpg(os.getpgid(pid_port["PID"]), signal.SIGTERM) except Exception as e: - print(f"Failed to stop server: {e}") + print(f"Failed to stop server: {e}, {str(traceback.format_exc())}") for port in [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT]: try: @@ -184,7 +185,7 @@ def stop_server(signum=None, frame=None): os.kill(int(pid), signal.SIGKILL) print(f"Killed process on port {port}, pid={pid}") except Exception as e: - print(f"Failed to killed process on port: {e}") + print(f"Failed to killed process on port: {e}, {str(traceback.format_exc())}") # 若log目录存在,则重命名为log_timestamp if os.path.isdir("./log"): os.rename("./log", "./log_{}".format(time.strftime("%Y%m%d%H%M%S"))) @@ -229,8 +230,10 @@ def start_service(): # 构建命令 cmd = build_command(final_config) except Exception as e: + error_msg = f"Failed to start service: {e}, {str(traceback.format_exc())}" + print(error_msg) return Response( - json.dumps({"status": "error", "message": str(e)}, ensure_ascii=False), + json.dumps({"status": "error", "message": error_msg}, ensure_ascii=False), status=500, content_type="application/json", ) @@ -264,8 +267,10 @@ def start_service(): return Response(json.dumps(json_data, ensure_ascii=False), status=200, content_type="application/json") except Exception as e: + error_msg = f"Failed to start service: {e}, {str(traceback.format_exc())}" + print(error_msg) return Response( - json.dumps({"status": "error", "message": str(e)}, ensure_ascii=False), + json.dumps({"status": "error", "message": error_msg}, ensure_ascii=False), status=500, content_type="application/json", ) @@ -295,8 +300,10 @@ def switch_service(): # 构建命令 cmd = build_command(final_config) except Exception as e: + error_msg = f"Failed to switch service: {e}, {str(traceback.format_exc())}" + print(error_msg) return Response( - json.dumps({"status": "error", "message": str(e)}, ensure_ascii=False), + json.dumps({"status": "error", "message": error_msg}, ensure_ascii=False), status=500, content_type="application/json", ) @@ -330,8 +337,10 @@ def switch_service(): return Response(json.dumps(json_data, ensure_ascii=False), status=200, content_type="application/json") except Exception as e: + error_msg = f"Failed to switch service: {e}, {str(traceback.format_exc())}" + print(error_msg) return Response( - json.dumps({"status": "error", "message": str(e)}, ensure_ascii=False), + json.dumps({"status": "error", "message": error_msg}, ensure_ascii=False), status=500, content_type="application/json", ) @@ -406,8 +415,10 @@ def get_config(): ) except Exception as e: + error_msg = f"{e}, {str(traceback.format_exc())}" + print(error_msg) return Response( - json.dumps({"message": "api_server.log解析失败,请检查log", "error": str(e)}, ensure_ascii=False), + json.dumps({"message": "api_server.log解析失败,请检查log", "error": error_msg}, ensure_ascii=False), status=500, content_type="application/json", ) @@ -447,7 +458,7 @@ def tail_file(path, lines=50): with open(path, "r", encoding="utf-8", errors="ignore") as f: return "".join(f.readlines()[-lines:]) except Exception as e: - return f"[无法读取 {path}]: {e}\n" + return f"[无法读取 {path}]: {e}, {str(traceback.format_exc())}\n" result = f"服务启动超时,耗时:[{timeout}s]\n\n" result += "==== server.log tail 50 ====\n" From eea387726132a79065d988b7197b8970e836cd2b Mon Sep 17 00:00:00 2001 From: kevin Date: Tue, 19 Aug 2025 11:22:25 +0800 Subject: [PATCH 15/20] update error msg --- fastdeploy/engine/engine.py | 8 ++++++-- fastdeploy/entrypoints/openai/serving_chat.py | 2 +- test/ce/deploy/deploy.py | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index 47e1bfd323..d4cc4f6fb6 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -984,7 +984,9 @@ def _exit_sub_services(self): try: os.killpg(p.pid, signal.SIGTERM) except Exception as e: - print(f"Error extracting file: {e}, {str(traceback.format_exc())}") + error_msg = f"Error killing cache manager process {p.pid}: {e}, {str(traceback.format_exc())}" + print(error_msg) + llm_logger.error(error_msg) self.worker_ready_signal.clear() self.exist_task_signal.clear() self.exist_swapped_task_signal.clear() @@ -997,7 +999,9 @@ def _exit_sub_services(self): try: os.killpg(self.worker_proc.pid, signal.SIGTERM) except Exception as e: - print(f"Error extracting sub services: {e}, {str(traceback.format_exc())}") + error_msg = f"Error extracting sub services: {e}, {str(traceback.format_exc())}" + print(error_msg) + llm_logger.error(error_msg) self.engine_worker_queue.cleanup() if hasattr(self, "zmq_server") and self.zmq_server is not None: diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index a0d28eedaa..28f4cb41bd 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -348,7 +348,7 @@ async def chat_completion_stream_generator( except Exception as e: error_data = self._create_streaming_error_response( - f"equest[{request_id}] generate stream error: {str(e)}, {str(traceback.format_exc())}" + f"request[{request_id}] generate stream error: {str(e)}, {str(traceback.format_exc())}" ) yield f"data: {error_data}\n\n" finally: diff --git a/test/ce/deploy/deploy.py b/test/ce/deploy/deploy.py index 50e540a997..3947d22288 100644 --- a/test/ce/deploy/deploy.py +++ b/test/ce/deploy/deploy.py @@ -185,7 +185,7 @@ def stop_server(signum=None, frame=None): os.kill(int(pid), signal.SIGKILL) print(f"Killed process on port {port}, pid={pid}") except Exception as e: - print(f"Failed to killed process on port: {e}, {str(traceback.format_exc())}") + print(f"Failed to kill process on port: {e}, {str(traceback.format_exc())}") # 若log目录存在,则重命名为log_timestamp if os.path.isdir("./log"): os.rename("./log", "./log_{}".format(time.strftime("%Y%m%d%H%M%S"))) From 4d8d46af76a158a6c50d81dd02b77d65ed26ddbf Mon Sep 17 00:00:00 2001 From: kevin Date: Tue, 19 Aug 2025 17:23:59 +0800 Subject: [PATCH 16/20] update structred output code --- fastdeploy/engine/engine.py | 2 ++ fastdeploy/engine/request.py | 12 +++++------- test/ci_use/EB_Lite/test_EB_Lite_serving.py | 15 +++++++++------ test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py | 15 +++++++++------ 4 files changed, 25 insertions(+), 19 deletions(-) diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index a403017fae..1522fd7acd 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -492,8 +492,10 @@ def _has_guided_input(self, request): for x in ( request.guided_json, request.guided_regex, + request.guided_choice, request.structural_tag, request.guided_grammar, + request.guided_json_object, ) ) diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py index a703ac010d..57abf71086 100644 --- a/fastdeploy/engine/request.py +++ b/fastdeploy/engine/request.py @@ -221,13 +221,11 @@ def set(self, key, value): setattr(self, key, value) def __repr__(self) -> str: - return ( - f"Request(request_id={self.request_id}, " - f"prompt={self.prompt!r}, " - f"prompt_token_ids={self.prompt_token_ids}, " - f"draft_token_ids={self.draft_token_ids}, " - f"sampling_params={self.sampling_params})" - ) + non_none_fields = [] + for attr, value in vars(self).items(): + if value is not None and not attr.startswith("_"): + non_none_fields.append(f"{attr}={value!r}") + return f"Request({', '.join(non_none_fields)})" @dataclass(slots=True) diff --git a/test/ci_use/EB_Lite/test_EB_Lite_serving.py b/test/ci_use/EB_Lite/test_EB_Lite_serving.py index 9c1689fcb3..bb00259ee1 100644 --- a/test/ci_use/EB_Lite/test_EB_Lite_serving.py +++ b/test/ci_use/EB_Lite/test_EB_Lite_serving.py @@ -1162,12 +1162,15 @@ def test_structured_outputs_structural_tag(openai_client): }, } - expect_str = '{"timezone": "Asia/Shanghai"}' + expect_str1 = "get_current_date" + expect_str2 = "Asia/Shanghai" response = streaming_chat_base(openai_client, structural_tag_param) - assert response == expect_str, f"structural_tag streaming response: {response} is not as expected" + assert expect_str1 in response, f"structural_tag streaming response: {response} is not as expected" + assert expect_str2 in response, f"structural_tag streaming response: {response} is not as expected" response = non_streaming_chat_base(openai_client, structural_tag_param) - assert response == expect_str, f"structural_tag non_streaming response: {response} is not as expected" + assert expect_str1 in response, f"structural_tag non_streaming response: {response} is not as expected" + assert expect_str2 in response, f"structural_tag non_streaming response: {response} is not as expected" def test_structured_outputs_choice(openai_client): @@ -1219,11 +1222,11 @@ def test_structured_outputs_regex(openai_client): response = streaming_chat_base(openai_client, regex_param) assert re.fullmatch( - r"^https:\/\/www\.[a-zA-Z]+\.com\/?$", response + r"^https:\/\/www\.[a-zA-Z]+\.com\/?$\n", response ), f"regex streaming response: {response} is not as expected" response = non_streaming_chat_base(openai_client, regex_param) assert re.fullmatch( - r"^https:\/\/www\.[a-zA-Z]+\.com\/?$", response + r"^https:\/\/www\.[a-zA-Z]+\.com\/?$\n", response ), f"regex non_streaming response: {response} is not as expected" @@ -1267,7 +1270,7 @@ def test_structured_outputs_grammar(openai_client): import re - pattern = r'^[A-Za-z0-9 ]+$' + pattern = r'^[A-Za-z0-9 ]+$' response = streaming_chat_base(openai_client, grammar_param) assert re.fullmatch(pattern, response), f"grammar streaming response: {response} is not as expected" response = non_streaming_chat_base(openai_client, grammar_param) diff --git a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py index 42dd91ee3e..819a2fdeec 100644 --- a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py +++ b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py @@ -766,12 +766,15 @@ def test_structured_outputs_structural_tag(openai_client): }, } - expect_str = '{"timezone": "Asia/Shanghai"}' + expect_str1 = "get_current_date" + expect_str2 = "Asia/Shanghai" response = streaming_chat_base(openai_client, structural_tag_param) - assert response == expect_str, f"structural_tag streaming response: {response} is not as expected" + assert expect_str1 in response, f"structural_tag streaming response: {response} is not as expected" + assert expect_str2 in response, f"structural_tag streaming response: {response} is not as expected" response = non_streaming_chat_base(openai_client, structural_tag_param) - assert response == expect_str, f"structural_tag non_streaming response: {response} is not as expected" + assert expect_str1 in response, f"structural_tag non_streaming response: {response} is not as expected" + assert expect_str2 in response, f"structural_tag non_streaming response: {response} is not as expected" def test_structured_outputs_choice(openai_client): @@ -823,11 +826,11 @@ def test_structured_outputs_regex(openai_client): response = streaming_chat_base(openai_client, regex_param) assert re.fullmatch( - r"^https:\/\/www\.[a-zA-Z]+\.com\/?$", response + r"^https:\/\/www\.[a-zA-Z]+\.com\/?$\n", response ), f"regex streaming response: {response} is not as expected" response = non_streaming_chat_base(openai_client, regex_param) assert re.fullmatch( - r"^https:\/\/www\.[a-zA-Z]+\.com\/?$", response + r"^https:\/\/www\.[a-zA-Z]+\.com\/?$\n", response ), f"regex non_streaming response: {response} is not as expected" @@ -871,7 +874,7 @@ def test_structured_outputs_grammar(openai_client): import re - pattern = r'^[A-Za-z0-9 ]+$' + pattern = r'^[A-Za-z0-9 ]+$' response = streaming_chat_base(openai_client, grammar_param) assert re.fullmatch(pattern, response), f"grammar streaming response: {response} is not as expected" response = non_streaming_chat_base(openai_client, grammar_param) From 4d0b1e45126f6e3e8c9c78bcba1f6871579e4919 Mon Sep 17 00:00:00 2001 From: kevin Date: Tue, 19 Aug 2025 17:25:13 +0800 Subject: [PATCH 17/20] update code --- fastdeploy/engine/engine.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index b6c52b80de..3494186fa4 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -985,9 +985,9 @@ def _exit_sub_services(self): try: os.killpg(p.pid, signal.SIGTERM) except Exception as e: - error_msg = f"Error killing cache manager process {p.pid}: {e}, {str(traceback.format_exc())}" - print(error_msg) - llm_logger.error(error_msg) + console_logger.error( + f"Error killing cache manager process {p.pid}: {e}, {str(traceback.format_exc())}" + ) self.worker_ready_signal.clear() self.exist_task_signal.clear() self.exist_swapped_task_signal.clear() @@ -1000,9 +1000,7 @@ def _exit_sub_services(self): try: os.killpg(self.worker_proc.pid, signal.SIGTERM) except Exception as e: - error_msg = f"Error extracting sub services: {e}, {str(traceback.format_exc())}" - print(error_msg) - llm_logger.error(error_msg) + console_logger.error(f"Error extracting sub services: {e}, {str(traceback.format_exc())}") self.engine_worker_queue.cleanup() if hasattr(self, "zmq_server") and self.zmq_server is not None: From 7ad87f6ee09be75a7aca857f3be70db41f6fbd0a Mon Sep 17 00:00:00 2001 From: kevin Date: Tue, 19 Aug 2025 20:33:38 +0800 Subject: [PATCH 18/20] update code --- fastdeploy/engine/request.py | 4 +-- fastdeploy/input/ernie_processor.py | 5 ++-- fastdeploy/input/ernie_vl_processor.py | 8 +++--- fastdeploy/input/text_processor.py | 36 +++++--------------------- fastdeploy/worker/gpu_model_runner.py | 8 ++++-- 5 files changed, 21 insertions(+), 40 deletions(-) diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py index c274b9bed9..0131188e06 100644 --- a/fastdeploy/engine/request.py +++ b/fastdeploy/engine/request.py @@ -70,7 +70,7 @@ def __init__( guided_grammar: Optional[Any] = None, structural_tag: Optional[Any] = None, guided_json_object: Optional[bool] = None, - enable_thinking: Optional[bool] = None, + enable_thinking: Optional[bool] = True, trace_carrier: dict = dict(), chat_template: Optional[str] = None, ) -> None: @@ -153,7 +153,7 @@ def from_dict(cls, d: dict): guided_grammar=d.get("guided_grammar", None), structural_tag=d.get("structural_tag", None), guided_json_object=d.get("guided_json_object", None), - enable_thinking=d.get("enable_thinking", None), + enable_thinking=d.get("enable_thinking", True), trace_carrier=d.get("trace_carrier", {}), chat_template=d.get("chat_template", None), ) diff --git a/fastdeploy/input/ernie_processor.py b/fastdeploy/input/ernie_processor.py index 6d7d2896bf..07e0d6cbaf 100644 --- a/fastdeploy/input/ernie_processor.py +++ b/fastdeploy/input/ernie_processor.py @@ -257,6 +257,7 @@ def process_response_dict_normal(self, response_dict, **kwargs): Returns: Dict: response contain text fields """ + enable_thinking = kwargs.get("enable_thinking") token_ids = response_dict["outputs"]["token_ids"] is_end = response_dict["finished"] req_id = response_dict["request_id"] @@ -266,7 +267,6 @@ def process_response_dict_normal(self, response_dict, **kwargs): delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id) if is_end: full_text = previous_texts + delta_text - enable_thinking = self.get_enable_thinking(kwargs.get("enable_thinking")) if self.reasoning_parser and ( enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser" ): @@ -296,6 +296,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): Returns: Dict: response contain text fields """ + enable_thinking = kwargs.get("enable_thinking") is_end = response_dict["finished"] req_id = response_dict["request_id"] token_ids = response_dict["outputs"]["token_ids"] @@ -304,9 +305,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): if token_ids[-1] == self.tokenizer.eos_token_id: token_ids = token_ids[:-1] delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id) - response_dict["outputs"]["raw_prediction"] = delta_text - enable_thinking = self.get_enable_thinking(kwargs.get("enable_thinking")) if self.reasoning_parser and ( enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser" ): diff --git a/fastdeploy/input/ernie_vl_processor.py b/fastdeploy/input/ernie_vl_processor.py index 9e824bdadc..5c64952a14 100644 --- a/fastdeploy/input/ernie_vl_processor.py +++ b/fastdeploy/input/ernie_vl_processor.py @@ -105,9 +105,6 @@ def set_value(req, key, value): set_value(request, "repetition_penalty", 1.0) set_value(request, "frequency_penalty", 0.0) set_value(request, "presence_penalty", 0.0) - - enable_thinking = self.get_enable_thinking(request.get("enable_thinking", None)) - set_value(request, "enable_thinking", enable_thinking) return request def process_request(self, request, max_model_len=None, **kwargs): @@ -198,6 +195,7 @@ def _check_mm_limits(self, item): def process_request_dict(self, request, max_model_len=None): """process the input data""" + request = self._apply_default_parameters(request) if not request.get("eos_token_ids"): request["eos_token_ids"] = self.eos_token_ids @@ -292,7 +290,9 @@ def process_response_dict(self, response_dict, stream, **kwargs): Returns: Dict: response contain text fields """ - enable_thinking = self.get_enable_thinking(kwargs.pop("enable_thinking", None)) + enable_thinking = kwargs.pop("enable_thinking", True) + if enable_thinking is None: + enable_thinking = True if stream: return self.process_response_dict_streaming(response_dict, enable_thinking=enable_thinking, **kwargs) else: diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py index 1ebc4cf846..dc2d91cb87 100644 --- a/fastdeploy/input/text_processor.py +++ b/fastdeploy/input/text_processor.py @@ -50,25 +50,6 @@ def __init__(self): ) ) - def get_enable_thinking(self, enable_thinking=None): - """ - get enable_thinking param - - 1. if enable_thinking is None: - 1.1 if reasoning_parser is not None, set enable_thinking to True. - 1.2 if reasoning_parser is None, set enable_thinking to False. - 2. if reasoning_parser is None but enable_thinking is True, set enable_thinking to False and print warning. - - """ - if enable_thinking is None: - enable_thinking = False if self.reasoning_parser is None else True - if enable_thinking and self.reasoning_parser is None: - enable_thinking = False - data_processor_logger.warning( - "enable_thinking is True, but reasoning_parser is None. " "enable_thinking will be set to False." - ) - return enable_thinking - def _apply_default_parameters(self, request): """ Apply default value for parameters in request @@ -88,10 +69,6 @@ def set_value(req, key, value): set_value(request, "repetition_penalty", 1.0) set_value(request, "frequency_penalty", 0.0) set_value(request, "presence_penalty", 0.0) - - enable_thinking = self.get_enable_thinking(request.get("enable_thinking")) - set_value(request, "enable_thinking", enable_thinking) - return request @abstractmethod @@ -237,7 +214,6 @@ def process_request(self, request, max_model_len=None, **kwargs): request.set("stop_token_ids", stop_seqs) request.set("stop_seqs_len", stop_seqs_len) - request.set("enable_thinking", self.get_enable_thinking(kwargs.get("enable_thinking"))) if request.prompt_token_ids is None or len(request.prompt_token_ids) == 0: if request.prompt is not None: request.prompt_token_ids = self.text2ids(request.prompt, max_model_len) @@ -253,6 +229,7 @@ def process_request(self, request, max_model_len=None, **kwargs): task[k] = v else: raise ValueError("Invalid input: chat_template_kwargs must be a dict") + task.setdefault("enable_thinking", True) request.prompt_token_ids = self.messages2ids(task) else: raise ValueError(f"The request should have `input_ids`, `text` or `messages`: {request}.") @@ -283,7 +260,6 @@ def process_request_dict(self, request, max_model_len=None, **kwargs): str: error message """ request = self._apply_default_parameters(request) - request["enable_thinking"] = self.get_enable_thinking(kwargs.get("enable_thinking")) if not request.get("eos_token_ids"): request["eos_token_ids"] = self.eos_token_ids @@ -311,6 +287,7 @@ def process_request_dict(self, request, max_model_len=None, **kwargs): request[k] = v else: raise ValueError("Invalid input: chat_template_kwargs must be a dict") + request.setdefault("enable_thinking", True) request["prompt_token_ids"] = self.messages2ids(request) else: raise ValueError(f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}") @@ -374,6 +351,7 @@ def process_response_dict_normal(self, response_dict, **kwargs): Returns: Dict: response contain text fields """ + enable_thinking = kwargs.get("enable_thinking") token_ids = response_dict["outputs"]["token_ids"] is_end = response_dict["finished"] req_id = response_dict["request_id"] @@ -383,7 +361,6 @@ def process_response_dict_normal(self, response_dict, **kwargs): delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id) if is_end: full_text = previous_texts + delta_text - enable_thinking = self.get_enable_thinking(kwargs.get("enable_thinking")) response_dict["outputs"]["raw_prediction"] = full_text if enable_thinking and self.reasoning_parser: reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict) @@ -411,6 +388,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): Returns: Dict: response contain text fields """ + enable_thinking = kwargs.get("enable_thinking") is_end = response_dict["finished"] req_id = response_dict["request_id"] token_ids = response_dict["outputs"]["token_ids"] @@ -419,8 +397,6 @@ def process_response_dict_streaming(self, response_dict, **kwargs): if token_ids[-1] == self.tokenizer.eos_token_id: token_ids = token_ids[:-1] delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id) - - enable_thinking = self.get_enable_thinking(kwargs.get("enable_thinking")) response_dict["outputs"]["raw_prediction"] = delta_text if enable_thinking and self.reasoning_parser: reasoning_content, text = self.reasoning_parser.extract_reasoning_content_streaming( @@ -466,7 +442,9 @@ def process_response_dict(self, response_dict, **kwargs): Returns: Dict: response contain text fields """ - enable_thinking = self.get_enable_thinking(kwargs.pop("enable_thinking", None)) + enable_thinking = kwargs.pop("enable_thinking", True) + if enable_thinking is None: + enable_thinking = True stream = kwargs.get("stream", True) if stream: return self.process_response_dict_streaming(response_dict, enable_thinking=enable_thinking, **kwargs) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 9ecdc27b53..4ef329491e 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -202,10 +202,13 @@ def _init_logits_processor(self, request): elif request.structural_tag is not None: schemata_key = ("structural_tag", request.structural_tag) + enable_thinking = request.get("enable_thinking", True) + enable_thinking = enable_thinking if enable_thinking is not None else True + return ( self.guided_backend.get_logits_processor( schemata_key=schemata_key, - enable_thinking=request.get("enable_thinking"), + enable_thinking=enable_thinking, ), schemata_key, ) @@ -468,7 +471,8 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests: self.share_inputs["prompt_lens"][idx : idx + 1] = length if self.enable_mm: - enable_thinking = request.get("enable_thinking") + enable_thinking = request.get("enable_thinking", True) + enable_thinking = enable_thinking if enable_thinking is not None else True self.share_inputs["enable_thinking"][:] = enable_thinking self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0 self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048) From 5ad6432ed1fabca28e5bd25c54ce07f1270ed809 Mon Sep 17 00:00:00 2001 From: kevin Date: Thu, 28 Aug 2025 11:59:33 +0800 Subject: [PATCH 19/20] update config --- fastdeploy/config.py | 12 +- fastdeploy/engine/config.py | 438 ------------------------------------ 2 files changed, 6 insertions(+), 444 deletions(-) delete mode 100644 fastdeploy/engine/config.py diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 20e196b243..198868e6c6 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1186,7 +1186,8 @@ def postprocess(self): self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size) if self.guided_decoding_backend == "auto": - if self.model_config.enable_mm: + if current_platform.is_xpu() or self.speculative_config.method is not None: + logger.warning("Speculative Decoding and XPU currently do not support Guided decoding, set off.") self.guided_decoding_backend = "off" else: self.guided_decoding_backend = "xgrammar" @@ -1256,12 +1257,10 @@ def check(self): ], f"Only support xgrammar、auto guided decoding backend, but got {self.guided_decoding_backend}." if self.guided_decoding_backend != "off": - # TODO: mm support guided_decoding - assert ( - self.model_config.enable_mm is False - ), "Multimodal model currently do not support guided_decoding" - # TODO: speculative decoding support guided_decoding + assert ( + self.speculative_config.method is None + ), "speculative decoding currently do not support guided_decoding" # TODO: xpu support guided_decoding assert not current_platform.is_xpu(), "XPU currently do not support guided_decoding" @@ -1272,6 +1271,7 @@ def check(self): raise Exception( f"import XGrammar failed, please install XGrammar use `pip install xgrammar==0.1.19`. \n\t {e}" ) + if self.scheduler_config is not None: self.scheduler_config.check() diff --git a/fastdeploy/engine/config.py b/fastdeploy/engine/config.py deleted file mode 100644 index eb3595b69c..0000000000 --- a/fastdeploy/engine/config.py +++ /dev/null @@ -1,438 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License" -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import json -import os -from datetime import datetime -from typing import Any, Dict, List, Optional - -from fastdeploy.config import ( - CacheConfig, - CommitConfig, - LoadConfig, - ModelConfig, - ParallelConfig, -) -from fastdeploy.multimodal.registry import MultimodalRegistry -from fastdeploy.platforms import current_platform -from fastdeploy.scheduler import SchedulerConfig -from fastdeploy.utils import ceil_div, get_host_ip, is_port_available, llm_logger - - -class Config: - """ - Initial configuration class. - - Attributes: - model_config (ModelConfig): Model configuration object. - cache_config (CacheConfig): Cache configuration object. - model_name_or_path (str): Directory path to the model or the model name. - tokenizer (Optional[str]): Default is the model. - max_num_batched_tokens (Optional[int]): Maximum number of batched tokens. - tensor_parallel_size (int): Tensor parallel size. - nnode (int): Number of nodes. - max_model_len (int): Maximum model length. Default is 8192. - max_num_seqs (int): Maximum number of sequences. Default is 8. - mm_processor_kwargs (Optional[Dict[str, Any]]): Additional arguments for multi-modal processor. - speculative_config (Optional[Dict[str, Any]]): Speculative execution configuration. - use_warmup (bool): Flag to use warmup. - engine_worker_queue_port (int): Port for engine worker queue. - enable_mm (bool): Flag to enable multi-modal processing. - reasoning_parser(str): Flag specifies the reasoning parser to use for - extracting reasoning content from the model output - splitwise_role (str): Splitwise role. - innode_prefill_ports (Optional[List[int]]): Innode prefill ports. - Temporary configuration, will be removed in the future. - load_choices(str):The format of the model weights to load. .Default is default - """ - - def __init__( - self, - model_config: ModelConfig, - cache_config: CacheConfig, - scheduler_config: SchedulerConfig, - parallel_config: ParallelConfig, - load_config: LoadConfig, - commit_config: CommitConfig = CommitConfig(), - model_name_or_path: str = None, - tokenizer: str = None, - tensor_parallel_size: int = 8, - max_model_len: int = 8192, - max_num_seqs: int = 8, - max_num_batched_tokens: Optional[int] = None, - ips: str = None, - speculative_config: Optional[Dict[str, Any]] = None, - graph_optimization_config: Optional[Dict[str, Any]] = None, - use_warmup: bool = False, - engine_worker_queue_port: int = 8002, - limit_mm_per_prompt: Optional[Dict[str, Any]] = None, - mm_processor_kwargs: Optional[Dict[str, Any]] = None, - # enable_mm: bool = False, - splitwise_role: str = "mixed", - innode_prefill_ports: Optional[List[int]] = None, - max_num_partial_prefills: int = 1, - max_long_partial_prefills: int = 1, - long_prefill_token_threshold: int = 0, - reasoning_parser: str = None, - tool_parser: str = None, - guided_decoding_backend: Optional[str] = None, - disable_any_whitespace: bool = False, - enable_logprob: bool = False, - early_stop_config: Optional[Dict[str, Any]] = None, - load_choices: str = "default", - ): - """ - Initialize the Config class. - - Args: - model_config (ModelConfig): Model configuration object. - cache_config (CacheConfig): Cache configuration object. - parallel_config (ParallelConfig): Parallel configuration object. - scheduler_config (SchedulerConfig): Scheduler configuration object. - model_name_or_path (str): Model directory path or model name. - tokenizer (str): Default is the model. - tensor_parallel_size (int): Tensor parallel size. Default is 8. - max_model_len (int): Maximum model length. Default is 8192. - max_num_seqs (int): Maximum number of sequences. Default is 8. - max_num_batched_tokens (Optional[int]): Maximum number of batched tokens. Default is None. - mm_processor_kwargs (Optional[Dict[str, Any]]): Additional arguments for multi-modal processor. Default is None. - speculative_config (Optional[Dict[str, Any]]): Speculative execution configuration. Default is None. - graph_optimization_config (Optional[Dict[str, Any]]): Graph optimizaion backend execution configuration. Default is None. - use_warmup (bool): Flag to use warmup. Default is False. - engine_worker_queue_port (int): Engine worker queue port. Default is 8002. - enable_mm (bool): Flag to enable multi-modal processing. Default is False. - splitwise_role (str): Splitwise role. Default is "mixed". - innode_prefill_ports (Optional[List[int]]): Innode prefill ports. Default is None. - reasoning_parser (str): Flag specifies the reasoning parser to use for - extracting reasoning content from the model output. Default is None. - guided_decoding_backend(str): Guided decoding backend. Default is None. - disable_any_whitespace(bool): Disable any whitespace when using guided decoding. - Default is False. - enable_logprob(bool): Enable logprob. Default is False. - early_stop_config (Optional[Dict[str, Any]]): Early stop configuration. Default is None. - load_choices(str):The format of the model weights to load. .Default is default - """ - self.model_config = model_config - self.cache_config = cache_config - self.scheduler_config = scheduler_config - self.parallel_config = parallel_config - self.load_config = load_config - self.commit_config = commit_config - self.model_name_or_path = model_name_or_path - self.tokenizer = tokenizer - self.max_num_batched_tokens = max_num_batched_tokens - self.tensor_parallel_size = tensor_parallel_size - self.ips = ips - - if self.ips is None: - self.master_ip = "0.0.0.0" - elif isinstance(self.ips, list): - self.master_ip = self.ips[0] - else: - self.ips = self.ips.split(",") - self.master_ip = self.ips[0] - - if self.ips is None: - self.nnode = 1 - self.node_rank = 0 - else: - self.nnode = len(self.ips) - - for idx, ip in enumerate(self.ips): - if ip == self.master_ip: - self.node_rank = idx - - self.max_model_len = max_model_len - self.max_num_seqs = max_num_seqs - self.limit_mm_per_prompt = limit_mm_per_prompt - self.mm_processor_kwargs = mm_processor_kwargs - # self.enable_mm = enable_mm - self.speculative_config = speculative_config - self.use_warmup = use_warmup - self.splitwise_role = splitwise_role - self.innode_prefill_ports = innode_prefill_ports - self.max_num_partial_prefills = max_num_partial_prefills - self.max_long_partial_prefills = max_long_partial_prefills - self.long_prefill_token_threshold = long_prefill_token_threshold - self.reasoning_parser = reasoning_parser - self.tool_parser = tool_parser - self.graph_optimization_config = graph_optimization_config - self.early_stop_config = early_stop_config - self.guided_decoding_backend = guided_decoding_backend - self.disable_any_whitespace = disable_any_whitespace - self._str_to_list("innode_prefill_ports", int) - self.load_choices = load_choices - - assert self.splitwise_role in ["mixed", "prefill", "decode"] - - import fastdeploy.model_executor.models # noqa: F401 - - architectures = self.model_config.architectures[0] - if MultimodalRegistry.contains_model(architectures): - self.enable_mm = True - else: - self.enable_mm = False - - # TODO - self.max_prefill_batch = 3 - if current_platform.is_xpu(): - self.max_prefill_batch = 1 - if self.enable_mm: - self.max_prefill_batch = 1 # TODO:当前多模prefill阶段只支持并行度为1,待优化 - - # TODO(@wufeisheng): TP and EP need to be supported simultaneously. - assert (self.tensor_parallel_size == 1 and self.parallel_config.expert_parallel_size >= 1) or ( - self.tensor_parallel_size >= 1 and self.parallel_config.expert_parallel_size == 1 - ), "TP and EP cannot be enabled at the same time" - - num_ranks = self.tensor_parallel_size * self.parallel_config.expert_parallel_size - self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8 - if num_ranks > self.max_chips_per_node: - self.worker_num_per_node = self.max_chips_per_node - nnode = ceil_div(num_ranks, self.worker_num_per_node) - assert nnode == self.nnode, f"nnode: {nnode}, but got {self.nnode}" - else: - self.worker_num_per_node = num_ranks - - self.engine_worker_queue_port = engine_worker_queue_port - self.device_ids = ",".join([str(i) for i in range(self.worker_num_per_node)]) - self.device_ids = os.getenv("CUDA_VISIBLE_DEVICES", self.device_ids) - if current_platform.is_xpu(): - self.device_ids = os.getenv("XPU_VISIBLE_DEVICES", self.device_ids) - - self.enable_logprob = enable_logprob - - self.read_from_config() - self.postprocess() - self.check() - self.print() - - def postprocess(self): - """ - calculate some parameters - """ - assert ( - self.device_ids.split(",").__len__() == self.worker_num_per_node - ), f"invalid CUDA_VISIBLE_DEVICES, should be equal to {self.worker_num_per_node}" - - self.local_device_ids = self.device_ids.split(",")[: self.tensor_parallel_size] - - self.host_ip = get_host_ip() - - if self.ips is None or self.host_ip == self.master_ip: - self.is_master = True - else: - self.is_master = False - - if self.tensor_parallel_size <= self.worker_num_per_node: - self.is_master = True - - import paddle - - self.paddle_commit_id = paddle.version.commit - - if self.max_num_batched_tokens is None: - if self.cache_config.enable_chunked_prefill: - self.max_num_batched_tokens = 2048 - else: - if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")): - self.max_num_batched_tokens = self.max_model_len - else: - self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM - - if self.long_prefill_token_threshold == 0: - self.long_prefill_token_threshold = int(self.max_model_len * 0.04) - - self.cache_config.postprocess(self.max_num_batched_tokens, self.max_num_seqs) - self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size) - - if self.guided_decoding_backend == "auto": - if current_platform.is_xpu() or self.speculative_config.method is not None: - llm_logger.warning("Speculative Decoding and XPU currently do not support Guided decoding, set off.") - self.guided_decoding_backend = "off" - else: - self.guided_decoding_backend = "xgrammar" - - def check(self): - """ - check the legality of config - """ - assert self.max_num_seqs <= 256, ( - "The parameter `max_num_seqs` is not allowed to exceed 256, " f"but now it's {self.max_num_seqs}." - ) - assert is_port_available( - "0.0.0.0", self.engine_worker_queue_port - ), f"The parameter `engine_worker_queue_port`:{self.engine_worker_queue_port} is already in use." - assert self.nnode >= 1, f"nnode: {self.nnode} should no less than 1" - assert self.max_model_len >= 16, f"max_model_len: {self.max_model_len} should be larger than 16" - assert self.max_num_seqs >= 1, f"max_num_seqs: {self.max_num_seqs} should be larger than 1" - assert self.max_num_batched_tokens >= self.max_num_seqs, ( - f"max_num_batched_tokens: {self.max_num_batched_tokens} " - f"should be larger than or equal to max_num_seqs: {self.max_num_seqs}" - ) - assert self.max_num_batched_tokens <= self.max_model_len * self.max_num_seqs, ( - f"max_num_batched_tokens: {self.max_num_batched_tokens} should be larger" - f"than or equal to max_num_seqs: {self.max_num_seqs} * max_model_len: {self.max_model_len}" - ) - assert ( - self.max_num_partial_prefills >= 1 - ), f"max_num_partial_prefills: {self.max_num_partial_prefills} should be larger than or equal to 1" - - assert ( - self.max_long_partial_prefills >= 1 - ), f"max_long_partial_prefills: {self.max_long_partial_prefills} should be larger than or equal to 1" - assert self.max_long_partial_prefills <= self.max_num_partial_prefills, ( - f"max_long_partial_prefills: {self.max_long_partial_prefills} should " - f"be less than or equal to max_num_partial_prefills: {self.max_num_partial_prefills}" - ) - - if not self.cache_config.enable_chunked_prefill: - if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")): - assert self.max_num_batched_tokens >= self.max_model_len, ( - f"max_num_batched_tokens: {self.max_num_batched_tokens} " - f"should be larger than or equal to max_model_len: {self.max_model_len}" - ) - else: - assert self.max_num_batched_tokens >= self.cache_config.block_size, ( - f"max_num_batched_tokens: {self.max_num_batched_tokens} " - f"should be larger than or equal to block_size: {self.cache_config.block_size}" - ) - - if self.max_num_partial_prefills > 1: - assert ( - self.cache_config.enable_chunked_prefill is True - ), "Chunked prefill must be enabled to set max_num_partial_prefills > 1" - assert self.long_prefill_token_threshold < self.max_model_len, ( - f"long_prefill_token_threshold: {self.long_prefill_token_threshold} should be less than" - f" max_model_len: {self.max_model_len}" - ) - - if self.guided_decoding_backend is not None: - assert self.guided_decoding_backend in [ - "xgrammar", - "XGrammar", - "auto", - "off", - ], f"Only support xgrammar、auto guided decoding backend, but got {self.guided_decoding_backend}." - - if self.guided_decoding_backend != "off": - - # TODO: speculative decoding support guided_decoding - assert ( - self.speculative_config.method is None - ), "speculative decoding currently do not support guided_decoding" - - # TODO: xpu support guided_decoding - assert not current_platform.is_xpu(), "XPU currently do not support guided_decoding" - - try: - import xgrammar # noqa - except Exception as e: - raise Exception( - f"import XGrammar failed, please install XGrammar use `pip install xgrammar==0.1.19`. \n\t {e}" - ) - - self.scheduler_config.check() - - def print(self, file=None): - """ - print all config - - Args: - file (str): the path of file to save config - """ - llm_logger.info("=================== Configuration Information ===============") - for k, v in self.__dict__.items(): - if k == "generation_config" and v is not None: - for gck, gcv in v.to_dict().items(): - llm_logger.info("{:<20}:{:<6}{}".format(gck, "", gcv)) - elif ( - k == "cache_config" - or k == "model_config" - or k == "scheduler_config" - or k == "parallel_config" - or k == "commit_config" - or k == "speculative_config" - ): - v.print() - else: - llm_logger.info("{:<20}:{:<6}{}".format(k, "", v)) - llm_logger.info("=============================================================") - if file is not None: - f = open(file, "a") - now_time = datetime.now() - f.write(f"{now_time} configuration information as below,\n") - for k, v in self.__dict__.items(): - f.write("{:<20}:{:<6}{}\n".format(k, "", v)) - f.close() - - def init_cache_info(self): - """ - initialize cache info - """ - disaggregate_info = {} - if self.splitwise_role != "mixed": - disaggregate_info["role"] = self.splitwise_role - disaggregate_info["cache_info"] = dict() - current_protocol = self.cache_config.cache_transfer_protocol.split(",") - disaggregate_info["transfer_protocol"] = current_protocol - for protocol in current_protocol: - if protocol == "ipc": - disaggregate_info["cache_info"][protocol] = { - "ip": self.host_ip, - "port": self.engine_worker_queue_port, - "device_ids": self.local_device_ids, - } - elif protocol == "rdma": - disaggregate_info["cache_info"][protocol] = { - "ip": self.host_ip, - "port": self.cache_config.pd_comm_port[0], - "rdma_port": self.cache_config.rdma_comm_ports, - } - self.disaggregate_info = disaggregate_info - llm_logger.info(f"disaggregate_info: {self.disaggregate_info}") - - def read_from_config(self): - """ - reset model config from json file - """ - - def reset_value(cls, value_name, key): - if hasattr(cls, key): - value = getattr(cls, key) - setattr(cls, value_name, value) - llm_logger.info(f"Reset parameter {value_name} = {value} from configuration.") - - reset_value(self.cache_config, "block_size", "infer_model_block_size") - reset_value( - self.model_config, - "return_full_hidden_states", - "return_full_hidden_states", - ) - reset_value(self.cache_config, "cache_dtype", "infer_model_dtype") - - def _check_master(self): - return self.is_master - - def _str_to_list(self, attr_name, default_type): - if hasattr(self, attr_name): - val = getattr(self, attr_name) - if type(val) is str: - setattr(self, attr_name, [default_type(i) for i in val.split(",")]) - else: - setattr(self, attr_name, val) - - def __str__(self) -> str: - return json.dumps(self.__dict__, indent=4) From da8d37a29f076cbd432000609872b60397aa9d61 Mon Sep 17 00:00:00 2001 From: kevin Date: Thu, 28 Aug 2025 12:03:27 +0800 Subject: [PATCH 20/20] update torch version --- scripts/run_pre_ce.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/run_pre_ce.sh b/scripts/run_pre_ce.sh index 8ec9af730a..ab36dac961 100644 --- a/scripts/run_pre_ce.sh +++ b/scripts/run_pre_ce.sh @@ -7,7 +7,7 @@ python -m pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/p python -m pip install -r requirements.txt python -m pip install jsonschema aistudio_sdk==0.3.5 -python -m pip install xgrammar==0.1.19 +python -m pip install xgrammar==0.1.19 torch==2.6.0 failed_files=() run_path="$DIR/../tests/ci_use/"