Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
a829d0e
mm support structured output
kevincheng2 Jul 17, 2025
6bd3676
update code
kevincheng2 Jul 18, 2025
65458b3
update code
kevincheng2 Aug 1, 2025
06df709
Merge branch 'develop' into mm_structred_output
kevincheng2 Aug 1, 2025
f1141fb
update format
kevincheng2 Aug 1, 2025
b8f8d71
update code
kevincheng2 Aug 4, 2025
ce01f29
update code
kevincheng2 Aug 4, 2025
c2d64b9
add enable_thinking default
kevincheng2 Aug 4, 2025
da81a94
update code
kevincheng2 Aug 5, 2025
3e9bba5
Merge remote-tracking branch 'origin/develop' into mm_structred_output
kevincheng2 Aug 5, 2025
2557839
add structured_outputs test case
kevincheng2 Aug 8, 2025
278d3bd
Merge branch 'develop' into mm_structred_output
kevincheng2 Aug 8, 2025
3ff2a4d
add ci install xgrammar
kevincheng2 Aug 8, 2025
83df9a4
add ci timeout time
kevincheng2 Aug 8, 2025
9a41035
update test for structured_outputs
kevincheng2 Aug 11, 2025
b1c6b0f
Merge branch 'develop' into mm_structred_output
kevincheng2 Aug 11, 2025
f0ea999
update code
kevincheng2 Aug 11, 2025
1fe01e7
add error traceback info
kevincheng2 Aug 15, 2025
0104df7
Merge branch 'develop' into develop
kevincheng2 Aug 15, 2025
0c69ebf
Merge branch 'develop' into develop
kevincheng2 Aug 18, 2025
bb95bb5
Merge branch 'PaddlePaddle:develop' into develop
kevincheng2 Aug 19, 2025
eea3877
update error msg
kevincheng2 Aug 19, 2025
d1390ee
Merge remote-tracking branch 'upstream/develop' into develop
kevincheng2 Aug 19, 2025
1a1bcb2
Merge branch 'develop' into develop
Jiang-Jia-Jun Aug 19, 2025
4d8d46a
update structred output code
kevincheng2 Aug 19, 2025
4d0b1e4
update code
kevincheng2 Aug 19, 2025
0c03b69
Merge branch 'develop' into develop
Jiang-Jia-Jun Aug 19, 2025
db0aa7a
Merge remote-tracking branch 'upstream/develop' into mm_structred_output
kevincheng2 Aug 19, 2025
a9a523f
Merge branch 'PaddlePaddle:develop' into develop
kevincheng2 Aug 19, 2025
7ad87f6
update code
kevincheng2 Aug 19, 2025
70b95a4
Merge branch 'PaddlePaddle:develop' into develop
kevincheng2 Aug 20, 2025
25b38fa
Merge branch 'develop' into mm_structred_output
kevincheng2 Aug 20, 2025
9ba1d41
Merge remote-tracking branch 'upstream/develop' into mm_structred_output
kevincheng2 Aug 28, 2025
5ad6432
update config
kevincheng2 Aug 28, 2025
da8d37a
update torch version
kevincheng2 Aug 28, 2025
d8810d1
Merge branch 'develop' into mm_structred_output
Jiang-Jia-Jun Aug 29, 2025
bd5d7a6
Merge branch 'develop' into mm_structred_output
kevincheng2 Sep 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions docs/features/structured_outputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -330,3 +330,65 @@ ParsedChatCompletionMessage[Info](content='{"addr": "No.1 Century Avenue, Pudong
Address: No.1 Century Avenue, Pudong New Area, Shanghai
Height: 468
```

### Offline Inference

Offline inference allows restricting the model's output format by pre-specified constraints. In `FastDeploy`, constraints can be specified through the `GuidedDecodingParams` class in `SamplingParams`. `GuidedDecodingParams` supports the following constraint types, with usage similar to online inference:

```python
json: Optional[Union[str, dict]] = None
regex: Optional[str] = None
choice: Optional[List[str]] = None
grammar: Optional[str] = None
json_object: Optional[bool] = None
structural_tag: Optional[str] = None
```

The following example demonstrates how to use offline inference to generate a structured json:

```python
from fastdeploy import LLM, SamplingParams
from fastdeploy.engine.sampling_params import GuidedDecodingParams
from pydantic import BaseModel
from enum import Enum

class BookType(str, Enum):
romance = "Romance"
historical = "Historical"
adventure = "Adventure"
mystery = "Mystery"
dystopian = "Dystopian"

class BookDescription(BaseModel):
author: str
title: str
genre: BookType

# Constrained decoding parameters
guided_decoding_params = GuidedDecodingParams(json=BookDescription.model_json_schema())

# Sampling parameters
sampling_params = SamplingParams(
top_p=0.95,
max_tokens=6400,
guided_decoding=guided_decoding_params,
)

# Load model
llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192, guided_decoding_backend="auto")

outputs = llm.generate(
prompts="Generate a JSON describing a literary work, including author, title and book type.",
sampling_params=sampling_params,
)

# Output results
for output in outputs:
print(output.outputs.text)
```

Output:

```
{"author": "George Orwell", "title": "1984", "genre": "Dystopian"}
```
64 changes: 64 additions & 0 deletions docs/zh/features/structured_outputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -330,3 +330,67 @@ ParsedChatCompletionMessage[Info](content='{"addr": "上海市浦东新区世纪
地址: 上海市浦东新区世纪大道1号
高度: 468
```

### 离线推理

离线推理允许通过预先指定约束条件,限制模型输出格式。在 `FastDeploy` 中,支持通过 `SamplingParams` 中的 `GuidedDecodingParams` 类指定相关约束条件。`GuidedDecodingParams` 支持以下几种约束条件,使用方式可以参考在线推理:

```python
json: Optional[Union[str, dict]] = None
regex: Optional[str] = None
choice: Optional[List[str]] = None
grammar: Optional[str] = None
json_object: Optional[bool] = None
structural_tag: Optional[str] = None
```

以下示例展示了如何使用离线推理生成一个结构化的 json :

```python

from fastdeploy import LLM, SamplingParams
from fastdeploy.engine.sampling_params import GuidedDecodingParams
from pydantic import BaseModel
from enum import Enum

class BookType(str, Enum):
romance = "Romance"
historical = "Historical"
adventure = "Adventure"
mystery = "Mystery"
dystopian = "Dystopian"

class BookDescription(BaseModel):
author: str
title: str
genre: BookType

# Constrained decoding parameters
guided_decoding_params = GuidedDecodingParams(json=BookDescription.model_json_schema())

# Sampling parameters
sampling_params = SamplingParams(
top_p=0.95,
max_tokens=6400,
guided_decoding=guided_decoding_params,
)

# Load model
llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192, guided_decoding_backend="auto")

outputs = llm.generate(
prompts="生成一个JSON,描述一本中国的著作,要包含作者、标题和书籍类型。",
sampling_params=sampling_params,
)

# Output results
for output in outputs:
print(output.outputs.text)

```

输出

```
{"author": "曹雪芹", "title": "红楼梦", "genre": "Historical"}
```
15 changes: 8 additions & 7 deletions fastdeploy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,12 +127,13 @@ def __init__(
self.redundant_experts_num = 0
self.seed = 0
self.quantization = None
self.reasoning_parser = None
self.pad_token_id: int = -1
self.eos_tokens_lens: int = 2
self.lm_head_fp32: bool = False
self.model_format = "auto"
for key, value in args.items():
if hasattr(self, key):
if hasattr(self, key) and value != "None":
setattr(self, key, value)

assert self.model != ""
Expand Down Expand Up @@ -1249,7 +1250,8 @@ def postprocess(self):
self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size)

if self.guided_decoding_backend == "auto":
if self.model_config.enable_mm:
if current_platform.is_xpu() or self.speculative_config.method is not None:
logger.warning("Speculative Decoding and XPU currently do not support Guided decoding, set off.")
self.guided_decoding_backend = "off"
else:
self.guided_decoding_backend = "xgrammar"
Expand Down Expand Up @@ -1319,12 +1321,10 @@ def check(self):
], f"Only support xgrammar、auto guided decoding backend, but got {self.guided_decoding_backend}."

if self.guided_decoding_backend != "off":
# TODO: mm support guided_decoding
assert (
self.model_config.enable_mm is False
), "Multimodal model currently do not support guided_decoding"

# TODO: speculative decoding support guided_decoding
assert (
self.speculative_config.method is None
), "speculative decoding currently do not support guided_decoding"

# TODO: xpu support guided_decoding
assert not current_platform.is_xpu(), "XPU currently do not support guided_decoding"
Expand All @@ -1335,6 +1335,7 @@ def check(self):
raise Exception(
f"import XGrammar failed, please install XGrammar use `pip install xgrammar==0.1.19`. \n\t {e}"
)

if self.scheduler_config is not None:
self.scheduler_config.check()

Expand Down
28 changes: 26 additions & 2 deletions fastdeploy/engine/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,22 @@ def _get_generated_result(self):

# _insert_task_to_worker moved to CommonEngine

def _has_guided_input(self, request):
"""
Check if the request has any guided input.
"""
return any(
x is not None
for x in (
request.guided_json,
request.guided_regex,
request.guided_choice,
request.structural_tag,
request.guided_grammar,
request.guided_json_object,
)
)

def add_requests(self, task, sampling_params=None, **kwargs):
"""
Add a new request to the queue.
Expand Down Expand Up @@ -249,8 +265,15 @@ def add_requests(self, task, sampling_params=None, **kwargs):
llm_logger.error(error_msg)
raise EngineError(error_msg, error_code=400)

if self.engine.guided_decoding_checker is not None:
request, err_msg = self.engine.guided_decoding_checker.schema_format(request)
if self._has_guided_input(request):
err_msg = None
if self.guided_decoding_checker is None:
err_msg = (
"guided_backend is None, use --guided-decoding-backend to specify the backend at server startup."
)
else:
request, err_msg = self.guided_decoding_checker.schema_format(request)

if err_msg is not None:
llm_logger.error(err_msg)
raise EngineError(err_msg, error_code=400)
Expand Down Expand Up @@ -469,6 +492,7 @@ def _start_worker_service(self):
f" --guided_decoding_backend {self.cfg.guided_decoding_backend}"
f" --load_strategy {self.cfg.load_config.load_strategy}"
f" --early_stop_config '{self.cfg.early_stop_config.to_json_string()}'"
f" --reasoning_parser {self.cfg.reasoning_parser}"
f" --load_choices {self.cfg.load_config.load_choices}"
f" --moba_attention_config '{self.cfg.moba_attention_config.to_json_string()}'"
f" --ips {ips}"
Expand Down
12 changes: 5 additions & 7 deletions fastdeploy/engine/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,13 +263,11 @@ def set(self, key, value):
setattr(self, key, value)

def __repr__(self) -> str:
return (
f"Request(request_id={self.request_id}, "
f"prompt={self.prompt!r}, "
f"prompt_token_ids={self.prompt_token_ids}, "
f"draft_token_ids={self.draft_token_ids}, "
f"sampling_params={self.sampling_params})"
)
non_none_fields = []
for attr, value in vars(self).items():
if value is not None and not attr.startswith("_"):
non_none_fields.append(f"{attr}={value!r}")
return f"Request({', '.join(non_none_fields)})"


@dataclass(slots=True)
Expand Down
51 changes: 51 additions & 0 deletions fastdeploy/engine/sampling_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ class SamplingParams:
temp_scaled_logprobs: bool = False
top_p_normalized_logprobs: bool = False
bad_words: Optional[List[str]] = None
guided_decoding: Optional[GuidedDecodingParams] = None
bad_words_token_ids: Optional[List[int]] = None

@classmethod
Expand Down Expand Up @@ -132,6 +133,7 @@ def from_optional(
min_tokens=1,
logprobs=None,
bad_words=None,
guided_decoding=None,
bad_words_token_ids=None,
) -> SamplingParams:
"""Create instance from command line arguments"""
Expand All @@ -153,6 +155,7 @@ def from_optional(
min_tokens=min_tokens,
logprobs=logprobs,
bad_words=bad_words,
guided_decoding=guided_decoding,
bad_words_token_ids=bad_words_token_ids,
)

Expand Down Expand Up @@ -217,3 +220,51 @@ class BeamSearchParams:
temperature: float = 0.0
length_penalty: float = 1.0
include_stop_str_in_output: bool = False


@dataclass
class GuidedDecodingParams:
"""Guided decoding parameters for text generation."""

json: Optional[Union[str, dict]] = None
regex: Optional[str] = None
choice: Optional[List[str]] = None
grammar: Optional[str] = None
json_object: Optional[bool] = None
structural_tag: Optional[str] = None

def to_dict(self):
"""convert to dict"""
key_dict = {
"guided_json": self.json,
"guided_regex": self.regex,
"guided_choice": self.choice,
"guided_grammar": self.grammar,
"structural_tag": self.structural_tag,
"guided_json_object": self.json_object,
}

guided_dict = {}
for key, value in key_dict.items():
if value is not None:
guided_dict[key] = value
return guided_dict

def __post_init__(self):
"""Verify the arguments."""
guided_count = sum(
[
self.json is not None,
self.regex is not None,
self.choice is not None,
self.grammar is not None,
self.json_object is not None,
self.structural_tag is not None,
]
)

if guided_count > 1:
raise ValueError(
"You can only use one kind of guided decoding "
"('json', 'json_object', 'regex', 'choice', 'grammar', 'structural_tag')."
)
3 changes: 3 additions & 0 deletions fastdeploy/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,9 @@ def _add_request(
current_sampling_params = sampling_params[i]
else:
current_sampling_params = sampling_params
if current_sampling_params.guided_decoding is not None:
guided_decoding_dict = current_sampling_params.guided_decoding.to_dict()
tasks.update(guided_decoding_dict)
self.llm_engine.add_requests(tasks, current_sampling_params, **kwargs)
return req_ids

Expand Down
7 changes: 6 additions & 1 deletion fastdeploy/model_executor/guided_decoding/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,13 @@
"""

# from fastdeploy.config import FDConfig
from fastdeploy.model_executor.guided_decoding.base_guided_decoding import (
BackendBase,
BaseChecker,
LogitsProcessorBase,
)

__all__ = ["get_guided_backend", "schema_checker"]
__all__ = ["get_guided_backend", "schema_checker", "LogitsProcessorBase", "BackendBase", "BaseChecker"]


def get_guided_backend(
Expand Down
Loading
Loading