Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions benchmarks/yaml/x1-64k-w4a8c8-tp4.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
reasoning-parser: ernie_x1
tool_call_parser: ernie_x1
reasoning-parser: ernie-x1
tool_call_parser: ernie-x1
tensor_parallel_size: 4
max_model_len: 65536
max_num_seqs: 128
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/yaml/x1-a3b-128k-wint8-h800-tp1.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
tensor_parallel_size: 1
max_model_len: 131072
max_num_seqs: 32
reasoning_parser: ernie_x1
tool_call_parser: ernie_x1
reasoning_parser: ernie-x1
tool_call_parser: ernie-x1
load_choices: "default_v1"
quantization: wint8
4 changes: 2 additions & 2 deletions docs/best_practices/ERNIE-4.5-21B-A3B-Thinking.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ python -m fastdeploy.entrypoints.openai.api_server \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--quantization wint8 \
--reasoning-parser ernie_x1 \
--tool-call-parser ernie_x1 \
--reasoning-parser ernie-x1 \
--tool-call-parser ernie-x1 \
--max-num-seqs 32
```
- `--quantization`: Indicates the quantization strategy used by the model. Different quantization strategies will result in different performance and accuracy of the model. It could be one of `wint8` / `wint4` / `block_wise_fp8`(Hopper is needed).
Expand Down
2 changes: 1 addition & 1 deletion docs/usage/environment_variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Whether to use Machete for wint4 dense GEMM.
"FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "1"),

# Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie4_5_vl, \n</think>\n\n for ernie_x1)
# Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie-45-vl, \n</think>\n\n for ernie-x1)
"FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", "</think>"),

# Timeout for cache_transfer_manager process exit
Expand Down
4 changes: 2 additions & 2 deletions docs/zh/best_practices/ERNIE-4.5-21B-A3B-Thinking.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ python -m fastdeploy.entrypoints.openai.api_server \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--quantization wint8 \
--reasoning-parser ernie_x1 \
--tool-call-parser ernie_x1 \
--reasoning-parser ernie-x1 \
--tool-call-parser ernie-x1 \
--max-num-seqs 32
```
其中:
Expand Down
2 changes: 1 addition & 1 deletion docs/zh/usage/environment_variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# 是否使用 Machete 后端的 wint4 GEMM.
"FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "1"),

# Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie4_5_vl, \n</think>\n\n for ernie_x1)
# Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie-45-vl, \n</think>\n\n for ernie-x1)
"FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", "</think>"),

# cache_transfer_manager 进程残留时退出等待超时时间
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ def get_tool_parser(cls, name) -> type:

Raise a KeyError exception if the name is not registered.
"""
name = name.replace("_", "-")
if name in cls.tool_parsers:
return cls.tool_parsers[name]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def random_tool_call_id() -> str:
from fastdeploy.utils import data_processor_logger


@ToolParserManager.register_module("ernie_45-vl-thinking")
@ToolParserManager.register_module("ernie-45-vl-thinking")
class Ernie45VLThinkingToolParser(ToolParser):
"""
Tool parser for Ernie model version 4.5.1.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def random_tool_call_id() -> str:
from fastdeploy.utils import data_processor_logger


@ToolParserManager.register_module("ernie_x1")
@ToolParserManager.register_module("ernie-x1")
class ErnieX1ToolParser(ToolParser):
"""
Tool parser for Ernie model version 4.5.1.
Expand Down
2 changes: 1 addition & 1 deletion fastdeploy/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@
"FD_ENABLE_SWAP_SPACE_CLEARING": lambda: int(os.getenv("FD_ENABLE_SWAP_SPACE_CLEARING", "0")),
# enable return text, used when FD_ENABLE_INTERNAL_ADAPTER=1
"FD_ENABLE_RETURN_TEXT": lambda: bool(int(os.getenv("FD_ENABLE_RETURN_TEXT", "0"))),
# Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie4_5_vl, \n</think>\n\n for ernie_x1)
# Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie-45-vl, \n</think>\n\n for ernie-x1)
"FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", "</think>"),
# Timeout for cache_transfer_manager process exit
"FD_CACHE_PROC_EXIT_TIMEOUT": lambda: int(os.getenv("FD_CACHE_PROC_EXIT_TIMEOUT", "600")),
Expand Down
8 changes: 4 additions & 4 deletions fastdeploy/model_executor/pre_and_post_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def limit_thinking_content_length(
line_break_id: int = None,
):
if limit_strategy == "</think>":
# for ernie4_5_vl
# for ernie-45-vl
limit_thinking_content_length_v1(
sampled_token_ids,
max_think_lens,
Expand All @@ -110,7 +110,7 @@ def limit_thinking_content_length(
think_end_id,
)
elif limit_strategy == "\n</think>\n\n":
# for ernie_x1
# for ernie-x1
assert line_break_id > 0
limit_thinking_content_length_v2(
sampled_token_ids,
Expand All @@ -136,7 +136,7 @@ def speculate_limit_thinking_content_length(
line_break_id: int = None,
):
if limit_strategy == "</think>":
# for ernie4_5_vl
# for ernie-45-vl
speculate_limit_thinking_content_length_v1(
accept_tokens,
max_think_lens,
Expand All @@ -147,7 +147,7 @@ def speculate_limit_thinking_content_length(
think_end_id,
)
elif limit_strategy == "\n</think>\n\n":
# for ernie_x1
# for ernie-x1
assert line_break_id > 0
speculate_limit_thinking_content_length_v2(
accept_tokens,
Expand Down
1 change: 1 addition & 0 deletions fastdeploy/reasoning/abs_reasoning_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ def get_reasoning_parser(cls, name: Optional[str]) -> type[ReasoningParser]:

Raise a KeyError exception if the name is not registered.
"""
name = name.replace("_", "-")
if name in cls.reasoning_parsers:
return cls.reasoning_parsers[name]

Expand Down
4 changes: 2 additions & 2 deletions fastdeploy/reasoning/ernie_x1_reasoning_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
from fastdeploy.reasoning import ReasoningParser, ReasoningParserManager


@ReasoningParserManager.register_module("ernie_x1")
@ReasoningParserManager.register_module("ernie-x1")
class ErnieX1ReasoningParser(ReasoningParser):
"""
Reasoning parser for ernie_x1 model with stricter boundary checking.
Reasoning parser for ernie-x1 model with stricter boundary checking.

Unified rules:
- Do not strip newline before </think>
Expand Down
4 changes: 2 additions & 2 deletions fastdeploy/worker/xpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def xpu_post_process(
step_idx = share_inputs["step_idx"]
limit_think_status = share_inputs["limit_think_status"]
if limit_strategy == "</think>":
# for ernie4_5_vl
# for ernie-45-vl
limit_thinking_content_length_v1(
sampled_token_ids,
max_think_lens,
Expand All @@ -212,7 +212,7 @@ def xpu_post_process(
think_end_id,
)
elif limit_strategy == "\n</think>\n\n":
# for ernie_x1
# for ernie-x1
assert line_break_id > 0
limit_thinking_content_length_v2(
sampled_token_ids,
Expand Down
8 changes: 4 additions & 4 deletions tests/entrypoints/openai/test_serving_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,9 @@ def test_check_master_tp16_dp1_master(self):
self.assertTrue(serving_completion._check_master())

def test_calc_finish_reason_tool_calls(self):
# 创建一个模拟的engine_client,并设置reasoning_parser为"ernie_x1"
# 创建一个模拟的engine_client,并设置reasoning_parser为"ernie-x1"
engine_client = Mock()
engine_client.reasoning_parser = "ernie_x1"
engine_client.reasoning_parser = "ernie-x1"
# 创建一个OpenAIServingCompletion实例
serving_completion = OpenAIServingCompletion(engine_client, None, "pid", "ips", 360)
# 创建一个模拟的output,并设置finish_reason为"tool_call"
Expand All @@ -86,9 +86,9 @@ def test_calc_finish_reason_tool_calls(self):
assert result == "tool_calls"

def test_calc_finish_reason_stop(self):
# 创建一个模拟的engine_client,并设置reasoning_parser为"ernie_x1"
# 创建一个模拟的engine_client,并设置reasoning_parser为"ernie-x1"
engine_client = Mock()
engine_client.reasoning_parser = "ernie_x1"
engine_client.reasoning_parser = "ernie-x1"
# 创建一个OpenAIServingCompletion实例
serving_completion = OpenAIServingCompletion(engine_client, None, "pid", "ips", 360)
# 创建一个模拟的output,并设置finish_reason为其他值
Expand Down
2 changes: 1 addition & 1 deletion tests/reasoning/test_reasoning_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def test_register_and_get_parser(self):
Test that a parser can be registered and retrieved successfully.
Verifies normal registration and retrieval functionality.
"""
ReasoningParserManager.register_module(module=TestReasoningParser, name="test_parser", force=True)
ReasoningParserManager.register_module(module=TestReasoningParser, name="test-parser", force=True)
parser_cls = ReasoningParserManager.get_reasoning_parser("test_parser")
self.assertIs(parser_cls, TestReasoningParser)

Expand Down
Loading