diff --git a/benchmarks/yaml/x1-64k-w4a8c8-tp4.yaml b/benchmarks/yaml/x1-64k-w4a8c8-tp4.yaml index 30a50170bd2..a5bb750ba90 100644 --- a/benchmarks/yaml/x1-64k-w4a8c8-tp4.yaml +++ b/benchmarks/yaml/x1-64k-w4a8c8-tp4.yaml @@ -1,5 +1,5 @@ -reasoning-parser: ernie_x1 -tool_call_parser: ernie_x1 +reasoning-parser: ernie-x1 +tool_call_parser: ernie-x1 tensor_parallel_size: 4 max_model_len: 65536 max_num_seqs: 128 diff --git a/benchmarks/yaml/x1-a3b-128k-wint8-h800-tp1.yaml b/benchmarks/yaml/x1-a3b-128k-wint8-h800-tp1.yaml index 09236610af3..4476a55a9fd 100644 --- a/benchmarks/yaml/x1-a3b-128k-wint8-h800-tp1.yaml +++ b/benchmarks/yaml/x1-a3b-128k-wint8-h800-tp1.yaml @@ -1,7 +1,7 @@ tensor_parallel_size: 1 max_model_len: 131072 max_num_seqs: 32 -reasoning_parser: ernie_x1 -tool_call_parser: ernie_x1 +reasoning_parser: ernie-x1 +tool_call_parser: ernie-x1 load_choices: "default_v1" quantization: wint8 diff --git a/docs/best_practices/ERNIE-4.5-21B-A3B-Thinking.md b/docs/best_practices/ERNIE-4.5-21B-A3B-Thinking.md index 05328ff08f7..a67be76fef3 100644 --- a/docs/best_practices/ERNIE-4.5-21B-A3B-Thinking.md +++ b/docs/best_practices/ERNIE-4.5-21B-A3B-Thinking.md @@ -33,8 +33,8 @@ python -m fastdeploy.entrypoints.openai.api_server \ --tensor-parallel-size 1 \ --max-model-len 131072 \ --quantization wint8 \ - --reasoning-parser ernie_x1 \ - --tool-call-parser ernie_x1 \ + --reasoning-parser ernie-x1 \ + --tool-call-parser ernie-x1 \ --max-num-seqs 32 ``` - `--quantization`: Indicates the quantization strategy used by the model. Different quantization strategies will result in different performance and accuracy of the model. It could be one of `wint8` / `wint4` / `block_wise_fp8`(Hopper is needed). diff --git a/docs/usage/environment_variables.md b/docs/usage/environment_variables.md index 378a80a7b61..c4c319f83aa 100644 --- a/docs/usage/environment_variables.md +++ b/docs/usage/environment_variables.md @@ -80,7 +80,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # Whether to use Machete for wint4 dense GEMM. "FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "1"), - # Used to truncate the string inserted during thinking when reasoning in a model. ( for ernie4_5_vl, \n\n\n for ernie_x1) + # Used to truncate the string inserted during thinking when reasoning in a model. ( for ernie-45-vl, \n\n\n for ernie-x1) "FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", ""), # Timeout for cache_transfer_manager process exit diff --git a/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Thinking.md b/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Thinking.md index 0dc0db52771..c2648ceb337 100644 --- a/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Thinking.md +++ b/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Thinking.md @@ -33,8 +33,8 @@ python -m fastdeploy.entrypoints.openai.api_server \ --tensor-parallel-size 1 \ --max-model-len 131072 \ --quantization wint8 \ - --reasoning-parser ernie_x1 \ - --tool-call-parser ernie_x1 \ + --reasoning-parser ernie-x1 \ + --tool-call-parser ernie-x1 \ --max-num-seqs 32 ``` 其中: diff --git a/docs/zh/usage/environment_variables.md b/docs/zh/usage/environment_variables.md index f778735eebd..b0a162a8aa8 100644 --- a/docs/zh/usage/environment_variables.md +++ b/docs/zh/usage/environment_variables.md @@ -80,7 +80,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # 是否使用 Machete 后端的 wint4 GEMM. "FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "1"), - # Used to truncate the string inserted during thinking when reasoning in a model. ( for ernie4_5_vl, \n\n\n for ernie_x1) + # Used to truncate the string inserted during thinking when reasoning in a model. ( for ernie-45-vl, \n\n\n for ernie-x1) "FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", ""), # cache_transfer_manager 进程残留时退出等待超时时间 diff --git a/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py index d6ac8f81aae..906483f445a 100644 --- a/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py +++ b/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py @@ -95,6 +95,7 @@ def get_tool_parser(cls, name) -> type: Raise a KeyError exception if the name is not registered. """ + name = name.replace("_", "-") if name in cls.tool_parsers: return cls.tool_parsers[name] diff --git a/fastdeploy/entrypoints/openai/tool_parsers/ernie_45_vl_thinking_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/ernie_45_vl_thinking_tool_parser.py index 131c17e6abc..1cb8c0ab71a 100644 --- a/fastdeploy/entrypoints/openai/tool_parsers/ernie_45_vl_thinking_tool_parser.py +++ b/fastdeploy/entrypoints/openai/tool_parsers/ernie_45_vl_thinking_tool_parser.py @@ -44,7 +44,7 @@ def random_tool_call_id() -> str: from fastdeploy.utils import data_processor_logger -@ToolParserManager.register_module("ernie_45-vl-thinking") +@ToolParserManager.register_module("ernie-45-vl-thinking") class Ernie45VLThinkingToolParser(ToolParser): """ Tool parser for Ernie model version 4.5.1. diff --git a/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py index 14a784f174e..8a14abee875 100644 --- a/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py +++ b/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py @@ -44,7 +44,7 @@ def random_tool_call_id() -> str: from fastdeploy.utils import data_processor_logger -@ToolParserManager.register_module("ernie_x1") +@ToolParserManager.register_module("ernie-x1") class ErnieX1ToolParser(ToolParser): """ Tool parser for Ernie model version 4.5.1. diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index 05b042d7a91..d60750d6a9e 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -122,7 +122,7 @@ "FD_ENABLE_SWAP_SPACE_CLEARING": lambda: int(os.getenv("FD_ENABLE_SWAP_SPACE_CLEARING", "0")), # enable return text, used when FD_ENABLE_INTERNAL_ADAPTER=1 "FD_ENABLE_RETURN_TEXT": lambda: bool(int(os.getenv("FD_ENABLE_RETURN_TEXT", "0"))), - # Used to truncate the string inserted during thinking when reasoning in a model. ( for ernie4_5_vl, \n\n\n for ernie_x1) + # Used to truncate the string inserted during thinking when reasoning in a model. ( for ernie-45-vl, \n\n\n for ernie-x1) "FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", ""), # Timeout for cache_transfer_manager process exit "FD_CACHE_PROC_EXIT_TIMEOUT": lambda: int(os.getenv("FD_CACHE_PROC_EXIT_TIMEOUT", "600")), diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index bddb12b496b..bcbd25dbf15 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -101,7 +101,7 @@ def limit_thinking_content_length( line_break_id: int = None, ): if limit_strategy == "": - # for ernie4_5_vl + # for ernie-45-vl limit_thinking_content_length_v1( sampled_token_ids, max_think_lens, @@ -110,7 +110,7 @@ def limit_thinking_content_length( think_end_id, ) elif limit_strategy == "\n\n\n": - # for ernie_x1 + # for ernie-x1 assert line_break_id > 0 limit_thinking_content_length_v2( sampled_token_ids, @@ -136,7 +136,7 @@ def speculate_limit_thinking_content_length( line_break_id: int = None, ): if limit_strategy == "": - # for ernie4_5_vl + # for ernie-45-vl speculate_limit_thinking_content_length_v1( accept_tokens, max_think_lens, @@ -147,7 +147,7 @@ def speculate_limit_thinking_content_length( think_end_id, ) elif limit_strategy == "\n\n\n": - # for ernie_x1 + # for ernie-x1 assert line_break_id > 0 speculate_limit_thinking_content_length_v2( accept_tokens, diff --git a/fastdeploy/reasoning/abs_reasoning_parsers.py b/fastdeploy/reasoning/abs_reasoning_parsers.py index 50e01e5a9f6..0f3e6e3183b 100644 --- a/fastdeploy/reasoning/abs_reasoning_parsers.py +++ b/fastdeploy/reasoning/abs_reasoning_parsers.py @@ -125,6 +125,7 @@ def get_reasoning_parser(cls, name: Optional[str]) -> type[ReasoningParser]: Raise a KeyError exception if the name is not registered. """ + name = name.replace("_", "-") if name in cls.reasoning_parsers: return cls.reasoning_parsers[name] diff --git a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py index 54b72a0eb5e..77fc1d5ada8 100644 --- a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py +++ b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py @@ -5,10 +5,10 @@ from fastdeploy.reasoning import ReasoningParser, ReasoningParserManager -@ReasoningParserManager.register_module("ernie_x1") +@ReasoningParserManager.register_module("ernie-x1") class ErnieX1ReasoningParser(ReasoningParser): """ - Reasoning parser for ernie_x1 model with stricter boundary checking. + Reasoning parser for ernie-x1 model with stricter boundary checking. Unified rules: - Do not strip newline before diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index f88a1147792..73538b28520 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -203,7 +203,7 @@ def xpu_post_process( step_idx = share_inputs["step_idx"] limit_think_status = share_inputs["limit_think_status"] if limit_strategy == "": - # for ernie4_5_vl + # for ernie-45-vl limit_thinking_content_length_v1( sampled_token_ids, max_think_lens, @@ -212,7 +212,7 @@ def xpu_post_process( think_end_id, ) elif limit_strategy == "\n\n\n": - # for ernie_x1 + # for ernie-x1 assert line_break_id > 0 limit_thinking_content_length_v2( sampled_token_ids, diff --git a/tests/entrypoints/openai/test_serving_completion.py b/tests/entrypoints/openai/test_serving_completion.py index d48ce4d5b74..c2a36d18555 100644 --- a/tests/entrypoints/openai/test_serving_completion.py +++ b/tests/entrypoints/openai/test_serving_completion.py @@ -73,9 +73,9 @@ def test_check_master_tp16_dp1_master(self): self.assertTrue(serving_completion._check_master()) def test_calc_finish_reason_tool_calls(self): - # 创建一个模拟的engine_client,并设置reasoning_parser为"ernie_x1" + # 创建一个模拟的engine_client,并设置reasoning_parser为"ernie-x1" engine_client = Mock() - engine_client.reasoning_parser = "ernie_x1" + engine_client.reasoning_parser = "ernie-x1" # 创建一个OpenAIServingCompletion实例 serving_completion = OpenAIServingCompletion(engine_client, None, "pid", "ips", 360) # 创建一个模拟的output,并设置finish_reason为"tool_call" @@ -86,9 +86,9 @@ def test_calc_finish_reason_tool_calls(self): assert result == "tool_calls" def test_calc_finish_reason_stop(self): - # 创建一个模拟的engine_client,并设置reasoning_parser为"ernie_x1" + # 创建一个模拟的engine_client,并设置reasoning_parser为"ernie-x1" engine_client = Mock() - engine_client.reasoning_parser = "ernie_x1" + engine_client.reasoning_parser = "ernie-x1" # 创建一个OpenAIServingCompletion实例 serving_completion = OpenAIServingCompletion(engine_client, None, "pid", "ips", 360) # 创建一个模拟的output,并设置finish_reason为其他值 diff --git a/tests/reasoning/test_reasoning_parser.py b/tests/reasoning/test_reasoning_parser.py index 9e06523b026..26a7457db9a 100644 --- a/tests/reasoning/test_reasoning_parser.py +++ b/tests/reasoning/test_reasoning_parser.py @@ -91,7 +91,7 @@ def test_register_and_get_parser(self): Test that a parser can be registered and retrieved successfully. Verifies normal registration and retrieval functionality. """ - ReasoningParserManager.register_module(module=TestReasoningParser, name="test_parser", force=True) + ReasoningParserManager.register_module(module=TestReasoningParser, name="test-parser", force=True) parser_cls = ReasoningParserManager.get_reasoning_parser("test_parser") self.assertIs(parser_cls, TestReasoningParser)