From cd22a30e3b19c9218ac303691ca5d717b28132da Mon Sep 17 00:00:00 2001 From: yyssys Date: Fri, 24 Oct 2025 10:07:11 +0000 Subject: [PATCH 1/3] [XPU]Moe uses a new operator --- fastdeploy/engine/args_utils.py | 9 +++------ .../model_executor/layers/backends/xpu/moe/fused_moe.py | 7 +------ tests/ci_use/XPU_45T/run_45T.py | 2 +- 3 files changed, 5 insertions(+), 13 deletions(-) diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 176683c9861..ee1d05b3d9a 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -20,8 +20,6 @@ from dataclasses import fields as dataclass_fields from typing import Any, Dict, List, Optional, Union -import paddle - from fastdeploy import envs from fastdeploy.config import ( CacheConfig, @@ -422,6 +420,8 @@ def __post_init__(self): envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 if self.guided_decoding_backend != "off": envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 + if current_platform.is_xpu(): + self.enable_chunked_prefill = True @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: @@ -1025,10 +1025,7 @@ def create_engine_config(self, port_availability_check=True) -> FDConfig: if self.max_num_batched_tokens is None: if int(envs.ENABLE_V1_KVCACHE_SCHEDULER): - if paddle.is_compiled_with_xpu(): - self.max_num_batched_tokens = self.max_model_len - else: - self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM + self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM else: if self.enable_chunked_prefill: self.max_num_batched_tokens = 2048 diff --git a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py index e95e1b1a55f..9407849f588 100644 --- a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py +++ b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py @@ -14,8 +14,6 @@ # limitations under the License. """ -import os - import paddle from paddle import nn @@ -246,11 +244,8 @@ def apply_tp( """ if self.moe_quant_type in ["w16a16"]: using_ep_moe_algo = False - elif self.moe_quant_type in ["w4a8"]: - using_ep_moe_algo = True else: - using_ep_moe_algo = int(os.environ.get("USING_EP_MOE_ALGO", 0)) != 0 - print(f"using_ep_moe_algo: {using_ep_moe_algo}") + using_ep_moe_algo = True if using_ep_moe_algo: fused_moe_out = self.apply_tp_scatter_op(layer, x, gate) diff --git a/tests/ci_use/XPU_45T/run_45T.py b/tests/ci_use/XPU_45T/run_45T.py index d64452bd827..cee6957de23 100644 --- a/tests/ci_use/XPU_45T/run_45T.py +++ b/tests/ci_use/XPU_45T/run_45T.py @@ -19,7 +19,7 @@ def test_45t(): ip = "0.0.0.0" service_http_port = "8188" # 服务配置的 client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY") - base_response = "你好!我是一个基于人工智能技术构建的助手,可以帮你解答问题、提供建议、辅助创作,或者陪你聊天解闷~😊 无论是学习、工作还是生活中的疑问,都可以随时告诉我哦!你今天有什么想聊的吗?" + base_response = "你好!我是一个基于人工智能技术打造的助手,可以帮你解答问题、提供建议、分享知识,或者陪你聊聊天~😊 无论是学习、工作、生活还是娱乐相关的问题,都可以随时告诉我哦!你今天有什么想聊的吗?" # 非流式对话 response = client.chat.completions.create( model="default", From fc8f17e50819e1689336ea76c01e1a4c70e2b87f Mon Sep 17 00:00:00 2001 From: yyssys Date: Fri, 24 Oct 2025 10:25:24 +0000 Subject: [PATCH 2/3] [XPU]Moe uses a new operator --- fastdeploy/engine/args_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index ee1d05b3d9a..ee2807fdc9c 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -420,8 +420,6 @@ def __post_init__(self): envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 if self.guided_decoding_backend != "off": envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 - if current_platform.is_xpu(): - self.enable_chunked_prefill = True @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: From 2244c8ce65410cd689187b7c0cc1b095053c26a5 Mon Sep 17 00:00:00 2001 From: yyssys Date: Fri, 24 Oct 2025 11:06:49 +0000 Subject: [PATCH 3/3] update response --- tests/ci_use/XPU_45T/run_45T.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/ci_use/XPU_45T/run_45T.py b/tests/ci_use/XPU_45T/run_45T.py index cee6957de23..3a0f1048600 100644 --- a/tests/ci_use/XPU_45T/run_45T.py +++ b/tests/ci_use/XPU_45T/run_45T.py @@ -19,7 +19,8 @@ def test_45t(): ip = "0.0.0.0" service_http_port = "8188" # 服务配置的 client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY") - base_response = "你好!我是一个基于人工智能技术打造的助手,可以帮你解答问题、提供建议、分享知识,或者陪你聊聊天~😊 无论是学习、工作、生活还是娱乐相关的问题,都可以随时告诉我哦!你今天有什么想聊的吗?" + base_response_110 = "你好!我是一个基于人工智能技术开发的助手,可以帮你解答问题、提供建议、聊天交流或者完成一些任务。无论是学习、工作还是生活中的疑问,都可以随时告诉我哦~😊 你有什么想聊的吗?" + base_response_104 = "你好!我是一个基于人工智能技术打造的助手,可以帮你解答问题、提供建议、分享知识,或者陪你聊聊天~😊 无论是学习、工作、生活还是娱乐相关的问题,都可以随时告诉我哦!你今天有什么想聊的吗?" # 非流式对话 response = client.chat.completions.create( model="default", @@ -32,8 +33,11 @@ def test_45t(): stream=False, ) print(response.choices[0].message.content) - print(base_response) - assert response.choices[0].message.content == base_response + # print(base_response) + assert ( + response.choices[0].message.content == base_response_110 + or response.choices[0].message.content == base_response_104 + ) if __name__ == "__main__":