diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 176683c9861..ee2807fdc9c 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -20,8 +20,6 @@ from dataclasses import fields as dataclass_fields from typing import Any, Dict, List, Optional, Union -import paddle - from fastdeploy import envs from fastdeploy.config import ( CacheConfig, @@ -1025,10 +1023,7 @@ def create_engine_config(self, port_availability_check=True) -> FDConfig: if self.max_num_batched_tokens is None: if int(envs.ENABLE_V1_KVCACHE_SCHEDULER): - if paddle.is_compiled_with_xpu(): - self.max_num_batched_tokens = self.max_model_len - else: - self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM + self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM else: if self.enable_chunked_prefill: self.max_num_batched_tokens = 2048 diff --git a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py index e95e1b1a55f..9407849f588 100644 --- a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py +++ b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py @@ -14,8 +14,6 @@ # limitations under the License. """ -import os - import paddle from paddle import nn @@ -246,11 +244,8 @@ def apply_tp( """ if self.moe_quant_type in ["w16a16"]: using_ep_moe_algo = False - elif self.moe_quant_type in ["w4a8"]: - using_ep_moe_algo = True else: - using_ep_moe_algo = int(os.environ.get("USING_EP_MOE_ALGO", 0)) != 0 - print(f"using_ep_moe_algo: {using_ep_moe_algo}") + using_ep_moe_algo = True if using_ep_moe_algo: fused_moe_out = self.apply_tp_scatter_op(layer, x, gate) diff --git a/tests/ci_use/XPU_45T/run_45T.py b/tests/ci_use/XPU_45T/run_45T.py index d64452bd827..3a0f1048600 100644 --- a/tests/ci_use/XPU_45T/run_45T.py +++ b/tests/ci_use/XPU_45T/run_45T.py @@ -19,7 +19,8 @@ def test_45t(): ip = "0.0.0.0" service_http_port = "8188" # 服务配置的 client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY") - base_response = "你好!我是一个基于人工智能技术构建的助手,可以帮你解答问题、提供建议、辅助创作,或者陪你聊天解闷~😊 无论是学习、工作还是生活中的疑问,都可以随时告诉我哦!你今天有什么想聊的吗?" + base_response_110 = "你好!我是一个基于人工智能技术开发的助手,可以帮你解答问题、提供建议、聊天交流或者完成一些任务。无论是学习、工作还是生活中的疑问,都可以随时告诉我哦~😊 你有什么想聊的吗?" + base_response_104 = "你好!我是一个基于人工智能技术打造的助手,可以帮你解答问题、提供建议、分享知识,或者陪你聊聊天~😊 无论是学习、工作、生活还是娱乐相关的问题,都可以随时告诉我哦!你今天有什么想聊的吗?" # 非流式对话 response = client.chat.completions.create( model="default", @@ -32,8 +33,11 @@ def test_45t(): stream=False, ) print(response.choices[0].message.content) - print(base_response) - assert response.choices[0].message.content == base_response + # print(base_response) + assert ( + response.choices[0].message.content == base_response_110 + or response.choices[0].message.content == base_response_104 + ) if __name__ == "__main__":