From cd22a30e3b19c9218ac303691ca5d717b28132da Mon Sep 17 00:00:00 2001
From: yyssys <atyangshuang@foxmail.com>
Date: Fri, 24 Oct 2025 10:07:11 +0000
Subject: [PATCH 1/3] [XPU]Moe uses a new operator

---
 fastdeploy/engine/args_utils.py                          | 9 +++------
 .../model_executor/layers/backends/xpu/moe/fused_moe.py  | 7 +------
 tests/ci_use/XPU_45T/run_45T.py                          | 2 +-
 3 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py
index 176683c9861..ee1d05b3d9a 100644
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -20,8 +20,6 @@
 from dataclasses import fields as dataclass_fields
 from typing import Any, Dict, List, Optional, Union
 
-import paddle
-
 from fastdeploy import envs
 from fastdeploy.config import (
     CacheConfig,
@@ -422,6 +420,8 @@ def __post_init__(self):
             envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
         if self.guided_decoding_backend != "off":
             envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
+        if current_platform.is_xpu():
+            self.enable_chunked_prefill = True
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
@@ -1025,10 +1025,7 @@ def create_engine_config(self, port_availability_check=True) -> FDConfig:
 
         if self.max_num_batched_tokens is None:
             if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
-                if paddle.is_compiled_with_xpu():
-                    self.max_num_batched_tokens = self.max_model_len
-                else:
-                    self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
+                self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
             else:
                 if self.enable_chunked_prefill:
                     self.max_num_batched_tokens = 2048
diff --git a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py
index e95e1b1a55f..9407849f588 100644
--- a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py
+++ b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py
@@ -14,8 +14,6 @@
 # limitations under the License.
 """
 
-import os
-
 import paddle
 from paddle import nn
 
@@ -246,11 +244,8 @@ def apply_tp(
         """
         if self.moe_quant_type in ["w16a16"]:
             using_ep_moe_algo = False
-        elif self.moe_quant_type in ["w4a8"]:
-            using_ep_moe_algo = True
         else:
-            using_ep_moe_algo = int(os.environ.get("USING_EP_MOE_ALGO", 0)) != 0
-            print(f"using_ep_moe_algo: {using_ep_moe_algo}")
+            using_ep_moe_algo = True
 
         if using_ep_moe_algo:
             fused_moe_out = self.apply_tp_scatter_op(layer, x, gate)
diff --git a/tests/ci_use/XPU_45T/run_45T.py b/tests/ci_use/XPU_45T/run_45T.py
index d64452bd827..cee6957de23 100644
--- a/tests/ci_use/XPU_45T/run_45T.py
+++ b/tests/ci_use/XPU_45T/run_45T.py
@@ -19,7 +19,7 @@ def test_45t():
     ip = "0.0.0.0"
     service_http_port = "8188"  # 服务配置的
     client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY")
-    base_response = "你好！我是一个基于人工智能技术构建的助手，可以帮你解答问题、提供建议、辅助创作，或者陪你聊天解闷～😊 无论是学习、工作还是生活中的疑问，都可以随时告诉我哦！你今天有什么想聊的吗？"
+    base_response = "你好！我是一个基于人工智能技术打造的助手，可以帮你解答问题、提供建议、分享知识，或者陪你聊聊天～😊 无论是学习、工作、生活还是娱乐相关的问题，都可以随时告诉我哦！你今天有什么想聊的吗？"
     # 非流式对话
     response = client.chat.completions.create(
         model="default",

From fc8f17e50819e1689336ea76c01e1a4c70e2b87f Mon Sep 17 00:00:00 2001
From: yyssys <atyangshuang@foxmail.com>
Date: Fri, 24 Oct 2025 10:25:24 +0000
Subject: [PATCH 2/3] [XPU]Moe uses a new operator

---
 fastdeploy/engine/args_utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py
index ee1d05b3d9a..ee2807fdc9c 100644
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -420,8 +420,6 @@ def __post_init__(self):
             envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
         if self.guided_decoding_backend != "off":
             envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
-        if current_platform.is_xpu():
-            self.enable_chunked_prefill = True
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:

From 2244c8ce65410cd689187b7c0cc1b095053c26a5 Mon Sep 17 00:00:00 2001
From: yyssys <atyangshuang@foxmail.com>
Date: Fri, 24 Oct 2025 11:06:49 +0000
Subject: [PATCH 3/3] update response

---
 tests/ci_use/XPU_45T/run_45T.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/ci_use/XPU_45T/run_45T.py b/tests/ci_use/XPU_45T/run_45T.py
index cee6957de23..3a0f1048600 100644
--- a/tests/ci_use/XPU_45T/run_45T.py
+++ b/tests/ci_use/XPU_45T/run_45T.py
@@ -19,7 +19,8 @@ def test_45t():
     ip = "0.0.0.0"
     service_http_port = "8188"  # 服务配置的
     client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY")
-    base_response = "你好！我是一个基于人工智能技术打造的助手，可以帮你解答问题、提供建议、分享知识，或者陪你聊聊天～😊 无论是学习、工作、生活还是娱乐相关的问题，都可以随时告诉我哦！你今天有什么想聊的吗？"
+    base_response_110 = "你好！我是一个基于人工智能技术开发的助手，可以帮你解答问题、提供建议、聊天交流或者完成一些任务。无论是学习、工作还是生活中的疑问，都可以随时告诉我哦～😊 你有什么想聊的吗？"
+    base_response_104 = "你好！我是一个基于人工智能技术打造的助手，可以帮你解答问题、提供建议、分享知识，或者陪你聊聊天～😊 无论是学习、工作、生活还是娱乐相关的问题，都可以随时告诉我哦！你今天有什么想聊的吗？"
     # 非流式对话
     response = client.chat.completions.create(
         model="default",
@@ -32,8 +33,11 @@ def test_45t():
         stream=False,
     )
     print(response.choices[0].message.content)
-    print(base_response)
-    assert response.choices[0].message.content == base_response
+    # print(base_response)
+    assert (
+        response.choices[0].message.content == base_response_110
+        or response.choices[0].message.content == base_response_104
+    )
 
 
 if __name__ == "__main__":