PaddlePaddle · EmmonsCurse · Oct 24, 2025 · Oct 24, 2025 · Oct 24, 2025 · Oct 24, 2025
diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py
@@ -20,8 +20,6 @@
 from dataclasses import fields as dataclass_fields
 from typing import Any, Dict, List, Optional, Union
 
-import paddle
-
 from fastdeploy import envs
 from fastdeploy.config import (
     CacheConfig,
@@ -1025,10 +1023,7 @@ def create_engine_config(self, port_availability_check=True) -> FDConfig:
 
         if self.max_num_batched_tokens is None:
             if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
-                if paddle.is_compiled_with_xpu():
-                    self.max_num_batched_tokens = self.max_model_len
-                else:
-                    self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
+                self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
             else:
                 if self.enable_chunked_prefill:
                     self.max_num_batched_tokens = 2048

diff --git a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py
@@ -14,8 +14,6 @@
 # limitations under the License.
 """
 
-import os
-
 import paddle
 from paddle import nn
 
@@ -246,11 +244,8 @@ def apply_tp(
         """
         if self.moe_quant_type in ["w16a16"]:
             using_ep_moe_algo = False
-        elif self.moe_quant_type in ["w4a8"]:
-            using_ep_moe_algo = True
         else:
-            using_ep_moe_algo = int(os.environ.get("USING_EP_MOE_ALGO", 0)) != 0
-            print(f"using_ep_moe_algo: {using_ep_moe_algo}")
+            using_ep_moe_algo = True
 
         if using_ep_moe_algo:
             fused_moe_out = self.apply_tp_scatter_op(layer, x, gate)

diff --git a/tests/ci_use/XPU_45T/run_45T.py b/tests/ci_use/XPU_45T/run_45T.py
@@ -19,7 +19,8 @@ def test_45t():
     ip = "0.0.0.0"
     service_http_port = "8188"  # 服务配置的
     client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY")
-    base_response = "你好！我是一个基于人工智能技术构建的助手，可以帮你解答问题、提供建议、辅助创作，或者陪你聊天解闷～😊 无论是学习、工作还是生活中的疑问，都可以随时告诉我哦！你今天有什么想聊的吗？"
+    base_response_110 = "你好！我是一个基于人工智能技术开发的助手，可以帮你解答问题、提供建议、聊天交流或者完成一些任务。无论是学习、工作还是生活中的疑问，都可以随时告诉我哦～😊 你有什么想聊的吗？"
+    base_response_104 = "你好！我是一个基于人工智能技术打造的助手，可以帮你解答问题、提供建议、分享知识，或者陪你聊聊天～😊 无论是学习、工作、生活还是娱乐相关的问题，都可以随时告诉我哦！你今天有什么想聊的吗？"
     # 非流式对话
     response = client.chat.completions.create(
         model="default",
@@ -32,8 +33,11 @@ def test_45t():
         stream=False,
     )
     print(response.choices[0].message.content)
-    print(base_response)
-    assert response.choices[0].message.content == base_response
+    # print(base_response)
+    assert (
+        response.choices[0].message.content == base_response_110
+        or response.choices[0].message.content == base_response_104
+    )
 
 
 if __name__ == "__main__":