Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions fastdeploy/engine/args_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@
from dataclasses import fields as dataclass_fields
from typing import Any, Dict, List, Optional, Union

import paddle

from fastdeploy import envs
from fastdeploy.config import (
CacheConfig,
Expand Down Expand Up @@ -1025,10 +1023,7 @@ def create_engine_config(self, port_availability_check=True) -> FDConfig:

if self.max_num_batched_tokens is None:
if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
if paddle.is_compiled_with_xpu():
self.max_num_batched_tokens = self.max_model_len
else:
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
else:
if self.enable_chunked_prefill:
self.max_num_batched_tokens = 2048
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
# limitations under the License.
"""

import os

import paddle
from paddle import nn

Expand Down Expand Up @@ -246,11 +244,8 @@ def apply_tp(
"""
if self.moe_quant_type in ["w16a16"]:
using_ep_moe_algo = False
elif self.moe_quant_type in ["w4a8"]:
using_ep_moe_algo = True
else:
using_ep_moe_algo = int(os.environ.get("USING_EP_MOE_ALGO", 0)) != 0
print(f"using_ep_moe_algo: {using_ep_moe_algo}")
using_ep_moe_algo = True

if using_ep_moe_algo:
fused_moe_out = self.apply_tp_scatter_op(layer, x, gate)
Expand Down
10 changes: 7 additions & 3 deletions tests/ci_use/XPU_45T/run_45T.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ def test_45t():
ip = "0.0.0.0"
service_http_port = "8188" # 服务配置的
client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY")
base_response = "你好!我是一个基于人工智能技术构建的助手,可以帮你解答问题、提供建议、辅助创作,或者陪你聊天解闷~😊 无论是学习、工作还是生活中的疑问,都可以随时告诉我哦!你今天有什么想聊的吗?"
base_response_110 = "你好!我是一个基于人工智能技术开发的助手,可以帮你解答问题、提供建议、聊天交流或者完成一些任务。无论是学习、工作还是生活中的疑问,都可以随时告诉我哦~😊 你有什么想聊的吗?"
base_response_104 = "你好!我是一个基于人工智能技术打造的助手,可以帮你解答问题、提供建议、分享知识,或者陪你聊聊天~😊 无论是学习、工作、生活还是娱乐相关的问题,都可以随时告诉我哦!你今天有什么想聊的吗?"
# 非流式对话
response = client.chat.completions.create(
model="default",
Expand All @@ -32,8 +33,11 @@ def test_45t():
stream=False,
)
print(response.choices[0].message.content)
print(base_response)
assert response.choices[0].message.content == base_response
# print(base_response)
assert (
response.choices[0].message.content == base_response_110
or response.choices[0].message.content == base_response_104
)


if __name__ == "__main__":
Expand Down
Loading