diff --git a/fastdeploy/model_executor/layers/moe/ep.py b/fastdeploy/model_executor/layers/moe/ep.py index 910c5dd87f6..eb5742d98f7 100644 --- a/fastdeploy/model_executor/layers/moe/ep.py +++ b/fastdeploy/model_executor/layers/moe/ep.py @@ -435,6 +435,7 @@ def dispatch( x: paddle.Tensor, topk_idx: paddle.Tensor, topk_weights: paddle.Tensor, + expert_alignment: int = 1, *args, **kwargs, ): @@ -461,6 +462,7 @@ def dispatch( "async_finish": self.ep_engine.async_finish, "topk_idx": topk_idx, "topk_weights": topk_weights, + "expert_alignment": expert_alignment, } return buffer.dispatch(**dispatch_args) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py index 06cc3294915..cd20d7eaf07 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py @@ -335,7 +335,9 @@ def apply_ep_prefill( recv_num_tokens_per_expert_list, handle, _, - ) = self.ep_prefill_runner.dispatch(x, topk_idx, topk_weights, x_scale_tensor=x_scale_tensor) + ) = self.ep_prefill_runner.dispatch( + x, topk_idx, topk_weights, x_scale_tensor=x_scale_tensor, expert_alignment=128 + ) token_all_num = sum(recv_num_tokens_per_expert_list) @@ -345,7 +347,6 @@ def apply_ep_prefill( (recv_x, recv_x_scale) = recv_x token_nums_this_rank = count_tokens_per_expert_func(recv_topk_idx, layer.num_local_experts) - token_nums_this_rank_padded = sum(token_nums_this_rank[1].numpy().tolist()) ( permute_input, @@ -365,7 +366,7 @@ def apply_ep_prefill( token_nums_this_rank[0], token_nums_this_rank[1], True, # use_in_ep - token_nums_this_rank_padded, + token_all_num, ) permute_scale = permute_scale.transpose([1, 0]).contiguous()