diff --git a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py index a7ac1845a21..4636d82ddff 100644 --- a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py +++ b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py @@ -262,7 +262,7 @@ def apply( # layer.top_k, # False, # moe group, used in deepseek # ) - # if layer.tp_size > 1: + # if layer.reduce_results and layer.tp_size > 1: # from fastdeploy.distributed.communication import ( # tensor_model_parallel_all_reduce, # ) @@ -271,48 +271,53 @@ def apply( # return fused_moe_out - gate_out = paddle.matmul(x.cast("float32"), gate.weight.transpose([1, 0]), transpose_y=True) - topk_idx, topk_weights = moe_topk_select(gate_out, layer.gate_correction_bias, layer.top_k, True) - token_nums_per_expert_list = list(range(64)) # 填充做占位符 - permute_input, permute_indices_per_token, token_num_lod, dst_weights, ffn1_act_scale_per_token = ( - ep_moe_expert_dispatch( - x, - topk_idx, - topk_weights, - (layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None), - token_nums_per_expert_list, - x.shape[0] * layer.top_k, + token_num = x.shape[0] + if token_num > 0: + gate_out = paddle.matmul(x.cast("float32"), gate.weight.transpose([1, 0]), transpose_y=True) + topk_idx, topk_weights = moe_topk_select(gate_out, layer.gate_correction_bias, layer.top_k, True) + token_nums_per_expert_list = list(range(64)) # 填充做占位符 + permute_input, permute_indices_per_token, token_num_lod, dst_weights, ffn1_act_scale_per_token = ( + ep_moe_expert_dispatch( + x, + topk_idx, + topk_weights, + (layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None), + token_nums_per_expert_list, + x.shape[0] * layer.top_k, + self.moe_quant_type, + ) + ) + + ffn_out = moe_expert_ffn( + permute_input, + token_num_lod, + layer.up_gate_proj_weight, + layer.down_proj_weight, + None, # moe_ffn1_bias + None, # moe_ffn2_bias + None, # ffn1 in scale + None, # ffn2 in scale + (layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None), + (layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None), + None, # moe_ffn2_shift + None, # moe_ffn2_smooth self.moe_quant_type, + -1, + x.shape[0] * layer.top_k, # token_all_num ) - ) + topk_weights_bf16 = topk_weights.astype("bfloat16") + tmp_ffn_out = ep_moe_expert_combine( + ffn_out, + permute_indices_per_token, + topk_weights_bf16, + permute_indices_per_token.shape[0], + ffn_out.shape[0], + ffn_out.shape[1], + permute_indices_per_token.shape[1], + ) + else: + tmp_ffn_out = paddle.empty(x.shape, x.dtype) - ffn_out = moe_expert_ffn( - permute_input, - token_num_lod, - layer.up_gate_proj_weight, - layer.down_proj_weight, - None, # moe_ffn1_bias - None, # moe_ffn2_bias - None, # ffn1 in scale - None, # ffn2 in scale - (layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None), - (layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None), - None, # moe_ffn2_shift - None, # moe_ffn2_smooth - self.moe_quant_type, - -1, - x.shape[0] * layer.top_k, # token_all_num - ) - topk_weights_bf16 = topk_weights.astype("bfloat16") - tmp_ffn_out = ep_moe_expert_combine( - ffn_out, - permute_indices_per_token, - topk_weights_bf16, - permute_indices_per_token.shape[0], - ffn_out.shape[0], - ffn_out.shape[1], - permute_indices_per_token.shape[1], - ) if layer.reduce_results and layer.tp_size > 1: from fastdeploy.distributed.communication import ( tensor_model_parallel_all_reduce, @@ -858,7 +863,7 @@ def apply( ffn_out.shape[1], permute_indices_per_token.shape[1], ) - if layer.tp_size > 1: + if layer.reduce_results and layer.tp_size > 1: from fastdeploy.distributed.communication import ( tensor_model_parallel_all_reduce, )