Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 46 additions & 41 deletions fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ def apply(
# layer.top_k,
# False, # moe group, used in deepseek
# )
# if layer.tp_size > 1:
# if layer.reduce_results and layer.tp_size > 1:
# from fastdeploy.distributed.communication import (
# tensor_model_parallel_all_reduce,
# )
Expand All @@ -271,48 +271,53 @@ def apply(

# return fused_moe_out

gate_out = paddle.matmul(x.cast("float32"), gate.weight.transpose([1, 0]), transpose_y=True)
topk_idx, topk_weights = moe_topk_select(gate_out, layer.gate_correction_bias, layer.top_k, True)
token_nums_per_expert_list = list(range(64)) # 填充做占位符
permute_input, permute_indices_per_token, token_num_lod, dst_weights, ffn1_act_scale_per_token = (
ep_moe_expert_dispatch(
x,
topk_idx,
topk_weights,
(layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None),
token_nums_per_expert_list,
x.shape[0] * layer.top_k,
token_num = x.shape[0]
if token_num > 0:
gate_out = paddle.matmul(x.cast("float32"), gate.weight.transpose([1, 0]), transpose_y=True)
topk_idx, topk_weights = moe_topk_select(gate_out, layer.gate_correction_bias, layer.top_k, True)
token_nums_per_expert_list = list(range(64)) # 填充做占位符
permute_input, permute_indices_per_token, token_num_lod, dst_weights, ffn1_act_scale_per_token = (
ep_moe_expert_dispatch(
x,
topk_idx,
topk_weights,
(layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None),
token_nums_per_expert_list,
x.shape[0] * layer.top_k,
self.moe_quant_type,
)
)

ffn_out = moe_expert_ffn(
permute_input,
token_num_lod,
layer.up_gate_proj_weight,
layer.down_proj_weight,
None, # moe_ffn1_bias
None, # moe_ffn2_bias
None, # ffn1 in scale
None, # ffn2 in scale
(layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
(layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None),
None, # moe_ffn2_shift
None, # moe_ffn2_smooth
self.moe_quant_type,
-1,
x.shape[0] * layer.top_k, # token_all_num
)
)
topk_weights_bf16 = topk_weights.astype("bfloat16")
tmp_ffn_out = ep_moe_expert_combine(
ffn_out,
permute_indices_per_token,
topk_weights_bf16,
permute_indices_per_token.shape[0],
ffn_out.shape[0],
ffn_out.shape[1],
permute_indices_per_token.shape[1],
)
else:
tmp_ffn_out = paddle.empty(x.shape, x.dtype)

ffn_out = moe_expert_ffn(
permute_input,
token_num_lod,
layer.up_gate_proj_weight,
layer.down_proj_weight,
None, # moe_ffn1_bias
None, # moe_ffn2_bias
None, # ffn1 in scale
None, # ffn2 in scale
(layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
(layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None),
None, # moe_ffn2_shift
None, # moe_ffn2_smooth
self.moe_quant_type,
-1,
x.shape[0] * layer.top_k, # token_all_num
)
topk_weights_bf16 = topk_weights.astype("bfloat16")
tmp_ffn_out = ep_moe_expert_combine(
ffn_out,
permute_indices_per_token,
topk_weights_bf16,
permute_indices_per_token.shape[0],
ffn_out.shape[0],
ffn_out.shape[1],
permute_indices_per_token.shape[1],
)
if layer.reduce_results and layer.tp_size > 1:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个layer.reduce_results的意义是什么?在哪里定义的?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GPU在适配VL模型时增加的一个参数,默认为true,加上这句不会影响文本模型只有。VL模型reduce_results取值false,在moe之后不需要进行all_reduce。

from fastdeploy.distributed.communication import (
tensor_model_parallel_all_reduce,
Expand Down Expand Up @@ -858,7 +863,7 @@ def apply(
ffn_out.shape[1],
permute_indices_per_token.shape[1],
)
if layer.tp_size > 1:
if layer.reduce_results and layer.tp_size > 1:
from fastdeploy.distributed.communication import (
tensor_model_parallel_all_reduce,
)
Expand Down
Loading