From 8ac7918601a4e9107ecacef93ab6603c6893c2a5 Mon Sep 17 00:00:00 2001 From: ckl117 Date: Wed, 26 Nov 2025 21:20:41 +0800 Subject: [PATCH 1/3] check fake input shape --- fastdeploy/model_executor/models/glm4_moe.py | 2 +- fastdeploy/model_executor/models/qwen3moe.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fastdeploy/model_executor/models/glm4_moe.py b/fastdeploy/model_executor/models/glm4_moe.py index a095b7b0435..1100347807d 100644 --- a/fastdeploy/model_executor/models/glm4_moe.py +++ b/fastdeploy/model_executor/models/glm4_moe.py @@ -499,7 +499,7 @@ def empty_input_forward(self): empty_input_forward """ fake_hidden_states = paddle.ones( - shape=[1, self.fd_config.model_config.hidden_size], + shape=[0, self.fd_config.model_config.hidden_size], dtype=paddle.get_default_dtype(), ) for i in range( diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py index 3e9a72d7688..74adb5cc3b9 100644 --- a/fastdeploy/model_executor/models/qwen3moe.py +++ b/fastdeploy/model_executor/models/qwen3moe.py @@ -421,7 +421,7 @@ def empty_input_forward(self): empty_input_forward """ fake_hidden_states = paddle.empty( - shape=[1, self.fd_config.model_config.hidden_size], + shape=[0, self.fd_config.model_config.hidden_size], dtype=paddle.get_default_dtype(), ) for i in range( From 5d2ba1cb27e790ac4adc1eb73b203c38f0114fba Mon Sep 17 00:00:00 2001 From: ckl117 Date: Fri, 28 Nov 2025 22:05:52 +0800 Subject: [PATCH 2/3] support glm 4.6 --- custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu | 5 +++++ .../model_executor/layers/moe/fused_moe_cutlass_backend.py | 4 +++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu b/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu index 369a92ee2eb..4416d204517 100644 --- a/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu +++ b/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu @@ -58,6 +58,11 @@ __VA_ARGS__ \ break; \ } \ + case 20: { \ + constexpr size_t NUM_EXPERTS_PER_RANK = 20; \ + __VA_ARGS__ \ + break; \ + } \ case 32: { \ constexpr size_t NUM_EXPERTS_PER_RANK = 32; \ __VA_ARGS__ \ diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py index d87894b8105..4802a6aab48 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py @@ -146,8 +146,10 @@ def apply_ep_prefill( recv_topk_weights, recv_num_tokens_per_expert_list, handle, - _, + event, ) = self.ep_prefill_runner.dispatch(x, topk_idx, topk_weights) + if self.ep_prefill_runner.ep_engine.async_finish: + event.current_stream_wait() token_all_num = sum(recv_num_tokens_per_expert_list) # 3. Compute ffn From 2054605a744bcf39b0f25c44a937fc84a38836b1 Mon Sep 17 00:00:00 2001 From: ckl117 Date: Mon, 1 Dec 2025 15:28:24 +0800 Subject: [PATCH 3/3] check --- fastdeploy/model_executor/models/glm4_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastdeploy/model_executor/models/glm4_moe.py b/fastdeploy/model_executor/models/glm4_moe.py index 1100347807d..9cd0c7003f3 100644 --- a/fastdeploy/model_executor/models/glm4_moe.py +++ b/fastdeploy/model_executor/models/glm4_moe.py @@ -498,7 +498,7 @@ def empty_input_forward(self): """ empty_input_forward """ - fake_hidden_states = paddle.ones( + fake_hidden_states = paddle.empty( shape=[0, self.fd_config.model_config.hidden_size], dtype=paddle.get_default_dtype(), )