From 121a34339971f9ea987524137f5182adc258a926 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Mon, 17 Nov 2025 20:29:48 +0800 Subject: [PATCH 01/11] clean code --- tests/layers/test_fusedmoe.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tests/layers/test_fusedmoe.py b/tests/layers/test_fusedmoe.py index 6831c146929..50b57fe76e2 100644 --- a/tests/layers/test_fusedmoe.py +++ b/tests/layers/test_fusedmoe.py @@ -535,12 +535,12 @@ def __init__( class TestFusedMoE(unittest.TestCase): def setUp(self) -> None: self.architectures = ["Ernie4_5_MoeForCausalLM"] - self.hidden_size = 7168 - self.moe_intermediate_size = 3584 - self.moe_num_experts = 64 + self.hidden_size = 4096 + self.moe_intermediate_size = 2048 + self.moe_num_experts = 160 self.moe_k = 8 - self.hidden_act = "silu" - self.num_attention_heads = 64 + self.num_layers = 46 + self.num_attention_heads = -1 self.model_config = self.build_model_config() def build_model_config(self) -> ModelConfig: @@ -559,7 +559,6 @@ def build_config_json(self) -> str: "moe_intermediate_size": self.moe_intermediate_size, "moe_num_experts": self.moe_num_experts, "moe_k": self.moe_k, - "hidden_act": self.hidden_act, "num_attention_heads": self.num_attention_heads, "dtype": "bfloat16", } @@ -590,8 +589,8 @@ def test_fused_moe(self): # 这行代码必须保留,否则影响均匀性! paddle.seed(ep_rank + 100) - num_layers = 80 - real_weight_layers = 20 + num_layers = self.num_layers + real_weight_layers = num_layers // 2 fused_moe = [None] * real_weight_layers for i in range(real_weight_layers): fused_moe[i] = FuseMoEWrapper(self.model_config, tp_size, tp_rank, ep_size, ep_rank, nnodes=nnodes) From 01f69e7c67822f618f047db21f0db893c42f992e Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Mon, 17 Nov 2025 20:32:19 +0800 Subject: [PATCH 02/11] clean code --- tests/layers/test_ffn.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/tests/layers/test_ffn.py b/tests/layers/test_ffn.py index 5bce5250efb..e527e1820a5 100644 --- a/tests/layers/test_ffn.py +++ b/tests/layers/test_ffn.py @@ -46,7 +46,7 @@ def __init__(self, model_config: ModelConfig): super().__init__() self.model_config = model_config - self.intermediate_size = 3584 + self.intermediate_size = self.model_config.intermediate_size self.hidden_size = self.model_config.hidden_size self.prefix = "hahahha" self.fd_config = FDConfig( @@ -94,10 +94,9 @@ def __init__(self, model_config: ModelConfig): class TestFusedMoE(unittest.TestCase): def setUp(self) -> None: self.architectures = ["Ernie4_5_MoeForCausalLM"] - self.hidden_size = 7168 - self.moe_intermediate_size = 1 - self.moe_num_experts = 1 - self.moe_k = 1 + self.hidden_size = 4096 + self.intermediate_size = 1408 + self.num_layers = 46 self.hidden_act = "silu" self.num_attention_heads = 64 self.model_config = self.build_model_config() @@ -115,9 +114,7 @@ def build_config_json(self) -> str: config_dict = { "architectures": self.architectures, "hidden_size": self.hidden_size, - "moe_intermediate_size": self.moe_intermediate_size, - "moe_num_experts": self.moe_num_experts, - "moe_k": self.moe_k, + "intermediate_size": self.intermediate_size, "hidden_act": self.hidden_act, "num_attention_heads": self.num_attention_heads, "dtype": "bfloat16", @@ -137,18 +134,18 @@ def test_ffn(self): # (ZKK): disable this test, # CI machine does not support deepgemm blockwise_fp8, compilation error. - return + # return moe_cuda_graphs = [None] * 100 cache_hidden_states = [None] * 100 - for idx, num_tokens in enumerate([10, 20, 40, 60, 80, 100, 128, 160, 192, 256, 512, 1024, 2048, 4096]): + for idx, num_tokens in enumerate([4096, 4096*2, 4096*4]): cache_hidden_states[idx] = paddle.rand((num_tokens, self.model_config.hidden_size), dtype=paddle.bfloat16) moe_cuda_graphs[idx] = graphs.CUDAGraph() moe_cuda_graphs[idx].capture_begin() - num_layers = 80 + num_layers = self.num_layers for _ in range(num_layers): out = ffn.ffn(cache_hidden_states[idx]) From ddfcc0c2d08252546b6867c679f301246c981905 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Tue, 18 Nov 2025 12:46:25 +0800 Subject: [PATCH 03/11] clean code --- tests/layers/test_ffn.py | 4 ++-- tests/layers/test_fusedmoe.py | 21 ++++++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/tests/layers/test_ffn.py b/tests/layers/test_ffn.py index e527e1820a5..3606da9367d 100644 --- a/tests/layers/test_ffn.py +++ b/tests/layers/test_ffn.py @@ -95,7 +95,7 @@ class TestFusedMoE(unittest.TestCase): def setUp(self) -> None: self.architectures = ["Ernie4_5_MoeForCausalLM"] self.hidden_size = 4096 - self.intermediate_size = 1408 + self.intermediate_size = 2048 self.num_layers = 46 self.hidden_act = "silu" self.num_attention_heads = 64 @@ -138,7 +138,7 @@ def test_ffn(self): moe_cuda_graphs = [None] * 100 cache_hidden_states = [None] * 100 - for idx, num_tokens in enumerate([4096, 4096*2, 4096*4]): + for idx, num_tokens in enumerate([4096, 4096 * 2, 4096 * 4]): cache_hidden_states[idx] = paddle.rand((num_tokens, self.model_config.hidden_size), dtype=paddle.bfloat16) diff --git a/tests/layers/test_fusedmoe.py b/tests/layers/test_fusedmoe.py index 50b57fe76e2..4660de4811d 100644 --- a/tests/layers/test_fusedmoe.py +++ b/tests/layers/test_fusedmoe.py @@ -539,7 +539,7 @@ def setUp(self) -> None: self.moe_intermediate_size = 2048 self.moe_num_experts = 160 self.moe_k = 8 - self.num_layers = 46 + self.num_layers = 1 self.num_attention_heads = -1 self.model_config = self.build_model_config() @@ -597,8 +597,10 @@ def test_fused_moe(self): moe_cuda_graphs = [None] * 100 cache_hidden_states = [None] * 100 - test_token_nums = [10, 20, 40, 60, 80, 100, 128, 160, 192, 256] - # test_token_nums = [1024 * i for i in [1,2,4,8,16,32]] + is_decoder = fused_moe[0].fd_config.model_config.moe_phase.phase == "decode" + test_token_nums = [4096 * i for i in [1, 2, 4, 8, 16, 32]] + if is_decoder: + test_token_nums = [10, 20, 40, 60, 80, 100, 128, 160, 192, 256] for idx, num_tokens in enumerate(test_token_nums): cache_hidden_states[idx] = paddle.rand((num_tokens, self.model_config.hidden_size), dtype=paddle.bfloat16) @@ -609,12 +611,14 @@ def fake_model_run(): return out - moe_cuda_graphs[idx] = graphs.CUDAGraph() - moe_cuda_graphs[idx].capture_begin() + if is_decoder: + moe_cuda_graphs[idx] = graphs.CUDAGraph() + moe_cuda_graphs[idx].capture_begin() fake_model_run() - moe_cuda_graphs[idx].capture_end() + if is_decoder: + moe_cuda_graphs[idx].capture_end() num_tests = 20 start_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(num_tests)] @@ -622,7 +626,10 @@ def fake_model_run(): for i in range(num_tests): start_events[i].record() - moe_cuda_graphs[idx].replay() + if is_decoder: + moe_cuda_graphs[idx].replay() + else: + fake_model_run() end_events[i].record() paddle.device.cuda.synchronize() From 6798566d82ab15f1cd342f142d2a487142fe7089 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Tue, 18 Nov 2025 12:48:47 +0800 Subject: [PATCH 04/11] clean code --- tests/layers/test_ffn.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/layers/test_ffn.py b/tests/layers/test_ffn.py index 3606da9367d..28d8671b7f2 100644 --- a/tests/layers/test_ffn.py +++ b/tests/layers/test_ffn.py @@ -39,6 +39,7 @@ from fastdeploy.worker.worker_process import init_distributed_environment paddle.set_default_dtype("bfloat16") +os.environ.setdefault("DG_NVCC_OVERRIDE_CPP_STANDARD", "17") class FFNWrapper(paddle.nn.Layer): @@ -96,7 +97,7 @@ def setUp(self) -> None: self.architectures = ["Ernie4_5_MoeForCausalLM"] self.hidden_size = 4096 self.intermediate_size = 2048 - self.num_layers = 46 + self.num_layers = 1 self.hidden_act = "silu" self.num_attention_heads = 64 self.model_config = self.build_model_config() @@ -132,10 +133,6 @@ def test_ffn(self): ffn = FFNWrapper(self.model_config) - # (ZKK): disable this test, - # CI machine does not support deepgemm blockwise_fp8, compilation error. - # return - moe_cuda_graphs = [None] * 100 cache_hidden_states = [None] * 100 for idx, num_tokens in enumerate([4096, 4096 * 2, 4096 * 4]): From b77a514700f87cb63e7f543a71973f6ea1cb5290 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Tue, 18 Nov 2025 12:51:05 +0800 Subject: [PATCH 05/11] clean code --- tests/layers/test_ffn.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/layers/test_ffn.py b/tests/layers/test_ffn.py index 28d8671b7f2..3e61d169438 100644 --- a/tests/layers/test_ffn.py +++ b/tests/layers/test_ffn.py @@ -135,7 +135,8 @@ def test_ffn(self): moe_cuda_graphs = [None] * 100 cache_hidden_states = [None] * 100 - for idx, num_tokens in enumerate([4096, 4096 * 2, 4096 * 4]): + test_token_nums = [10, 20, 40, 60, 80, 100, 128, 160, 192, 256, 4096, 4096 * 4] + for idx, num_tokens in enumerate(test_token_nums): cache_hidden_states[idx] = paddle.rand((num_tokens, self.model_config.hidden_size), dtype=paddle.bfloat16) From 6ba9ee6523e57d07df5ca1c0e79cb0e765872696 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Tue, 18 Nov 2025 12:57:19 +0800 Subject: [PATCH 06/11] clean code --- tests/layers/test_ffn.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/layers/test_ffn.py b/tests/layers/test_ffn.py index 3e61d169438..dc558b33f79 100644 --- a/tests/layers/test_ffn.py +++ b/tests/layers/test_ffn.py @@ -39,7 +39,9 @@ from fastdeploy.worker.worker_process import init_distributed_environment paddle.set_default_dtype("bfloat16") -os.environ.setdefault("DG_NVCC_OVERRIDE_CPP_STANDARD", "17") +if "h20" in paddle.device.cuda.get_device_name().lower(): + # (ZKK): CI machine. + os.environ.setdefault("DG_NVCC_OVERRIDE_CPP_STANDARD", "17") class FFNWrapper(paddle.nn.Layer): From 7876cb0be77bfb9d1aaf7f0fb08fc98d1c944e29 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Tue, 18 Nov 2025 13:57:17 +0800 Subject: [PATCH 07/11] clean code --- tests/layers/test_attention_layer.py | 89 ++++++++++++++-------------- 1 file changed, 45 insertions(+), 44 deletions(-) diff --git a/tests/layers/test_attention_layer.py b/tests/layers/test_attention_layer.py index 5e803b8a22d..9589e89a2af 100644 --- a/tests/layers/test_attention_layer.py +++ b/tests/layers/test_attention_layer.py @@ -50,7 +50,9 @@ from fastdeploy.model_executor.models.ernie4_5_moe import Ernie4_5_Attention from fastdeploy.model_executor.ops.gpu import get_padding_offset -os.environ.setdefault("DG_NVCC_OVERRIDE_CPP_STANDARD", "17") +if "h20" in paddle.device.cuda.get_device_name().lower(): + # (ZKK): CI machine. + os.environ.setdefault("DG_NVCC_OVERRIDE_CPP_STANDARD", "17") class TestAttentionPerformance(unittest.TestCase): @@ -117,12 +119,12 @@ def create_model_config_json(self) -> str: config_dict = { "architectures": ["Ernie4_5_MoeForCausalLM"], "dtype": "bfloat16", - "hidden_size": 1536, + "hidden_size": 4096, "max_position_embeddings": 131072, - "max_model_len": 2 * (9000 + 128), - "num_attention_heads": 12, + "max_model_len": 5500, + "num_attention_heads": 32, "num_key_value_heads": 4, - "num_hidden_layers": 39, + "num_hidden_layers": 57, } model_dir = tempfile.mkdtemp(prefix="tmp_model_config_") config_path = os.path.join(model_dir, "config.json") @@ -293,7 +295,6 @@ def test_decode_performance_with_prefill(self): # Test parameters test_steps = 100 prefill_batch_size = 1 - decode_batch_size = 100 # This can be configured as needed prefill_seq_len = 4096 use_dynamic_quant = True act_tensor_dtype = paddle.bfloat16 @@ -317,8 +318,7 @@ def test_decode_performance_with_prefill(self): paddle.device.synchronize() - import paddle.profiler as profiler - + # import paddle.profiler as profiler # p = profiler.Profiler( # targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU], # on_trace_ready=profiler.export_chrome_tracing("./profile_log"), @@ -341,56 +341,57 @@ def test_decode_performance_with_prefill(self): # p.stop() - decode_hidden_states = paddle.randn( - [decode_batch_size, self.fd_config.model_config.hidden_size], dtype=act_tensor_dtype - ) + # p = profiler.Profiler( + # targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU], + # on_trace_ready=profiler.export_chrome_tracing("./profile_log"), + # ) - forward_meta = self.create_forward_meta( - batch_size=decode_batch_size, - seq_len=5000, - mode=ForwardMode.DECODE, - fd_config=self.fd_config, - attn_backend=self.attn_backend, - use_dynamic_quant=use_dynamic_quant, - ) + # p.start() + # p.step() - self.attn_backend.init_attention_metadata(forward_meta) + for decode_batch_size in [10, 20, 40, 60, 80, 100, 128]: + decode_hidden_states = paddle.randn( + [decode_batch_size, self.fd_config.model_config.hidden_size], dtype=act_tensor_dtype + ) - p = profiler.Profiler( - targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU], - on_trace_ready=profiler.export_chrome_tracing("./profile_log"), - ) + forward_meta = self.create_forward_meta( + batch_size=decode_batch_size, + seq_len=5000, + mode=ForwardMode.DECODE, + fd_config=self.fd_config, + attn_backend=self.attn_backend, + use_dynamic_quant=use_dynamic_quant, + ) - p.start() - p.step() + self.attn_backend.init_attention_metadata(forward_meta) - paddle.device.synchronize() + paddle.device.synchronize() - # 必须要先预热一次!因为预处理被放到了第一层再做了! - self.attn_forward(forward_meta, decode_hidden_states) + # 必须要先预热一次!因为预处理被放到了第一层再做了! + self.attn_forward(forward_meta, decode_hidden_states) - attn_cuda_graphs = graphs.CUDAGraph() - attn_cuda_graphs.capture_begin() + attn_cuda_graphs = graphs.CUDAGraph() + attn_cuda_graphs.capture_begin() - self.attn_forward(forward_meta, decode_hidden_states) + self.attn_forward(forward_meta, decode_hidden_states) - attn_cuda_graphs.capture_end() + attn_cuda_graphs.capture_end() - start_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)] - end_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)] - for i in range(test_steps): - start_events[i].record() + start_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)] + end_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)] + for i in range(test_steps): + start_events[i].record() - attn_cuda_graphs.replay() - # self.attn_forward(forward_meta, decode_hidden_states) + attn_cuda_graphs.replay() + # self.attn_forward(forward_meta, decode_hidden_states) - end_events[i].record() - paddle.device.synchronize() + end_events[i].record() + paddle.device.synchronize() - times = np.array([round(s.elapsed_time(e), 1) for s, e in zip(start_events, end_events)])[1:] - print(times[-5:]) + times = np.array([round(s.elapsed_time(e), 1) for s, e in zip(start_events, end_events)])[1:] + print(times[-5:]) - p.stop() + # p.stop() if __name__ == "__main__": From 9bfd4b1568bcbd78cbb66024e265c6d2aee35e14 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Tue, 18 Nov 2025 14:04:10 +0800 Subject: [PATCH 08/11] clean code --- tests/layers/test_fusedmoe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/layers/test_fusedmoe.py b/tests/layers/test_fusedmoe.py index 4660de4811d..bac24963b3e 100644 --- a/tests/layers/test_fusedmoe.py +++ b/tests/layers/test_fusedmoe.py @@ -539,7 +539,7 @@ def setUp(self) -> None: self.moe_intermediate_size = 2048 self.moe_num_experts = 160 self.moe_k = 8 - self.num_layers = 1 + self.num_layers = 2 self.num_attention_heads = -1 self.model_config = self.build_model_config() From 9afbaad4cfda8731af5d2f788504dd181c6206c7 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Tue, 18 Nov 2025 14:21:23 +0800 Subject: [PATCH 09/11] clean code --- tests/layers/test_attention_layer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/layers/test_attention_layer.py b/tests/layers/test_attention_layer.py index 9589e89a2af..bf208f6ba30 100644 --- a/tests/layers/test_attention_layer.py +++ b/tests/layers/test_attention_layer.py @@ -124,7 +124,7 @@ def create_model_config_json(self) -> str: "max_model_len": 5500, "num_attention_heads": 32, "num_key_value_heads": 4, - "num_hidden_layers": 57, + "num_hidden_layers": 5, } model_dir = tempfile.mkdtemp(prefix="tmp_model_config_") config_path = os.path.join(model_dir, "config.json") From 9499f4a50ca530840fbc2f253009696b52542bbe Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Tue, 18 Nov 2025 16:03:27 +0800 Subject: [PATCH 10/11] commit --- tests/layers/test_attention_layer.py | 2 +- tests/layers/test_ffn.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/layers/test_attention_layer.py b/tests/layers/test_attention_layer.py index bf208f6ba30..32d579f74b6 100644 --- a/tests/layers/test_attention_layer.py +++ b/tests/layers/test_attention_layer.py @@ -50,7 +50,7 @@ from fastdeploy.model_executor.models.ernie4_5_moe import Ernie4_5_Attention from fastdeploy.model_executor.ops.gpu import get_padding_offset -if "h20" in paddle.device.cuda.get_device_name().lower(): +if "nvidia graphics device" in paddle.device.cuda.get_device_name().lower(): # (ZKK): CI machine. os.environ.setdefault("DG_NVCC_OVERRIDE_CPP_STANDARD", "17") diff --git a/tests/layers/test_ffn.py b/tests/layers/test_ffn.py index dc558b33f79..eb551fd9eb1 100644 --- a/tests/layers/test_ffn.py +++ b/tests/layers/test_ffn.py @@ -39,7 +39,7 @@ from fastdeploy.worker.worker_process import init_distributed_environment paddle.set_default_dtype("bfloat16") -if "h20" in paddle.device.cuda.get_device_name().lower(): +if "nvidia graphics device" in paddle.device.cuda.get_device_name().lower(): # (ZKK): CI machine. os.environ.setdefault("DG_NVCC_OVERRIDE_CPP_STANDARD", "17") From fc5ad43e70730c1ea854eecf5f7dc9aa1b8474ce Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Tue, 18 Nov 2025 16:06:50 +0800 Subject: [PATCH 11/11] commit --- tests/layers/test_fusedmoe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/layers/test_fusedmoe.py b/tests/layers/test_fusedmoe.py index bac24963b3e..8a603427876 100644 --- a/tests/layers/test_fusedmoe.py +++ b/tests/layers/test_fusedmoe.py @@ -598,7 +598,7 @@ def test_fused_moe(self): moe_cuda_graphs = [None] * 100 cache_hidden_states = [None] * 100 is_decoder = fused_moe[0].fd_config.model_config.moe_phase.phase == "decode" - test_token_nums = [4096 * i for i in [1, 2, 4, 8, 16, 32]] + test_token_nums = [4096 * i for i in [1, 2, 4, 8]] if is_decoder: test_token_nums = [10, 20, 40, 60, 80, 100, 128, 160, 192, 256] for idx, num_tokens in enumerate(test_token_nums):