Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 43 additions & 42 deletions tests/layers/test_attention_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,9 @@
from fastdeploy.model_executor.models.ernie4_5_moe import Ernie4_5_Attention
from fastdeploy.model_executor.ops.gpu import get_padding_offset

os.environ.setdefault("DG_NVCC_OVERRIDE_CPP_STANDARD", "17")
if "nvidia graphics device" in paddle.device.cuda.get_device_name().lower():
# (ZKK): CI machine.
os.environ.setdefault("DG_NVCC_OVERRIDE_CPP_STANDARD", "17")


class TestAttentionPerformance(unittest.TestCase):
Expand Down Expand Up @@ -119,10 +121,10 @@ def create_model_config_json(self) -> str:
"dtype": "bfloat16",
"hidden_size": 4096,
"max_position_embeddings": 131072,
"max_model_len": 2 * (9000 + 128),
"max_model_len": 5500,
"num_attention_heads": 32,
"num_key_value_heads": 4,
"num_hidden_layers": 39,
"num_hidden_layers": 5,
}
model_dir = tempfile.mkdtemp(prefix="tmp_model_config_")
config_path = os.path.join(model_dir, "config.json")
Expand Down Expand Up @@ -293,7 +295,6 @@ def test_decode_performance_with_prefill(self):
# Test parameters
test_steps = 100
prefill_batch_size = 1
decode_batch_size = 100 # This can be configured as needed
prefill_seq_len = 4096
use_dynamic_quant = True
act_tensor_dtype = paddle.bfloat16
Expand All @@ -317,8 +318,7 @@ def test_decode_performance_with_prefill(self):

paddle.device.synchronize()

import paddle.profiler as profiler

# import paddle.profiler as profiler
# p = profiler.Profiler(
# targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
# on_trace_ready=profiler.export_chrome_tracing("./profile_log"),
Expand All @@ -341,56 +341,57 @@ def test_decode_performance_with_prefill(self):

# p.stop()

decode_hidden_states = paddle.randn(
[decode_batch_size, self.fd_config.model_config.hidden_size], dtype=act_tensor_dtype
)
# p = profiler.Profiler(
# targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
# on_trace_ready=profiler.export_chrome_tracing("./profile_log"),
# )

forward_meta = self.create_forward_meta(
batch_size=decode_batch_size,
seq_len=5000,
mode=ForwardMode.DECODE,
fd_config=self.fd_config,
attn_backend=self.attn_backend,
use_dynamic_quant=use_dynamic_quant,
)
# p.start()
# p.step()

self.attn_backend.init_attention_metadata(forward_meta)
for decode_batch_size in [10, 20, 40, 60, 80, 100, 128]:
decode_hidden_states = paddle.randn(
[decode_batch_size, self.fd_config.model_config.hidden_size], dtype=act_tensor_dtype
)

p = profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
on_trace_ready=profiler.export_chrome_tracing("./profile_log"),
)
forward_meta = self.create_forward_meta(
batch_size=decode_batch_size,
seq_len=5000,
mode=ForwardMode.DECODE,
fd_config=self.fd_config,
attn_backend=self.attn_backend,
use_dynamic_quant=use_dynamic_quant,
)

p.start()
p.step()
self.attn_backend.init_attention_metadata(forward_meta)

paddle.device.synchronize()
paddle.device.synchronize()

# 必须要先预热一次!因为预处理被放到了第一层再做了!
self.attn_forward(forward_meta, decode_hidden_states)
# 必须要先预热一次!因为预处理被放到了第一层再做了!
self.attn_forward(forward_meta, decode_hidden_states)

attn_cuda_graphs = graphs.CUDAGraph()
attn_cuda_graphs.capture_begin()
attn_cuda_graphs = graphs.CUDAGraph()
attn_cuda_graphs.capture_begin()

self.attn_forward(forward_meta, decode_hidden_states)
self.attn_forward(forward_meta, decode_hidden_states)

attn_cuda_graphs.capture_end()
attn_cuda_graphs.capture_end()

start_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)]
end_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)]
for i in range(test_steps):
start_events[i].record()
start_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)]
end_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)]
for i in range(test_steps):
start_events[i].record()

attn_cuda_graphs.replay()
# self.attn_forward(forward_meta, decode_hidden_states)
attn_cuda_graphs.replay()
# self.attn_forward(forward_meta, decode_hidden_states)

end_events[i].record()
paddle.device.synchronize()
end_events[i].record()
paddle.device.synchronize()

times = np.array([round(s.elapsed_time(e), 1) for s, e in zip(start_events, end_events)])[1:]
print(times[-5:])
times = np.array([round(s.elapsed_time(e), 1) for s, e in zip(start_events, end_events)])[1:]
print(times[-5:])

p.stop()
# p.stop()


if __name__ == "__main__":
Expand Down
25 changes: 11 additions & 14 deletions tests/layers/test_ffn.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,17 @@
from fastdeploy.worker.worker_process import init_distributed_environment

paddle.set_default_dtype("bfloat16")
if "nvidia graphics device" in paddle.device.cuda.get_device_name().lower():
# (ZKK): CI machine.
os.environ.setdefault("DG_NVCC_OVERRIDE_CPP_STANDARD", "17")


class FFNWrapper(paddle.nn.Layer):
def __init__(self, model_config: ModelConfig):
super().__init__()
self.model_config = model_config

self.intermediate_size = 3584
self.intermediate_size = self.model_config.intermediate_size
self.hidden_size = self.model_config.hidden_size
self.prefix = "hahahha"
self.fd_config = FDConfig(
Expand Down Expand Up @@ -94,10 +97,9 @@ def __init__(self, model_config: ModelConfig):
class TestFusedMoE(unittest.TestCase):
def setUp(self) -> None:
self.architectures = ["Ernie4_5_MoeForCausalLM"]
self.hidden_size = 7168
self.moe_intermediate_size = 1
self.moe_num_experts = 1
self.moe_k = 1
self.hidden_size = 4096
self.intermediate_size = 2048
self.num_layers = 1
self.hidden_act = "silu"
self.num_attention_heads = 64
self.model_config = self.build_model_config()
Expand All @@ -115,9 +117,7 @@ def build_config_json(self) -> str:
config_dict = {
"architectures": self.architectures,
"hidden_size": self.hidden_size,
"moe_intermediate_size": self.moe_intermediate_size,
"moe_num_experts": self.moe_num_experts,
"moe_k": self.moe_k,
"intermediate_size": self.intermediate_size,
"hidden_act": self.hidden_act,
"num_attention_heads": self.num_attention_heads,
"dtype": "bfloat16",
Expand All @@ -135,20 +135,17 @@ def test_ffn(self):

ffn = FFNWrapper(self.model_config)

# (ZKK): disable this test,
# CI machine does not support deepgemm blockwise_fp8, compilation error.
return

moe_cuda_graphs = [None] * 100
cache_hidden_states = [None] * 100
for idx, num_tokens in enumerate([10, 20, 40, 60, 80, 100, 128, 160, 192, 256, 512, 1024, 2048, 4096]):
test_token_nums = [10, 20, 40, 60, 80, 100, 128, 160, 192, 256, 4096, 4096 * 4]
for idx, num_tokens in enumerate(test_token_nums):

cache_hidden_states[idx] = paddle.rand((num_tokens, self.model_config.hidden_size), dtype=paddle.bfloat16)

moe_cuda_graphs[idx] = graphs.CUDAGraph()
moe_cuda_graphs[idx].capture_begin()

num_layers = 80
num_layers = self.num_layers
for _ in range(num_layers):
out = ffn.ffn(cache_hidden_states[idx])

Expand Down
34 changes: 20 additions & 14 deletions tests/layers/test_fusedmoe.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,12 +535,12 @@ def __init__(
class TestFusedMoE(unittest.TestCase):
def setUp(self) -> None:
self.architectures = ["Ernie4_5_MoeForCausalLM"]
self.hidden_size = 7168
self.moe_intermediate_size = 3584
self.moe_num_experts = 64
self.hidden_size = 4096
self.moe_intermediate_size = 2048
self.moe_num_experts = 160
self.moe_k = 8
self.hidden_act = "silu"
self.num_attention_heads = 64
self.num_layers = 2
self.num_attention_heads = -1
self.model_config = self.build_model_config()

def build_model_config(self) -> ModelConfig:
Expand All @@ -559,7 +559,6 @@ def build_config_json(self) -> str:
"moe_intermediate_size": self.moe_intermediate_size,
"moe_num_experts": self.moe_num_experts,
"moe_k": self.moe_k,
"hidden_act": self.hidden_act,
"num_attention_heads": self.num_attention_heads,
"dtype": "bfloat16",
}
Expand Down Expand Up @@ -590,16 +589,18 @@ def test_fused_moe(self):
# 这行代码必须保留,否则影响均匀性!
paddle.seed(ep_rank + 100)

num_layers = 80
real_weight_layers = 20
num_layers = self.num_layers
real_weight_layers = num_layers // 2
fused_moe = [None] * real_weight_layers
for i in range(real_weight_layers):
fused_moe[i] = FuseMoEWrapper(self.model_config, tp_size, tp_rank, ep_size, ep_rank, nnodes=nnodes)

moe_cuda_graphs = [None] * 100
cache_hidden_states = [None] * 100
test_token_nums = [10, 20, 40, 60, 80, 100, 128, 160, 192, 256]
# test_token_nums = [1024 * i for i in [1,2,4,8,16,32]]
is_decoder = fused_moe[0].fd_config.model_config.moe_phase.phase == "decode"
test_token_nums = [4096 * i for i in [1, 2, 4, 8]]
if is_decoder:
test_token_nums = [10, 20, 40, 60, 80, 100, 128, 160, 192, 256]
for idx, num_tokens in enumerate(test_token_nums):

cache_hidden_states[idx] = paddle.rand((num_tokens, self.model_config.hidden_size), dtype=paddle.bfloat16)
Expand All @@ -610,20 +611,25 @@ def fake_model_run():

return out

moe_cuda_graphs[idx] = graphs.CUDAGraph()
moe_cuda_graphs[idx].capture_begin()
if is_decoder:
moe_cuda_graphs[idx] = graphs.CUDAGraph()
moe_cuda_graphs[idx].capture_begin()

fake_model_run()

moe_cuda_graphs[idx].capture_end()
if is_decoder:
moe_cuda_graphs[idx].capture_end()

num_tests = 20
start_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(num_tests)]
end_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(num_tests)]
for i in range(num_tests):
start_events[i].record()

moe_cuda_graphs[idx].replay()
if is_decoder:
moe_cuda_graphs[idx].replay()
else:
fake_model_run()

end_events[i].record()
paddle.device.cuda.synchronize()
Expand Down
Loading