diff --git a/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu b/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu index e84f8281648..2b5c1fbc7d0 100644 --- a/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu +++ b/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu @@ -311,16 +311,17 @@ void GetBlockShapeAndSplitKVBlock( if (mla_backend && group_size <= 64) { const int set_chunk_size = get_mla_dec_chunk_size(bsz); - PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync( + CUDA_CHECK(cudaMemsetAsync( decoder_chunk_size_device.data(), 64, sizeof(int32_t), stream)); - PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync( + CUDA_CHECK(cudaMemsetAsync( decoder_num_blocks_device.data(), 0, sizeof(int32_t), stream)); int device; - cudaGetDevice(&device); + CUDA_CHECK(cudaGetDevice(&device)); int sm_cout; - cudaDeviceGetAttribute(&sm_cout, cudaDevAttrMultiProcessorCount, device); + CUDA_CHECK(cudaDeviceGetAttribute( + &sm_cout, cudaDevAttrMultiProcessorCount, device)); constexpr int config_size = 12; // search space for chunk size:[64, 128, 256, ... 131072] @@ -341,16 +342,14 @@ void GetBlockShapeAndSplitKVBlock( decoder_chunk_size_device.copy_to(paddle::CPUPlace(), false); const int chunk_size = decoder_chunk_size_cpu.data()[0]; - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemsetAsync(decoder_batch_ids.data(), - 0, - decoder_batch_ele_num * sizeof(int32_t), - stream)); - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemsetAsync(decoder_tile_ids_per_batch.data(), - 0, - decoder_batch_ele_num * sizeof(int32_t), - stream)); + CUDA_CHECK(cudaMemsetAsync(decoder_batch_ids.data(), + 0, + decoder_batch_ele_num * sizeof(int32_t), + stream)); + CUDA_CHECK(cudaMemsetAsync(decoder_tile_ids_per_batch.data(), + 0, + decoder_batch_ele_num * sizeof(int32_t), + stream)); split_block_for_mla<<<1, 32, 0, stream>>>( seq_lens_this_time.data(), @@ -362,17 +361,15 @@ void GetBlockShapeAndSplitKVBlock( chunk_size); } else { - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemsetAsync(decoder_batch_ids.data(), - 0, - decoder_batch_ele_num * sizeof(int32_t), - stream)); - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemsetAsync(decoder_tile_ids_per_batch.data(), - 0, - decoder_batch_ele_num * sizeof(int32_t), - stream)); - PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync( + CUDA_CHECK(cudaMemsetAsync(decoder_batch_ids.data(), + 0, + decoder_batch_ele_num * sizeof(int32_t), + stream)); + CUDA_CHECK(cudaMemsetAsync(decoder_tile_ids_per_batch.data(), + 0, + decoder_batch_ele_num * sizeof(int32_t), + stream)); + CUDA_CHECK(cudaMemsetAsync( decoder_num_blocks_device.data(), 0, sizeof(int32_t), stream)); split_q_block<<<1, 32, 0, stream>>>( @@ -391,8 +388,6 @@ void GetBlockShapeAndSplitKVBlock( #endif decoder_num_blocks_cpu.copy_( decoder_num_blocks_device, decoder_num_blocks_cpu.place(), false); - PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync( - decoder_chunk_size_device.data(), 64, sizeof(int32_t), stream)); } } @@ -401,19 +396,17 @@ void GetBlockShapeAndSplitKVBlock( const uint32_t max_tile_size_per_bs_kv = div_up(max_enc_dec_len_this_time, block_size); const uint32_t kv_batch_shape = bsz * max_tile_size_per_bs_kv; - PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync( + CUDA_CHECK(cudaMemsetAsync( kv_batch_ids.data(), 0, kv_batch_shape * sizeof(int32_t), stream)); - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemsetAsync(kv_tile_ids_per_batch.data(), - 0, - kv_batch_shape * sizeof(int32_t), - stream)); + CUDA_CHECK(cudaMemsetAsync(kv_tile_ids_per_batch.data(), + 0, + kv_batch_shape * sizeof(int32_t), + stream)); auto kv_num_blocks_x = GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place()); split_kv_block<<<1, 32, 0, seq_lens_encoder.stream()>>>( seq_lens_decoder.data(), - // sequence_lengths->data(), seq_lens_encoder.data(), kv_batch_ids.data(), kv_tile_ids_per_batch.data(), @@ -428,16 +421,14 @@ void GetBlockShapeAndSplitKVBlock( const uint32_t encoder_max_tile_size_per_bs_q = div_up((max_enc_dec_len_this_time * group_size), encoder_block_shape_q); const uint32_t encoder_batch_shape = bsz * encoder_max_tile_size_per_bs_q; - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemsetAsync(encoder_batch_ids.data(), - 0, - encoder_batch_shape * sizeof(int32_t), - stream)); - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemsetAsync(encoder_tile_ids_per_batch.data(), - 0, - encoder_batch_shape * sizeof(int32_t), - stream)); + CUDA_CHECK(cudaMemsetAsync(encoder_batch_ids.data(), + 0, + encoder_batch_shape * sizeof(int32_t), + stream)); + CUDA_CHECK(cudaMemsetAsync(encoder_tile_ids_per_batch.data(), + 0, + encoder_batch_shape * sizeof(int32_t), + stream)); auto encoder_num_blocks_x = GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place()); split_q_block<<<1, 32, 0, stream>>>(seq_lens_encoder.data(), diff --git a/fastdeploy/model_executor/layers/attention/append_attn_backend.py b/fastdeploy/model_executor/layers/attention/append_attn_backend.py index ac6231a5804..cae006e968d 100644 --- a/fastdeploy/model_executor/layers/attention/append_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/append_attn_backend.py @@ -72,18 +72,21 @@ def allocate_launch_related_buffer( block_size, ): # Initialize AttentionBackend buffers - group_size = np.ceil(num_heads / kv_num_heads) + assert num_heads % kv_num_heads == 0 + assert max_model_len % block_size == 0 + assert max_model_len % encoder_block_shape_q == 0 + group_size = num_heads // kv_num_heads # NOTE: (changwenbin) When using auto_chunk, # decode_max_tile_size must take into account the maximum case, where *1024 can cover 128K. decode_max_tile_size = ( - 1024 * max_batch_size * np.ceil((decoder_step_token_num * group_size) / decoder_block_shape_q) + 1024 * max_batch_size * (int)(np.ceil(decoder_step_token_num * group_size / decoder_block_shape_q)) ) - encode_max_tile_size = max_batch_size * np.ceil((max_model_len * group_size) / encoder_block_shape_q) - kv_max_tile_size = max_batch_size * np.ceil(max_model_len / block_size) + encode_max_tile_size = max_batch_size * (max_model_len * group_size // encoder_block_shape_q) + kv_max_tile_size = max_batch_size * (max_model_len // block_size) res = {} - res["decoder_batch_ids"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32") - res["decoder_tile_ids_per_batch"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32") + res["decoder_batch_ids"] = paddle.full([decode_max_tile_size], 0, dtype="int32") + res["decoder_tile_ids_per_batch"] = paddle.full([decode_max_tile_size], 0, dtype="int32") res["decoder_num_blocks_cpu"] = paddle.full([1], 0, dtype="int32").pin_memory() # NOTE: (changwenbin) MLA kernel only needs decoder_num_blocks_device in place of GPU tensor, # adapted to cudagraph. @@ -91,12 +94,12 @@ def allocate_launch_related_buffer( res["decoder_chunk_size_device"] = paddle.full([1], 64, dtype="int32") res["max_len_tensor_cpu"] = paddle.full([9], 0, dtype="int32").cpu() - res["encoder_batch_ids"] = paddle.full([int(encode_max_tile_size)], 0, dtype="int32") - res["encoder_tile_ids_per_batch"] = paddle.full([int(encode_max_tile_size)], 0, dtype="int32") + res["encoder_batch_ids"] = paddle.full([encode_max_tile_size], 0, dtype="int32") + res["encoder_tile_ids_per_batch"] = paddle.full([encode_max_tile_size], 0, dtype="int32") res["encoder_num_blocks_x_cpu"] = paddle.full([1], 0, dtype="int32").cpu() - res["kv_batch_ids"] = paddle.full([int(kv_max_tile_size)], 0, dtype="int32") - res["kv_tile_ids_per_batch"] = paddle.full([int(kv_max_tile_size)], 0, dtype="int32") + res["kv_batch_ids"] = paddle.full([kv_max_tile_size], 0, dtype="int32") + res["kv_tile_ids_per_batch"] = paddle.full([kv_max_tile_size], 0, dtype="int32") res["kv_num_blocks_x_cpu"] = paddle.full([1], 0, dtype="int32").cpu() return res diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py index fc6803452a6..4cd82f49165 100644 --- a/tests/entrypoints/openai/test_run_batch.py +++ b/tests/entrypoints/openai/test_run_batch.py @@ -172,7 +172,7 @@ async def test_initialize_engine_client(self, mock_engine_client): mock_args = Mock() mock_args.model = "test-model" mock_args.tokenizer = "test-tokenizer" - mock_args.max_model_len = 1000 + mock_args.max_model_len = 1024 mock_args.tensor_parallel_size = 1 mock_args.engine_worker_queue_port = [8000] mock_args.local_data_parallel_id = 0 @@ -202,7 +202,7 @@ async def test_initialize_engine_client(self, mock_engine_client): def test_create_serving_handlers(self, mock_chat_handler, mock_model_handler): """测试创建服务处理器""" mock_args = Mock() - mock_args.max_model_len = 1000 + mock_args.max_model_len = 1024 mock_args.ips = "127.0.0.1" mock_args.max_waiting_time = 60 mock_args.enable_mm_output = False @@ -1286,7 +1286,7 @@ def run_fastdeploy_command(self, input_content, port=None): "--quantization", "wint4", "--max-model-len", - "4192", + "5120", "--max-num-seqs", "64", "--load-choices",