From ec1f1eaa496d5bc216b37b3ac6930589e9cbd751 Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Tue, 21 Oct 2025 19:46:31 +0800 Subject: [PATCH 1/2] fix custom_ops --- custom_ops/cpu_ops/stop_generation_multi_ends.cc | 6 ++++++ custom_ops/gpu_ops/flash_mask_attn/flash_mask_attn.cu | 6 +++++- custom_ops/gpu_ops/get_img_boundaries.cc | 6 +++++- custom_ops/gpu_ops/get_mm_split_fuse.cc | 6 +++++- custom_ops/gpu_ops/limit_thinking_content_length_v1.cu | 2 +- custom_ops/gpu_ops/limit_thinking_content_length_v2.cu | 2 +- custom_ops/gpu_ops/moba_attn/moba_attn.cu | 5 ++++- .../moba_attn/moba_decoder_attn/moba_qk_sort_decoder.cu | 5 ++++- .../moba_attn/moba_encoder_attn/moba_encoder_attn.cu | 6 +++++- .../moba_encoder_attn/moba_encoder_write_cache.cu | 5 ++++- .../moba_attn/moba_encoder_attn/moba_qk_sort_encoder.cu | 6 +++++- .../moba_attn/moba_process/moba_get_kv_from_cache.cu | 8 ++++++-- .../gpu_ops/moba_attn/moba_process/moba_mlp_einsum.cu | 5 ++++- custom_ops/gpu_ops/moba_attn/moba_process/moba_qk_gemm.cu | 6 +++++- .../gpu_ops/moba_attn/moba_process/split_qkv_and_rope.cu | 6 +++++- custom_ops/gpu_ops/stop_generation_multi_ends.cu | 2 ++ fastdeploy/engine/engine.py | 1 - 17 files changed, 67 insertions(+), 16 deletions(-) diff --git a/custom_ops/cpu_ops/stop_generation_multi_ends.cc b/custom_ops/cpu_ops/stop_generation_multi_ends.cc index 7669cfa51d0..37f1f40c250 100644 --- a/custom_ops/cpu_ops/stop_generation_multi_ends.cc +++ b/custom_ops/cpu_ops/stop_generation_multi_ends.cc @@ -18,6 +18,10 @@ #include #include "paddle/extension.h" +#ifndef PD_BUILD_STATIC_OP +#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name) +#endif + bool is_in_end(const int64_t id, const int64_t *end_ids, int length) { bool flag = false; for (int i = 0; i < length; i++) { @@ -49,6 +53,8 @@ void set_value_by_flags(bool *stop_flags, } if (!beam_search && is_in_end(topk_ids[bi], end_ids, end_length)) { stop_flags[bi] = true; + topk_ids[bi] = end_ids[0]; + next_tokens[bi] = end_ids[0]; } } } diff --git a/custom_ops/gpu_ops/flash_mask_attn/flash_mask_attn.cu b/custom_ops/gpu_ops/flash_mask_attn/flash_mask_attn.cu index fc98742f819..3e40794f47d 100644 --- a/custom_ops/gpu_ops/flash_mask_attn/flash_mask_attn.cu +++ b/custom_ops/gpu_ops/flash_mask_attn/flash_mask_attn.cu @@ -20,6 +20,10 @@ #include "kernel_traits.h" #include "flash_mask_attn_kernel.hpp" +#ifndef PD_BUILD_STATIC_OP +#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name) +#endif + template struct cuteType; @@ -142,7 +146,7 @@ std::vector FlashAttentionMask( } -PD_BUILD_OP(flash_attention_mask) +PD_BUILD_STATIC_OP(flash_attention_mask) .Inputs({ "q_input", "k_input", diff --git a/custom_ops/gpu_ops/get_img_boundaries.cc b/custom_ops/gpu_ops/get_img_boundaries.cc index 30ca6d2697b..2a5f5b5ff3f 100644 --- a/custom_ops/gpu_ops/get_img_boundaries.cc +++ b/custom_ops/gpu_ops/get_img_boundaries.cc @@ -14,6 +14,10 @@ #include "paddle/extension.h" +#ifndef PD_BUILD_STATIC_OP +#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name) +#endif + std::vector GetImgBoundaries(const paddle::Tensor& task_input_ids, const paddle::Tensor& grid_thw, const int64_t image_patch_id) { @@ -53,7 +57,7 @@ std::vector GetImgBoundaries(const paddle::Tensor& task_input_id return {out}; } -PD_BUILD_OP(get_img_boundaries) +PD_BUILD_STATIC_OP(get_img_boundaries) .Inputs({"task_input_ids", "grid_thw"}) .Attrs({"image_patch_id: int64_t"}) .Outputs({"img_boundaries"}) diff --git a/custom_ops/gpu_ops/get_mm_split_fuse.cc b/custom_ops/gpu_ops/get_mm_split_fuse.cc index 3d70258d005..81531ba95e5 100644 --- a/custom_ops/gpu_ops/get_mm_split_fuse.cc +++ b/custom_ops/gpu_ops/get_mm_split_fuse.cc @@ -15,6 +15,10 @@ #include "paddle/extension.h" #include +#ifndef PD_BUILD_STATIC_OP +#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name) +#endif + std::vector GetMmSplitFuse(const paddle::Tensor& task_input_ids, const paddle::Tensor& task_image_type_ids, const paddle::Tensor& task_input_ids_image_token_count, @@ -133,7 +137,7 @@ std::vector GetMmSplitFuse(const paddle::Tensor& task_input_ids, return {image_chunk_selections_out, split_fuse_cur_seq_lens_out}; } -PD_BUILD_OP(get_mm_split_fuse) +PD_BUILD_STATIC_OP(get_mm_split_fuse) .Inputs({"task_input_ids", "task_image_type_ids", "task_input_ids_image_token_count", "grid_thw"}) .Attrs({"image_token_id: int64_t", "img_total: int64_t", "batch_idx: int", "seq_lens_origin: int", "split_fuse_img_size: int", "split_fuse_text_size: int", "max_chunk_token_size: int"}) .Outputs({"image_chunk_selections", "split_fuse_cur_seq_lens"}) diff --git a/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu b/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu index d4c494b53a4..3ca280662fd 100644 --- a/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu +++ b/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu @@ -80,7 +80,7 @@ void LimitThinkingContentLengthV1(const paddle::Tensor &next_tokens, batch_size); } -PD_BUILD_OP(limit_thinking_content_length_v1) +PD_BUILD_STATIC_OP(limit_thinking_content_length_v1) .Inputs({"next_tokens", "max_think_lens", "step_idx", "limit_think_status"}) .Attrs({"think_end_id: int64_t"}) .Outputs({"next_tokens_out"}) diff --git a/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu b/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu index a61dec8960d..f1da488faa3 100644 --- a/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu +++ b/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu @@ -103,7 +103,7 @@ void LimitThinkingContentLengthV2(const paddle::Tensor &next_tokens, batch_size); } -PD_BUILD_OP(limit_thinking_content_length_v2) +PD_BUILD_STATIC_OP(limit_thinking_content_length_v2) .Inputs({"next_tokens", "max_think_lens", "step_idx", "limit_think_status"}) .Attrs({"think_end_id: int64_t", "line_break_id: int64_t"}) .Outputs({"next_tokens_out"}) diff --git a/custom_ops/gpu_ops/moba_attn/moba_attn.cu b/custom_ops/gpu_ops/moba_attn/moba_attn.cu index 2def647eef1..bf0bd381508 100644 --- a/custom_ops/gpu_ops/moba_attn/moba_attn.cu +++ b/custom_ops/gpu_ops/moba_attn/moba_attn.cu @@ -15,6 +15,9 @@ #include "paddle/extension.h" #include "moba_attn.h" +#ifndef PD_BUILD_STATIC_OP +#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name) +#endif std::vector MobaAttention( const paddle::Tensor& qkv, @@ -272,7 +275,7 @@ std::vector MobaAttention( } -PD_BUILD_OP(moba_attention) +PD_BUILD_STATIC_OP(moba_attention) .Inputs({ "qkv", "q_input", diff --git a/custom_ops/gpu_ops/moba_attn/moba_decoder_attn/moba_qk_sort_decoder.cu b/custom_ops/gpu_ops/moba_attn/moba_decoder_attn/moba_qk_sort_decoder.cu index 3575cf5b89d..8808533f88b 100644 --- a/custom_ops/gpu_ops/moba_attn/moba_decoder_attn/moba_qk_sort_decoder.cu +++ b/custom_ops/gpu_ops/moba_attn/moba_decoder_attn/moba_qk_sort_decoder.cu @@ -16,6 +16,9 @@ #include "moba_attn/moba_attn_utils.hpp" #include "moba_attn/moba_attn.h" +#ifndef PD_BUILD_STATIC_OP +#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name) +#endif template __global__ void qk_gate_sort_decoder_kernel( @@ -221,7 +224,7 @@ std::vector QkSortDecoder( } } -PD_BUILD_OP(moba_qk_sort_decoder) +PD_BUILD_STATIC_OP(moba_qk_sort_decoder) .Inputs({ "qk_gate_weight", "seq_len_encoder", diff --git a/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_encoder_attn.cu b/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_encoder_attn.cu index 29d6564ff62..d4df8b27e2a 100644 --- a/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_encoder_attn.cu +++ b/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_encoder_attn.cu @@ -27,6 +27,10 @@ #include "softmax.hpp" #include "cutlass/arch/reg_reconfig.h" +#ifndef PD_BUILD_STATIC_OP +#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name) +#endif + template auto get_gmem_layout(int token_num, int head_num) { return make_layout( @@ -360,7 +364,7 @@ void MobaEncoderAttn( } -PD_BUILD_OP(moba_encoder_attn) +PD_BUILD_STATIC_OP(moba_encoder_attn) .Inputs({ "q_input", "k_input", diff --git a/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_encoder_write_cache.cu b/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_encoder_write_cache.cu index 7ced46a49ff..89666c51f3b 100644 --- a/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_encoder_write_cache.cu +++ b/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_encoder_write_cache.cu @@ -15,6 +15,9 @@ #include "paddle/extension.h" #include "moba_attn/moba_attn.h" +#ifndef PD_BUILD_STATIC_OP +#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name) +#endif template __global__ void write_encoder_cachekv_c16( @@ -135,7 +138,7 @@ void MobaEncoderAttnWriteCacheKv( } } -PD_BUILD_OP(moba_encoder_attn_write_cache_kv) +PD_BUILD_STATIC_OP(moba_encoder_attn_write_cache_kv) .Inputs({ "k_input", "v_input", diff --git a/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_qk_sort_encoder.cu b/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_qk_sort_encoder.cu index 2b190ede57e..dd044551d4c 100644 --- a/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_qk_sort_encoder.cu +++ b/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_qk_sort_encoder.cu @@ -15,6 +15,10 @@ #include "paddle/extension.h" #include "moba_attn/moba_attn_utils.hpp" +#ifndef PD_BUILD_STATIC_OP +#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name) +#endif + template __global__ void qk_gate_sort_encoder_kernel( const T* qk_gate_weight, @@ -320,7 +324,7 @@ std::vector QkSortEncoder( } } -PD_BUILD_OP(moba_qk_sort_encoder) +PD_BUILD_STATIC_OP(moba_qk_sort_encoder) .Inputs({ "qk_gate_weight", "seq_len_encoder", diff --git a/custom_ops/gpu_ops/moba_attn/moba_process/moba_get_kv_from_cache.cu b/custom_ops/gpu_ops/moba_attn/moba_process/moba_get_kv_from_cache.cu index 9aff2a3d54a..66228342c70 100644 --- a/custom_ops/gpu_ops/moba_attn/moba_process/moba_get_kv_from_cache.cu +++ b/custom_ops/gpu_ops/moba_attn/moba_process/moba_get_kv_from_cache.cu @@ -16,6 +16,10 @@ #include "moba_attn/moba_attn_utils.hpp" #include "moba_attn/moba_attn.h" +#ifndef PD_BUILD_STATIC_OP +#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name) +#endif + template __global__ void get_kv_from_cache_c16_kernel( T * k_input, @@ -251,7 +255,7 @@ std::vector GetCurCuSeqLenk( return {cu_seq_q_pack, cu_seqlens_k, q_pack_tokens_cpu}; } -PD_BUILD_OP(get_kv_from_cache) +PD_BUILD_STATIC_OP(get_kv_from_cache) .Inputs({ "k_input", "v_input", @@ -277,7 +281,7 @@ PD_BUILD_OP(get_kv_from_cache) {"v_input", "v_input_out"}}) .SetKernelFn(PD_KERNEL(GetKVFromCache)); -PD_BUILD_OP(get_cur_cu_seq_len_k) +PD_BUILD_STATIC_OP(get_cur_cu_seq_len_k) .Inputs({ "seq_lens_encoder", "seq_lens_decoder", diff --git a/custom_ops/gpu_ops/moba_attn/moba_process/moba_mlp_einsum.cu b/custom_ops/gpu_ops/moba_attn/moba_process/moba_mlp_einsum.cu index 2354f410638..4d740290a57 100644 --- a/custom_ops/gpu_ops/moba_attn/moba_process/moba_mlp_einsum.cu +++ b/custom_ops/gpu_ops/moba_attn/moba_process/moba_mlp_einsum.cu @@ -16,6 +16,9 @@ #include "moba_attn/moba_attn_utils.hpp" #include "moba_attn/moba_attn.h" +#ifndef PD_BUILD_STATIC_OP +#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name) +#endif template __global__ void moba_mlp_einsum_kernel( @@ -207,7 +210,7 @@ std::vector MobaMlpEinsum( return {k_gate_weight}; } -PD_BUILD_OP(moba_mlp_einsum) +PD_BUILD_STATIC_OP(moba_mlp_einsum) .Inputs({ "k_input", "attn_gate_weight", diff --git a/custom_ops/gpu_ops/moba_attn/moba_process/moba_qk_gemm.cu b/custom_ops/gpu_ops/moba_attn/moba_process/moba_qk_gemm.cu index 96409592a34..2662fb26bac 100644 --- a/custom_ops/gpu_ops/moba_attn/moba_process/moba_qk_gemm.cu +++ b/custom_ops/gpu_ops/moba_attn/moba_process/moba_qk_gemm.cu @@ -25,6 +25,10 @@ #include "cutlass/cluster_launch.hpp" #include "cutlass/arch/reg_reconfig.h" +#ifndef PD_BUILD_STATIC_OP +#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name) +#endif + template __global__ void qk_gemm_kernel( const input_type *q_input, @@ -446,7 +450,7 @@ std::vector MobaQKGemm( } } -PD_BUILD_OP(moba_qk_gemm) +PD_BUILD_STATIC_OP(moba_qk_gemm) .Inputs({ "q_input", "k_block_means", diff --git a/custom_ops/gpu_ops/moba_attn/moba_process/split_qkv_and_rope.cu b/custom_ops/gpu_ops/moba_attn/moba_process/split_qkv_and_rope.cu index 3957ba8e880..c1fe07b3983 100644 --- a/custom_ops/gpu_ops/moba_attn/moba_process/split_qkv_and_rope.cu +++ b/custom_ops/gpu_ops/moba_attn/moba_process/split_qkv_and_rope.cu @@ -16,6 +16,10 @@ #include "moba_attn/moba_attn_utils.hpp" #include "moba_attn/moba_attn.h" +#ifndef PD_BUILD_STATIC_OP +#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name) +#endif + template __global__ void fused_block_mean_and_rope_kernel( const input_type *qkv_input, @@ -341,7 +345,7 @@ void FusedBlockMeanAndRope( -PD_BUILD_OP(fused_block_mean_and_rope) +PD_BUILD_STATIC_OP(fused_block_mean_and_rope) .Inputs({ "qkv_out", "k_block_means", diff --git a/custom_ops/gpu_ops/stop_generation_multi_ends.cu b/custom_ops/gpu_ops/stop_generation_multi_ends.cu index fe82be207ff..3165be10a79 100644 --- a/custom_ops/gpu_ops/stop_generation_multi_ends.cu +++ b/custom_ops/gpu_ops/stop_generation_multi_ends.cu @@ -64,6 +64,8 @@ __global__ void set_value_by_flags(bool *stop_flags, } if (!beam_search && is_in_end(topk_ids[bid], end_ids, end_length)) { stop_flags[bid] = true; + topk_ids[bid] = end_ids[0]; + next_tokens[bid] = end_ids[0]; } } // dealing stop_seqs diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index a5c317f80bb..ab15391a88e 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -426,7 +426,6 @@ def _setting_environ_variables(self): "ENABLE_FASTDEPLOY_LOAD_MODEL_CONCURRENCY": 0, "LOAD_STATE_DICT_THREAD_NUM": len(self.cfg.parallel_config.device_ids.split(",")), "PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION": "python", - "FLAGS_use_append_attn": 1, "NCCL_ALGO": "Ring", "FLAGS_max_partition_size": int(os.getenv("FLAGS_max_partition_size", 1024)), } From b5ff8c9a8fc67163e85cda7213b405699584deae Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Tue, 21 Oct 2025 20:06:55 +0800 Subject: [PATCH 2/2] paddleformers>=0.3.1 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 32acf860de3..dd96c0e2200 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ tqdm pynvml uvicorn==0.29.0 fastapi -paddleformers>=0.2 +paddleformers>=0.3.1 redis etcd3 httpx