From ec1f1eaa496d5bc216b37b3ac6930589e9cbd751 Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Tue, 21 Oct 2025 19:46:31 +0800
Subject: [PATCH 1/2] fix custom_ops

---
 custom_ops/cpu_ops/stop_generation_multi_ends.cc          | 6 ++++++
 custom_ops/gpu_ops/flash_mask_attn/flash_mask_attn.cu     | 6 +++++-
 custom_ops/gpu_ops/get_img_boundaries.cc                  | 6 +++++-
 custom_ops/gpu_ops/get_mm_split_fuse.cc                   | 6 +++++-
 custom_ops/gpu_ops/limit_thinking_content_length_v1.cu    | 2 +-
 custom_ops/gpu_ops/limit_thinking_content_length_v2.cu    | 2 +-
 custom_ops/gpu_ops/moba_attn/moba_attn.cu                 | 5 ++++-
 .../moba_attn/moba_decoder_attn/moba_qk_sort_decoder.cu   | 5 ++++-
 .../moba_attn/moba_encoder_attn/moba_encoder_attn.cu      | 6 +++++-
 .../moba_encoder_attn/moba_encoder_write_cache.cu         | 5 ++++-
 .../moba_attn/moba_encoder_attn/moba_qk_sort_encoder.cu   | 6 +++++-
 .../moba_attn/moba_process/moba_get_kv_from_cache.cu      | 8 ++++++--
 .../gpu_ops/moba_attn/moba_process/moba_mlp_einsum.cu     | 5 ++++-
 custom_ops/gpu_ops/moba_attn/moba_process/moba_qk_gemm.cu | 6 +++++-
 .../gpu_ops/moba_attn/moba_process/split_qkv_and_rope.cu  | 6 +++++-
 custom_ops/gpu_ops/stop_generation_multi_ends.cu          | 2 ++
 fastdeploy/engine/engine.py                               | 1 -
 17 files changed, 67 insertions(+), 16 deletions(-)
diff --git a/custom_ops/cpu_ops/stop_generation_multi_ends.cc b/custom_ops/cpu_ops/stop_generation_multi_ends.cc
index 7669cfa51d0..37f1f40c250 100644
--- a/custom_ops/cpu_ops/stop_generation_multi_ends.cc
+++ b/custom_ops/cpu_ops/stop_generation_multi_ends.cc
@@ -18,6 +18,10 @@
 #include <stdio.h>
 #include "paddle/extension.h"
 
+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
+
 bool is_in_end(const int64_t id, const int64_t *end_ids, int length) {
     bool flag = false;
     for (int i = 0; i < length; i++) {
@@ -49,6 +53,8 @@ void set_value_by_flags(bool *stop_flags,
         }
         if (!beam_search && is_in_end(topk_ids[bi], end_ids, end_length)) {
             stop_flags[bi] = true;
+            topk_ids[bi] = end_ids[0];
+            next_tokens[bi] = end_ids[0];
         }
     }
 }
diff --git a/custom_ops/gpu_ops/flash_mask_attn/flash_mask_attn.cu b/custom_ops/gpu_ops/flash_mask_attn/flash_mask_attn.cu
index fc98742f819..3e40794f47d 100644
--- a/custom_ops/gpu_ops/flash_mask_attn/flash_mask_attn.cu
+++ b/custom_ops/gpu_ops/flash_mask_attn/flash_mask_attn.cu
@@ -20,6 +20,10 @@
 #include "kernel_traits.h"
 #include "flash_mask_attn_kernel.hpp"
 
+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
+
 template <typename paddle_type>
 struct cuteType;
 
@@ -142,7 +146,7 @@ std::vector<paddle::Tensor> FlashAttentionMask(
 }
 
 
-PD_BUILD_OP(flash_attention_mask)
+PD_BUILD_STATIC_OP(flash_attention_mask)
     .Inputs({
         "q_input",
         "k_input",
diff --git a/custom_ops/gpu_ops/get_img_boundaries.cc b/custom_ops/gpu_ops/get_img_boundaries.cc
index 30ca6d2697b..2a5f5b5ff3f 100644
--- a/custom_ops/gpu_ops/get_img_boundaries.cc
+++ b/custom_ops/gpu_ops/get_img_boundaries.cc
@@ -14,6 +14,10 @@
 
 #include "paddle/extension.h"
 
+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
+
 std::vector<paddle::Tensor> GetImgBoundaries(const paddle::Tensor& task_input_ids,
                                              const paddle::Tensor& grid_thw,
                                              const int64_t image_patch_id) {
@@ -53,7 +57,7 @@ std::vector<paddle::Tensor> GetImgBoundaries(const paddle::Tensor& task_input_id
     return {out};
 }
 
-PD_BUILD_OP(get_img_boundaries)
+PD_BUILD_STATIC_OP(get_img_boundaries)
     .Inputs({"task_input_ids", "grid_thw"})
     .Attrs({"image_patch_id: int64_t"})
     .Outputs({"img_boundaries"})
diff --git a/custom_ops/gpu_ops/get_mm_split_fuse.cc b/custom_ops/gpu_ops/get_mm_split_fuse.cc
index 3d70258d005..81531ba95e5 100644
--- a/custom_ops/gpu_ops/get_mm_split_fuse.cc
+++ b/custom_ops/gpu_ops/get_mm_split_fuse.cc
@@ -15,6 +15,10 @@
 #include "paddle/extension.h"
 #include <map>
 
+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
+
 std::vector<paddle::Tensor> GetMmSplitFuse(const paddle::Tensor& task_input_ids,
                             const paddle::Tensor& task_image_type_ids,
                             const paddle::Tensor& task_input_ids_image_token_count,
@@ -133,7 +137,7 @@ std::vector<paddle::Tensor> GetMmSplitFuse(const paddle::Tensor& task_input_ids,
     return {image_chunk_selections_out, split_fuse_cur_seq_lens_out};
 }
 
-PD_BUILD_OP(get_mm_split_fuse)
+PD_BUILD_STATIC_OP(get_mm_split_fuse)
     .Inputs({"task_input_ids", "task_image_type_ids", "task_input_ids_image_token_count", "grid_thw"})
     .Attrs({"image_token_id: int64_t", "img_total: int64_t", "batch_idx: int", "seq_lens_origin: int", "split_fuse_img_size: int", "split_fuse_text_size: int", "max_chunk_token_size: int"})
     .Outputs({"image_chunk_selections", "split_fuse_cur_seq_lens"})
diff --git a/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu b/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu
index d4c494b53a4..3ca280662fd 100644
--- a/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu
+++ b/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu
@@ -80,7 +80,7 @@ void LimitThinkingContentLengthV1(const paddle::Tensor &next_tokens,
         batch_size);
 }
 
-PD_BUILD_OP(limit_thinking_content_length_v1)
+PD_BUILD_STATIC_OP(limit_thinking_content_length_v1)
     .Inputs({"next_tokens", "max_think_lens", "step_idx", "limit_think_status"})
     .Attrs({"think_end_id: int64_t"})
     .Outputs({"next_tokens_out"})
diff --git a/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu b/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu
index a61dec8960d..f1da488faa3 100644
--- a/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu
+++ b/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu
@@ -103,7 +103,7 @@ void LimitThinkingContentLengthV2(const paddle::Tensor &next_tokens,
         batch_size);
 }
 
-PD_BUILD_OP(limit_thinking_content_length_v2)
+PD_BUILD_STATIC_OP(limit_thinking_content_length_v2)
     .Inputs({"next_tokens", "max_think_lens", "step_idx", "limit_think_status"})
     .Attrs({"think_end_id: int64_t", "line_break_id: int64_t"})
     .Outputs({"next_tokens_out"})
diff --git a/custom_ops/gpu_ops/moba_attn/moba_attn.cu b/custom_ops/gpu_ops/moba_attn/moba_attn.cu
index 2def647eef1..bf0bd381508 100644
--- a/custom_ops/gpu_ops/moba_attn/moba_attn.cu
+++ b/custom_ops/gpu_ops/moba_attn/moba_attn.cu
@@ -15,6 +15,9 @@
 #include "paddle/extension.h"
 #include "moba_attn.h"
 
+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
 
 std::vector<paddle::Tensor> MobaAttention(
         const paddle::Tensor& qkv,
@@ -272,7 +275,7 @@ std::vector<paddle::Tensor> MobaAttention(
 }
 
 
-PD_BUILD_OP(moba_attention)
+PD_BUILD_STATIC_OP(moba_attention)
     .Inputs({
         "qkv",
         "q_input",
diff --git a/custom_ops/gpu_ops/moba_attn/moba_decoder_attn/moba_qk_sort_decoder.cu b/custom_ops/gpu_ops/moba_attn/moba_decoder_attn/moba_qk_sort_decoder.cu
index 3575cf5b89d..8808533f88b 100644
--- a/custom_ops/gpu_ops/moba_attn/moba_decoder_attn/moba_qk_sort_decoder.cu
+++ b/custom_ops/gpu_ops/moba_attn/moba_decoder_attn/moba_qk_sort_decoder.cu
@@ -16,6 +16,9 @@
 #include "moba_attn/moba_attn_utils.hpp"
 #include "moba_attn/moba_attn.h"
 
+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
 
 template <typename T, int knthreads, int moba_block_size, int kBlockMaxN, int searchtimes>
 __global__ void qk_gate_sort_decoder_kernel(
@@ -221,7 +224,7 @@ std::vector<paddle::Tensor> QkSortDecoder(
     }
 }
 
-PD_BUILD_OP(moba_qk_sort_decoder)
+PD_BUILD_STATIC_OP(moba_qk_sort_decoder)
     .Inputs({
         "qk_gate_weight",
         "seq_len_encoder",
diff --git a/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_encoder_attn.cu b/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_encoder_attn.cu
index 29d6564ff62..d4df8b27e2a 100644
--- a/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_encoder_attn.cu
+++ b/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_encoder_attn.cu
@@ -27,6 +27,10 @@
 #include "softmax.hpp"
 #include "cutlass/arch/reg_reconfig.h"
 
+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
+
 template <int kHeadDim>
 auto get_gmem_layout(int token_num, int head_num) {
     return make_layout(
@@ -360,7 +364,7 @@ void MobaEncoderAttn(
 }
 
 
-PD_BUILD_OP(moba_encoder_attn)
+PD_BUILD_STATIC_OP(moba_encoder_attn)
     .Inputs({
         "q_input",
         "k_input",
diff --git a/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_encoder_write_cache.cu b/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_encoder_write_cache.cu
index 7ced46a49ff..89666c51f3b 100644
--- a/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_encoder_write_cache.cu
+++ b/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_encoder_write_cache.cu
@@ -15,6 +15,9 @@
 #include "paddle/extension.h"
 #include "moba_attn/moba_attn.h"
 
+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
 
 template <typename T, int kBlockSize, int kHeadDim>
 __global__ void write_encoder_cachekv_c16(
@@ -135,7 +138,7 @@ void MobaEncoderAttnWriteCacheKv(
     }
 }
 
-PD_BUILD_OP(moba_encoder_attn_write_cache_kv)
+PD_BUILD_STATIC_OP(moba_encoder_attn_write_cache_kv)
     .Inputs({
         "k_input",
         "v_input",
diff --git a/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_qk_sort_encoder.cu b/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_qk_sort_encoder.cu
index 2b190ede57e..dd044551d4c 100644
--- a/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_qk_sort_encoder.cu
+++ b/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_qk_sort_encoder.cu
@@ -15,6 +15,10 @@
 #include "paddle/extension.h"
 #include "moba_attn/moba_attn_utils.hpp"
 
+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
+
 template <typename T, int knthreads, int moba_block_size, int kBlockM, int kBlockMaxN, int searchtimes>
 __global__ void qk_gate_sort_encoder_kernel(
         const T* qk_gate_weight,
@@ -320,7 +324,7 @@ std::vector<paddle::Tensor> QkSortEncoder(
     }
 }
 
-PD_BUILD_OP(moba_qk_sort_encoder)
+PD_BUILD_STATIC_OP(moba_qk_sort_encoder)
     .Inputs({
         "qk_gate_weight",
         "seq_len_encoder",
diff --git a/custom_ops/gpu_ops/moba_attn/moba_process/moba_get_kv_from_cache.cu b/custom_ops/gpu_ops/moba_attn/moba_process/moba_get_kv_from_cache.cu
index 9aff2a3d54a..66228342c70 100644
--- a/custom_ops/gpu_ops/moba_attn/moba_process/moba_get_kv_from_cache.cu
+++ b/custom_ops/gpu_ops/moba_attn/moba_process/moba_get_kv_from_cache.cu
@@ -16,6 +16,10 @@
 #include "moba_attn/moba_attn_utils.hpp"
 #include "moba_attn/moba_attn.h"
 
+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
+
 template <typename T, int kBlockSize, int kHeadDim>
 __global__ void get_kv_from_cache_c16_kernel(
         T * k_input,
@@ -251,7 +255,7 @@ std::vector<paddle::Tensor> GetCurCuSeqLenk(
     return {cu_seq_q_pack, cu_seqlens_k, q_pack_tokens_cpu};
 }
 
-PD_BUILD_OP(get_kv_from_cache)
+PD_BUILD_STATIC_OP(get_kv_from_cache)
     .Inputs({
         "k_input",
         "v_input",
@@ -277,7 +281,7 @@ PD_BUILD_OP(get_kv_from_cache)
                     {"v_input", "v_input_out"}})
     .SetKernelFn(PD_KERNEL(GetKVFromCache));
 
-PD_BUILD_OP(get_cur_cu_seq_len_k)
+PD_BUILD_STATIC_OP(get_cur_cu_seq_len_k)
     .Inputs({
             "seq_lens_encoder",
             "seq_lens_decoder",
diff --git a/custom_ops/gpu_ops/moba_attn/moba_process/moba_mlp_einsum.cu b/custom_ops/gpu_ops/moba_attn/moba_process/moba_mlp_einsum.cu
index 2354f410638..4d740290a57 100644
--- a/custom_ops/gpu_ops/moba_attn/moba_process/moba_mlp_einsum.cu
+++ b/custom_ops/gpu_ops/moba_attn/moba_process/moba_mlp_einsum.cu
@@ -16,6 +16,9 @@
 #include "moba_attn/moba_attn_utils.hpp"
 #include "moba_attn/moba_attn.h"
 
+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
 
 template <typename T, int moba_block_size, int kHeadDim, int kMaxN>
 __global__ void moba_mlp_einsum_kernel(
@@ -207,7 +210,7 @@ std::vector<paddle::Tensor> MobaMlpEinsum(
     return {k_gate_weight};
 }
 
-PD_BUILD_OP(moba_mlp_einsum)
+PD_BUILD_STATIC_OP(moba_mlp_einsum)
     .Inputs({
         "k_input",
         "attn_gate_weight",
diff --git a/custom_ops/gpu_ops/moba_attn/moba_process/moba_qk_gemm.cu b/custom_ops/gpu_ops/moba_attn/moba_process/moba_qk_gemm.cu
index 96409592a34..2662fb26bac 100644
--- a/custom_ops/gpu_ops/moba_attn/moba_process/moba_qk_gemm.cu
+++ b/custom_ops/gpu_ops/moba_attn/moba_process/moba_qk_gemm.cu
@@ -25,6 +25,10 @@
 #include "cutlass/cluster_launch.hpp"
 #include "cutlass/arch/reg_reconfig.h"
 
+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
+
 template <typename input_type, int kBlockM, int kBlockN, int kMobaBlockSize, int kMaxN, int kHeadDim, bool is_split_kv>
 __global__ void qk_gemm_kernel(
         const input_type *q_input,
@@ -446,7 +450,7 @@ std::vector<paddle::Tensor> MobaQKGemm(
     }
 }
 
-PD_BUILD_OP(moba_qk_gemm)
+PD_BUILD_STATIC_OP(moba_qk_gemm)
     .Inputs({
         "q_input",
         "k_block_means",
diff --git a/custom_ops/gpu_ops/moba_attn/moba_process/split_qkv_and_rope.cu b/custom_ops/gpu_ops/moba_attn/moba_process/split_qkv_and_rope.cu
index 3957ba8e880..c1fe07b3983 100644
--- a/custom_ops/gpu_ops/moba_attn/moba_process/split_qkv_and_rope.cu
+++ b/custom_ops/gpu_ops/moba_attn/moba_process/split_qkv_and_rope.cu
@@ -16,6 +16,10 @@
 #include "moba_attn/moba_attn_utils.hpp"
 #include "moba_attn/moba_attn.h"
 
+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
+
 template <typename input_type, int moba_block_size, int kBlockM, int kMaxN, int tokens_per_block, bool need_k_mean>
 __global__ void fused_block_mean_and_rope_kernel(
         const input_type *qkv_input,
@@ -341,7 +345,7 @@ void FusedBlockMeanAndRope(
 
 
 
-PD_BUILD_OP(fused_block_mean_and_rope)
+PD_BUILD_STATIC_OP(fused_block_mean_and_rope)
     .Inputs({
         "qkv_out",
         "k_block_means",
diff --git a/custom_ops/gpu_ops/stop_generation_multi_ends.cu b/custom_ops/gpu_ops/stop_generation_multi_ends.cu
index fe82be207ff..3165be10a79 100644
--- a/custom_ops/gpu_ops/stop_generation_multi_ends.cu
+++ b/custom_ops/gpu_ops/stop_generation_multi_ends.cu
@@ -64,6 +64,8 @@ __global__ void set_value_by_flags(bool *stop_flags,
             }
             if (!beam_search && is_in_end(topk_ids[bid], end_ids, end_length)) {
                 stop_flags[bid] = true;
+                topk_ids[bid] = end_ids[0];
+                next_tokens[bid] = end_ids[0];
             }
         }
         // dealing stop_seqs
diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py
index a5c317f80bb..ab15391a88e 100644
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -426,7 +426,6 @@ def _setting_environ_variables(self):
             "ENABLE_FASTDEPLOY_LOAD_MODEL_CONCURRENCY": 0,
             "LOAD_STATE_DICT_THREAD_NUM": len(self.cfg.parallel_config.device_ids.split(",")),
             "PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION": "python",
-            "FLAGS_use_append_attn": 1,
             "NCCL_ALGO": "Ring",
             "FLAGS_max_partition_size": int(os.getenv("FLAGS_max_partition_size", 1024)),
         }

From b5ff8c9a8fc67163e85cda7213b405699584deae Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Tue, 21 Oct 2025 20:06:55 +0800
Subject: [PATCH 2/2] paddleformers>=0.3.1

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 32acf860de3..dd96c0e2200 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,7 @@ tqdm
 pynvml
 uvicorn==0.29.0
 fastapi
-paddleformers>=0.2
+paddleformers>=0.3.1
 redis
 etcd3
 httpx