Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions custom_ops/cpu_ops/stop_generation_multi_ends.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
#include <stdio.h>
#include "paddle/extension.h"

#ifndef PD_BUILD_STATIC_OP
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
#endif

bool is_in_end(const int64_t id, const int64_t *end_ids, int length) {
bool flag = false;
for (int i = 0; i < length; i++) {
Expand Down Expand Up @@ -49,6 +53,8 @@ void set_value_by_flags(bool *stop_flags,
}
if (!beam_search && is_in_end(topk_ids[bi], end_ids, end_length)) {
stop_flags[bi] = true;
topk_ids[bi] = end_ids[0];
next_tokens[bi] = end_ids[0];
}
}
}
Expand Down
6 changes: 5 additions & 1 deletion custom_ops/gpu_ops/flash_mask_attn/flash_mask_attn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@
#include "kernel_traits.h"
#include "flash_mask_attn_kernel.hpp"

#ifndef PD_BUILD_STATIC_OP
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
#endif

template <typename paddle_type>
struct cuteType;

Expand Down Expand Up @@ -142,7 +146,7 @@ std::vector<paddle::Tensor> FlashAttentionMask(
}


PD_BUILD_OP(flash_attention_mask)
PD_BUILD_STATIC_OP(flash_attention_mask)
.Inputs({
"q_input",
"k_input",
Expand Down
6 changes: 5 additions & 1 deletion custom_ops/gpu_ops/get_img_boundaries.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@

#include "paddle/extension.h"

#ifndef PD_BUILD_STATIC_OP
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
#endif

std::vector<paddle::Tensor> GetImgBoundaries(const paddle::Tensor& task_input_ids,
const paddle::Tensor& grid_thw,
const int64_t image_patch_id) {
Expand Down Expand Up @@ -53,7 +57,7 @@ std::vector<paddle::Tensor> GetImgBoundaries(const paddle::Tensor& task_input_id
return {out};
}

PD_BUILD_OP(get_img_boundaries)
PD_BUILD_STATIC_OP(get_img_boundaries)
.Inputs({"task_input_ids", "grid_thw"})
.Attrs({"image_patch_id: int64_t"})
.Outputs({"img_boundaries"})
Expand Down
6 changes: 5 additions & 1 deletion custom_ops/gpu_ops/get_mm_split_fuse.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
#include "paddle/extension.h"
#include <map>

#ifndef PD_BUILD_STATIC_OP
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
#endif

std::vector<paddle::Tensor> GetMmSplitFuse(const paddle::Tensor& task_input_ids,
const paddle::Tensor& task_image_type_ids,
const paddle::Tensor& task_input_ids_image_token_count,
Expand Down Expand Up @@ -133,7 +137,7 @@ std::vector<paddle::Tensor> GetMmSplitFuse(const paddle::Tensor& task_input_ids,
return {image_chunk_selections_out, split_fuse_cur_seq_lens_out};
}

PD_BUILD_OP(get_mm_split_fuse)
PD_BUILD_STATIC_OP(get_mm_split_fuse)
.Inputs({"task_input_ids", "task_image_type_ids", "task_input_ids_image_token_count", "grid_thw"})
.Attrs({"image_token_id: int64_t", "img_total: int64_t", "batch_idx: int", "seq_lens_origin: int", "split_fuse_img_size: int", "split_fuse_text_size: int", "max_chunk_token_size: int"})
.Outputs({"image_chunk_selections", "split_fuse_cur_seq_lens"})
Expand Down
2 changes: 1 addition & 1 deletion custom_ops/gpu_ops/limit_thinking_content_length_v1.cu
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ void LimitThinkingContentLengthV1(const paddle::Tensor &next_tokens,
batch_size);
}

PD_BUILD_OP(limit_thinking_content_length_v1)
PD_BUILD_STATIC_OP(limit_thinking_content_length_v1)
.Inputs({"next_tokens", "max_think_lens", "step_idx", "limit_think_status"})
.Attrs({"think_end_id: int64_t"})
.Outputs({"next_tokens_out"})
Expand Down
2 changes: 1 addition & 1 deletion custom_ops/gpu_ops/limit_thinking_content_length_v2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ void LimitThinkingContentLengthV2(const paddle::Tensor &next_tokens,
batch_size);
}

PD_BUILD_OP(limit_thinking_content_length_v2)
PD_BUILD_STATIC_OP(limit_thinking_content_length_v2)
.Inputs({"next_tokens", "max_think_lens", "step_idx", "limit_think_status"})
.Attrs({"think_end_id: int64_t", "line_break_id: int64_t"})
.Outputs({"next_tokens_out"})
Expand Down
5 changes: 4 additions & 1 deletion custom_ops/gpu_ops/moba_attn/moba_attn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
#include "paddle/extension.h"
#include "moba_attn.h"

#ifndef PD_BUILD_STATIC_OP
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
#endif

std::vector<paddle::Tensor> MobaAttention(
const paddle::Tensor& qkv,
Expand Down Expand Up @@ -272,7 +275,7 @@ std::vector<paddle::Tensor> MobaAttention(
}


PD_BUILD_OP(moba_attention)
PD_BUILD_STATIC_OP(moba_attention)
.Inputs({
"qkv",
"q_input",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
#include "moba_attn/moba_attn_utils.hpp"
#include "moba_attn/moba_attn.h"

#ifndef PD_BUILD_STATIC_OP
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
#endif

template <typename T, int knthreads, int moba_block_size, int kBlockMaxN, int searchtimes>
__global__ void qk_gate_sort_decoder_kernel(
Expand Down Expand Up @@ -221,7 +224,7 @@ std::vector<paddle::Tensor> QkSortDecoder(
}
}

PD_BUILD_OP(moba_qk_sort_decoder)
PD_BUILD_STATIC_OP(moba_qk_sort_decoder)
.Inputs({
"qk_gate_weight",
"seq_len_encoder",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@
#include "softmax.hpp"
#include "cutlass/arch/reg_reconfig.h"

#ifndef PD_BUILD_STATIC_OP
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
#endif

template <int kHeadDim>
auto get_gmem_layout(int token_num, int head_num) {
return make_layout(
Expand Down Expand Up @@ -360,7 +364,7 @@ void MobaEncoderAttn(
}


PD_BUILD_OP(moba_encoder_attn)
PD_BUILD_STATIC_OP(moba_encoder_attn)
.Inputs({
"q_input",
"k_input",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
#include "paddle/extension.h"
#include "moba_attn/moba_attn.h"

#ifndef PD_BUILD_STATIC_OP
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
#endif

template <typename T, int kBlockSize, int kHeadDim>
__global__ void write_encoder_cachekv_c16(
Expand Down Expand Up @@ -135,7 +138,7 @@ void MobaEncoderAttnWriteCacheKv(
}
}

PD_BUILD_OP(moba_encoder_attn_write_cache_kv)
PD_BUILD_STATIC_OP(moba_encoder_attn_write_cache_kv)
.Inputs({
"k_input",
"v_input",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
#include "paddle/extension.h"
#include "moba_attn/moba_attn_utils.hpp"

#ifndef PD_BUILD_STATIC_OP
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
#endif

template <typename T, int knthreads, int moba_block_size, int kBlockM, int kBlockMaxN, int searchtimes>
__global__ void qk_gate_sort_encoder_kernel(
const T* qk_gate_weight,
Expand Down Expand Up @@ -320,7 +324,7 @@ std::vector<paddle::Tensor> QkSortEncoder(
}
}

PD_BUILD_OP(moba_qk_sort_encoder)
PD_BUILD_STATIC_OP(moba_qk_sort_encoder)
.Inputs({
"qk_gate_weight",
"seq_len_encoder",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
#include "moba_attn/moba_attn_utils.hpp"
#include "moba_attn/moba_attn.h"

#ifndef PD_BUILD_STATIC_OP
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
#endif

template <typename T, int kBlockSize, int kHeadDim>
__global__ void get_kv_from_cache_c16_kernel(
T * k_input,
Expand Down Expand Up @@ -251,7 +255,7 @@ std::vector<paddle::Tensor> GetCurCuSeqLenk(
return {cu_seq_q_pack, cu_seqlens_k, q_pack_tokens_cpu};
}

PD_BUILD_OP(get_kv_from_cache)
PD_BUILD_STATIC_OP(get_kv_from_cache)
.Inputs({
"k_input",
"v_input",
Expand All @@ -277,7 +281,7 @@ PD_BUILD_OP(get_kv_from_cache)
{"v_input", "v_input_out"}})
.SetKernelFn(PD_KERNEL(GetKVFromCache));

PD_BUILD_OP(get_cur_cu_seq_len_k)
PD_BUILD_STATIC_OP(get_cur_cu_seq_len_k)
.Inputs({
"seq_lens_encoder",
"seq_lens_decoder",
Expand Down
5 changes: 4 additions & 1 deletion custom_ops/gpu_ops/moba_attn/moba_process/moba_mlp_einsum.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
#include "moba_attn/moba_attn_utils.hpp"
#include "moba_attn/moba_attn.h"

#ifndef PD_BUILD_STATIC_OP
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
#endif

template <typename T, int moba_block_size, int kHeadDim, int kMaxN>
__global__ void moba_mlp_einsum_kernel(
Expand Down Expand Up @@ -207,7 +210,7 @@ std::vector<paddle::Tensor> MobaMlpEinsum(
return {k_gate_weight};
}

PD_BUILD_OP(moba_mlp_einsum)
PD_BUILD_STATIC_OP(moba_mlp_einsum)
.Inputs({
"k_input",
"attn_gate_weight",
Expand Down
6 changes: 5 additions & 1 deletion custom_ops/gpu_ops/moba_attn/moba_process/moba_qk_gemm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@
#include "cutlass/cluster_launch.hpp"
#include "cutlass/arch/reg_reconfig.h"

#ifndef PD_BUILD_STATIC_OP
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
#endif

template <typename input_type, int kBlockM, int kBlockN, int kMobaBlockSize, int kMaxN, int kHeadDim, bool is_split_kv>
__global__ void qk_gemm_kernel(
const input_type *q_input,
Expand Down Expand Up @@ -446,7 +450,7 @@ std::vector<paddle::Tensor> MobaQKGemm(
}
}

PD_BUILD_OP(moba_qk_gemm)
PD_BUILD_STATIC_OP(moba_qk_gemm)
.Inputs({
"q_input",
"k_block_means",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
#include "moba_attn/moba_attn_utils.hpp"
#include "moba_attn/moba_attn.h"

#ifndef PD_BUILD_STATIC_OP
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
#endif

template <typename input_type, int moba_block_size, int kBlockM, int kMaxN, int tokens_per_block, bool need_k_mean>
__global__ void fused_block_mean_and_rope_kernel(
const input_type *qkv_input,
Expand Down Expand Up @@ -341,7 +345,7 @@ void FusedBlockMeanAndRope(



PD_BUILD_OP(fused_block_mean_and_rope)
PD_BUILD_STATIC_OP(fused_block_mean_and_rope)
.Inputs({
"qkv_out",
"k_block_means",
Expand Down
2 changes: 2 additions & 0 deletions custom_ops/gpu_ops/stop_generation_multi_ends.cu
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ __global__ void set_value_by_flags(bool *stop_flags,
}
if (!beam_search && is_in_end(topk_ids[bid], end_ids, end_length)) {
stop_flags[bid] = true;
topk_ids[bid] = end_ids[0];
next_tokens[bid] = end_ids[0];
}
}
// dealing stop_seqs
Expand Down
1 change: 0 additions & 1 deletion fastdeploy/engine/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,6 @@ def _setting_environ_variables(self):
"ENABLE_FASTDEPLOY_LOAD_MODEL_CONCURRENCY": 0,
"LOAD_STATE_DICT_THREAD_NUM": len(self.cfg.parallel_config.device_ids.split(",")),
"PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION": "python",
"FLAGS_use_append_attn": 1,
"NCCL_ALGO": "Ring",
"FLAGS_max_partition_size": int(os.getenv("FLAGS_max_partition_size", 1024)),
}
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ tqdm
pynvml
uvicorn==0.29.0
fastapi
paddleformers>=0.2
paddleformers>=0.3.1
redis
etcd3
httpx
Expand Down
Loading