Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,9 @@ function build_and_install_ops() {
TMP_DIR_REAL_PATH=`readlink -f ${OPS_TMP_DIR}`
is_xpu=`$python -c "import paddle; print(paddle.is_compiled_with_xpu())"`
if [ "$is_xpu" = "True" ]; then
cd xpu_ops/src
cd xpu_ops
bash build.sh ${TMP_DIR_REAL_PATH}
cd ../..
cd ..
elif [ "$FD_CPU_USE_BF16" == "true" ]; then
if [ "$FD_BUILDING_ARCS" == "" ]; then
FD_CPU_USE_BF16=True ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
Expand Down
2 changes: 1 addition & 1 deletion custom_ops/setup_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,7 +542,7 @@ def find_end_files(directory, end_str):
include_package_data=True,
)
elif paddle.is_compiled_with_xpu():
assert False, "In XPU, we should use setup_ops.py in xpu_ops/src, not this."
assert False, "For XPU, please use setup_ops.py in the xpu_ops directory to compile custom ops."
elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
setup(
name="fastdeploy_ops",
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from paddle.utils.cpp_extension import CppExtension, setup

current_file = Path(__file__).resolve()
base_dir = current_file.parent
base_dir = os.path.join(current_file.parent, "src")


def build_plugin(CLANG_PATH, XRE_INC_DIR, XRE_LIB_DIR, XDNN_INC_DIR, XDNN_LIB_DIR):
Expand Down Expand Up @@ -136,33 +136,8 @@ def xpu_setup_ops():
# build plugin
build_plugin(CLANG_PATH, XRE_INC_PATH, XRE_LIB_DIR, XDNN_INC_PATH, XDNN_LIB_DIR)

ops = [
# custom ops
"./ops/save_with_output_msg.cc",
"./ops/stop_generation_multi_ends.cc",
"./ops/set_value_by_flags_and_idx.cc",
"./ops/get_token_penalty_multi_scores.cc",
"./ops/get_padding_offset.cc",
"./ops/update_inputs.cc",
"./ops/recover_decode_task.cc",
"./ops/update_inputs_v1.cc",
"./ops/get_output.cc",
"./ops/step.cc",
"./ops/get_infer_param.cc",
"./ops/adjust_batch.cc",
"./ops/gather_next_token.cc",
"./ops/block_attn.cc",
"./ops/moe_layer.cc",
"./ops/weight_quantize_xpu.cc",
# device manage ops
"./ops/device/get_context_gm_max_mem_demand.cc",
"./ops/device/get_free_global_memory.cc",
"./ops/device/get_total_global_memory.cc",
"./ops/device/get_used_global_memory.cc",
]
ops = [os.path.join(base_dir, op) for op in ops]

for root, dirs, files in os.walk(base_dir / "ops/mtp_ops"):
ops = []
for root, dirs, files in os.walk(os.path.join(base_dir, "ops")):
for file in files:
if file.endswith(".cc"):
ops.append(os.path.join(root, file))
Expand Down
225 changes: 225 additions & 0 deletions custom_ops/xpu_ops/src/ops/fused_rms_norm.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <infer_ops.h>
#include <functional>
#include "paddle/extension.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "utility/debug.h"
#include "utility/env.h"

#ifndef PD_BUILD_STATIC_OP
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
#endif

XPU_DECLARE_BOOL(ENABLE_XVLLM_SDNN_INFER, false);
namespace api = baidu::xpu::api;

template <typename T>
std::vector<paddle::Tensor> RmsNormKernel(
const paddle::Tensor& x,
const paddle::optional<paddle::Tensor>& bias,
const paddle::optional<paddle::Tensor>& residual,
const paddle::Tensor& norm_weight,
const paddle::optional<paddle::Tensor>& norm_bias,
const float epsilon,
const int begin_norm_axis,
const float quant_scale,
const int quant_round_type,
const float quant_max_bound,
const float quant_min_bound) {
using XPU_T = typename XPUTypeTrait<T>::Type;
phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);

int ret = -1;
auto x_shape = x.shape();
PD_CHECK(quant_scale <= 0, "Quantization is not supported");
PD_CHECK(begin_norm_axis > 0 && begin_norm_axis <= x_shape.size(),
"begin_norm_axis check fail");
PD_CHECK(norm_bias.get_ptr() == nullptr,
"rms norm kernel don't support norm_bias");

int64_t m = std::accumulate(x_shape.begin(),
x_shape.begin() + begin_norm_axis,
static_cast<int64_t>(1),
std::multiplies<int64_t>());
int64_t n = std::accumulate(x_shape.begin() + begin_norm_axis,
x_shape.end(),
static_cast<int64_t>(1),
std::multiplies<int64_t>());

PD_CHECK(n == norm_weight.shape()[0],
"The product from begin_norm_axis to the last axis of x must be "
"equal to the norm_weight's shape[0]");
if (bias.get_ptr()) {
PD_CHECK(n == bias.get_ptr()->shape()[0],
"The product from begin_norm_axis to the last axis of x must be "
"equal to the bias's shape[0]");
}

paddle::Tensor out = paddle::empty(x_shape, x.dtype(), x.place());
paddle::Tensor residual_out = paddle::empty(x_shape, x.dtype(), x.place());
const XPU_T* x_data = reinterpret_cast<const XPU_T*>(x.data<T>());
const XPU_T* norm_weight_data =
reinterpret_cast<const XPU_T*>(norm_weight.data<T>());
const XPU_T* bias_data =
bias.get_ptr() ? reinterpret_cast<const XPU_T*>(bias.get_ptr()->data<T>())
: nullptr;
const XPU_T* residual_data =
residual.get_ptr()
? reinterpret_cast<const XPU_T*>(residual.get_ptr()->data<T>())
: nullptr;
XPU_T* out_data = reinterpret_cast<XPU_T*>(const_cast<T*>(out.data<T>()));
XPU_T* residual_out_data = nullptr;
if (residual_data) {
residual_out_data =
reinterpret_cast<XPU_T*>(const_cast<T*>(residual_out.data<T>()));
}

XPU_T* add_out_data = const_cast<XPU_T*>(x_data);
if (bias_data) {
ret = api::broadcast_add(
xpu_ctx->x_context(), x_data, bias_data, out_data, {m, n}, {n});
PD_CHECK(ret == 0, "broadcast_add");
add_out_data = out_data;
}

bool use_sdnn = FLAGS_ENABLE_XVLLM_SDNN_INFER;
if (residual_data) {
ret = infer_ops::add_rms_layer_norm<XPU_T, XPU_T>(xpu_ctx->x_context(),
add_out_data,
residual_data,
out_data,
m,
n,
epsilon,
norm_weight_data,
nullptr,
nullptr,
residual_out_data,
nullptr,
use_sdnn);
PD_CHECK(ret == 0, "add_rms_layer_norm");
} else {
ret = api::rms_layer_norm<XPU_T, XPU_T>(xpu_ctx->x_context(),
add_out_data,
out_data,
m,
n,
epsilon,
norm_weight_data,
nullptr,
nullptr,
false);
PD_CHECK(ret == 0, "rms_layer_norm");
}

return {out, residual_out};
}

std::vector<paddle::Tensor> RmsNorm(
const paddle::Tensor& x,
const paddle::optional<paddle::Tensor>& bias,
const paddle::optional<paddle::Tensor>& residual,
const paddle::Tensor& norm_weight,
const paddle::optional<paddle::Tensor>& norm_bias,
const float epsilon,
const int begin_norm_axis,
const float quant_scale,
const int quant_round_type,
const float quant_max_bound,
const float quant_min_bound) {
const auto x_type = x.dtype();

#define APPLY_RMS_NORM_KERNEL(TX) \
return RmsNormKernel<TX>(x, \
bias, \
residual, \
norm_weight, \
norm_bias, \
epsilon, \
begin_norm_axis, \
quant_scale, \
quant_round_type, \
quant_max_bound, \
quant_min_bound);

if (x_type == paddle::DataType::BFLOAT16) {
APPLY_RMS_NORM_KERNEL(paddle::bfloat16);
} else if (x_type == paddle::DataType::FLOAT16) {
APPLY_RMS_NORM_KERNEL(paddle::float16);
} else if (x_type == paddle::DataType::FLOAT32) {
APPLY_RMS_NORM_KERNEL(float);
} else {
PD_THROW("RmsNorm not support x_type=", static_cast<int>(x_type));
return {};
}
#undef APPLY_RMS_NORM_KERNEL
}

std::vector<std::vector<int64_t>> RmsNormInferShape(
const std::vector<int64_t>& x_shape,
const paddle::optional<std::vector<int64_t>>& bias_shape,
const paddle::optional<std::vector<int64_t>>& residual_shape,
const std::vector<int64_t>& norm_weight_shape,
const paddle::optional<std::vector<int64_t>>& norm_bias_shape,
const float epsilon,
const int begin_norm_axis,
const float quant_scale,
const int quant_round_type,
const float quant_max_bound,
const float quant_min_bound) {
PD_CHECK(begin_norm_axis > 0 && begin_norm_axis <= x_shape.size(),
"begin_norm_axis check fail");
int64_t m = std::accumulate(x_shape.begin(),
x_shape.begin() + begin_norm_axis,
static_cast<int64_t>(1),
std::multiplies<int64_t>());
return {x_shape, x_shape, {m}};
}

std::vector<paddle::DataType> RmsNormInferDtype(
const paddle::DataType& x_dtype,
const paddle::optional<paddle::DataType>& bias_dtype,
const paddle::optional<paddle::DataType>& residual_dtype,
const paddle::DataType& norm_weight_dtype,
const paddle::optional<paddle::DataType>& norm_bias_dtype,
const float epsilon,
const int begin_norm_axis,
const float quant_scale,
const int quant_round_type,
const float quant_max_bound,
const float quant_min_bound) {
// out, residual_out
return {x_dtype, x_dtype};
}

PD_BUILD_STATIC_OP(fused_rms_norm_xpu)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

为什么需要给xpu单独写一个fused_rms_norm算子呢?和gpu版本的算子有什么区别?
kernel实现可以是硬件相关的,但是算子定义应该是硬件无关的。

.Inputs({"x",
paddle::Optional("bias"),
paddle::Optional("residual"),
"norm_weight",
paddle::Optional("norm_bias")})
.Outputs({"out", "residul_out"})
.Attrs({"epsilon:float",
"begin_norm_axis:int",
"quant_scale:float",
"quant_round_type:int",
"quant_max_bound:float",
"quant_min_bound:float"})
.SetKernelFn(PD_KERNEL(RmsNorm))
.SetInferShapeFn(PD_INFER_SHAPE(RmsNormInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(RmsNormInferDtype));
34 changes: 28 additions & 6 deletions custom_ops/xpu_ops/src/ops/get_output.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,35 @@
#include <sys/ipc.h>
#include <sys/msg.h>
#include <sys/types.h>
#include "msg_utils.h"

#define MAX_BSZ 256
// #define GET_OUTPUT_DEBUG
struct msgdata {
long mtype;
int mtext[MAX_BSZ + 2]; // stop_flag, bsz, tokens
};
void GetOutputKVSignal(const paddle::Tensor& x,
int64_t rank_id,
bool wait_flag) {
int msg_queue_id = 1024 + rank_id;
static struct msgdatakv msg_rcv;
static key_t key = ftok("/opt/", msg_queue_id);
static int msgid = msgget(key, IPC_CREAT | 0666);

int* out_data = const_cast<int*>(x.data<int>());
int ret = -1;
if (!wait_flag) {
ret = msgrcv(msgid, &msg_rcv, (MAX_BSZ * 3 + 2) * 4, 0, IPC_NOWAIT);
} else {
ret = msgrcv(msgid, &msg_rcv, (MAX_BSZ * 3 + 2) * 4, 0, 0);
}
if (ret == -1) {
out_data[0] = -1;
out_data[1] = -1;
return;
}
int encoder_count = msg_rcv.mtext[0];

for (int i = 0; i < encoder_count * 3 + 2; i++) {
out_data[i] = msg_rcv.mtext[i];
}
return;
}

void GetOutput(const paddle::Tensor &x, int64_t rank_id, bool wait_flag,
int msg_queue_id) {
Expand Down
Loading
Loading