PaddlePaddle · zhupengyang · Sep 10, 2025 · Sep 5, 2025 · Sep 9, 2025 · XiaoguangHu01
diff --git a/build.sh b/build.sh
@@ -143,9 +143,9 @@ function build_and_install_ops() {
   TMP_DIR_REAL_PATH=`readlink -f ${OPS_TMP_DIR}`
   is_xpu=`$python -c "import paddle; print(paddle.is_compiled_with_xpu())"`
   if [ "$is_xpu" = "True" ]; then
-    cd xpu_ops/src
+    cd xpu_ops
     bash build.sh ${TMP_DIR_REAL_PATH}
-    cd ../..
+    cd ..
   elif [ "$FD_CPU_USE_BF16" == "true" ]; then
     if [ "$FD_BUILDING_ARCS" == "" ]; then
       FD_CPU_USE_BF16=True ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}

diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py
@@ -542,7 +542,7 @@ def find_end_files(directory, end_str):
         include_package_data=True,
     )
 elif paddle.is_compiled_with_xpu():
-    assert False, "In XPU, we should use setup_ops.py in xpu_ops/src, not this."
+    assert False, "For XPU, please use setup_ops.py in the xpu_ops directory to compile custom ops."
 elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
     setup(
         name="fastdeploy_ops",

diff --git a/custom_ops/xpu_ops/src/build.sh → custom_ops/xpu_ops/build.sh b/custom_ops/xpu_ops/src/build.sh → custom_ops/xpu_ops/build.sh
diff --git a/..._ops/xpu_ops/src/download_dependencies.sh → custom_ops/xpu_ops/download_dependencies.sh b/..._ops/xpu_ops/src/download_dependencies.sh → custom_ops/xpu_ops/download_dependencies.sh
diff --git a/custom_ops/xpu_ops/src/setup_ops.py → custom_ops/xpu_ops/setup_ops.py b/custom_ops/xpu_ops/src/setup_ops.py → custom_ops/xpu_ops/setup_ops.py
@@ -27,7 +27,7 @@
 from paddle.utils.cpp_extension import CppExtension, setup
 
 current_file = Path(__file__).resolve()
-base_dir = current_file.parent
+base_dir = os.path.join(current_file.parent, "src")
 
 
 def build_plugin(CLANG_PATH, XRE_INC_DIR, XRE_LIB_DIR, XDNN_INC_DIR, XDNN_LIB_DIR):
@@ -136,33 +136,8 @@ def xpu_setup_ops():
     # build plugin
     build_plugin(CLANG_PATH, XRE_INC_PATH, XRE_LIB_DIR, XDNN_INC_PATH, XDNN_LIB_DIR)
 
-    ops = [
-        # custom ops
-        "./ops/save_with_output_msg.cc",
-        "./ops/stop_generation_multi_ends.cc",
-        "./ops/set_value_by_flags_and_idx.cc",
-        "./ops/get_token_penalty_multi_scores.cc",
-        "./ops/get_padding_offset.cc",
-        "./ops/update_inputs.cc",
-        "./ops/recover_decode_task.cc",
-        "./ops/update_inputs_v1.cc",
-        "./ops/get_output.cc",
-        "./ops/step.cc",
-        "./ops/get_infer_param.cc",
-        "./ops/adjust_batch.cc",
-        "./ops/gather_next_token.cc",
-        "./ops/block_attn.cc",
-        "./ops/moe_layer.cc",
-        "./ops/weight_quantize_xpu.cc",
-        # device manage ops
-        "./ops/device/get_context_gm_max_mem_demand.cc",
-        "./ops/device/get_free_global_memory.cc",
-        "./ops/device/get_total_global_memory.cc",
-        "./ops/device/get_used_global_memory.cc",
-    ]
-    ops = [os.path.join(base_dir, op) for op in ops]
-
-    for root, dirs, files in os.walk(base_dir / "ops/mtp_ops"):
+    ops = []
+    for root, dirs, files in os.walk(os.path.join(base_dir, "ops")):
         for file in files:
             if file.endswith(".cc"):
                 ops.append(os.path.join(root, file))

diff --git a/custom_ops/xpu_ops/src/ops/fused_rms_norm.cc b/custom_ops/xpu_ops/src/ops/fused_rms_norm.cc
@@ -0,0 +1,225 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <infer_ops.h>
+#include <functional>
+#include "paddle/extension.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "utility/debug.h"
+#include "utility/env.h"
+
+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
+
+XPU_DECLARE_BOOL(ENABLE_XVLLM_SDNN_INFER, false);
+namespace api = baidu::xpu::api;
+
+template <typename T>
+std::vector<paddle::Tensor> RmsNormKernel(
+    const paddle::Tensor& x,
+    const paddle::optional<paddle::Tensor>& bias,
+    const paddle::optional<paddle::Tensor>& residual,
+    const paddle::Tensor& norm_weight,
+    const paddle::optional<paddle::Tensor>& norm_bias,
+    const float epsilon,
+    const int begin_norm_axis,
+    const float quant_scale,
+    const int quant_round_type,
+    const float quant_max_bound,
+    const float quant_min_bound) {
+  using XPU_T = typename XPUTypeTrait<T>::Type;
+  phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
+  auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
+  auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
+
+  int ret = -1;
+  auto x_shape = x.shape();
+  PD_CHECK(quant_scale <= 0, "Quantization is not supported");
+  PD_CHECK(begin_norm_axis > 0 && begin_norm_axis <= x_shape.size(),
+           "begin_norm_axis check fail");
+  PD_CHECK(norm_bias.get_ptr() == nullptr,
+           "rms norm kernel don't support norm_bias");
+
+  int64_t m = std::accumulate(x_shape.begin(),
+                              x_shape.begin() + begin_norm_axis,
+                              static_cast<int64_t>(1),
+                              std::multiplies<int64_t>());
+  int64_t n = std::accumulate(x_shape.begin() + begin_norm_axis,
+                              x_shape.end(),
+                              static_cast<int64_t>(1),
+                              std::multiplies<int64_t>());
+
+  PD_CHECK(n == norm_weight.shape()[0],
+           "The product from begin_norm_axis to the last axis of x must be "
+           "equal to the norm_weight's shape[0]");
+  if (bias.get_ptr()) {
+    PD_CHECK(n == bias.get_ptr()->shape()[0],
+             "The product from begin_norm_axis to the last axis of x must be "
+             "equal to the bias's shape[0]");
+  }
+
+  paddle::Tensor out = paddle::empty(x_shape, x.dtype(), x.place());
+  paddle::Tensor residual_out = paddle::empty(x_shape, x.dtype(), x.place());
+  const XPU_T* x_data = reinterpret_cast<const XPU_T*>(x.data<T>());
+  const XPU_T* norm_weight_data =
+      reinterpret_cast<const XPU_T*>(norm_weight.data<T>());
+  const XPU_T* bias_data =
+      bias.get_ptr() ? reinterpret_cast<const XPU_T*>(bias.get_ptr()->data<T>())
+                     : nullptr;
+  const XPU_T* residual_data =
+      residual.get_ptr()
+          ? reinterpret_cast<const XPU_T*>(residual.get_ptr()->data<T>())
+          : nullptr;
+  XPU_T* out_data = reinterpret_cast<XPU_T*>(const_cast<T*>(out.data<T>()));
+  XPU_T* residual_out_data = nullptr;
+  if (residual_data) {
+    residual_out_data =
+        reinterpret_cast<XPU_T*>(const_cast<T*>(residual_out.data<T>()));
+  }
+
+  XPU_T* add_out_data = const_cast<XPU_T*>(x_data);
+  if (bias_data) {
+    ret = api::broadcast_add(
+        xpu_ctx->x_context(), x_data, bias_data, out_data, {m, n}, {n});
+    PD_CHECK(ret == 0, "broadcast_add");
+    add_out_data = out_data;
+  }
+
+  bool use_sdnn = FLAGS_ENABLE_XVLLM_SDNN_INFER;
+  if (residual_data) {
+    ret = infer_ops::add_rms_layer_norm<XPU_T, XPU_T>(xpu_ctx->x_context(),
+                                                      add_out_data,
+                                                      residual_data,
+                                                      out_data,
+                                                      m,
+                                                      n,
+                                                      epsilon,
+                                                      norm_weight_data,
+                                                      nullptr,
+                                                      nullptr,
+                                                      residual_out_data,
+                                                      nullptr,
+                                                      use_sdnn);
+    PD_CHECK(ret == 0, "add_rms_layer_norm");
+  } else {
+    ret = api::rms_layer_norm<XPU_T, XPU_T>(xpu_ctx->x_context(),
+                                            add_out_data,
+                                            out_data,
+                                            m,
+                                            n,
+                                            epsilon,
+                                            norm_weight_data,
+                                            nullptr,
+                                            nullptr,
+                                            false);
+    PD_CHECK(ret == 0, "rms_layer_norm");
+  }
+
+  return {out, residual_out};
+}
+
+std::vector<paddle::Tensor> RmsNorm(
+    const paddle::Tensor& x,
+    const paddle::optional<paddle::Tensor>& bias,
+    const paddle::optional<paddle::Tensor>& residual,
+    const paddle::Tensor& norm_weight,
+    const paddle::optional<paddle::Tensor>& norm_bias,
+    const float epsilon,
+    const int begin_norm_axis,
+    const float quant_scale,
+    const int quant_round_type,
+    const float quant_max_bound,
+    const float quant_min_bound) {
+  const auto x_type = x.dtype();
+
+#define APPLY_RMS_NORM_KERNEL(TX)            \
+  return RmsNormKernel<TX>(x,                \
+                           bias,             \
+                           residual,         \
+                           norm_weight,      \
+                           norm_bias,        \
+                           epsilon,          \
+                           begin_norm_axis,  \
+                           quant_scale,      \
+                           quant_round_type, \
+                           quant_max_bound,  \
+                           quant_min_bound);
+
+  if (x_type == paddle::DataType::BFLOAT16) {
+    APPLY_RMS_NORM_KERNEL(paddle::bfloat16);
+  } else if (x_type == paddle::DataType::FLOAT16) {
+    APPLY_RMS_NORM_KERNEL(paddle::float16);
+  } else if (x_type == paddle::DataType::FLOAT32) {
+    APPLY_RMS_NORM_KERNEL(float);
+  } else {
+    PD_THROW("RmsNorm not support x_type=", static_cast<int>(x_type));
+    return {};
+  }
+#undef APPLY_RMS_NORM_KERNEL
+}
+
+std::vector<std::vector<int64_t>> RmsNormInferShape(
+    const std::vector<int64_t>& x_shape,
+    const paddle::optional<std::vector<int64_t>>& bias_shape,
+    const paddle::optional<std::vector<int64_t>>& residual_shape,
+    const std::vector<int64_t>& norm_weight_shape,
+    const paddle::optional<std::vector<int64_t>>& norm_bias_shape,
+    const float epsilon,
+    const int begin_norm_axis,
+    const float quant_scale,
+    const int quant_round_type,
+    const float quant_max_bound,
+    const float quant_min_bound) {
+  PD_CHECK(begin_norm_axis > 0 && begin_norm_axis <= x_shape.size(),
+           "begin_norm_axis check fail");
+  int64_t m = std::accumulate(x_shape.begin(),
+                              x_shape.begin() + begin_norm_axis,
+                              static_cast<int64_t>(1),
+                              std::multiplies<int64_t>());
+  return {x_shape, x_shape, {m}};
+}
+
+std::vector<paddle::DataType> RmsNormInferDtype(
+    const paddle::DataType& x_dtype,
+    const paddle::optional<paddle::DataType>& bias_dtype,
+    const paddle::optional<paddle::DataType>& residual_dtype,
+    const paddle::DataType& norm_weight_dtype,
+    const paddle::optional<paddle::DataType>& norm_bias_dtype,
+    const float epsilon,
+    const int begin_norm_axis,
+    const float quant_scale,
+    const int quant_round_type,
+    const float quant_max_bound,
+    const float quant_min_bound) {
+  // out, residual_out
+  return {x_dtype, x_dtype};
+}
+
+PD_BUILD_STATIC_OP(fused_rms_norm_xpu)
+    .Inputs({"x",
+             paddle::Optional("bias"),
+             paddle::Optional("residual"),
+             "norm_weight",
+             paddle::Optional("norm_bias")})
+    .Outputs({"out", "residul_out"})
+    .Attrs({"epsilon:float",
+            "begin_norm_axis:int",
+            "quant_scale:float",
+            "quant_round_type:int",
+            "quant_max_bound:float",
+            "quant_min_bound:float"})
+    .SetKernelFn(PD_KERNEL(RmsNorm))
+    .SetInferShapeFn(PD_INFER_SHAPE(RmsNormInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(RmsNormInferDtype));
diff --git a/custom_ops/xpu_ops/src/ops/get_output.cc b/custom_ops/xpu_ops/src/ops/get_output.cc
@@ -18,13 +18,35 @@
 #include <sys/ipc.h>
 #include <sys/msg.h>
 #include <sys/types.h>
+#include "msg_utils.h"
 
-#define MAX_BSZ 256
-// #define GET_OUTPUT_DEBUG
-struct msgdata {
-    long mtype;
-    int mtext[MAX_BSZ + 2]; // stop_flag, bsz, tokens
-};
+void GetOutputKVSignal(const paddle::Tensor& x,
+                       int64_t rank_id,
+                       bool wait_flag) {
+  int msg_queue_id = 1024 + rank_id;
+  static struct msgdatakv msg_rcv;
+  static key_t key = ftok("/opt/", msg_queue_id);
+  static int msgid = msgget(key, IPC_CREAT | 0666);
+
+  int* out_data = const_cast<int*>(x.data<int>());
+  int ret = -1;
+  if (!wait_flag) {
+    ret = msgrcv(msgid, &msg_rcv, (MAX_BSZ * 3 + 2) * 4, 0, IPC_NOWAIT);
+  } else {
+    ret = msgrcv(msgid, &msg_rcv, (MAX_BSZ * 3 + 2) * 4, 0, 0);
+  }
+  if (ret == -1) {
+    out_data[0] = -1;
+    out_data[1] = -1;
+    return;
+  }
+  int encoder_count = msg_rcv.mtext[0];
+
+  for (int i = 0; i < encoder_count * 3 + 2; i++) {
+    out_data[i] = msg_rcv.mtext[i];
+  }
+  return;
+}
 
 void GetOutput(const paddle::Tensor &x, int64_t rank_id, bool wait_flag,
                int msg_queue_id) {