[cherry-pick][XPU] 2.12 (#9726)

* [Cherry-Pick][XPU] fixed inplace op mem reuse issue when the previous op is an invalid op (#9562) (#9564) * [XPU] support roformer relative embedding (#9536) * fix sampling_id, fix xpu python whl, fix quant_dequant pass (#9636) * [XPU] support ffn intermediate size M!=4 (#9646) * [xpu] fix scope new tensor, max weight is unchanged (#9641) * [XPU] Fixed the bug in op calib. (#9700) * [XPU] support skip ffn quant in K200 (#9704)
PaddlePaddle · Nov 24, 2022 · 8c84d4a · 8c84d4a
1 parent 193bbd5
commit 8c84d4a
Show file tree

Hide file tree

Showing 16 changed files with 673 additions and 56 deletions.
diff --git a/cmake/backends/xpu.cmake b/cmake/backends/xpu.cmake
@@ -23,7 +23,7 @@ set (XPU_DOWNLOAD_DIR ${XPU_SOURCE_DIR}/download)
 set (XPU_INSTALL_DIR ${THIRD_PARTY_PATH}/install)
 
 if (NOT XPU_SDK_URL)
-  set (XPU_SDK_URL "https://baidu-kunlun-product.su.bcebos.com/KL-SDK/klsdk-dev/20220923")
+  set (XPU_SDK_URL "https://baidu-kunlun-product.su.bcebos.com/klx-sdk/search/20221122")
 endif ()
 
 if (NOT XPU_SDK_ENV)

diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
@@ -87,6 +87,7 @@ USE_MIR_PASS(__xpu__conv2d_affine_channel_fuse_pass);
 USE_MIR_PASS(__xpu__conv2d_fuse_pass);
 USE_MIR_PASS(__xpu__softmax_topk_fuse_pass);
 USE_MIR_PASS(__xpu__multi_encoder_adaptive_seqlen_fuse_pass);
+USE_MIR_PASS(__xpu__roformer_relative_pos_fuse_pass);
 USE_MIR_PASS(__xpu__multi_encoder_slice_link_fuse_pass);
 USE_MIR_PASS(__xpu__generate_sequence_fuse_pass);
 USE_MIR_PASS(__xpu__logit_fuse_pass);

diff --git a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
diff --git a/lite/core/optimizer/mir/fusion/__xpu__roformer_relative_pos_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__roformer_relative_pos_fuse_pass.cc
@@ -0,0 +1,195 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include "lite/backends/xpu/math.h"
+#include "lite/core/optimizer/mir/pass_registry.h"
+#include "lite/core/optimizer/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+/* support xpu roformer relative pos                    */
+/*                in_Input ---------------              */
+/*                    |    \             |              */
+/*                    |     \            |              */
+/*                  split    shape       |              */
+/*                 /  |        \         |              */
+/*                /   |         \        |              */
+/*               |  scale      slice     |              */
+/*                \   |         /  \     |              */
+/*                 \  |        /    \    |              */
+/*                  concat  slice  slice |              */
+/*                    |      /        \  |              */
+/*                    |     /          \ |              */
+/*             elementwise_mul     elementwise_mul      */
+/*                    |           /                     */
+/*                    |          /                      */
+/*                elementwise_add                       */
+/*                    |                                 */
+/*                    |                                 */
+/*                out_Output                            */
+/*-------------------------------------------*/
+/* After the pass apply:                     */
+/*                in_Input                   */
+/*          cos_emb   |   sin_emb            */
+/*                 \  |  /                   */
+/*          xpu_roformer_relative            */
+/*                    |                      */
+/*                    |                      */
+/*                out_Output                 */
+/*-------------------------------------------*/
+
+class XPURoformerRelativePosFuser : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* input = VarNode("input")
+                      ->assert_is_op_input("split", "X")
+                      ->assert_is_op_input("elementwise_mul", "X")
+                      ->assert_is_op_input("shape", "Input")
+                      ->AsInput();
+    auto* split =
+        OpNode("split", "split")
+            ->assert_op_attr<int32_t>("axis", 3)
+            ->assert_op_attr<int32_t>("num", 2)  // do we really need it
+            ->AsIntermediate();
+    auto* split_out0 = VarNode("split_out0")
+                           ->assert_is_op_nth_input("concat", "X", 1)
+                           ->assert_is_op_nth_output("split", "Out", 0)
+                           ->AsIntermediate();
+    auto* split_out1 = VarNode("split_out1")
+                           ->assert_is_op_input("scale", "X")
+                           ->assert_is_op_nth_output("split", "Out", 1)
+                           ->AsIntermediate();
+    auto* scale =
+        OpNode("scale", "scale")
+            ->assert_op_attr_satisfied<float>(
+                "scale",
+                [](float attr) { return (std::fabs(attr + 1.0) < 1e-5); })
+            ->AsIntermediate();
+    auto* scale_out = VarNode("scale_out")
+                          ->assert_is_op_input("concat", "X")
+                          ->assert_is_op_output("scale", "Out")
+                          ->AsIntermediate();
+    auto* concat = OpNode("concat", "concat")->AsIntermediate();
+    auto* concat_out = VarNode("concat_out")
+                           ->assert_is_op_input("elementwise_mul", "X")
+                           ->assert_is_op_output("concat", "Out")
+                           ->AsIntermediate();
+    auto* shape = OpNode("shape", "shape")->AsIntermediate();
+    auto* shape_out = VarNode("shape_out")
+                          ->assert_is_op_input("slice", "Input")
+                          ->assert_is_op_output("shape", "Out")
+                          ->AsIntermediate();
+    auto* slice1 = OpNode("slice1", "slice")->AsIntermediate();
+    auto* slice1_out = VarNode("slice1_out")
+                           ->assert_is_op_input("slice", "EndsTensorList")
+                           ->assert_is_op_output("slice", "Out")
+                           ->AsIntermediate();
+    auto* sin_emb =
+        VarNode("sin_emb")->assert_is_op_input("slice", "Input")->AsInput();
+    auto* cos_emb =
+        VarNode("cos_emb")->assert_is_op_input("slice", "Input")->AsInput();
+    auto* slice_sin = OpNode("slice_sin", "slice")->AsIntermediate();
+    auto* slice_sin_out = VarNode("slice_sin_out")
+                              ->assert_is_op_input("elementwise_mul", "Y")
+                              ->assert_is_op_output("slice", "Out")
+                              ->AsIntermediate();
+    auto* ew_mul_sin =
+        OpNode("ew_mul_sin", "elementwise_mul")->AsIntermediate();
+    auto* ew_mul_sin_out = VarNode("ew_mul_sin_out")
+                               ->assert_is_op_input("elementwise_add", "Y")
+                               ->assert_is_op_output("elementwise_mul", "Out")
+                               ->AsIntermediate();
+    auto* ew_add = OpNode("ew_add", "elementwise_add")->AsIntermediate();
+    auto* ew_add_out = VarNode("ew_add_out")
+                           ->assert_is_op_output("elementwise_add", "Out")
+                           ->AsOutput();
+    auto* slice_cos = OpNode("slice_cos", "slice")->AsIntermediate();
+    auto* slice_cos_out = VarNode("slice_cos_out")
+                              ->assert_is_op_input("elementwise_mul", "Y")
+                              ->assert_is_op_output("slice", "Out")
+                              ->AsIntermediate();
+    auto* ew_mul_cos =
+        OpNode("ew_mul_cos", "elementwise_mul")->AsIntermediate();
+    auto* ew_mul_cos_out = VarNode("ew_mul_cos_out")
+                               ->assert_is_op_input("elementwise_add", "X")
+                               ->assert_is_op_output("elementwise_mul", "Out")
+                               ->AsIntermediate();
+    *input >> *split >> *split_out1 >> *scale >> *scale_out >> *concat >>
+        *concat_out >> *ew_mul_sin >> *ew_mul_sin_out >> *ew_add >> *ew_add_out;
+    *input >> *ew_mul_cos >> *ew_mul_cos_out >> *ew_add;
+    *input >> *shape >> *shape_out >> *slice1 >> *slice1_out >> *slice_sin >>
+        *slice_sin_out >> *ew_mul_sin;
+    *slice1_out >> *slice_cos >> *slice_cos_out >> *ew_mul_cos;
+    *sin_emb >> *slice_sin;
+    *cos_emb >> *slice_cos;
+    *split >> *split_out0 >> *concat;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__roformer_relative_embedding");
+    // use "X", be consistent with target_op_type_ in multiencoder pass
+    op_desc.SetInput("X", {matched.at("input")->arg()->name});
+    op_desc.SetInput("CosEmbbeding", {matched.at("cos_emb")->arg()->name});
+    op_desc.SetInput("SinEmbbeding", {matched.at("sin_emb")->arg()->name});
+    op_desc.SetOutput("Out", {matched.at("ew_add_out")->arg()->name});
+    auto* scope = matched.at("split")->stmt()->op()->scope();
+
+    auto cos_emb_name = matched.at("cos_emb")->arg()->name;
+    auto cos_emb_shape = scope->FindMutableTensor(cos_emb_name)->dims();
+    auto sin_emb_name = matched.at("sin_emb")->arg()->name;
+    auto sin_emb_shape = scope->FindMutableTensor(sin_emb_name)->dims();
+    CHECK_EQ(cos_emb_shape.size(), 4) << cos_emb_shape.size();
+    CHECK_GT(cos_emb_shape[2], 0) << cos_emb_shape[2];
+    CHECK_EQ(sin_emb_shape.size(), 4) << sin_emb_shape.size();
+    for (int i = 0; i < sin_emb_shape.size(); ++i) {
+      CHECK_EQ(sin_emb_shape[i], cos_emb_shape[i])
+          << i << " th dim: " << sin_emb_shape[i] << ", " << cos_emb_shape[i];
+    }
+    op_desc.SetAttr<int>("max_pos_len", cos_emb_shape[2]);
+
+    auto& valid_places = matched.at("split")->stmt()->op()->valid_places();
+    auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    new_op->Attach(op_desc, scope);
+    auto* new_op_node = graph->GraphCreateInstructNode(new_op, valid_places);
+
+    DirectedLink(matched.at("input"), new_op_node);
+    DirectedLink(matched.at("cos_emb"), new_op_node);
+    DirectedLink(matched.at("sin_emb"), new_op_node);
+    DirectedLink(new_op_node, matched.at("ew_add_out"));
+  }
+};
+
+}  // namespace fusion
+
+class XPURoformerRelativePosFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    fusion::XPURoformerRelativePosFuser fuser;
+    fuser(graph.get());
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(__xpu__roformer_relative_pos_fuse_pass,
+                  paddle::lite::mir::XPURoformerRelativePosFusePass)
+    .BindTargets({TARGET(kXPU)});
diff --git a/lite/core/optimizer/optimizer.cc b/lite/core/optimizer/optimizer.cc
@@ -199,6 +199,7 @@ std::unique_ptr<RuntimeProgram> RunDefaultOptimizer(
        "__xpu__squeeze_excitation_fuse_pass",
        "__xpu__mmdnn_fuse_pass",
        "__xpu__bigru_fuse_pass",
+       "__xpu__roformer_relative_pos_fuse_pass",
        "__xpu__multi_encoder_fuse_pass",
        "__xpu__embedding_with_eltwise_add_fuse_pass",
        "__xpu__fc_fuse_pass",

diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt
@@ -117,6 +117,7 @@ add_kernel(__xpu__conv2d_compute_xpu XPU extra SRCS __xpu__conv2d_compute.cc)
 add_kernel(__xpu__softmax_topk_compute_xpu XPU extra SRCS __xpu__softmax_topk_compute.cc)
 add_kernel(__xpu__generate_sequence_compute_xpu XPU extra SRCS __xpu__generate_sequence_compute.cc)
 add_kernel(__xpu__logit_compute_xpu XPU extra SRCS __xpu__logit_compute.cc)
+add_kernel(__xpu__roformer_relative_embedding_compute_xpu XPU extra SRCS __xpu__roformer_relative_embedding_compute.cc)
 add_kernel(__xpu__squeeze_excitation_compute_xpu XPU extra SRCS __xpu__squeeze_excitation_compute.cc)
 add_kernel(__xpu__bigru_compute_xpu XPU extra SRCS __xpu__bigru_compute.cc)
 add_kernel(__xpu__dynamic_lstm_compute_xpu XPU extra SRCS __xpu__dynamic_lstm_compute.cc)

diff --git a/lite/kernels/xpu/__xpu__multi_encoder_compute.cc b/lite/kernels/xpu/__xpu__multi_encoder_compute.cc
@@ -155,6 +155,13 @@ void XPUMultiEncoderCompute::PrepareForRun() {
   for (auto* ln_bias : param.ln_bias) {
     arg_ln_bias_.push_back(ln_bias->data<float>());
   }
+  relative_type_ = param.relative_type;
+  // prepare roformer embedding
+  if (relative_type_ == 1) {
+    for (auto* emb : param.roformer_embedding) {
+      roformer_embedding_.push_back(emb->data<float>());
+    }
+  }
   // prepare weights
   local_quant_ =
       GetBoolFromEnv("XPU_LOCAL_QUANT") || lite::TargetWrapperXPU::local_quant;
@@ -226,7 +233,13 @@ void XPUMultiEncoderCompute::run_encoder(const T* in, T* out) {
                                       param.norm_before, /*is_pre_norm*/
                                       param.per_channel);
     qkv_attn_param.quant_type_.assign(quant_types_.begin(), quant_types_.end());
-
+    if (relative_type_ == 1) {
+      qkv_attn_param.relative_type = relative_type_;
+      qkv_attn_param.max_pos_len = param.max_pos_len;
+      qkv_attn_param.relative_pos.assign(roformer_embedding_.begin(),
+                                         roformer_embedding_.end());
+    }
+    qkv_attn_param.scale_of_hidden_units = param.ffn_hidden_dim_scale;
     if (std::is_same<TGEMM, int8_t>::value) {
       CHECK_GT(fc_input_max_.size(), 0);
     }
@@ -249,7 +262,8 @@ void XPUMultiEncoderCompute::run_encoder(const T* in, T* out) {
     std::vector<int64_t> mask_shape = param.mask->dims().Vectorize();
     std::vector<int> encoder_mask_shape =
         std::vector<int>(mask_shape.begin(), mask_shape.end());
-
+    CHECK_EQ(param.ffn_hidden_dim_scale, 4)
+        << "xpu don't support ffn_hidden_dim_scale!=4 when no vsl";
     xdnn::QKVAttnParam qkv_attn_param(batch,
                                       max_seqlen,
                                       param.head_num,
@@ -259,8 +273,15 @@ void XPUMultiEncoderCompute::run_encoder(const T* in, T* out) {
                                       slice_idx,
                                       true,
                                       param.hidden_dim,
-                                      param.norm_before);
+                                      param.norm_before,
+                                      param.per_channel);
     qkv_attn_param.quant_type_.assign(quant_types_.begin(), quant_types_.end());
+    if (relative_type_ == 1) {
+      qkv_attn_param.relative_type = relative_type_;
+      qkv_attn_param.max_pos_len = param.max_pos_len;
+      qkv_attn_param.relative_pos.assign(roformer_embedding_.begin(),
+                                         roformer_embedding_.end());
+    }
     int r = xdnn::transformer_encoder<T, TW, TGEMM>(
         ctx.GetRawContext(),
         in,
@@ -367,6 +388,7 @@ REGISTER_LITE_KERNEL(__xpu__multi_encoder,
     .BindInput("FCBias", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindInput("LNScale", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindInput("LNBias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("RoformerEmbedding", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindInput("Mask", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
diff --git a/lite/kernels/xpu/__xpu__multi_encoder_compute.h b/lite/kernels/xpu/__xpu__multi_encoder_compute.h
@@ -43,13 +43,15 @@ class XPUMultiEncoderCompute
   std::vector<const float *> arg_ln_bias_;
   std::vector<const float *> fc_weight_max_;
   std::vector<const float *> fc_input_max_;
+  std::vector<const float *> roformer_embedding_;
   std::vector<xdnn::QuantType> quant_types_;
   XPUScratchPadGuard weight_max_guard_;
   XPUScratchPadGuard input_max_guard_;
   XPUScratchPadGuard cast_in_guard_;
   XPUScratchPadGuard cast_out_guard_;
   xdnn::Activation_t qkv_act = xdnn::Activation_t::RELU;
   int slice_idx = -1;
+  int relative_type_ = 0;
   bool local_quant_ = false;
 
   template <typename T>

diff --git a/lite/kernels/xpu/__xpu__roformer_relative_embedding_compute.cc b/lite/kernels/xpu/__xpu__roformer_relative_embedding_compute.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/__xpu__roformer_relative_embedding_compute.h"
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void RoformerRelativeEmbeddingCompute::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+  auto input_dim = param.input->dims();
+  CHECK_EQ(input_dim.size(), 4);
+  int batch = input_dim[0];
+  int head_num = param.input->dims()[1];
+  int seqlen = param.input->dims()[2];
+  int head_dim = param.input->dims()[3];
+  CHECK_LE(seqlen, param.max_pos_len);
+  std::vector<int> lod;
+  lod.resize(batch + 1);
+  for (int i = 0; i < batch + 1; i++) {
+    lod[i] = i * seqlen;
+  }
+  int r =
+      xdnn::rope<float>(ctx.GetRawContext(),
+                        param.input->data<float>(),
+                        param.output->mutable_data<float>(TARGET(kXPU)),
+                        param.cos_embedding->data<float>(),
+                        param.sin_embedding->data<float>(),
+                        batch,
+                        head_num,
+                        head_dim,
+                        head_num * head_dim,
+                        lod,
+                        param.max_pos_len,
+                        false,  // no vsl
+                        true);  // transpose to [n, seql, head_num, head_dim]
+  CHECK_EQ(r, 0) << "call RoformerRelativeEmbeddingCompute failed";
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    __xpu__roformer_relative_embedding,
+    kXPU,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::xpu::RoformerRelativeEmbeddingCompute,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("CosEmbbeding", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("SinEmbbeding", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();