Skip to content

Commit

Permalink
[cherry-pick][XPU] 2.12 (#9726)
Browse files Browse the repository at this point in the history
* [Cherry-Pick][XPU] fixed inplace op mem reuse issue when the previous op is an invalid op (#9562) (#9564)

* [XPU] support roformer relative embedding (#9536)

* fix sampling_id, fix xpu python whl, fix quant_dequant pass (#9636)

* [XPU] support ffn intermediate size M!=4 (#9646)

* [xpu] fix scope new tensor, max weight is unchanged (#9641)

* [XPU] Fixed the bug in op calib. (#9700)

* [XPU] support skip ffn quant in K200 (#9704)
  • Loading branch information
newway committed Nov 24, 2022
1 parent 193bbd5 commit 8c84d4a
Show file tree
Hide file tree
Showing 16 changed files with 673 additions and 56 deletions.
2 changes: 1 addition & 1 deletion cmake/backends/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ set (XPU_DOWNLOAD_DIR ${XPU_SOURCE_DIR}/download)
set (XPU_INSTALL_DIR ${THIRD_PARTY_PATH}/install)

if (NOT XPU_SDK_URL)
set (XPU_SDK_URL "https://baidu-kunlun-product.su.bcebos.com/KL-SDK/klsdk-dev/20220923")
set (XPU_SDK_URL "https://baidu-kunlun-product.su.bcebos.com/klx-sdk/search/20221122")
endif ()

if (NOT XPU_SDK_ENV)
Expand Down
1 change: 1 addition & 0 deletions lite/api/paddle_use_passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ USE_MIR_PASS(__xpu__conv2d_affine_channel_fuse_pass);
USE_MIR_PASS(__xpu__conv2d_fuse_pass);
USE_MIR_PASS(__xpu__softmax_topk_fuse_pass);
USE_MIR_PASS(__xpu__multi_encoder_adaptive_seqlen_fuse_pass);
USE_MIR_PASS(__xpu__roformer_relative_pos_fuse_pass);
USE_MIR_PASS(__xpu__multi_encoder_slice_link_fuse_pass);
USE_MIR_PASS(__xpu__generate_sequence_fuse_pass);
USE_MIR_PASS(__xpu__logit_fuse_pass);
Expand Down
208 changes: 174 additions & 34 deletions lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <memory>
#include <string>
#include "lite/backends/xpu/math.h"
#include "lite/core/optimizer/mir/pass_registry.h"
#include "lite/core/optimizer/mir/pattern_matcher_high_api.h"

namespace paddle {
namespace lite {
namespace mir {
namespace fusion {

/* support xpu roformer relative pos */
/* in_Input --------------- */
/* | \ | */
/* | \ | */
/* split shape | */
/* / | \ | */
/* / | \ | */
/* | scale slice | */
/* \ | / \ | */
/* \ | / \ | */
/* concat slice slice | */
/* | / \ | */
/* | / \ | */
/* elementwise_mul elementwise_mul */
/* | / */
/* | / */
/* elementwise_add */
/* | */
/* | */
/* out_Output */
/*-------------------------------------------*/
/* After the pass apply: */
/* in_Input */
/* cos_emb | sin_emb */
/* \ | / */
/* xpu_roformer_relative */
/* | */
/* | */
/* out_Output */
/*-------------------------------------------*/

class XPURoformerRelativePosFuser : public FuseBase {
public:
void BuildPattern() override {
auto* input = VarNode("input")
->assert_is_op_input("split", "X")
->assert_is_op_input("elementwise_mul", "X")
->assert_is_op_input("shape", "Input")
->AsInput();
auto* split =
OpNode("split", "split")
->assert_op_attr<int32_t>("axis", 3)
->assert_op_attr<int32_t>("num", 2) // do we really need it
->AsIntermediate();
auto* split_out0 = VarNode("split_out0")
->assert_is_op_nth_input("concat", "X", 1)
->assert_is_op_nth_output("split", "Out", 0)
->AsIntermediate();
auto* split_out1 = VarNode("split_out1")
->assert_is_op_input("scale", "X")
->assert_is_op_nth_output("split", "Out", 1)
->AsIntermediate();
auto* scale =
OpNode("scale", "scale")
->assert_op_attr_satisfied<float>(
"scale",
[](float attr) { return (std::fabs(attr + 1.0) < 1e-5); })
->AsIntermediate();
auto* scale_out = VarNode("scale_out")
->assert_is_op_input("concat", "X")
->assert_is_op_output("scale", "Out")
->AsIntermediate();
auto* concat = OpNode("concat", "concat")->AsIntermediate();
auto* concat_out = VarNode("concat_out")
->assert_is_op_input("elementwise_mul", "X")
->assert_is_op_output("concat", "Out")
->AsIntermediate();
auto* shape = OpNode("shape", "shape")->AsIntermediate();
auto* shape_out = VarNode("shape_out")
->assert_is_op_input("slice", "Input")
->assert_is_op_output("shape", "Out")
->AsIntermediate();
auto* slice1 = OpNode("slice1", "slice")->AsIntermediate();
auto* slice1_out = VarNode("slice1_out")
->assert_is_op_input("slice", "EndsTensorList")
->assert_is_op_output("slice", "Out")
->AsIntermediate();
auto* sin_emb =
VarNode("sin_emb")->assert_is_op_input("slice", "Input")->AsInput();
auto* cos_emb =
VarNode("cos_emb")->assert_is_op_input("slice", "Input")->AsInput();
auto* slice_sin = OpNode("slice_sin", "slice")->AsIntermediate();
auto* slice_sin_out = VarNode("slice_sin_out")
->assert_is_op_input("elementwise_mul", "Y")
->assert_is_op_output("slice", "Out")
->AsIntermediate();
auto* ew_mul_sin =
OpNode("ew_mul_sin", "elementwise_mul")->AsIntermediate();
auto* ew_mul_sin_out = VarNode("ew_mul_sin_out")
->assert_is_op_input("elementwise_add", "Y")
->assert_is_op_output("elementwise_mul", "Out")
->AsIntermediate();
auto* ew_add = OpNode("ew_add", "elementwise_add")->AsIntermediate();
auto* ew_add_out = VarNode("ew_add_out")
->assert_is_op_output("elementwise_add", "Out")
->AsOutput();
auto* slice_cos = OpNode("slice_cos", "slice")->AsIntermediate();
auto* slice_cos_out = VarNode("slice_cos_out")
->assert_is_op_input("elementwise_mul", "Y")
->assert_is_op_output("slice", "Out")
->AsIntermediate();
auto* ew_mul_cos =
OpNode("ew_mul_cos", "elementwise_mul")->AsIntermediate();
auto* ew_mul_cos_out = VarNode("ew_mul_cos_out")
->assert_is_op_input("elementwise_add", "X")
->assert_is_op_output("elementwise_mul", "Out")
->AsIntermediate();
*input >> *split >> *split_out1 >> *scale >> *scale_out >> *concat >>
*concat_out >> *ew_mul_sin >> *ew_mul_sin_out >> *ew_add >> *ew_add_out;
*input >> *ew_mul_cos >> *ew_mul_cos_out >> *ew_add;
*input >> *shape >> *shape_out >> *slice1 >> *slice1_out >> *slice_sin >>
*slice_sin_out >> *ew_mul_sin;
*slice1_out >> *slice_cos >> *slice_cos_out >> *ew_mul_cos;
*sin_emb >> *slice_sin;
*cos_emb >> *slice_cos;
*split >> *split_out0 >> *concat;
}

void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
cpp::OpDesc op_desc;
op_desc.SetType("__xpu__roformer_relative_embedding");
// use "X", be consistent with target_op_type_ in multiencoder pass
op_desc.SetInput("X", {matched.at("input")->arg()->name});
op_desc.SetInput("CosEmbbeding", {matched.at("cos_emb")->arg()->name});
op_desc.SetInput("SinEmbbeding", {matched.at("sin_emb")->arg()->name});
op_desc.SetOutput("Out", {matched.at("ew_add_out")->arg()->name});
auto* scope = matched.at("split")->stmt()->op()->scope();

auto cos_emb_name = matched.at("cos_emb")->arg()->name;
auto cos_emb_shape = scope->FindMutableTensor(cos_emb_name)->dims();
auto sin_emb_name = matched.at("sin_emb")->arg()->name;
auto sin_emb_shape = scope->FindMutableTensor(sin_emb_name)->dims();
CHECK_EQ(cos_emb_shape.size(), 4) << cos_emb_shape.size();
CHECK_GT(cos_emb_shape[2], 0) << cos_emb_shape[2];
CHECK_EQ(sin_emb_shape.size(), 4) << sin_emb_shape.size();
for (int i = 0; i < sin_emb_shape.size(); ++i) {
CHECK_EQ(sin_emb_shape[i], cos_emb_shape[i])
<< i << " th dim: " << sin_emb_shape[i] << ", " << cos_emb_shape[i];
}
op_desc.SetAttr<int>("max_pos_len", cos_emb_shape[2]);

auto& valid_places = matched.at("split")->stmt()->op()->valid_places();
auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
new_op->Attach(op_desc, scope);
auto* new_op_node = graph->GraphCreateInstructNode(new_op, valid_places);

DirectedLink(matched.at("input"), new_op_node);
DirectedLink(matched.at("cos_emb"), new_op_node);
DirectedLink(matched.at("sin_emb"), new_op_node);
DirectedLink(new_op_node, matched.at("ew_add_out"));
}
};

} // namespace fusion

class XPURoformerRelativePosFusePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override {
fusion::XPURoformerRelativePosFuser fuser;
fuser(graph.get());
}
};

} // namespace mir
} // namespace lite
} // namespace paddle

REGISTER_MIR_PASS(__xpu__roformer_relative_pos_fuse_pass,
paddle::lite::mir::XPURoformerRelativePosFusePass)
.BindTargets({TARGET(kXPU)});
1 change: 1 addition & 0 deletions lite/core/optimizer/optimizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ std::unique_ptr<RuntimeProgram> RunDefaultOptimizer(
"__xpu__squeeze_excitation_fuse_pass",
"__xpu__mmdnn_fuse_pass",
"__xpu__bigru_fuse_pass",
"__xpu__roformer_relative_pos_fuse_pass",
"__xpu__multi_encoder_fuse_pass",
"__xpu__embedding_with_eltwise_add_fuse_pass",
"__xpu__fc_fuse_pass",
Expand Down
1 change: 1 addition & 0 deletions lite/kernels/xpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ add_kernel(__xpu__conv2d_compute_xpu XPU extra SRCS __xpu__conv2d_compute.cc)
add_kernel(__xpu__softmax_topk_compute_xpu XPU extra SRCS __xpu__softmax_topk_compute.cc)
add_kernel(__xpu__generate_sequence_compute_xpu XPU extra SRCS __xpu__generate_sequence_compute.cc)
add_kernel(__xpu__logit_compute_xpu XPU extra SRCS __xpu__logit_compute.cc)
add_kernel(__xpu__roformer_relative_embedding_compute_xpu XPU extra SRCS __xpu__roformer_relative_embedding_compute.cc)
add_kernel(__xpu__squeeze_excitation_compute_xpu XPU extra SRCS __xpu__squeeze_excitation_compute.cc)
add_kernel(__xpu__bigru_compute_xpu XPU extra SRCS __xpu__bigru_compute.cc)
add_kernel(__xpu__dynamic_lstm_compute_xpu XPU extra SRCS __xpu__dynamic_lstm_compute.cc)
Expand Down
28 changes: 25 additions & 3 deletions lite/kernels/xpu/__xpu__multi_encoder_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,13 @@ void XPUMultiEncoderCompute::PrepareForRun() {
for (auto* ln_bias : param.ln_bias) {
arg_ln_bias_.push_back(ln_bias->data<float>());
}
relative_type_ = param.relative_type;
// prepare roformer embedding
if (relative_type_ == 1) {
for (auto* emb : param.roformer_embedding) {
roformer_embedding_.push_back(emb->data<float>());
}
}
// prepare weights
local_quant_ =
GetBoolFromEnv("XPU_LOCAL_QUANT") || lite::TargetWrapperXPU::local_quant;
Expand Down Expand Up @@ -226,7 +233,13 @@ void XPUMultiEncoderCompute::run_encoder(const T* in, T* out) {
param.norm_before, /*is_pre_norm*/
param.per_channel);
qkv_attn_param.quant_type_.assign(quant_types_.begin(), quant_types_.end());

if (relative_type_ == 1) {
qkv_attn_param.relative_type = relative_type_;
qkv_attn_param.max_pos_len = param.max_pos_len;
qkv_attn_param.relative_pos.assign(roformer_embedding_.begin(),
roformer_embedding_.end());
}
qkv_attn_param.scale_of_hidden_units = param.ffn_hidden_dim_scale;
if (std::is_same<TGEMM, int8_t>::value) {
CHECK_GT(fc_input_max_.size(), 0);
}
Expand All @@ -249,7 +262,8 @@ void XPUMultiEncoderCompute::run_encoder(const T* in, T* out) {
std::vector<int64_t> mask_shape = param.mask->dims().Vectorize();
std::vector<int> encoder_mask_shape =
std::vector<int>(mask_shape.begin(), mask_shape.end());

CHECK_EQ(param.ffn_hidden_dim_scale, 4)
<< "xpu don't support ffn_hidden_dim_scale!=4 when no vsl";
xdnn::QKVAttnParam qkv_attn_param(batch,
max_seqlen,
param.head_num,
Expand All @@ -259,8 +273,15 @@ void XPUMultiEncoderCompute::run_encoder(const T* in, T* out) {
slice_idx,
true,
param.hidden_dim,
param.norm_before);
param.norm_before,
param.per_channel);
qkv_attn_param.quant_type_.assign(quant_types_.begin(), quant_types_.end());
if (relative_type_ == 1) {
qkv_attn_param.relative_type = relative_type_;
qkv_attn_param.max_pos_len = param.max_pos_len;
qkv_attn_param.relative_pos.assign(roformer_embedding_.begin(),
roformer_embedding_.end());
}
int r = xdnn::transformer_encoder<T, TW, TGEMM>(
ctx.GetRawContext(),
in,
Expand Down Expand Up @@ -367,6 +388,7 @@ REGISTER_LITE_KERNEL(__xpu__multi_encoder,
.BindInput("FCBias", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("LNScale", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("LNBias", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("RoformerEmbedding", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Mask", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
2 changes: 2 additions & 0 deletions lite/kernels/xpu/__xpu__multi_encoder_compute.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,15 @@ class XPUMultiEncoderCompute
std::vector<const float *> arg_ln_bias_;
std::vector<const float *> fc_weight_max_;
std::vector<const float *> fc_input_max_;
std::vector<const float *> roformer_embedding_;
std::vector<xdnn::QuantType> quant_types_;
XPUScratchPadGuard weight_max_guard_;
XPUScratchPadGuard input_max_guard_;
XPUScratchPadGuard cast_in_guard_;
XPUScratchPadGuard cast_out_guard_;
xdnn::Activation_t qkv_act = xdnn::Activation_t::RELU;
int slice_idx = -1;
int relative_type_ = 0;
bool local_quant_ = false;

template <typename T>
Expand Down
73 changes: 73 additions & 0 deletions lite/kernels/xpu/__xpu__roformer_relative_embedding_compute.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lite/kernels/xpu/__xpu__roformer_relative_embedding_compute.h"
#include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"

namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {

void RoformerRelativeEmbeddingCompute::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<XPUContext>();
auto input_dim = param.input->dims();
CHECK_EQ(input_dim.size(), 4);
int batch = input_dim[0];
int head_num = param.input->dims()[1];
int seqlen = param.input->dims()[2];
int head_dim = param.input->dims()[3];
CHECK_LE(seqlen, param.max_pos_len);
std::vector<int> lod;
lod.resize(batch + 1);
for (int i = 0; i < batch + 1; i++) {
lod[i] = i * seqlen;
}
int r =
xdnn::rope<float>(ctx.GetRawContext(),
param.input->data<float>(),
param.output->mutable_data<float>(TARGET(kXPU)),
param.cos_embedding->data<float>(),
param.sin_embedding->data<float>(),
batch,
head_num,
head_dim,
head_num * head_dim,
lod,
param.max_pos_len,
false, // no vsl
true); // transpose to [n, seql, head_num, head_dim]
CHECK_EQ(r, 0) << "call RoformerRelativeEmbeddingCompute failed";
}

} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle

REGISTER_LITE_KERNEL(
__xpu__roformer_relative_embedding,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::RoformerRelativeEmbeddingCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("CosEmbbeding", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("SinEmbbeding", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
Loading

0 comments on commit 8c84d4a

Please sign in to comment.