Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Paddle Inference] General optimization for no_varlen embedding layernorm #48580

198 changes: 80 additions & 118 deletions paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/utils.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_varseqlen_plugin.h"
#include "paddle/phi/core/ddim.h"

Expand All @@ -36,7 +36,6 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
const framework::Scope& scope,
bool test_mode) override {
VLOG(4) << "convert fluid EmbEltwiseLayerNorm op to tensorrt layer";

// get the presistable var's data
auto GetWeight = [&](const std::string& var_name,
framework::DDim* dim) -> TensorRTEngine::Weight {
Expand All @@ -47,32 +46,13 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
return weight;
};

auto GetFp16Weight = [&](const std::string& var_name,
framework::DDim* dim) -> TensorRTEngine::Weight {
auto* temp_var = scope.FindVar(var_name);
auto* temp_tensor = temp_var->GetMutable<phi::DenseTensor>();
*dim = temp_tensor->dims();
auto weight = engine_->GetFp16TrtWeight(var_name, *temp_tensor);
return weight;
};

auto GetFp32Weight = [&](const std::string& var_name,
framework::DDim* dim) -> TensorRTEngine::Weight {
auto* temp_var = scope.FindVar(var_name);
auto* temp_tensor = temp_var->GetMutable<phi::DenseTensor>();
*dim = temp_tensor->dims();
auto weight = engine_->GetFp32TrtWeight(var_name, *temp_tensor);
return weight;
};

framework::OpDesc op_desc(op, nullptr);
auto pos_id_name = engine_->tensorrt_transformer_posid();
auto mask_id_name = engine_->tensorrt_transformer_maskid();
bool flag_varseqlen =
engine_->use_varseqlen() && pos_id_name != "" && mask_id_name != "";
bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
int hidden = 0;
// Declare inputs
// bool with_fp16 = engine_->WithFp16() &&
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

delete?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

delete?

done

// !engine_->disable_trt_plugin_fp16(); int hidden = 0; Declare inputs
std::vector<nvinfer1::ITensor*> input_ids;

// Declare inputs_weight
Expand All @@ -95,55 +75,6 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
if (flag_varseqlen) {
engine_->SetITensor("pos_id", engine_->GetITensor(pos_id_name));
engine_->SetITensor("mask_id", engine_->GetITensor(mask_id_name));

auto mask_id_tensor = engine_->GetITensor("mask_id");
auto mask_dims = mask_id_tensor->getDimensions();
auto slice_start_dims = mask_dims;
auto slice_stride_dims = mask_dims;

for (int i = 0; i < mask_dims.nbDims; i++) {
slice_start_dims.d[i] = 0;
slice_stride_dims.d[i] = 1;
}

auto* shape_tensor = Shape(mask_id_tensor);
std::vector<nvinfer1::ITensor*> size_vec_tensor;
std::vector<nvinfer1::ITensor*> start_vec_tensor;
for (int i = 0; i < mask_dims.nbDims; i++) {
size_vec_tensor.push_back(Add1DConstantLayer(1));
start_vec_tensor.push_back(Add1DConstantLayer(0));
}
size_vec_tensor[1] = GetEleTensorOfShape(shape_tensor, 1);
auto size_tensor = Concat(size_vec_tensor);
auto start_tensor = Concat(start_vec_tensor);

auto slice_layer =
TRT_ENGINE_ADD_LAYER(engine_,
Slice,
*mask_id_tensor,
slice_start_dims,
slice_start_dims,
slice_stride_dims); // unuseful slice_start_dims
slice_layer->setInput(1, *start_tensor);
slice_layer->setInput(2, *size_tensor);
slice_layer->setName(
("Embeltwise_slice_layer (Output: slice_max_seqlen " +
op_desc.Output("Out")[0] + ")")
.c_str());
engine_->SetTensorDynamicRange(slice_layer->getOutput(0), 1.0f);

auto* reshape_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *slice_layer->getOutput(0));
nvinfer1::Dims shape_dim;
shape_dim.nbDims = 1;
shape_dim.d[0] = -1;
reshape_layer->setReshapeDimensions(shape_dim);
reshape_layer->setName(("Embeltwise_reshape_layer (Output: max_seqlen " +
op_desc.Output("Out")[0] + ")")
.c_str());
engine_->SetTensorDynamicRange(reshape_layer->getOutput(0), 1.0f);
engine_->SetITensor("max_seqlen_tensor", reshape_layer->getOutput(0));

for (int i = 0; i < input_num; i++) {
auto input_tensor = engine_->GetITensor(id_names[i]);
weight = GetWeight(emb_names[i], &emb_dims);
Expand All @@ -156,7 +87,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
input_embs.push_back(weight.get());
emb_sizes.push_back(weight.get().count);
}
hidden = emb_dims[1];
// hidden = emb_dims[1];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

delete?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

}
bias_weight = GetWeight(op_desc.Input("Bias").front(), &bias_dims);
scale_weight = GetWeight(op_desc.Input("Scale").front(), &scale_dims);
Expand Down Expand Up @@ -206,26 +137,29 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
plugin_ptr->fields = fields.data();

std::vector<nvinfer1::ITensor*> plugin_inputs = input_ids;
plugin_inputs.emplace_back(engine_->GetITensor(
"max_seqlen_tensor")); // max_seqlen, eval_placeholder_3

plugin_inputs.emplace_back(
engine_->GetITensor("mask_id")); // input mask_id
auto creator = GetPluginRegistry()->getPluginCreator(
"ManyEmbLayerNormPluginDynamic", "1");
auto plugin_obj =
creator->createPlugin("ManyEmbLayerNormPluginDynamic", plugin_ptr);
"ManyEmbLayerNormVarlenPluginDynamic", "1");
auto plugin_obj = creator->createPlugin(
"ManyEmbLayerNormVarlenPluginDynamic", plugin_ptr);

auto plugin_layer = engine_->network()->addPluginV2(
plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);

plugin_layer->setName(("ManyEmbLayerNormPluginDynamic_V1(Output: " +
plugin_layer->setName(("ManyEmbLayerNormVarlenPluginDynamicV1(Output: " +
op_desc.Output("Out")[0] + ")")
.c_str());
free(plugin_ptr);
if (enable_int8) {
float out_scale =
PADDLE_GET_CONST(float, op_desc.GetAttr("out_threshold"));
engine_->SetTensorDynamicRange(plugin_layer->getOutput(0), out_scale);
engine_->SetTensorDynamicRange(plugin_layer->getOutput(1), out_scale);
engine_->SetTensorDynamicRange(plugin_layer->getOutput(0),
out_scale); // output
engine_->SetTensorDynamicRange(plugin_layer->getOutput(1),
out_scale); // mask
engine_->SetTensorDynamicRange(plugin_layer->getOutput(2),
out_scale); // max seqlen
Comment on lines +158 to +161
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the scale of mask and max seqlen is different from out_scale

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mask and max seqlen 这两个tensor只用来提供shape信息,没有实际计算,可以用任意scale

}
if (engine_->with_interleaved()) {
VLOG(4) << "fused emb_eltwise_layernorm op: use_varseqlen and "
Expand All @@ -249,54 +183,82 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
auto output_name = op_desc.Output("Out")[0];
RreplenishLayerAndOutput(layer,
"ManyEmbLayerNormPluginDynamic_V1",
{output_name, std::string("qkv_plugin_mask")},
{output_name,
std::string("qkv_plugin_mask"),
std::string("max_seqlen_tensor")},
test_mode);
}
} else {
for (int i = 0; i < input_num; i++) {
if (with_fp16) {
weight = GetFp16Weight(emb_names[i], &emb_dims);
} else {
weight = GetFp32Weight(emb_names[i], &emb_dims);
}
input_ids.push_back(engine_->GetITensor(id_names[i]));
auto input_tensor = engine_->GetITensor(id_names[i]);
weight = GetWeight(emb_names[i], &emb_dims);
input_ids.push_back(input_tensor);
input_embs.push_back(weight.get());
emb_sizes.push_back(weight.get().count);
hidden = emb_dims[1];
}
if (with_fp16) {
bias_weight = GetFp16Weight(op_desc.Input("Bias").front(), &bias_dims);
scale_weight =
GetFp16Weight(op_desc.Input("Scale").front(), &scale_dims);
} else {
bias_weight = GetFp32Weight(op_desc.Input("Bias").front(), &bias_dims);
scale_weight =
GetFp32Weight(op_desc.Input("Scale").front(), &scale_dims);
// hidden = emb_dims[1];
}
bias_weight = GetWeight(op_desc.Input("Bias").front(), &bias_dims);
scale_weight = GetWeight(op_desc.Input("Scale").front(), &scale_dims);
bias_size = phi::product(bias_dims);
scale_size = phi::product(scale_dims);
float eps = PADDLE_GET_CONST(float, op_desc.GetAttr("epsilon"));
plugin::DynamicPluginTensorRT* plugin = nullptr;
std::vector<void*> input_embs_data;
for (size_t i = 0; i < input_embs.size(); ++i) {
input_embs_data.push_back(const_cast<void*>(
reinterpret_cast<const void*>(input_embs[i].values)));

int output_fp16 = static_cast<int>((engine_->WithFp16() == 1) ? 1 : 0);
if (enable_int8) {
output_fp16 = 1;
}

std::vector<nvinfer1::PluginField> fields;
std::vector<std::string> temp_fields_keys;
fields.emplace_back("bert_embeddings_layernorm_beta",
bias_weight.get().values,
GetPluginFieldType(bias_weight.get().type),
static_cast<int32_t>(bias_size));
fields.emplace_back("bert_embeddings_layernorm_gamma",
scale_weight.get().values,
GetPluginFieldType(scale_weight.get().type),
static_cast<int32_t>(scale_size));
fields.emplace_back(
"output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1);
for (int i = 0; i < input_num; ++i) {
temp_fields_keys.push_back("bert_embeddings_word_embeddings_" +
std::to_string(i));
fields.emplace_back(temp_fields_keys.rbegin()->c_str(),
input_embs[i].values,
GetPluginFieldType(input_embs[i].type),
static_cast<int32_t>(emb_sizes[i]));
}

nvinfer1::PluginFieldCollection* plugin_ptr =
static_cast<nvinfer1::PluginFieldCollection*>(
malloc(sizeof(*plugin_ptr) +
fields.size() * sizeof(nvinfer1::PluginField)));
plugin_ptr->nbFields = static_cast<int>(fields.size());
plugin_ptr->fields = fields.data();

std::vector<nvinfer1::ITensor*> plugin_inputs = input_ids;

auto creator = GetPluginRegistry()->getPluginCreator(
"ManyEmbLayerNormPluginDynamic", "1");
auto plugin_obj =
creator->createPlugin("ManyEmbLayerNormPluginDynamic", plugin_ptr);

auto plugin_layer = engine_->network()->addPluginV2(
plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);

plugin_layer->setName(("ManyEmbLayerNormPluginDynamicV1(Output: " +
op_desc.Output("Out")[0] + ")")
.c_str());
free(plugin_ptr);
if (enable_int8) {
float out_scale =
PADDLE_GET_CONST(float, op_desc.GetAttr("out_threshold"));
engine_->SetTensorDynamicRange(plugin_layer->getOutput(0),
out_scale); // output
}
plugin = new plugin::EmbEltwiseLayernormPluginDynamic(
input_embs_data,
const_cast<void*>(static_cast<const void*>(bias_weight.get().values)),
const_cast<void*>(
static_cast<const void*>(scale_weight.get().values)),
emb_sizes,
bias_size,
scale_size,
hidden,
eps,
with_fp16);
layer = engine_->AddDynamicPlugin(input_ids.data(), input_num, plugin);
layer = plugin_layer;
auto output_name = op_desc.Output("Out")[0];
RreplenishLayerAndOutput(
layer, "emb_eltwise_layernorm", {output_name}, test_mode);
layer, "ManyEmbLayerNormPluginDynamicV1", {output_name}, test_mode);
}
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,10 +194,10 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
"max_seqlen_tensor")); // max_seqlen, eval_placeholder_3

auto creator = GetPluginRegistry()->getPluginCreator(
"ManyEmbLayerNormPluginDynamic", "2");
"ManyEmbLayerNormVarlenPluginDynamic", "2");

auto plugin_obj =
creator->createPlugin("ManyEmbLayerNormPluginDynamic", plugin_ptr);
auto plugin_obj = creator->createPlugin(
"ManyEmbLayerNormVarlenPluginDynamic", plugin_ptr);

auto plugin_layer = engine_->network()->addPluginV2(
plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);
Expand Down
5 changes: 3 additions & 2 deletions paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ list(
group_norm_op_plugin.cu
layer_norm_op_plugin.cu
instance_norm_op_plugin.cu
emb_eltwise_layernorm_plugin.cu
qkv_to_context_plugin.cu
skip_layernorm_op_plugin.cu
hard_swish_op_plugin.cu
Expand All @@ -37,7 +36,9 @@ list(
merge_layernorm_op_plugin.cu
skip_merge_layernorm_op_plugin.cu
generic_plugin.cu
lookup_table.cu)
lookup_table.cu
many_emb_layernorm_plugin.cu
many_emb_Layernorm_kernel.cu)

if(${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 7 AND NOT WIN32)
list(APPEND TRT_FILES many_emb_layernorm_varseqlen_plugin.cu
Expand Down
Loading