-
Notifications
You must be signed in to change notification settings - Fork 5.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Paddle Inference] General optimization for no_varlen embedding layernorm #48580
Changes from 2 commits
eb2947a
a232615
02d4aa2
44c17a1
f6bbd88
db05e1c
c9c9dc2
c2eab7b
7cc2d6d
04eb453
50d30d3
1595f39
a3b4bea
42b4ef8
0f8c60d
9e726ca
b1bd464
3ed6d72
c3fe156
0d7a77a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,7 +13,7 @@ limitations under the License. */ | |
#include "paddle/fluid/inference/tensorrt/convert/utils.h" | ||
#include "paddle/fluid/inference/tensorrt/engine.h" | ||
#include "paddle/fluid/inference/tensorrt/helper.h" | ||
#include "paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h" | ||
#include "paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_plugin.h" | ||
#include "paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_varseqlen_plugin.h" | ||
#include "paddle/phi/core/ddim.h" | ||
|
||
|
@@ -36,7 +36,6 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { | |
const framework::Scope& scope, | ||
bool test_mode) override { | ||
VLOG(4) << "convert fluid EmbEltwiseLayerNorm op to tensorrt layer"; | ||
|
||
// get the presistable var's data | ||
auto GetWeight = [&](const std::string& var_name, | ||
framework::DDim* dim) -> TensorRTEngine::Weight { | ||
|
@@ -47,32 +46,13 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { | |
return weight; | ||
}; | ||
|
||
auto GetFp16Weight = [&](const std::string& var_name, | ||
framework::DDim* dim) -> TensorRTEngine::Weight { | ||
auto* temp_var = scope.FindVar(var_name); | ||
auto* temp_tensor = temp_var->GetMutable<phi::DenseTensor>(); | ||
*dim = temp_tensor->dims(); | ||
auto weight = engine_->GetFp16TrtWeight(var_name, *temp_tensor); | ||
return weight; | ||
}; | ||
|
||
auto GetFp32Weight = [&](const std::string& var_name, | ||
framework::DDim* dim) -> TensorRTEngine::Weight { | ||
auto* temp_var = scope.FindVar(var_name); | ||
auto* temp_tensor = temp_var->GetMutable<phi::DenseTensor>(); | ||
*dim = temp_tensor->dims(); | ||
auto weight = engine_->GetFp32TrtWeight(var_name, *temp_tensor); | ||
return weight; | ||
}; | ||
|
||
framework::OpDesc op_desc(op, nullptr); | ||
auto pos_id_name = engine_->tensorrt_transformer_posid(); | ||
auto mask_id_name = engine_->tensorrt_transformer_maskid(); | ||
bool flag_varseqlen = | ||
engine_->use_varseqlen() && pos_id_name != "" && mask_id_name != ""; | ||
bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); | ||
int hidden = 0; | ||
// Declare inputs | ||
// bool with_fp16 = engine_->WithFp16() && | ||
// !engine_->disable_trt_plugin_fp16(); int hidden = 0; Declare inputs | ||
std::vector<nvinfer1::ITensor*> input_ids; | ||
|
||
// Declare inputs_weight | ||
|
@@ -95,55 +75,6 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { | |
if (flag_varseqlen) { | ||
engine_->SetITensor("pos_id", engine_->GetITensor(pos_id_name)); | ||
engine_->SetITensor("mask_id", engine_->GetITensor(mask_id_name)); | ||
|
||
auto mask_id_tensor = engine_->GetITensor("mask_id"); | ||
auto mask_dims = mask_id_tensor->getDimensions(); | ||
auto slice_start_dims = mask_dims; | ||
auto slice_stride_dims = mask_dims; | ||
|
||
for (int i = 0; i < mask_dims.nbDims; i++) { | ||
slice_start_dims.d[i] = 0; | ||
slice_stride_dims.d[i] = 1; | ||
} | ||
|
||
auto* shape_tensor = Shape(mask_id_tensor); | ||
std::vector<nvinfer1::ITensor*> size_vec_tensor; | ||
std::vector<nvinfer1::ITensor*> start_vec_tensor; | ||
for (int i = 0; i < mask_dims.nbDims; i++) { | ||
size_vec_tensor.push_back(Add1DConstantLayer(1)); | ||
start_vec_tensor.push_back(Add1DConstantLayer(0)); | ||
} | ||
size_vec_tensor[1] = GetEleTensorOfShape(shape_tensor, 1); | ||
auto size_tensor = Concat(size_vec_tensor); | ||
auto start_tensor = Concat(start_vec_tensor); | ||
|
||
auto slice_layer = | ||
TRT_ENGINE_ADD_LAYER(engine_, | ||
Slice, | ||
*mask_id_tensor, | ||
slice_start_dims, | ||
slice_start_dims, | ||
slice_stride_dims); // unuseful slice_start_dims | ||
slice_layer->setInput(1, *start_tensor); | ||
slice_layer->setInput(2, *size_tensor); | ||
slice_layer->setName( | ||
("Embeltwise_slice_layer (Output: slice_max_seqlen " + | ||
op_desc.Output("Out")[0] + ")") | ||
.c_str()); | ||
engine_->SetTensorDynamicRange(slice_layer->getOutput(0), 1.0f); | ||
|
||
auto* reshape_layer = | ||
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *slice_layer->getOutput(0)); | ||
nvinfer1::Dims shape_dim; | ||
shape_dim.nbDims = 1; | ||
shape_dim.d[0] = -1; | ||
reshape_layer->setReshapeDimensions(shape_dim); | ||
reshape_layer->setName(("Embeltwise_reshape_layer (Output: max_seqlen " + | ||
op_desc.Output("Out")[0] + ")") | ||
.c_str()); | ||
engine_->SetTensorDynamicRange(reshape_layer->getOutput(0), 1.0f); | ||
engine_->SetITensor("max_seqlen_tensor", reshape_layer->getOutput(0)); | ||
|
||
for (int i = 0; i < input_num; i++) { | ||
auto input_tensor = engine_->GetITensor(id_names[i]); | ||
weight = GetWeight(emb_names[i], &emb_dims); | ||
|
@@ -156,7 +87,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { | |
input_embs.push_back(weight.get()); | ||
emb_sizes.push_back(weight.get().count); | ||
} | ||
hidden = emb_dims[1]; | ||
// hidden = emb_dims[1]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. delete? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
} | ||
bias_weight = GetWeight(op_desc.Input("Bias").front(), &bias_dims); | ||
scale_weight = GetWeight(op_desc.Input("Scale").front(), &scale_dims); | ||
|
@@ -206,26 +137,29 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { | |
plugin_ptr->fields = fields.data(); | ||
|
||
std::vector<nvinfer1::ITensor*> plugin_inputs = input_ids; | ||
plugin_inputs.emplace_back(engine_->GetITensor( | ||
"max_seqlen_tensor")); // max_seqlen, eval_placeholder_3 | ||
|
||
plugin_inputs.emplace_back( | ||
engine_->GetITensor("mask_id")); // input mask_id | ||
auto creator = GetPluginRegistry()->getPluginCreator( | ||
"ManyEmbLayerNormPluginDynamic", "1"); | ||
auto plugin_obj = | ||
creator->createPlugin("ManyEmbLayerNormPluginDynamic", plugin_ptr); | ||
"ManyEmbLayerNormVarlenPluginDynamic", "1"); | ||
auto plugin_obj = creator->createPlugin( | ||
"ManyEmbLayerNormVarlenPluginDynamic", plugin_ptr); | ||
|
||
auto plugin_layer = engine_->network()->addPluginV2( | ||
plugin_inputs.data(), plugin_inputs.size(), *plugin_obj); | ||
|
||
plugin_layer->setName(("ManyEmbLayerNormPluginDynamic_V1(Output: " + | ||
plugin_layer->setName(("ManyEmbLayerNormVarlenPluginDynamicV1(Output: " + | ||
op_desc.Output("Out")[0] + ")") | ||
.c_str()); | ||
free(plugin_ptr); | ||
if (enable_int8) { | ||
float out_scale = | ||
PADDLE_GET_CONST(float, op_desc.GetAttr("out_threshold")); | ||
engine_->SetTensorDynamicRange(plugin_layer->getOutput(0), out_scale); | ||
engine_->SetTensorDynamicRange(plugin_layer->getOutput(1), out_scale); | ||
engine_->SetTensorDynamicRange(plugin_layer->getOutput(0), | ||
out_scale); // output | ||
engine_->SetTensorDynamicRange(plugin_layer->getOutput(1), | ||
out_scale); // mask | ||
engine_->SetTensorDynamicRange(plugin_layer->getOutput(2), | ||
out_scale); // max seqlen | ||
Comment on lines
+158
to
+161
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the scale of mask and max seqlen is different from out_scale There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mask and max seqlen 这两个tensor只用来提供shape信息,没有实际计算,可以用任意scale |
||
} | ||
if (engine_->with_interleaved()) { | ||
VLOG(4) << "fused emb_eltwise_layernorm op: use_varseqlen and " | ||
|
@@ -249,54 +183,82 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { | |
auto output_name = op_desc.Output("Out")[0]; | ||
RreplenishLayerAndOutput(layer, | ||
"ManyEmbLayerNormPluginDynamic_V1", | ||
{output_name, std::string("qkv_plugin_mask")}, | ||
{output_name, | ||
std::string("qkv_plugin_mask"), | ||
std::string("max_seqlen_tensor")}, | ||
test_mode); | ||
} | ||
} else { | ||
for (int i = 0; i < input_num; i++) { | ||
if (with_fp16) { | ||
weight = GetFp16Weight(emb_names[i], &emb_dims); | ||
} else { | ||
weight = GetFp32Weight(emb_names[i], &emb_dims); | ||
} | ||
input_ids.push_back(engine_->GetITensor(id_names[i])); | ||
auto input_tensor = engine_->GetITensor(id_names[i]); | ||
weight = GetWeight(emb_names[i], &emb_dims); | ||
input_ids.push_back(input_tensor); | ||
input_embs.push_back(weight.get()); | ||
emb_sizes.push_back(weight.get().count); | ||
hidden = emb_dims[1]; | ||
} | ||
if (with_fp16) { | ||
bias_weight = GetFp16Weight(op_desc.Input("Bias").front(), &bias_dims); | ||
scale_weight = | ||
GetFp16Weight(op_desc.Input("Scale").front(), &scale_dims); | ||
} else { | ||
bias_weight = GetFp32Weight(op_desc.Input("Bias").front(), &bias_dims); | ||
scale_weight = | ||
GetFp32Weight(op_desc.Input("Scale").front(), &scale_dims); | ||
// hidden = emb_dims[1]; | ||
} | ||
bias_weight = GetWeight(op_desc.Input("Bias").front(), &bias_dims); | ||
scale_weight = GetWeight(op_desc.Input("Scale").front(), &scale_dims); | ||
bias_size = phi::product(bias_dims); | ||
scale_size = phi::product(scale_dims); | ||
float eps = PADDLE_GET_CONST(float, op_desc.GetAttr("epsilon")); | ||
plugin::DynamicPluginTensorRT* plugin = nullptr; | ||
std::vector<void*> input_embs_data; | ||
for (size_t i = 0; i < input_embs.size(); ++i) { | ||
input_embs_data.push_back(const_cast<void*>( | ||
reinterpret_cast<const void*>(input_embs[i].values))); | ||
|
||
int output_fp16 = static_cast<int>((engine_->WithFp16() == 1) ? 1 : 0); | ||
if (enable_int8) { | ||
output_fp16 = 1; | ||
} | ||
|
||
std::vector<nvinfer1::PluginField> fields; | ||
std::vector<std::string> temp_fields_keys; | ||
fields.emplace_back("bert_embeddings_layernorm_beta", | ||
bias_weight.get().values, | ||
GetPluginFieldType(bias_weight.get().type), | ||
static_cast<int32_t>(bias_size)); | ||
fields.emplace_back("bert_embeddings_layernorm_gamma", | ||
scale_weight.get().values, | ||
GetPluginFieldType(scale_weight.get().type), | ||
static_cast<int32_t>(scale_size)); | ||
fields.emplace_back( | ||
"output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1); | ||
for (int i = 0; i < input_num; ++i) { | ||
temp_fields_keys.push_back("bert_embeddings_word_embeddings_" + | ||
std::to_string(i)); | ||
fields.emplace_back(temp_fields_keys.rbegin()->c_str(), | ||
input_embs[i].values, | ||
GetPluginFieldType(input_embs[i].type), | ||
static_cast<int32_t>(emb_sizes[i])); | ||
} | ||
|
||
nvinfer1::PluginFieldCollection* plugin_ptr = | ||
static_cast<nvinfer1::PluginFieldCollection*>( | ||
malloc(sizeof(*plugin_ptr) + | ||
fields.size() * sizeof(nvinfer1::PluginField))); | ||
plugin_ptr->nbFields = static_cast<int>(fields.size()); | ||
plugin_ptr->fields = fields.data(); | ||
|
||
std::vector<nvinfer1::ITensor*> plugin_inputs = input_ids; | ||
|
||
auto creator = GetPluginRegistry()->getPluginCreator( | ||
"ManyEmbLayerNormPluginDynamic", "1"); | ||
auto plugin_obj = | ||
creator->createPlugin("ManyEmbLayerNormPluginDynamic", plugin_ptr); | ||
|
||
auto plugin_layer = engine_->network()->addPluginV2( | ||
plugin_inputs.data(), plugin_inputs.size(), *plugin_obj); | ||
|
||
plugin_layer->setName(("ManyEmbLayerNormPluginDynamicV1(Output: " + | ||
op_desc.Output("Out")[0] + ")") | ||
.c_str()); | ||
free(plugin_ptr); | ||
if (enable_int8) { | ||
float out_scale = | ||
PADDLE_GET_CONST(float, op_desc.GetAttr("out_threshold")); | ||
engine_->SetTensorDynamicRange(plugin_layer->getOutput(0), | ||
out_scale); // output | ||
} | ||
plugin = new plugin::EmbEltwiseLayernormPluginDynamic( | ||
input_embs_data, | ||
const_cast<void*>(static_cast<const void*>(bias_weight.get().values)), | ||
const_cast<void*>( | ||
static_cast<const void*>(scale_weight.get().values)), | ||
emb_sizes, | ||
bias_size, | ||
scale_size, | ||
hidden, | ||
eps, | ||
with_fp16); | ||
layer = engine_->AddDynamicPlugin(input_ids.data(), input_num, plugin); | ||
layer = plugin_layer; | ||
auto output_name = op_desc.Output("Out")[0]; | ||
RreplenishLayerAndOutput( | ||
layer, "emb_eltwise_layernorm", {output_name}, test_mode); | ||
layer, "ManyEmbLayerNormPluginDynamicV1", {output_name}, test_mode); | ||
} | ||
} | ||
}; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
delete?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done