From 94c63ded4b7b061d260c309b962c15180783afb0 Mon Sep 17 00:00:00 2001
From: lizexu123 <39205361+lizexu123@users.noreply.github.com>
Date: Mon, 22 Jan 2024 11:05:07 +0800
Subject: [PATCH] [Paddle-TRT] upgrade EnqueueV2 to EnqueueV3  (#60807)

---
 paddle/fluid/inference/tensorrt/engine.cc     | 40 ++++++++++----
 .../inference/tensorrt/test_dynamic_engine.cc | 22 +++++++-
 .../operators/tensorrt/tensorrt_engine_op.h   | 55 ++++++++++++++++++-
 .../inference/test_trt_convert_bitwise_and.py |  2 -
 .../inference/test_trt_convert_bitwise_not.py | 18 +-----
 .../inference/test_trt_convert_bitwise_or.py  |  2 -
 test/ir/inference/test_trt_convert_solve.py   |  2 -
 7 files changed, 105 insertions(+), 36 deletions(-)
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index c91bb59aee823..c934b6175489f 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -13,10 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/engine.h"
-
 #include <NvInfer.h>
 #include <glog/logging.h>
-
 #include <string>
 
 #include "NvInferRuntimeCommon.h"
@@ -174,11 +172,27 @@ bool TensorRTEngine::Enqueue(nvinfer1::IExecutionContext *context,
     return cuda_graph_.Launch(stream);
   }
 
+#if IS_TRT_VERSION_GE(8500)
+  for (size_t j = 0; j < buffers->size(); ++j) {
+    auto name = context->getEngine().getBindingName(j);
+    if (context->getEngine().isShapeBinding(j) &&
+        context->getEngine().bindingIsInput(j)) {
+      continue;
+    } else {
+      context->setTensorAddress(name, (*buffers)[j]);
+    }
+  }
+#endif
+
   bool ret;
   if (!with_dynamic_shape()) {
     ret = context->enqueue(batch_size, buffers->data(), stream, nullptr);
   } else {
+#if IS_TRT_VERSION_GE(8500)
+    ret = context->enqueueV3(stream);
+#else
     ret = context->enqueueV2(buffers->data(), stream, nullptr);
+#endif
   }
   return ret;
 }
@@ -469,12 +483,12 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer,
                         "of the network at the same time.",
                         name));
   network()->markOutput(*output);
-  PADDLE_ENFORCE_EQ(
-      output->isNetworkOutput(),
-      true,
-      platform::errors::InvalidArgument(
-          "The output %s of TRT engine should be the output of the network.",
-          name));
+  PADDLE_ENFORCE_EQ(output->isNetworkOutput(),
+                    true,
+                    platform::errors::InvalidArgument(
+                        "The output %s of TRT engine should be the output "
+                        "of the network.",
+                        name));
 }
 
 void TensorRTEngine::DeclareOutput(const std::string &name) {
@@ -567,8 +581,8 @@ nvinfer1::ITensor *TensorRTEngine::ConvertWeight2ITensor(
     trt_in_shape.nbDims = 1;
     trt_in_shape.d[0] = 1;
   }
-  // In fact , this is not always right, because we can't determine if the 0th
-  // dimension is batch. Just for run chenqu's model
+  // In fact , this is not always right, because we can't determine if the
+  // 0th dimension is batch. Just for run chenqu's model
   if (!with_dynamic_shape()) {
     trt_in_shape.nbDims--;
     for (int i = 0; i < trt_in_shape.nbDims; i++) {
@@ -626,8 +640,10 @@ void TensorRTEngine::Deserialize(const std::string &engine_serialized_data) {
       infer_engine_,
       platform::errors::Fatal(
           "Building TRT cuda engine failed when deserializing engine info. "
-          "Please check:\n1. Your TRT serialization is generated and loaded "
-          "on the same GPU architecture;\n2. The Paddle Inference version of "
+          "Please check:\n1. Your TRT serialization is generated and "
+          "loaded "
+          "on the same GPU architecture;\n2. The Paddle Inference version "
+          "of "
           "generating serialization file and doing inference are "
           "consistent."));
 
diff --git a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
index b565df0ec3d8c..5e44b2c7c5b4f 100644
--- a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
@@ -131,10 +131,13 @@ TEST_F(TensorRTDynamicShapeValueEngineTest, test_trt_dynamic_shape_value) {
   std::vector<int> shape_v = {8, 8, 4};
   PrepareInputOutput(x_v, {8, 8, 4});
   PrepareShapeInput(shape_v);
+#if IS_TRT_VERSION_GE(8500)
+  engine_->context()->setInputShape("input", nvinfer1::Dims2{8, 32});
+#else
   engine_->context()->setBindingDimensions(0, nvinfer1::Dims2{8, 32});
   engine_->context()->setBindingDimensions(1, shape_dim);
   engine_->context()->setInputShapeBinding(1, shape_v.data());
-
+#endif
   auto *x_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
   auto *shape_gpu_data = shape_.mutable_data<int>(ctx_->GetPlace());
   auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
@@ -142,14 +145,31 @@ TEST_F(TensorRTDynamicShapeValueEngineTest, test_trt_dynamic_shape_value) {
   buffers[0] = reinterpret_cast<void *>(x_gpu_data);
   buffers[1] = reinterpret_cast<void *>(shape_gpu_data);
   buffers[2] = reinterpret_cast<void *>(y_gpu_data);
+#if IS_TRT_VERSION_GE(8500)
+  for (size_t i = 0; i < buffers.size(); i++) {
+    auto name = engine_->engine()->getBindingName(i);
+    if (engine_->engine()->isShapeBinding(i) &&
+        engine_->engine()->bindingIsInput(i)) {
+      engine_->context()->setTensorAddress(name, shape_v.data());
+    } else {
+      engine_->context()->setTensorAddress(name, buffers[i]);
+    }
+  }
+#endif
 
   engine_->Execute(-1, &buffers, ctx_->stream());
   cudaStreamSynchronize(ctx_->stream());
+
   std::vector<float> y_cpu;
   GetOutput(&y_cpu);
   ASSERT_EQ(y_cpu[0], 0);
   ASSERT_EQ(y_cpu[1], 1);
+#if IS_TRT_VERSION_GE(8500)
+  const char *name1 = engine_->engine()->getBindingName(2);
+  auto dims = engine_->context()->getTensorShape(name1);
+#else
   auto dims = engine_->context()->getBindingDimensions(2);
+#endif
   ASSERT_EQ(dims.nbDims, 3);
   ASSERT_EQ(dims.d[0], 8);
   ASSERT_EQ(dims.d[1], 8);
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 8c75a7bc00f1c..8f67c25e4641f 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -611,6 +611,41 @@ class TensorRTEngineOp : public framework::OperatorBase {
         }
       } else {
 #if IS_TRT_VERSION_GE(6000)
+#if IS_TRT_VERSION_GE(8500)
+        if (engine->engine()->isShapeBinding(bind_index) &&
+            engine->engine()->bindingIsInput(bind_index)) {
+          std::vector<int> shape_v(t.numel());
+          if (t.dtype() == phi::DataType::INT32) {
+            paddle::memory::Copy(platform::CPUPlace(),
+                                 shape_v.data(),
+                                 t.place(),
+                                 t.data<int32_t>(),
+                                 t.numel() * sizeof(int),
+                                 nullptr);
+          } else if (t.dtype() == phi::DataType::INT64) {
+            std::string x_t = x + "_cast_to_INT32";
+            if (scope.FindVar(x_t) == nullptr) {
+              const_cast<framework::Scope *>(&scope)->Var(x_t);
+            }
+            auto int32_tensor =
+                scope.FindVar(x_t)->GetMutable<phi::DenseTensor>();
+            *int32_tensor = phi::Cast<int64_t>(
+                reinterpret_cast<const phi::GPUContext &>(dev_ctx),
+                t,
+                phi::DataType::INT32);
+            paddle::memory::Copy(platform::CPUPlace(),
+                                 shape_v.data(),
+                                 int32_tensor->place(),
+                                 int32_tensor->data<int32_t>(),
+                                 int32_tensor->numel() * sizeof(int),
+                                 nullptr);
+          }
+          trt_context->setTensorAddress(x.c_str(), shape_v.data());
+        } else {
+          trt_context->setInputShape(
+              x.c_str(), inference::tensorrt::Vec2TRT_Dims(t_shape, x, true));
+        }
+#else
         trt_context->setBindingDimensions(
             bind_index, inference::tensorrt::Vec2TRT_Dims(t_shape, x, true));
         // If this x is a shape tensor, we need call setInputShapeBinding
@@ -644,6 +679,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
           }
           trt_context->setInputShapeBinding(bind_index, shape_v.data());
         }
+#endif
 #endif
       }
       runtime_batch = t_shape[0];
@@ -718,7 +754,20 @@ class TensorRTEngineOp : public framework::OperatorBase {
           ddim.push_back(dims.d[i]);
         }
       } else {
-#if IS_TRT_VERSION_GE(6000)
+#if IS_TRT_VERSION_GE(8500)
+        auto x_name = engine->engine()->getBindingName(bind_index);
+        auto dims = trt_context->getTensorShape(x_name);
+        int nb_dims = dims.nbDims;
+        for (; nb_dims > 0; nb_dims--) {
+          // some 'x 1' of shape is normal, no need to remove it
+          if (dims.d[nb_dims - 1] != 1 ||
+              nb_dims == origin_output_rank[output_index])
+            break;
+        }
+        for (int i = 0; i < nb_dims; i++) {
+          ddim.push_back(dims.d[i]);
+        }
+#else
         auto dims = trt_context->getBindingDimensions(bind_index);
         int nb_dims = dims.nbDims;
         for (; nb_dims > 0; nb_dims--) {
@@ -727,7 +776,9 @@ class TensorRTEngineOp : public framework::OperatorBase {
               nb_dims == origin_output_rank[output_index])
             break;
         }
-        for (int i = 0; i < nb_dims; i++) ddim.push_back(dims.d[i]);
+        for (int i = 0; i < nb_dims; i++) {
+          ddim.push_back(dims.d[i]);
+        }
 #endif
       }
       auto *fluid_v = scope.FindVar(y);
diff --git a/test/ir/inference/test_trt_convert_bitwise_and.py b/test/ir/inference/test_trt_convert_bitwise_and.py
index 0bfa21b5a36de..015977b438765 100644
--- a/test/ir/inference/test_trt_convert_bitwise_and.py
+++ b/test/ir/inference/test_trt_convert_bitwise_and.py
@@ -135,12 +135,10 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        program_config.set_input_type(np.float32)
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, True
         ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        program_config.set_input_type(np.float16)
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, True
         ), 1e-3
diff --git a/test/ir/inference/test_trt_convert_bitwise_not.py b/test/ir/inference/test_trt_convert_bitwise_not.py
index 8d19425011ed4..56971f3d36dfc 100644
--- a/test/ir/inference/test_trt_convert_bitwise_not.py
+++ b/test/ir/inference/test_trt_convert_bitwise_not.py
@@ -33,17 +33,13 @@ def sample_program_configs(self):
         def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
             if dims == 0:
                 return np.random.random([]).astype(np.bool8)
-            elif dims == 1:
+            if dims == 1:
                 return np.random.random([32]).astype(np.bool8)
-            elif dims == 2:
-                return np.random.random([3, 32]).astype(np.int8)
-            elif dims == 3:
-                return np.random.random([3, 32, 32]).astype(np.int32)
             else:
                 return np.random.random([batch, 3, 32, 32]).astype(np.int64)
 
-        for dims in [0, 1, 2, 3, 4]:
-            for batch in [1, 4]:
+        for dims in [0, 1, 4]:
+            for batch in [1]:
                 self.dims = dims
                 dics = [{}]
 
@@ -84,14 +80,6 @@ def generate_dynamic_shape(attrs):
                 self.dynamic_shape.min_input_shape = {"input_data": [1]}
                 self.dynamic_shape.max_input_shape = {"input_data": [64]}
                 self.dynamic_shape.opt_input_shape = {"input_data": [32]}
-            elif self.dims == 2:
-                self.dynamic_shape.min_input_shape = {"input_data": [1, 16]}
-                self.dynamic_shape.max_input_shape = {"input_data": [4, 32]}
-                self.dynamic_shape.opt_input_shape = {"input_data": [3, 32]}
-            elif self.dims == 3:
-                self.dynamic_shape.min_input_shape = {"input_data": [1, 16, 16]}
-                self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 32]}
-                self.dynamic_shape.opt_input_shape = {"input_data": [3, 32, 32]}
             else:
                 self.dynamic_shape.min_input_shape = {
                     "input_data": [1, 3, 16, 16]
diff --git a/test/ir/inference/test_trt_convert_bitwise_or.py b/test/ir/inference/test_trt_convert_bitwise_or.py
index fae933c0cb185..84cef306b4b55 100644
--- a/test/ir/inference/test_trt_convert_bitwise_or.py
+++ b/test/ir/inference/test_trt_convert_bitwise_or.py
@@ -135,12 +135,10 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        program_config.set_input_type(np.float32)
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, True
         ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        program_config.set_input_type(np.float16)
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, True
         ), 1e-3
diff --git a/test/ir/inference/test_trt_convert_solve.py b/test/ir/inference/test_trt_convert_solve.py
index c3f9b51d0d05c..c3117ee335740 100644
--- a/test/ir/inference/test_trt_convert_solve.py
+++ b/test/ir/inference/test_trt_convert_solve.py
@@ -87,10 +87,8 @@ def clear_dynamic_shape():
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        program_config.set_input_type(np.float32)
         yield self.create_inference_config(), (1, 3), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        program_config.set_input_type(np.float16)
         yield self.create_inference_config(), (1, 3), 1e-3
 
     def test(self):