Add the first implememtation of fusion_group op (#19621)

* Add the dynamic load of nvrtc, and support runtime compiling of CUDA kernel using nvrtc. test=develop * Call CUDA driver api to launch the kernel compiled by nvrtc. test=develop * Disable for mac and windows. test=develop * Refine the codes to support manually specified num_threads and workload_per_thread. test=develop * Refine the CUDA kernel to support large dims. test=develop * Add DeviceCodePool to manage all device codes. * Add the first implementation fusion_group op. * Add unit-test for fusion_group op. * Add the check of result. * Add the check of nvrtc in unit-test. test=develop * Add comment to explain the inputs, outputs and features of fusion_group op. test=develop * Disable fusion_group op for mac and windows. test=develop * Make the compiling of device code return status instead of hanging up. test=develop * Add the check of whether there is CUDA driver library, and do not core dump when failing to call the CUDA driver API. * Unify fusion_group_op's input and output names. test=develop * Add the check of CUDA driver library in unittest. test=develop * Refine the calling of PADDLE_ENFORCE. test=develop
PaddlePaddle · Jan 3, 2020 · d483207 · d483207
1 parent 6192108
commit d483207
Show file tree

Hide file tree

Showing 14 changed files with 658 additions and 45 deletions.
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
@@ -116,7 +116,9 @@ function(op_library TARGET)
     # Define operators that don't need pybind here.
     foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
-"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op" "multihead_matmul_op")
+"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
+"sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
+"multihead_matmul_op" "fusion_group_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()

diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
@@ -82,7 +82,7 @@ void FusionGroupPass::InsertFusionGroupOp(
     input_names.push_back(n->Name());
     external_nodes.insert(n);
   }
-  op_desc.SetInput("Xs", input_names);
+  op_desc.SetInput("Inputs", input_names);
 
   std::vector<std::string> output_names;
   for (auto* n : output_vars_of_subgraph) {

diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -4,7 +4,8 @@ register_operators(EXCLUDES
     fusion_transpose_flatten_concat_op
     fusion_conv_inception_op
     fused_fc_elementwise_layernorm_op
-    multihead_matmul_op)
+    multihead_matmul_op
+    fusion_group_op)
 
 if (WITH_GPU)
     # conv_fusion_op needs cudnn 7 above
@@ -26,4 +27,10 @@ if (WITH_GPU)
     # multihead_matmul_op
     op_library(multihead_matmul_op)
     file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(multihead_matmul);\n")
+    # fusion_group
+    if(NOT APPLE AND NOT WIN32)
+        op_library(fusion_group_op DEPS device_code)
+        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_group);\n")
+        cc_test(test_fusion_group_op SRCS fusion_group_op_test.cc DEPS fusion_group_op)
+    endif()
 endif()
diff --git a/paddle/fluid/operators/fused/fusion_group_op.cc b/paddle/fluid/operators/fused/fusion_group_op.cc
@@ -0,0 +1,90 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused/fusion_group_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FusionGroupOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    const size_t num_ins = ctx->Inputs("Inputs").size();
+    const size_t num_outs = ctx->Outputs("Outs").size();
+
+    PADDLE_ENFORCE_GE(
+        num_ins, 1UL,
+        platform::errors::InvalidArgument(
+            "Expected the number of inputs >= 1. Received %d.", num_ins));
+    PADDLE_ENFORCE_GE(
+        num_outs, 1UL,
+        platform::errors::InvalidArgument(
+            "Expected the number of outputs >= 1. Recived %d.", num_outs));
+
+    int type = ctx->Attrs().Get<int>("type");
+    PADDLE_ENFORCE_EQ(type, 0UL,
+                      platform::errors::InvalidArgument(
+                          "Only support fusion of elementwise operations."));
+
+    std::vector<framework::DDim> x_dims = ctx->GetInputsDim("Inputs");
+    if (type == 0) {
+      for (size_t i = 1; i < num_ins; ++i) {
+        PADDLE_ENFORCE_EQ(x_dims[0], x_dims[i],
+                          platform::errors::InvalidArgument(
+                              "All the inputs' dims should be the same."));
+      }
+      std::vector<framework::DDim> out_dims;
+      for (size_t j = 0; j < num_outs; ++j) {
+        out_dims.push_back(x_dims[0]);
+      }
+      ctx->SetOutputsDim("Outs", out_dims);
+    }
+
+    // Only lod of Inputs[0] would be shared with Outs.
+    for (size_t j = 0; j < num_outs; ++j) {
+      ctx->ShareLoD("Inputs", /*->*/ "Outs", 0, j);
+    }
+  }
+};
+
+class FusionGroupOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Inputs",
+             "(std::vector<LoDTensor>) The inputs of fusion_group op.")
+        .AsDuplicable();
+    AddOutput("Outs",
+              "(std::vector<LoDTensor>) The outputs of fusion_group op.")
+        .AsDuplicable();
+    AddAttr<int>("type", "Fusion type.").SetDefault(0);
+    AddAttr<std::string>("func_name", "Name of the generated functions.")
+        .SetDefault("");
+    AddComment(R"DOC(
+fusion_group Operator.
+
+It is used to execute a generated CUDA kernel which fuse the computation of
+multiple operators into one. It supports serveral types:
+0, fused computation of elementwise operations in which all the dims of inputs
+    and outputs should be exactly the same.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fusion_group, ops::FusionGroupOp, ops::FusionGroupOpMaker);
diff --git a/paddle/fluid/operators/fused/fusion_group_op.cu.cc b/paddle/fluid/operators/fused/fusion_group_op.cu.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused/fusion_group_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(
+    fusion_group,
+    ops::FusionGroupKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FusionGroupKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/fused/fusion_group_op.h b/paddle/fluid/operators/fused/fusion_group_op.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_code.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class FusionGroupKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<framework::LoDTensor>("Inputs");
+    auto outs = ctx.MultiOutput<framework::LoDTensor>("Outs");
+    int type = ctx.Attr<int>("type");
+
+    size_t num_ins = ins.size();
+    size_t num_outs = outs.size();
+
+    auto place = ctx.GetPlace();
+    for (size_t i = 0; i < num_outs; ++i) {
+      outs[i]->mutable_data<T>(place);
+    }
+
+    std::string func_name = ctx.Attr<std::string>("func_name");
+    platform::DeviceCode* dev_code =
+        platform::DeviceCodePool::Instance().Get(place, func_name);
+    VLOG(3) << "func_name: " << func_name;
+
+    if (type == 0) {
+      size_t n = ins[0]->numel();
+      std::vector<void*> args;
+      args.push_back(&n);
+      std::vector<const T*> ptrs(num_ins + num_outs);
+      for (size_t i = 0; i < num_ins; ++i) {
+        ptrs[i] = ins[i]->data<T>();
+        args.push_back(&ptrs[i]);
+      }
+      for (size_t j = 0; j < num_outs; ++j) {
+        ptrs[num_ins + j] = outs[j]->data<T>();
+        args.push_back(&ptrs[num_ins + j]);
+      }
+      dev_code->Launch(n, &args);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle