merge develop

PaddlePaddle · Dec 27, 2021 · 212d98a · 212d98a
2 parents 25f03ac + 9cfdae9
commit 212d98a
Show file tree

Hide file tree

Showing 82 changed files with 1,291 additions and 882 deletions.
diff --git a/cmake/pten_kernel.cmake b/cmake/pten_kernel.cmake
@@ -18,7 +18,7 @@ function(kernel_declare TARGET_LIST)
         file(READ ${kernel_path} kernel_impl)
         # TODO(chenweihang): rename PT_REGISTER_CTX_KERNEL to PT_REGISTER_KERNEL
         # NOTE(chenweihang): now we don't recommend to use digit in kernel name
-        string(REGEX MATCH "(PT_REGISTER_CTX_KERNEL|PT_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z_]*," first_registry "${kernel_impl}")
+        string(REGEX MATCH "(PT_REGISTER_CTX_KERNEL|PT_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}")
         if (NOT first_registry STREQUAL "")
             # parse the first kernel name
             string(REPLACE "PT_REGISTER_CTX_KERNEL(" "" kernel_name "${first_registry}")
@@ -33,8 +33,6 @@ function(kernel_declare TARGET_LIST)
                 file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n")
             elseif (${kernel_path} MATCHES "./xpu\/")
                 file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
-            elseif (${kernel_path} MATCHES "./npu\/*")
-                file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, NPU, ALL_LAYOUT);\n")
             else ()
                 # deal with device independent kernel, now we use CPU temporaary
                 file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
@@ -48,7 +46,9 @@ function(kernel_library TARGET)
     set(cpu_srcs)
     set(gpu_srcs)
     set(xpu_srcs)
-    set(npu_srcs)
+    # parse and save the deps kerenl targets
+    set(all_srcs)
+    set(kernel_deps)
 
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
@@ -57,7 +57,6 @@ function(kernel_library TARGET)
 
     list(LENGTH kernel_library_SRCS kernel_library_SRCS_len)
     # one kernel only match one impl file in each backend
-    # TODO(chenweihang): parse compile deps by include headers
     if (${kernel_library_SRCS_len} EQUAL 0)
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
             list(APPEND common_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
@@ -75,57 +74,68 @@ function(kernel_library TARGET)
                 list(APPEND xpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc)
             endif()
         endif()
-        if (WITH_ASCEND_CL)
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/npu/${TARGET}.cc)
-                list(APPEND npu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/npu/${TARGET}.cc)
-            endif()
-        endif()
     else()
         # TODO(chenweihang): impl compile by source later
     endif()
 
+    list(APPEND all_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.h)
+    list(APPEND all_srcs ${common_srcs})
+    list(APPEND all_srcs ${cpu_srcs})
+    list(APPEND all_srcs ${gpu_srcs})
+    list(APPEND all_srcs ${xpu_srcs})
+    foreach(src ${all_srcs})
+        file(READ ${src} target_content)
+        string(REGEX MATCHALL "#include \"paddle\/pten\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
+        foreach(include_kernel ${include_kernels})
+            string(REGEX REPLACE "#include \"paddle\/pten\/kernels\/" "" kernel_name ${include_kernel})
+            string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
+            list(APPEND kernel_deps ${kernel_name})
+        endforeach()
+    endforeach()
+    list(REMOVE_DUPLICATES kernel_deps)
+    list(REMOVE_ITEM kernel_deps ${TARGET})
+
     list(LENGTH common_srcs common_srcs_len)
     list(LENGTH cpu_srcs cpu_srcs_len)
     list(LENGTH gpu_srcs gpu_srcs_len)
     list(LENGTH xpu_srcs xpu_srcs_len)
-    list(LENGTH npu_srcs npu_srcs_len)
 
     if (${common_srcs_len} GREATER 0)
         # If the kernel has a device independent public implementation,
         # we will use this implementation and will not adopt the implementation
         # under specific devices
         if (WITH_GPU)
-            nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS})
+            nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         elseif (WITH_ROCM)
-            hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS})
+            hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         else()
-            cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS})
+            cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         endif()
     else()
         # If the kernel has a header file declaration, but no corresponding
         # implementation can be found, this is not allowed
         if (${cpu_srcs_len} EQUAL 0 AND ${gpu_srcs_len} EQUAL 0 AND
-            ${xpu_srcs_len} EQUAL 0 AND ${npu_srcs_len} EQUAL 0)
+            ${xpu_srcs_len} EQUAL 0)
             message(FATAL_ERROR "Cannot find any implementation for ${TARGET}")
         else()
             if (WITH_GPU)
                 if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
-                    nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS})
+                    nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
                 endif()
             elseif (WITH_ROCM)
                 if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
-                    hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS})
+                    hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
                 endif()
             else()
-                if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${npu_srcs_len} GREATER 0)
-                    cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${npu_srcs} DEPS ${kernel_library_DEPS})
+                if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
+                    cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
                 endif()
             endif()
         endif()
     endif()
 
-    if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
-        ${xpu_srcs_len} GREATER 0 OR ${npu_srcs_len} GREATER 0)
+    if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
+        ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
         # append target into PTEN_KERNELS property
         get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
         set(pten_kernels ${pten_kernels} ${TARGET})
@@ -147,9 +157,6 @@ function(kernel_library TARGET)
     if (${xpu_srcs_len} GREATER 0)
         kernel_declare(${xpu_srcs})
     endif()
-    if (${npu_srcs_len} GREATER 0)
-        kernel_declare(${npu_srcs})
-    endif()
 endfunction()
 
 function(register_kernels)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
@@ -91,15 +91,16 @@ endif()
 cc_test(copy_same_tensor_test SRCS copy_same_tensor_test.cc DEPS tensor)
 
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
+cc_library(mixed_vector SRCS mixed_vector.cc DEPS device_context)
 
 if(WITH_GPU)
-  nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor)
+  nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS mixed_vector place memory device_context tensor)
 elseif(WITH_ROCM)
-  hip_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor)
+  hip_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS mixed_vector place memory device_context tensor)
 else()
-  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
+  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS mixed_vector place memory device_context tensor)
 endif()
-cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
+cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim mixed_vector place tensor framework_proto version)
 
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 

diff --git a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc
@@ -67,12 +67,6 @@ FuseBatchNormActOneDNNPass::FuseBatchNormActOneDNNPass() {
       .AddAttr("epsilon")
       .IsNumGE(0.0f)
       .IsNumLE(0.001f)
-      .End()
-      .AddAttr("trainable_statistics")
-      .IsBoolEQ(false)
-      .End()
-      .AddAttr("is_test")
-      .IsBoolEQ(true)
       .End();
 
   AddOpCompat(OpCompat("relu"))
@@ -114,21 +108,19 @@ void FuseBatchNormActOneDNNPass::FuseBatchNormAct(
     GET_IR_NODE_FROM_SUBGRAPH(act, act, bn_act_pattern);
 
     auto *bn_op = batch_norm->Op();
-    if (bn_op->HasAttr("use_mkldnn")) {
+    if (bn_op->HasAttr("trainable_statistics")) {
       PADDLE_ENFORCE(
-          BOOST_GET_CONST(bool, bn_op->GetAttr("use_mkldnn")),
+          !BOOST_GET_CONST(bool, bn_op->GetAttr("trainable_statistics")),
           platform::errors::PreconditionNotMet(
-              "The BatchNorm+Act fusion may happen only when oneDNN library "
-              "is used."));
+              "The BatchNorm+Act fusion may happen only when mean and variance "
+              "are not calculated by current batch statistics."));
     }
 
-    auto *act_op = act->Op();
-    if (act_op->HasAttr("use_mkldnn")) {
+    if (bn_op->HasAttr("is_test")) {
       PADDLE_ENFORCE(
-          BOOST_GET_CONST(bool, bn_op->GetAttr("use_mkldnn")),
+          BOOST_GET_CONST(bool, bn_op->GetAttr("is_test")),
           platform::errors::PreconditionNotMet(
-              "The BatchNorm+Act fusion may happen only when oneDNN library "
-              "is used."));
+              "The BatchNorm+Act fusion may happen only during inference."));
     }
 
     bn_op->SetAttr("use_mkldnn", true);

diff --git a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc
@@ -65,9 +65,9 @@ TEST(FuseBatchNormActOneDNNPass, ThrowIsTestTrainableStats) {
   // No fusion in this attribute configuration
   constexpr int removed_nodes_count = 0;
 
-  EXPECT_TRUE(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
-                                     "act_y", removed_nodes_count));
-  EXPECT_TRUE(test::AssertOpsCount(graph, {{"batch_norm", 1}, {"relu", 1}}));
+  EXPECT_THROW(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
+                                      "act_y", removed_nodes_count),
+               paddle::platform::EnforceNotMet);
 }
 
 TEST(FuseBatchNormActOneDNNPass, FuseIsTest) {
@@ -123,9 +123,9 @@ TEST(FuseBatchNormActOneDNNPass, ThrowTrainableStats) {
   // No fusion in this attribute configuration
   constexpr int removed_nodes_count = 0;
 
-  EXPECT_TRUE(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
-                                     "act_y", removed_nodes_count));
-  EXPECT_TRUE(test::AssertOpsCount(graph, {{"batch_norm", 1}, {"relu", 1}}));
+  EXPECT_THROW(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
+                                      "act_y", removed_nodes_count),
+               paddle::platform::EnforceNotMet);
 }
 
 TEST(FuseBatchNormActOneDNNPass, AllAttrsFalse) {
@@ -149,9 +149,9 @@ TEST(FuseBatchNormActOneDNNPass, AllAttrsFalse) {
   // No fusion in this attribute configuration
   constexpr int removed_nodes_count = 0;
 
-  EXPECT_TRUE(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
-                                     "act_y", removed_nodes_count));
-  EXPECT_TRUE(test::AssertOpsCount(graph, {{"batch_norm", 1}, {"relu", 1}}));
+  EXPECT_THROW(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
+                                      "act_y", removed_nodes_count),
+               paddle::platform::EnforceNotMet);
 }
 
 TEST(FuseBatchNormActOneDNNPass, ThrowUseMkldnn) {
@@ -176,9 +176,9 @@ TEST(FuseBatchNormActOneDNNPass, ThrowUseMkldnn) {
   // No fusion in this attribute configuration
   constexpr int removed_nodes_count = 0;
 
-  EXPECT_TRUE(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
-                                     "act_y", removed_nodes_count));
-  EXPECT_TRUE(test::AssertOpsCount(graph, {{"batch_norm", 1}, {"relu", 1}}));
+  EXPECT_THROW(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
+                                      "act_y", removed_nodes_count),
+               paddle::platform::EnforceNotMet);
 }
 
 TEST(FuseBatchNormActOneDNNPass, pass_op_version_check) {

diff --git a/paddle/fluid/framework/mixed_vector.cc b/paddle/fluid/framework/mixed_vector.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/mixed_vector.h"
+
+#include <algorithm>
+#include <initializer_list>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <utility>
+#include <vector>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/details/cow_ptr.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/utils/none.h"
+#include "paddle/utils/optional.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+void CopyToCPUHelper(std::vector<T> *cpu_, paddle::memory::AllocationPtr *gpu_,
+                     size_t *gpu_memory_size_) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // COPY GPU Data To CPU
+  auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+      platform::DeviceContextPool::Instance().Get((*gpu_)->place()));
+  auto stream = dev_ctx->stream();
+  void *src = (*gpu_)->ptr();
+  void *dst = cpu_->data();
+  paddle::memory::Copy(platform::CPUPlace(), dst,
+                       OptionalCUDAPlace(*gpu_).get(), src, *gpu_memory_size_,
+                       stream);
+  dev_ctx->Wait();
+#endif
+}
+
+template <typename T>
+void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
+                             paddle::memory::AllocationPtr *gpu_,
+                             size_t *gpu_memory_size_,
+                             const platform::Place &place) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  void *src = cpu_->data();
+  *gpu_memory_size_ = cpu_->size() * sizeof(T);  // sizeof(T)
+  (*gpu_) = memory::Alloc(place, *gpu_memory_size_);
+  void *dst = (*gpu_)->ptr();
+  auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(place));
+  auto stream = dev_ctx->stream();
+  paddle::memory::Copy(OptionalCUDAPlace(*gpu_).get(), dst,
+                       platform::CPUPlace(), src, *gpu_memory_size_, stream);
+#endif
+}
+
+#define INSTANTIATE_VECTOR_FOR_TYPE(__TYPE__)                                  \
+  template <>                                                                  \
+  void Vector<__TYPE__>::VectorData::CopyToCPU() const {                       \
+    CopyToCPUHelper<__TYPE__>(&cpu_, &gpu_, &gpu_memory_size_);                \
+  }                                                                            \
+                                                                               \
+  template <>                                                                  \
+  void Vector<__TYPE__>::VectorData::CopyCPUDataToCUDA(                        \
+      const platform::Place &place) const {                                    \
+    CopyCPUDataToCUDAHelper<__TYPE__>(&cpu_, &gpu_, &gpu_memory_size_, place); \
+  }
+
+INSTANTIATE_VECTOR_FOR_TYPE(size_t)
+INSTANTIATE_VECTOR_FOR_TYPE(int)
+INSTANTIATE_VECTOR_FOR_TYPE(int64_t)
+
+};  // namespace framework
+}  // namespace paddle