merge develop

PaddlePaddle · Dec 14, 2022 · 060b3a2 · 060b3a2
2 parents 91d04f2 + f6915d4
commit 060b3a2
Show file tree

Hide file tree

Showing 268 changed files with 4,024 additions and 7,954 deletions.
diff --git a/.github/ISSUE_TEMPLATE/3_build-installation-issue.yml b/.github/ISSUE_TEMPLATE/3_build-installation-issue.yml
@@ -49,11 +49,16 @@ body:
       Paddle With CUDA:
 
       OS:
+      GCC version:
+      Clang version:
+      CMake version:
+      Libc version:
       Python version:
 
       CUDA version:
       cuDNN version:
       Nvidia driver version:
+      Nvidia driver List:
       ****************************************
   validations:
     required: true

diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
@@ -29,11 +29,13 @@ list(
   ${CUDNN_ROOT}
   ${CUDNN_ROOT}/lib64
   ${CUDNN_ROOT}/lib
+  ${CUDNN_ROOT}/lib/x64
   ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
   ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
   $ENV{CUDNN_ROOT}
   $ENV{CUDNN_ROOT}/lib64
   $ENV{CUDNN_ROOT}/lib
+  $ENV{CUDNN_ROOT}/lib/x64
   /usr/lib
   ${CUDA_TOOLKIT_ROOT_DIR}
   ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)

diff --git a/cmake/flags.cmake b/cmake/flags.cmake
@@ -220,7 +220,8 @@ if(APPLE)
       -Werror=uninitialized
       -Werror=tautological-constant-out-of-range-compare
       -Werror=literal-conversion
-      -Werror=pragma-pack)
+      -Werror=pragma-pack
+      -Werror=c++17-extensions)
 endif()
 
 if(WITH_HETERPS AND WITH_PSLIB)

diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
@@ -317,8 +317,9 @@ if(WITH_ONNXRUNTIME)
 endif()
 
 if(WITH_GPU)
-  if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0 OR ${CMAKE_CUDA_COMPILER_VERSION}
-                                                 GREATER_EQUAL 11.6)
+  if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0
+     OR (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.6
+         AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.8))
     include(external/cub) # download cub
     list(APPEND third_party_deps extern_cub)
   endif()

diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc
@@ -154,7 +154,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Collective(
   const auto& place = in_tensor.place();
   const auto& key = GetKeyFromPlace(place);
 
-  if (!calc_event_) {
+  if (!calc_event_ ||
+      (place_to_comm_ctx_.find(key) == place_to_comm_ctx_.end())) {
     CreateBKCLEnvCache(place, key);
   }
 
@@ -170,6 +171,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Collective(
   fn(out_tensor, in_tensor, comm_ctx->bkcl_context(), bkcl_stream);
 
   if (!use_calc_stream) {
+    PADDLE_ENFORCE_NOT_NULL(
+        comm_ctx.get(), platform::errors::Fatal("comm context is nullptr."));
     task->comm_event_->Record(*comm_ctx.get());
   }
 
@@ -369,6 +372,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllReduce(
       1,
       platform::errors::InvalidArgument(
           "BKCL only support single tensor collective communication."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInXPUPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in XPUPlace."));
   return Collective(
       &out_tensors[0],
       in_tensors[0],
@@ -406,6 +413,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllReduce(
       1,
       platform::errors::InvalidArgument(
           "BKCL only support single tensor collective communication."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInXPUPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in XPUPlace."));
   return Collective(
       &out_tensors[0],
       in_tensors[0],
@@ -442,6 +453,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Broadcast(
       1,
       platform::errors::InvalidArgument(
           "BKCL only support single tensor collective communication."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInXPUPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in XPUPlace."));
 
   return Collective(
       &out_tensors[0],
@@ -481,6 +496,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Broadcast(
       1,
       platform::errors::InvalidArgument(
           "BKCL only support single tensor collective communication."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInXPUPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in XPUPlace."));
 
   return Collective(
       &out_tensors[0],
@@ -518,6 +537,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllGather(
       1,
       platform::errors::InvalidArgument(
           "BKCL only support single tensor collective communication."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInXPUPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in XPUPlace."));
   PADDLE_ENFORCE_EQ(
       CheckTensorsInXPUPlace(out_tensors),
       true,

diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
@@ -572,15 +572,17 @@ fused_attention_dygraph_function(
       egr::EagerUtils::CheckAndRetainGrad(SoftmaxOut);
       grad_node->SetGradOutMeta(SoftmaxOut, 19);
 
-      auto AttnDropoutOut_accumulation_node =
-          std::make_shared<egr::GradNodeAccumulation>(
-              p_autograd_AttnDropoutOut);
-      egr::EagerUtils::SetOutRankWithSlot(p_autograd_AttnDropoutOut, 0);
-      egr::EagerUtils::SetHistory(p_autograd_AttnDropoutOut,
-                                  AttnDropoutOut_accumulation_node);
-      AttnDropoutOut_accumulation_node->SetGradInMeta(AttnDropoutOut, 0);
-      egr::EagerUtils::CheckAndRetainGrad(AttnDropoutOut);
-      grad_node->SetGradOutMeta(AttnDropoutOut, 20);
+      if (AttnDropoutOut.initialized()) {
+        auto AttnDropoutOut_accumulation_node =
+            std::make_shared<egr::GradNodeAccumulation>(
+                p_autograd_AttnDropoutOut);
+        egr::EagerUtils::SetOutRankWithSlot(p_autograd_AttnDropoutOut, 0);
+        egr::EagerUtils::SetHistory(p_autograd_AttnDropoutOut,
+                                    AttnDropoutOut_accumulation_node);
+        AttnDropoutOut_accumulation_node->SetGradInMeta(AttnDropoutOut, 0);
+        egr::EagerUtils::CheckAndRetainGrad(AttnDropoutOut);
+        grad_node->SetGradOutMeta(AttnDropoutOut, 20);
+      }
 
       auto FMHAOut_accumulation_node =
           std::make_shared<egr::GradNodeAccumulation>(p_autograd_FMHAOut);

diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h b/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h
@@ -476,7 +476,7 @@ class fused_attentionGradNodeCompat : public egr::GradNodeBase {
     SoftmaxOut_ = egr::TensorWrapper(SoftmaxOut, false);
   }
   void SetTensorWrapperSrcMask(const paddle::experimental::Tensor& SrcMask) {
-    SrcMask_ = egr::TensorWrapper(SrcMask, false);
+    SrcMask_ = egr::TensorWrapper(SrcMask, true);
   }
   void SetTensorWrapperSrcMaskOut(
       const paddle::experimental::Tensor& SrcMaskOut) {

diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -428,13 +428,12 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
   phi::DenseTensor block_num_nan_inf;
   block_num_nan_inf.Resize({static_cast<int64_t>(2 * numel_max_min)});
   int64_t* block_num_nan_ptr =
-      block_num_nan_inf.mutable_data<int64_t>(tensor.place());
+      dev_ctx->template Alloc<int64_t>(&block_num_nan_inf);
   int64_t* block_num_inf_ptr = block_num_nan_ptr + numel_max_min;
 
   phi::DenseTensor tensor_block_max_min;
   tensor_block_max_min.Resize({static_cast<int64_t>(3 * numel_max_min)});
-  MT* tensor_block_max_ptr =
-      tensor_block_max_min.mutable_data<MT>(tensor.place());
+  MT* tensor_block_max_ptr = dev_ctx->template Alloc<MT>(&tensor_block_max_min);
   MT* tensor_block_min_ptr = tensor_block_max_ptr + numel_max_min;
   MT* tensor_block_mean_ptr = tensor_block_max_ptr + 2 * numel_max_min;
 

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -103,7 +103,7 @@ pass_library(delete_c_identity_op_pass inference)
 pass_library(preln_residual_bias_fuse_pass inference)
 pass_library(delete_fill_constant_op_pass inference)
 pass_library(constant_folding_pass inference)
-pass_library(float_to_half_pass inference)
+pass_library(auto_mixed_precision_pass inference)
 pass_library(conv2d_fusion_layout_transfer_pass inference)
 pass_library(simplify_with_basic_ops_pass base)
 pass_library(fc_elementwise_layernorm_fuse_pass base)
@@ -151,19 +151,6 @@ endif()
 
 if(WITH_MKLDNN)
   pass_library(mkldnn_placement_pass base DEPS placement_pass_base DIR mkldnn)
-  pass_library(
-    mkldnn_inplace_pass
-    inference
-    DEPS
-    mkldnn_placement_pass
-    op_registry
-    elementwise_add_op
-    generated_op
-    activation_op
-    softmax_op
-    softmax
-    DIR
-    mkldnn)
   pass_library(depthwise_conv_mkldnn_pass base DIR mkldnn)
   pass_library(conv_affine_channel_mkldnn_fuse_pass inference DIR mkldnn)
   pass_library(conv_bias_mkldnn_fuse_pass inference DIR mkldnn)
@@ -450,10 +437,6 @@ if(WITH_MKLDNN)
     test_mkldnn_placement_pass
     SRCS mkldnn/mkldnn_placement_pass_tester.cc
     DEPS mkldnn_placement_pass)
-  cc_test(
-    test_mkldnn_inplace_pass
-    SRCS mkldnn/mkldnn_inplace_pass_tester.cc
-    DEPS mkldnn_inplace_pass)
   cc_test(
     test_compute_propagate_scales_mkldnn_pass
     SRCS mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc