Merge branch 'official_develop' into support_int64

PaddlePaddle · Dec 8, 2022 · 644ff23 · 644ff23
2 parents 77492b6 + 33fa268
commit 644ff23
Show file tree

Hide file tree

Showing 1,170 changed files with 19,615 additions and 27,693 deletions.
diff --git a/.flake8 b/.flake8
@@ -17,13 +17,13 @@ exclude =
 ignore =
     # E, see https://pycodestyle.pycqa.org/en/latest/intro.html#error-codes
     E203,
-    E401,E402,
+    E402,
     E501,
     E721,E722,E731,E741,
 
     # F, see https://flake8.pycqa.org/en/latest/user/error-codes.html
     F405,
-    F811,F841,
+    F841,
 
     # W, see https://pycodestyle.pycqa.org/en/latest/intro.html#error-codes
     W503

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
diff --git a/cmake/external/cutlass.cmake b/cmake/external/cutlass.cmake
@@ -17,7 +17,7 @@ include(ExternalProject)
 set(CUTLASS_PREFIX_DIR ${THIRD_PARTY_PATH}/cutlass)
 
 set(CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git)
-set(CUTLASS_TAG v2.9.1)
+set(CUTLASS_TAG v2.10.0)
 
 include_directories("${THIRD_PARTY_PATH}/cutlass/src/extern_cutlass/")
 include_directories("${THIRD_PARTY_PATH}/cutlass/src/extern_cutlass/include/")

diff --git a/cmake/external/dgc.cmake b/cmake/external/dgc.cmake
@@ -23,14 +23,14 @@ set(DGC_INCLUDE_DIR
 set(DGC_LIBRARIES
     "${DGC_INSTALL_DIR}/lib/libdgc.a"
     CACHE FILEPATH "dgc library." FORCE)
-set(DGC_URL "https://fleet.bj.bcebos.com/dgc/collective_f66ef73.tgz")
+set(DGC_URL "https://fleet.bj.bcebos.com/dgc/collective_7369ff.tgz")
 include_directories(${DGC_INCLUDE_DIR})
 
 ExternalProject_Add(
   extern_dgc
   ${EXTERNAL_PROJECT_LOG_ARGS}
   URL ${DGC_URL}
-  URL_MD5 "94e6fa1bc97169d0e1aad44570fe3251"
+  URL_MD5 "ede459281a0f979da8d84f81287369ff"
   PREFIX "${DGC_PREFIX_DIR}"
   CONFIGURE_COMMAND ""
   BUILD_COMMAND make -j${NPROC}

diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
@@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so")
 if(NOT DEFINED XPU_BASE_URL)
   set(XPU_BASE_URL_WITHOUT_DATE
       "https://baidu-kunlun-product.su.bcebos.com/KL-SDK/klsdk-dev")
-  set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20221124")
+  set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20221201")
 else()
   set(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()

diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
@@ -17,10 +17,16 @@
 #include "paddle/phi/backends/device_manager.h"
 
 DECLARE_bool(use_stream_safe_cuda_allocator);
+DECLARE_string(allocator_strategy);
 
 namespace paddle {
 namespace distributed {
 
+static bool IsStreamSafeAllocator() {
+  return FLAGS_allocator_strategy == "auto_growth" &&
+         FLAGS_use_stream_safe_cuda_allocator;
+}
+
 static Backend TransToBackend(platform::Place place) {
   static const std::map<phi::AllocationType, Backend> type_backend = {
       {phi::AllocationType::GPU, Backend::GPU},
@@ -399,14 +405,14 @@ void EagerGroup::ConcatTensors(const platform::Place &place) {
   }
 }
 
-void EagerGroup::SplitTensorsDev(const platform::DeviceContext &context) {
+void EagerGroup::SplitTensors(const platform::DeviceContext &context) {
   auto place = context.GetPlace();
   if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     auto &gpu_context = static_cast<const phi::GPUContext &>(context);
     SplitTensorsWithType(
         gpu_context, &dense_contents_, &dense_tensors_, dtype_);
-    if (FLAGS_use_stream_safe_cuda_allocator) {
+    if (IsStreamSafeAllocator()) {
       auto dense_tensor =
           std::dynamic_pointer_cast<phi::DenseTensor>(dense_contents_.impl());
       VLOG(3) << "Free dense_contents_ " << dense_contents_.numel();
@@ -1011,12 +1017,11 @@ void EagerReducer::FinalizeBackward() {
   for (auto &group : groups_) {
     if (!group.is_sparse_) {
       group.task->Synchronize();
-    }
-  }
-
-  for (auto &group : groups_) {
-    if (!group.is_sparse_) {
-      group.dense_contents_.reset();
+      if (!IsStreamSafeAllocator()) {
+        auto *default_ctx =
+            platform::DeviceContextPool::Instance().Get(inner_place_);
+        group.SplitTensors(*default_ctx);
+      }
     }
   }
 
@@ -1054,9 +1059,15 @@ void EagerReducer::FusedAllReduceSchedule(EagerGroup *group,
   group->task = process_group_->AllReduce(in_out, in_out, opts);
 
   auto *context = process_group_->GetDeviceContext(inner_place_);
-  group->SplitTensorsDev(*context);
-  group->task->UpdateWaitChain(*context);
-  // split in FinalizeBackward()
+
+  if (IsStreamSafeAllocator()) {
+    // NOTE(shenliang03): The best_fit allocator strategy is multi-stream
+    // insecure. In the Split operator, additional memory will be applied for
+    // calculation, and if it is asynchronous, an illegal memory access may be
+    // encountered.
+    group->SplitTensors(*context);
+    group->task->UpdateWaitChain(*context);
+  }
 }
 
 void EagerReducer::AllReduceSparse(EagerGroup *group,

diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h
@@ -75,7 +75,7 @@ class EagerGroup {
 
   // context is used to select the stream for split
 
-  void SplitTensorsDev(const platform::DeviceContext &);
+  void SplitTensors(const platform::DeviceContext &);
 
   friend std::ostream &operator<<(std::ostream &, const EagerGroup &);
 };

diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -217,18 +217,20 @@ RunCustomOpNode::operator()(
   VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size();
   for (size_t i = 0; i < OutputMeta().size(); i++) {
     if (map[0][0].find(i) != map[0][0].end()) {
+      int grad_output_idx = map[0][0][i];
       VLOG(7) << "Insert grad outputs: " << i
-              << " with size: " << OutputMeta()[i].size()
-              << " to tmp_outputs: " << map[0][0][i];
-      for (size_t j = 0; j < OutputMeta()[i].size(); j++) {
-        outs[i].emplace_back(/* init it incase of copy nullptr of shared_ptr */
-                             std::make_shared<phi::DenseTensor>(
-                                 phi::DataType::UNDEFINED),
-                             egr::Controller::Instance().GenerateUniqueName(
-                                 "custom_tmp_grad"));
-        egr::EagerUtils::autograd_meta(&(outs[i][j]));
+              << " with size: " << OutputMeta()[grad_output_idx].size()
+              << " to tmp_outputs: " << grad_output_idx;
+      for (size_t j = 0; j < OutputMeta()[grad_output_idx].size(); j++) {
+        outs[grad_output_idx]
+            .emplace_back(/* init it incase of copy nullptr of shared_ptr */
+                          std::make_shared<phi::DenseTensor>(
+                              phi::DataType::UNDEFINED),
+                          egr::Controller::Instance().GenerateUniqueName(
+                              "custom_tmp_grad"));
+        egr::EagerUtils::autograd_meta(&(outs[grad_output_idx][j]));
       }
-      tmp_outs[map[0][0][i]] = outs[i];
+      tmp_outs[grad_output_idx] = outs[grad_output_idx];
     }
   }
   for (size_t i = 0; i < tmp_outs.size(); i++) {

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
@@ -92,8 +92,7 @@ if(WITH_GPU)
          memory
          dynload_cuda
          variable_visitor
-         place
-         device_memory_aligment)
+         place)
   nv_library(
     grad_merge_all_reduce_op_handle
     SRCS grad_merge_all_reduce_op_handle.cc
@@ -105,7 +104,6 @@ if(WITH_GPU)
          dynload_cuda
          variable_visitor
          place
-         device_memory_aligment
          all_reduce_op_handle
          fused_all_reduce_op_handle)
 
@@ -170,8 +168,7 @@ elseif(WITH_ROCM)
          memory
          dynload_cuda
          variable_visitor
-         place
-         device_memory_aligment)
+         place)
   hip_library(
     grad_merge_all_reduce_op_handle
     SRCS grad_merge_all_reduce_op_handle.cc
@@ -183,7 +180,6 @@ elseif(WITH_ROCM)
          dynload_cuda
          variable_visitor
          place
-         device_memory_aligment
          all_reduce_op_handle
          fused_all_reduce_op_handle)
 
@@ -233,8 +229,7 @@ else()
          ddim
          memory
          variable_visitor
-         place
-         device_memory_aligment)
+         place)
   cc_library(
     grad_merge_all_reduce_op_handle
     SRCS grad_merge_all_reduce_op_handle.cc
@@ -245,7 +240,6 @@ else()
          memory
          variable_visitor
          place
-         device_memory_aligment
          all_reduce_op_handle
          fused_all_reduce_op_handle)
   if(WITH_DISTRIBUTE)

diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -16,9 +16,9 @@
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
-#include "paddle/fluid/platform/device_memory_aligment.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/phi/backends/device_memory_aligment.h"
 
 DEFINE_bool(skip_fused_all_reduce_check, false, "");
 DECLARE_bool(allreduce_record_one_event);
@@ -247,7 +247,7 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
     for (size_t k = 1; k < g_tensor.size(); ++k) {
       const void *cur_address = g_tensor.at(k - 1).second->data();
       int64_t len = g_tensor.at(k - 1).second->numel();
-      auto offset = platform::Alignment(len * size_of_dtype, places_[0]);
+      auto offset = phi::Alignment(len * size_of_dtype, places_[0]);
       void *infer_next_address = reinterpret_cast<void *>(
           reinterpret_cast<uintptr_t>(cur_address) + offset);
       const void *next_address = g_tensor.at(k).second->data();
@@ -400,8 +400,7 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
             "The size of grad tensors of fused_all_reduce_op_handle  "
             "must be > 0, but got %d.",
             len));
-    *numel +=
-        platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
+    *numel += phi::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
   }
 }
 

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -104,6 +104,7 @@ pass_library(delete_c_identity_op_pass inference)
 pass_library(preln_residual_bias_fuse_pass inference)
 pass_library(delete_fill_constant_op_pass inference)
 pass_library(constant_folding_pass inference)
+pass_library(float_to_half_pass inference)
 pass_library(conv2d_fusion_layout_transfer_pass inference)
 pass_library(simplify_with_basic_ops_pass base)
 pass_library(fc_elementwise_layernorm_fuse_pass base)
@@ -135,10 +136,11 @@ if(WITH_TENSORRT)
   pass_library(remove_padding_recover_padding_pass inference)
   pass_library(delete_remove_padding_recover_padding_pass inference)
   pass_library(layernorm_shift_partition_fuse_pass inference)
+  pass_library(reverse_roll_fuse_pass inference)
   pass_library(preln_layernorm_x_fuse_pass inference)
 endif()
 
-if(WITH_TENSORRT AND NOT WIN32)
+if(WITH_TENSORRT)
   pass_library(trt_embedding_eltwise_layernorm_fuse_pass inference)
   pass_library(preln_embedding_eltwise_layernorm_fuse_pass inference)
 endif()