Skip to content

Commit

Permalink
Merge branch 'official_develop' into support_int64
Browse files Browse the repository at this point in the history
  • Loading branch information
zhoutianzi666 committed Dec 8, 2022
2 parents 77492b6 + 33fa268 commit 644ff23
Show file tree
Hide file tree
Showing 1,170 changed files with 19,615 additions and 27,693 deletions.
4 changes: 2 additions & 2 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ exclude =
ignore =
# E, see https://pycodestyle.pycqa.org/en/latest/intro.html#error-codes
E203,
E401,E402,
E402,
E501,
E721,E722,E731,E741,

# F, see https://flake8.pycqa.org/en/latest/user/error-codes.html
F405,
F811,F841,
F841,

# W, see https://pycodestyle.pycqa.org/en/latest/intro.html#error-codes
W503
Expand Down
Empty file modified .pre-commit-config.yaml
100755 → 100644
Empty file.
2 changes: 1 addition & 1 deletion cmake/external/cutlass.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ include(ExternalProject)
set(CUTLASS_PREFIX_DIR ${THIRD_PARTY_PATH}/cutlass)

set(CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git)
set(CUTLASS_TAG v2.9.1)
set(CUTLASS_TAG v2.10.0)

include_directories("${THIRD_PARTY_PATH}/cutlass/src/extern_cutlass/")
include_directories("${THIRD_PARTY_PATH}/cutlass/src/extern_cutlass/include/")
Expand Down
4 changes: 2 additions & 2 deletions cmake/external/dgc.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,14 @@ set(DGC_INCLUDE_DIR
set(DGC_LIBRARIES
"${DGC_INSTALL_DIR}/lib/libdgc.a"
CACHE FILEPATH "dgc library." FORCE)
set(DGC_URL "https://fleet.bj.bcebos.com/dgc/collective_f66ef73.tgz")
set(DGC_URL "https://fleet.bj.bcebos.com/dgc/collective_7369ff.tgz")
include_directories(${DGC_INCLUDE_DIR})

ExternalProject_Add(
extern_dgc
${EXTERNAL_PROJECT_LOG_ARGS}
URL ${DGC_URL}
URL_MD5 "94e6fa1bc97169d0e1aad44570fe3251"
URL_MD5 "ede459281a0f979da8d84f81287369ff"
PREFIX "${DGC_PREFIX_DIR}"
CONFIGURE_COMMAND ""
BUILD_COMMAND make -j${NPROC}
Expand Down
2 changes: 1 addition & 1 deletion cmake/external/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so")
if(NOT DEFINED XPU_BASE_URL)
set(XPU_BASE_URL_WITHOUT_DATE
"https://baidu-kunlun-product.su.bcebos.com/KL-SDK/klsdk-dev")
set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20221124")
set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20221201")
else()
set(XPU_BASE_URL "${XPU_BASE_URL}")
endif()
Expand Down
33 changes: 22 additions & 11 deletions paddle/fluid/distributed/collective/reducer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,16 @@
#include "paddle/phi/backends/device_manager.h"

DECLARE_bool(use_stream_safe_cuda_allocator);
DECLARE_string(allocator_strategy);

namespace paddle {
namespace distributed {

static bool IsStreamSafeAllocator() {
return FLAGS_allocator_strategy == "auto_growth" &&
FLAGS_use_stream_safe_cuda_allocator;
}

static Backend TransToBackend(platform::Place place) {
static const std::map<phi::AllocationType, Backend> type_backend = {
{phi::AllocationType::GPU, Backend::GPU},
Expand Down Expand Up @@ -399,14 +405,14 @@ void EagerGroup::ConcatTensors(const platform::Place &place) {
}
}

void EagerGroup::SplitTensorsDev(const platform::DeviceContext &context) {
void EagerGroup::SplitTensors(const platform::DeviceContext &context) {
auto place = context.GetPlace();
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto &gpu_context = static_cast<const phi::GPUContext &>(context);
SplitTensorsWithType(
gpu_context, &dense_contents_, &dense_tensors_, dtype_);
if (FLAGS_use_stream_safe_cuda_allocator) {
if (IsStreamSafeAllocator()) {
auto dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(dense_contents_.impl());
VLOG(3) << "Free dense_contents_ " << dense_contents_.numel();
Expand Down Expand Up @@ -1011,12 +1017,11 @@ void EagerReducer::FinalizeBackward() {
for (auto &group : groups_) {
if (!group.is_sparse_) {
group.task->Synchronize();
}
}

for (auto &group : groups_) {
if (!group.is_sparse_) {
group.dense_contents_.reset();
if (!IsStreamSafeAllocator()) {
auto *default_ctx =
platform::DeviceContextPool::Instance().Get(inner_place_);
group.SplitTensors(*default_ctx);
}
}
}

Expand Down Expand Up @@ -1054,9 +1059,15 @@ void EagerReducer::FusedAllReduceSchedule(EagerGroup *group,
group->task = process_group_->AllReduce(in_out, in_out, opts);

auto *context = process_group_->GetDeviceContext(inner_place_);
group->SplitTensorsDev(*context);
group->task->UpdateWaitChain(*context);
// split in FinalizeBackward()

if (IsStreamSafeAllocator()) {
// NOTE(shenliang03): The best_fit allocator strategy is multi-stream
// insecure. In the Split operator, additional memory will be applied for
// calculation, and if it is asynchronous, an illegal memory access may be
// encountered.
group->SplitTensors(*context);
group->task->UpdateWaitChain(*context);
}
}

void EagerReducer::AllReduceSparse(EagerGroup *group,
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/distributed/collective/reducer.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ class EagerGroup {

// context is used to select the stream for split

void SplitTensorsDev(const platform::DeviceContext &);
void SplitTensors(const platform::DeviceContext &);

friend std::ostream &operator<<(std::ostream &, const EagerGroup &);
};
Expand Down
22 changes: 12 additions & 10 deletions paddle/fluid/eager/custom_operator/custom_operator_node.cc
Original file line number Diff line number Diff line change
Expand Up @@ -217,18 +217,20 @@ RunCustomOpNode::operator()(
VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size();
for (size_t i = 0; i < OutputMeta().size(); i++) {
if (map[0][0].find(i) != map[0][0].end()) {
int grad_output_idx = map[0][0][i];
VLOG(7) << "Insert grad outputs: " << i
<< " with size: " << OutputMeta()[i].size()
<< " to tmp_outputs: " << map[0][0][i];
for (size_t j = 0; j < OutputMeta()[i].size(); j++) {
outs[i].emplace_back(/* init it incase of copy nullptr of shared_ptr */
std::make_shared<phi::DenseTensor>(
phi::DataType::UNDEFINED),
egr::Controller::Instance().GenerateUniqueName(
"custom_tmp_grad"));
egr::EagerUtils::autograd_meta(&(outs[i][j]));
<< " with size: " << OutputMeta()[grad_output_idx].size()
<< " to tmp_outputs: " << grad_output_idx;
for (size_t j = 0; j < OutputMeta()[grad_output_idx].size(); j++) {
outs[grad_output_idx]
.emplace_back(/* init it incase of copy nullptr of shared_ptr */
std::make_shared<phi::DenseTensor>(
phi::DataType::UNDEFINED),
egr::Controller::Instance().GenerateUniqueName(
"custom_tmp_grad"));
egr::EagerUtils::autograd_meta(&(outs[grad_output_idx][j]));
}
tmp_outs[map[0][0][i]] = outs[i];
tmp_outs[grad_output_idx] = outs[grad_output_idx];
}
}
for (size_t i = 0; i < tmp_outs.size(); i++) {
Expand Down
12 changes: 3 additions & 9 deletions paddle/fluid/framework/details/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,7 @@ if(WITH_GPU)
memory
dynload_cuda
variable_visitor
place
device_memory_aligment)
place)
nv_library(
grad_merge_all_reduce_op_handle
SRCS grad_merge_all_reduce_op_handle.cc
Expand All @@ -105,7 +104,6 @@ if(WITH_GPU)
dynload_cuda
variable_visitor
place
device_memory_aligment
all_reduce_op_handle
fused_all_reduce_op_handle)

Expand Down Expand Up @@ -170,8 +168,7 @@ elseif(WITH_ROCM)
memory
dynload_cuda
variable_visitor
place
device_memory_aligment)
place)
hip_library(
grad_merge_all_reduce_op_handle
SRCS grad_merge_all_reduce_op_handle.cc
Expand All @@ -183,7 +180,6 @@ elseif(WITH_ROCM)
dynload_cuda
variable_visitor
place
device_memory_aligment
all_reduce_op_handle
fused_all_reduce_op_handle)

Expand Down Expand Up @@ -233,8 +229,7 @@ else()
ddim
memory
variable_visitor
place
device_memory_aligment)
place)
cc_library(
grad_merge_all_reduce_op_handle
SRCS grad_merge_all_reduce_op_handle.cc
Expand All @@ -245,7 +240,6 @@ else()
memory
variable_visitor
place
device_memory_aligment
all_reduce_op_handle
fused_all_reduce_op_handle)
if(WITH_DISTRIBUTE)
Expand Down
7 changes: 3 additions & 4 deletions paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/device_memory_aligment.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/backends/device_memory_aligment.h"

DEFINE_bool(skip_fused_all_reduce_check, false, "");
DECLARE_bool(allreduce_record_one_event);
Expand Down Expand Up @@ -247,7 +247,7 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
for (size_t k = 1; k < g_tensor.size(); ++k) {
const void *cur_address = g_tensor.at(k - 1).second->data();
int64_t len = g_tensor.at(k - 1).second->numel();
auto offset = platform::Alignment(len * size_of_dtype, places_[0]);
auto offset = phi::Alignment(len * size_of_dtype, places_[0]);
void *infer_next_address = reinterpret_cast<void *>(
reinterpret_cast<uintptr_t>(cur_address) + offset);
const void *next_address = g_tensor.at(k).second->data();
Expand Down Expand Up @@ -400,8 +400,7 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
"The size of grad tensors of fused_all_reduce_op_handle "
"must be > 0, but got %d.",
len));
*numel +=
platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
*numel += phi::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
}
}

Expand Down
4 changes: 3 additions & 1 deletion paddle/fluid/framework/ir/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ pass_library(delete_c_identity_op_pass inference)
pass_library(preln_residual_bias_fuse_pass inference)
pass_library(delete_fill_constant_op_pass inference)
pass_library(constant_folding_pass inference)
pass_library(float_to_half_pass inference)
pass_library(conv2d_fusion_layout_transfer_pass inference)
pass_library(simplify_with_basic_ops_pass base)
pass_library(fc_elementwise_layernorm_fuse_pass base)
Expand Down Expand Up @@ -135,10 +136,11 @@ if(WITH_TENSORRT)
pass_library(remove_padding_recover_padding_pass inference)
pass_library(delete_remove_padding_recover_padding_pass inference)
pass_library(layernorm_shift_partition_fuse_pass inference)
pass_library(reverse_roll_fuse_pass inference)
pass_library(preln_layernorm_x_fuse_pass inference)
endif()

if(WITH_TENSORRT AND NOT WIN32)
if(WITH_TENSORRT)
pass_library(trt_embedding_eltwise_layernorm_fuse_pass inference)
pass_library(preln_embedding_eltwise_layernorm_fuse_pass inference)
endif()
Expand Down
Loading

0 comments on commit 644ff23

Please sign in to comment.