Skip to content

Commit

Permalink
merge develop
Browse files Browse the repository at this point in the history
  • Loading branch information
HydrogenSulfate committed Dec 14, 2022
2 parents 91d04f2 + f6915d4 commit 060b3a2
Show file tree
Hide file tree
Showing 268 changed files with 4,024 additions and 7,954 deletions.
5 changes: 5 additions & 0 deletions .github/ISSUE_TEMPLATE/3_build-installation-issue.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,16 @@ body:
Paddle With CUDA:
OS:
GCC version:
Clang version:
CMake version:
Libc version:
Python version:
CUDA version:
cuDNN version:
Nvidia driver version:
Nvidia driver List:
****************************************
validations:
required: true
Expand Down
2 changes: 2 additions & 0 deletions cmake/cudnn.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@ list(
${CUDNN_ROOT}
${CUDNN_ROOT}/lib64
${CUDNN_ROOT}/lib
${CUDNN_ROOT}/lib/x64
${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
$ENV{CUDNN_ROOT}
$ENV{CUDNN_ROOT}/lib64
$ENV{CUDNN_ROOT}/lib
$ENV{CUDNN_ROOT}/lib/x64
/usr/lib
${CUDA_TOOLKIT_ROOT_DIR}
${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
Expand Down
3 changes: 2 additions & 1 deletion cmake/flags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,8 @@ if(APPLE)
-Werror=uninitialized
-Werror=tautological-constant-out-of-range-compare
-Werror=literal-conversion
-Werror=pragma-pack)
-Werror=pragma-pack
-Werror=c++17-extensions)
endif()

if(WITH_HETERPS AND WITH_PSLIB)
Expand Down
5 changes: 3 additions & 2 deletions cmake/third_party.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -317,8 +317,9 @@ if(WITH_ONNXRUNTIME)
endif()

if(WITH_GPU)
if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0 OR ${CMAKE_CUDA_COMPILER_VERSION}
GREATER_EQUAL 11.6)
if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0
OR (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.6
AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.8))
include(external/cub) # download cub
list(APPEND third_party_deps extern_cub)
endif()
Expand Down
25 changes: 24 additions & 1 deletion paddle/fluid/distributed/collective/ProcessGroupBKCL.cc
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Collective(
const auto& place = in_tensor.place();
const auto& key = GetKeyFromPlace(place);

if (!calc_event_) {
if (!calc_event_ ||
(place_to_comm_ctx_.find(key) == place_to_comm_ctx_.end())) {
CreateBKCLEnvCache(place, key);
}

Expand All @@ -170,6 +171,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Collective(
fn(out_tensor, in_tensor, comm_ctx->bkcl_context(), bkcl_stream);

if (!use_calc_stream) {
PADDLE_ENFORCE_NOT_NULL(
comm_ctx.get(), platform::errors::Fatal("comm context is nullptr."));
task->comm_event_->Record(*comm_ctx.get());
}

Expand Down Expand Up @@ -369,6 +372,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllReduce(
1,
platform::errors::InvalidArgument(
"BKCL only support single tensor collective communication."));
PADDLE_ENFORCE_EQ(
CheckTensorsInXPUPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in XPUPlace."));
return Collective(
&out_tensors[0],
in_tensors[0],
Expand Down Expand Up @@ -406,6 +413,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllReduce(
1,
platform::errors::InvalidArgument(
"BKCL only support single tensor collective communication."));
PADDLE_ENFORCE_EQ(
CheckTensorsInXPUPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in XPUPlace."));
return Collective(
&out_tensors[0],
in_tensors[0],
Expand Down Expand Up @@ -442,6 +453,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Broadcast(
1,
platform::errors::InvalidArgument(
"BKCL only support single tensor collective communication."));
PADDLE_ENFORCE_EQ(
CheckTensorsInXPUPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in XPUPlace."));

return Collective(
&out_tensors[0],
Expand Down Expand Up @@ -481,6 +496,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Broadcast(
1,
platform::errors::InvalidArgument(
"BKCL only support single tensor collective communication."));
PADDLE_ENFORCE_EQ(
CheckTensorsInXPUPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in XPUPlace."));

return Collective(
&out_tensors[0],
Expand Down Expand Up @@ -518,6 +537,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllGather(
1,
platform::errors::InvalidArgument(
"BKCL only support single tensor collective communication."));
PADDLE_ENFORCE_EQ(
CheckTensorsInXPUPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in XPUPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInXPUPlace(out_tensors),
true,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -572,15 +572,17 @@ fused_attention_dygraph_function(
egr::EagerUtils::CheckAndRetainGrad(SoftmaxOut);
grad_node->SetGradOutMeta(SoftmaxOut, 19);

auto AttnDropoutOut_accumulation_node =
std::make_shared<egr::GradNodeAccumulation>(
p_autograd_AttnDropoutOut);
egr::EagerUtils::SetOutRankWithSlot(p_autograd_AttnDropoutOut, 0);
egr::EagerUtils::SetHistory(p_autograd_AttnDropoutOut,
AttnDropoutOut_accumulation_node);
AttnDropoutOut_accumulation_node->SetGradInMeta(AttnDropoutOut, 0);
egr::EagerUtils::CheckAndRetainGrad(AttnDropoutOut);
grad_node->SetGradOutMeta(AttnDropoutOut, 20);
if (AttnDropoutOut.initialized()) {
auto AttnDropoutOut_accumulation_node =
std::make_shared<egr::GradNodeAccumulation>(
p_autograd_AttnDropoutOut);
egr::EagerUtils::SetOutRankWithSlot(p_autograd_AttnDropoutOut, 0);
egr::EagerUtils::SetHistory(p_autograd_AttnDropoutOut,
AttnDropoutOut_accumulation_node);
AttnDropoutOut_accumulation_node->SetGradInMeta(AttnDropoutOut, 0);
egr::EagerUtils::CheckAndRetainGrad(AttnDropoutOut);
grad_node->SetGradOutMeta(AttnDropoutOut, 20);
}

auto FMHAOut_accumulation_node =
std::make_shared<egr::GradNodeAccumulation>(p_autograd_FMHAOut);
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,7 @@ class fused_attentionGradNodeCompat : public egr::GradNodeBase {
SoftmaxOut_ = egr::TensorWrapper(SoftmaxOut, false);
}
void SetTensorWrapperSrcMask(const paddle::experimental::Tensor& SrcMask) {
SrcMask_ = egr::TensorWrapper(SrcMask, false);
SrcMask_ = egr::TensorWrapper(SrcMask, true);
}
void SetTensorWrapperSrcMaskOut(
const paddle::experimental::Tensor& SrcMaskOut) {
Expand Down
5 changes: 2 additions & 3 deletions paddle/fluid/framework/details/nan_inf_utils_detail.cu
Original file line number Diff line number Diff line change
Expand Up @@ -428,13 +428,12 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
phi::DenseTensor block_num_nan_inf;
block_num_nan_inf.Resize({static_cast<int64_t>(2 * numel_max_min)});
int64_t* block_num_nan_ptr =
block_num_nan_inf.mutable_data<int64_t>(tensor.place());
dev_ctx->template Alloc<int64_t>(&block_num_nan_inf);
int64_t* block_num_inf_ptr = block_num_nan_ptr + numel_max_min;

phi::DenseTensor tensor_block_max_min;
tensor_block_max_min.Resize({static_cast<int64_t>(3 * numel_max_min)});
MT* tensor_block_max_ptr =
tensor_block_max_min.mutable_data<MT>(tensor.place());
MT* tensor_block_max_ptr = dev_ctx->template Alloc<MT>(&tensor_block_max_min);
MT* tensor_block_min_ptr = tensor_block_max_ptr + numel_max_min;
MT* tensor_block_mean_ptr = tensor_block_max_ptr + 2 * numel_max_min;

Expand Down
19 changes: 1 addition & 18 deletions paddle/fluid/framework/ir/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ pass_library(delete_c_identity_op_pass inference)
pass_library(preln_residual_bias_fuse_pass inference)
pass_library(delete_fill_constant_op_pass inference)
pass_library(constant_folding_pass inference)
pass_library(float_to_half_pass inference)
pass_library(auto_mixed_precision_pass inference)
pass_library(conv2d_fusion_layout_transfer_pass inference)
pass_library(simplify_with_basic_ops_pass base)
pass_library(fc_elementwise_layernorm_fuse_pass base)
Expand Down Expand Up @@ -151,19 +151,6 @@ endif()

if(WITH_MKLDNN)
pass_library(mkldnn_placement_pass base DEPS placement_pass_base DIR mkldnn)
pass_library(
mkldnn_inplace_pass
inference
DEPS
mkldnn_placement_pass
op_registry
elementwise_add_op
generated_op
activation_op
softmax_op
softmax
DIR
mkldnn)
pass_library(depthwise_conv_mkldnn_pass base DIR mkldnn)
pass_library(conv_affine_channel_mkldnn_fuse_pass inference DIR mkldnn)
pass_library(conv_bias_mkldnn_fuse_pass inference DIR mkldnn)
Expand Down Expand Up @@ -450,10 +437,6 @@ if(WITH_MKLDNN)
test_mkldnn_placement_pass
SRCS mkldnn/mkldnn_placement_pass_tester.cc
DEPS mkldnn_placement_pass)
cc_test(
test_mkldnn_inplace_pass
SRCS mkldnn/mkldnn_inplace_pass_tester.cc
DEPS mkldnn_inplace_pass)
cc_test(
test_compute_propagate_scales_mkldnn_pass
SRCS mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc
Expand Down
Loading

0 comments on commit 060b3a2

Please sign in to comment.