Skip to content

Commit

Permalink
merge develop
Browse files Browse the repository at this point in the history
  • Loading branch information
zoooo0820 committed Dec 27, 2021
2 parents 25f03ac + 9cfdae9 commit 212d98a
Show file tree
Hide file tree
Showing 82 changed files with 1,291 additions and 882 deletions.
55 changes: 31 additions & 24 deletions cmake/pten_kernel.cmake
Expand Up @@ -18,7 +18,7 @@ function(kernel_declare TARGET_LIST)
file(READ ${kernel_path} kernel_impl)
# TODO(chenweihang): rename PT_REGISTER_CTX_KERNEL to PT_REGISTER_KERNEL
# NOTE(chenweihang): now we don't recommend to use digit in kernel name
string(REGEX MATCH "(PT_REGISTER_CTX_KERNEL|PT_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z_]*," first_registry "${kernel_impl}")
string(REGEX MATCH "(PT_REGISTER_CTX_KERNEL|PT_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}")
if (NOT first_registry STREQUAL "")
# parse the first kernel name
string(REPLACE "PT_REGISTER_CTX_KERNEL(" "" kernel_name "${first_registry}")
Expand All @@ -33,8 +33,6 @@ function(kernel_declare TARGET_LIST)
file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n")
elseif (${kernel_path} MATCHES "./xpu\/")
file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
elseif (${kernel_path} MATCHES "./npu\/*")
file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, NPU, ALL_LAYOUT);\n")
else ()
# deal with device independent kernel, now we use CPU temporaary
file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
Expand All @@ -48,7 +46,9 @@ function(kernel_library TARGET)
set(cpu_srcs)
set(gpu_srcs)
set(xpu_srcs)
set(npu_srcs)
# parse and save the deps kerenl targets
set(all_srcs)
set(kernel_deps)

set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
Expand All @@ -57,7 +57,6 @@ function(kernel_library TARGET)

list(LENGTH kernel_library_SRCS kernel_library_SRCS_len)
# one kernel only match one impl file in each backend
# TODO(chenweihang): parse compile deps by include headers
if (${kernel_library_SRCS_len} EQUAL 0)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
list(APPEND common_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
Expand All @@ -75,57 +74,68 @@ function(kernel_library TARGET)
list(APPEND xpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc)
endif()
endif()
if (WITH_ASCEND_CL)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/npu/${TARGET}.cc)
list(APPEND npu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/npu/${TARGET}.cc)
endif()
endif()
else()
# TODO(chenweihang): impl compile by source later
endif()

list(APPEND all_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.h)
list(APPEND all_srcs ${common_srcs})
list(APPEND all_srcs ${cpu_srcs})
list(APPEND all_srcs ${gpu_srcs})
list(APPEND all_srcs ${xpu_srcs})
foreach(src ${all_srcs})
file(READ ${src} target_content)
string(REGEX MATCHALL "#include \"paddle\/pten\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
foreach(include_kernel ${include_kernels})
string(REGEX REPLACE "#include \"paddle\/pten\/kernels\/" "" kernel_name ${include_kernel})
string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
list(APPEND kernel_deps ${kernel_name})
endforeach()
endforeach()
list(REMOVE_DUPLICATES kernel_deps)
list(REMOVE_ITEM kernel_deps ${TARGET})

list(LENGTH common_srcs common_srcs_len)
list(LENGTH cpu_srcs cpu_srcs_len)
list(LENGTH gpu_srcs gpu_srcs_len)
list(LENGTH xpu_srcs xpu_srcs_len)
list(LENGTH npu_srcs npu_srcs_len)

if (${common_srcs_len} GREATER 0)
# If the kernel has a device independent public implementation,
# we will use this implementation and will not adopt the implementation
# under specific devices
if (WITH_GPU)
nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS})
nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_ROCM)
hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS})
hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
else()
cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS})
cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
else()
# If the kernel has a header file declaration, but no corresponding
# implementation can be found, this is not allowed
if (${cpu_srcs_len} EQUAL 0 AND ${gpu_srcs_len} EQUAL 0 AND
${xpu_srcs_len} EQUAL 0 AND ${npu_srcs_len} EQUAL 0)
${xpu_srcs_len} EQUAL 0)
message(FATAL_ERROR "Cannot find any implementation for ${TARGET}")
else()
if (WITH_GPU)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS})
nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
elseif (WITH_ROCM)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS})
hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
else()
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${npu_srcs_len} GREATER 0)
cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${npu_srcs} DEPS ${kernel_library_DEPS})
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
endif()
endif()
endif()

if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
${xpu_srcs_len} GREATER 0 OR ${npu_srcs_len} GREATER 0)
if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
# append target into PTEN_KERNELS property
get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
set(pten_kernels ${pten_kernels} ${TARGET})
Expand All @@ -147,9 +157,6 @@ function(kernel_library TARGET)
if (${xpu_srcs_len} GREATER 0)
kernel_declare(${xpu_srcs})
endif()
if (${npu_srcs_len} GREATER 0)
kernel_declare(${npu_srcs})
endif()
endfunction()

function(register_kernels)
Expand Down
9 changes: 5 additions & 4 deletions paddle/fluid/framework/CMakeLists.txt
Expand Up @@ -91,15 +91,16 @@ endif()
cc_test(copy_same_tensor_test SRCS copy_same_tensor_test.cc DEPS tensor)

cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
cc_library(mixed_vector SRCS mixed_vector.cc DEPS device_context)

if(WITH_GPU)
nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor)
nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS mixed_vector place memory device_context tensor)
elseif(WITH_ROCM)
hip_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor)
hip_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS mixed_vector place memory device_context tensor)
else()
cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS mixed_vector place memory device_context tensor)
endif()
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim mixed_vector place tensor framework_proto version)

cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)

Expand Down
22 changes: 7 additions & 15 deletions paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc
Expand Up @@ -67,12 +67,6 @@ FuseBatchNormActOneDNNPass::FuseBatchNormActOneDNNPass() {
.AddAttr("epsilon")
.IsNumGE(0.0f)
.IsNumLE(0.001f)
.End()
.AddAttr("trainable_statistics")
.IsBoolEQ(false)
.End()
.AddAttr("is_test")
.IsBoolEQ(true)
.End();

AddOpCompat(OpCompat("relu"))
Expand Down Expand Up @@ -114,21 +108,19 @@ void FuseBatchNormActOneDNNPass::FuseBatchNormAct(
GET_IR_NODE_FROM_SUBGRAPH(act, act, bn_act_pattern);

auto *bn_op = batch_norm->Op();
if (bn_op->HasAttr("use_mkldnn")) {
if (bn_op->HasAttr("trainable_statistics")) {
PADDLE_ENFORCE(
BOOST_GET_CONST(bool, bn_op->GetAttr("use_mkldnn")),
!BOOST_GET_CONST(bool, bn_op->GetAttr("trainable_statistics")),
platform::errors::PreconditionNotMet(
"The BatchNorm+Act fusion may happen only when oneDNN library "
"is used."));
"The BatchNorm+Act fusion may happen only when mean and variance "
"are not calculated by current batch statistics."));
}

auto *act_op = act->Op();
if (act_op->HasAttr("use_mkldnn")) {
if (bn_op->HasAttr("is_test")) {
PADDLE_ENFORCE(
BOOST_GET_CONST(bool, bn_op->GetAttr("use_mkldnn")),
BOOST_GET_CONST(bool, bn_op->GetAttr("is_test")),
platform::errors::PreconditionNotMet(
"The BatchNorm+Act fusion may happen only when oneDNN library "
"is used."));
"The BatchNorm+Act fusion may happen only during inference."));
}

bn_op->SetAttr("use_mkldnn", true);
Expand Down
24 changes: 12 additions & 12 deletions paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc
Expand Up @@ -65,9 +65,9 @@ TEST(FuseBatchNormActOneDNNPass, ThrowIsTestTrainableStats) {
// No fusion in this attribute configuration
constexpr int removed_nodes_count = 0;

EXPECT_TRUE(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
"act_y", removed_nodes_count));
EXPECT_TRUE(test::AssertOpsCount(graph, {{"batch_norm", 1}, {"relu", 1}}));
EXPECT_THROW(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
"act_y", removed_nodes_count),
paddle::platform::EnforceNotMet);
}

TEST(FuseBatchNormActOneDNNPass, FuseIsTest) {
Expand Down Expand Up @@ -123,9 +123,9 @@ TEST(FuseBatchNormActOneDNNPass, ThrowTrainableStats) {
// No fusion in this attribute configuration
constexpr int removed_nodes_count = 0;

EXPECT_TRUE(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
"act_y", removed_nodes_count));
EXPECT_TRUE(test::AssertOpsCount(graph, {{"batch_norm", 1}, {"relu", 1}}));
EXPECT_THROW(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
"act_y", removed_nodes_count),
paddle::platform::EnforceNotMet);
}

TEST(FuseBatchNormActOneDNNPass, AllAttrsFalse) {
Expand All @@ -149,9 +149,9 @@ TEST(FuseBatchNormActOneDNNPass, AllAttrsFalse) {
// No fusion in this attribute configuration
constexpr int removed_nodes_count = 0;

EXPECT_TRUE(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
"act_y", removed_nodes_count));
EXPECT_TRUE(test::AssertOpsCount(graph, {{"batch_norm", 1}, {"relu", 1}}));
EXPECT_THROW(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
"act_y", removed_nodes_count),
paddle::platform::EnforceNotMet);
}

TEST(FuseBatchNormActOneDNNPass, ThrowUseMkldnn) {
Expand All @@ -176,9 +176,9 @@ TEST(FuseBatchNormActOneDNNPass, ThrowUseMkldnn) {
// No fusion in this attribute configuration
constexpr int removed_nodes_count = 0;

EXPECT_TRUE(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
"act_y", removed_nodes_count));
EXPECT_TRUE(test::AssertOpsCount(graph, {{"batch_norm", 1}, {"relu", 1}}));
EXPECT_THROW(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
"act_y", removed_nodes_count),
paddle::platform::EnforceNotMet);
}

TEST(FuseBatchNormActOneDNNPass, pass_op_version_check) {
Expand Down
87 changes: 87 additions & 0 deletions paddle/fluid/framework/mixed_vector.cc
@@ -0,0 +1,87 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/framework/mixed_vector.h"

#include <algorithm>
#include <initializer_list>
#include <memory>
#include <mutex> // NOLINT
#include <utility>
#include <vector>

#include "glog/logging.h"
#include "paddle/fluid/framework/details/cow_ptr.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/utils/none.h"
#include "paddle/utils/optional.h"

namespace paddle {
namespace framework {

template <typename T>
void CopyToCPUHelper(std::vector<T> *cpu_, paddle::memory::AllocationPtr *gpu_,
size_t *gpu_memory_size_) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// COPY GPU Data To CPU
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get((*gpu_)->place()));
auto stream = dev_ctx->stream();
void *src = (*gpu_)->ptr();
void *dst = cpu_->data();
paddle::memory::Copy(platform::CPUPlace(), dst,
OptionalCUDAPlace(*gpu_).get(), src, *gpu_memory_size_,
stream);
dev_ctx->Wait();
#endif
}

template <typename T>
void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
paddle::memory::AllocationPtr *gpu_,
size_t *gpu_memory_size_,
const platform::Place &place) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void *src = cpu_->data();
*gpu_memory_size_ = cpu_->size() * sizeof(T); // sizeof(T)
(*gpu_) = memory::Alloc(place, *gpu_memory_size_);
void *dst = (*gpu_)->ptr();
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place));
auto stream = dev_ctx->stream();
paddle::memory::Copy(OptionalCUDAPlace(*gpu_).get(), dst,
platform::CPUPlace(), src, *gpu_memory_size_, stream);
#endif
}

#define INSTANTIATE_VECTOR_FOR_TYPE(__TYPE__) \
template <> \
void Vector<__TYPE__>::VectorData::CopyToCPU() const { \
CopyToCPUHelper<__TYPE__>(&cpu_, &gpu_, &gpu_memory_size_); \
} \
\
template <> \
void Vector<__TYPE__>::VectorData::CopyCPUDataToCUDA( \
const platform::Place &place) const { \
CopyCPUDataToCUDAHelper<__TYPE__>(&cpu_, &gpu_, &gpu_memory_size_, place); \
}

INSTANTIATE_VECTOR_FOR_TYPE(size_t)
INSTANTIATE_VECTOR_FOR_TYPE(int)
INSTANTIATE_VECTOR_FOR_TYPE(int64_t)

}; // namespace framework
} // namespace paddle

0 comments on commit 212d98a

Please sign in to comment.