Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Core] Add the graph optimization of subblocks for transformer model #3947

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
1680551
[Core][ARM] Fix beam_search, eltwise_mul supports broadcast and int64…
hong19860320 Jul 14, 2020
675ac7c
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle-Lite…
hong19860320 Jul 14, 2020
289ffe5
test=develop
hong19860320 Jul 14, 2020
7069291
test=develop
hong19860320 Jul 14, 2020
611e603
test=develop
hong19860320 Jul 15, 2020
58ab604
test=develop
hong19860320 Jul 16, 2020
4f5e84a
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle-Lite…
hong19860320 Jul 16, 2020
e0955dc
test=develop test=xpu
hong19860320 Jul 17, 2020
9b26148
test=develop test=xpu
hong19860320 Jul 17, 2020
6e2c040
test=develop test=xpu
hong19860320 Jul 17, 2020
c8fd2e9
test=develop test=xpu
hong19860320 Jul 17, 2020
59042c5
test=develop test=xpu
hong19860320 Jul 17, 2020
d9d2795
test=develop test=xpu
hong19860320 Jul 18, 2020
2ccdf11
test=develop test=xpu
hong19860320 Jul 18, 2020
429101c
test=develop test=xpu
hong19860320 Jul 18, 2020
05f11ab
test=develop test=xpu
hong19860320 Jul 19, 2020
657a380
test=develop test=xpu
hong19860320 Jul 19, 2020
f097cc5
test=develop test=xpu
hong19860320 Jul 19, 2020
4cc0c9c
test=develop test=xpu
hong19860320 Jul 19, 2020
b33d98c
test=develop test=xpu
hong19860320 Jul 19, 2020
681ba7b
test=develop test=xpu
hong19860320 Jul 19, 2020
554d84e
test=develop test=xpu
hong19860320 Jul 19, 2020
ff9a631
test=develop test=xpu
hong19860320 Jul 19, 2020
ef95517
test=develop
hong19860320 Jul 20, 2020
7c7bd59
test=develop
hong19860320 Jul 20, 2020
869378a
test=develop
hong19860320 Jul 20, 2020
edb6f96
[ARM] Add the unit test for the transformer model
hong19860320 Jul 20, 2020
10ce221
test=develop
hong19860320 Jul 21, 2020
61b5679
test=develop
hong19860320 Jul 21, 2020
37bfd84
test=develop
hong19860320 Jul 21, 2020
e0c42aa
test=develop
hong19860320 Jul 21, 2020
2e00379
test=develop test=xpu
hong19860320 Jul 21, 2020
8346769
test=develop test=xpu
hong19860320 Jul 21, 2020
3c0bc54
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle-Lite…
hong19860320 Jul 22, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Expand Up @@ -98,6 +98,7 @@ lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK "Enable light-weight framework" OF
lite_option(LITE_WITH_PROFILE "Enable profile mode in lite framework" OFF)
lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF)
lite_option(LITE_WITH_LOG "Enable log printing or not." ON)
lite_option(LITE_WITH_EXCEPTION "Enable throwing the exception when error occurs in lite" OFF)
lite_option(LITE_WITH_NVTX "Enable nvtx or not, please enable LITE_WITH_CUDA first." OFF)
lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
Expand Down
4 changes: 4 additions & 0 deletions cmake/configure.cmake
Expand Up @@ -190,6 +190,10 @@ if (LITE_WITH_LOG)
add_definitions("-DLITE_WITH_LOG")
endif()

if (LITE_WITH_EXCEPTION)
add_definitions("-DLITE_WITH_EXCEPTION")
endif()

if (LITE_ON_TINY_PUBLISH)
add_definitions("-DLITE_ON_TINY_PUBLISH")
endif()
Expand Down
11 changes: 11 additions & 0 deletions cmake/cross_compiling/android.cmake
Expand Up @@ -80,6 +80,17 @@ if (ARM_TARGET_LANG STREQUAL "clang")
elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
set(triple arm-v7a-linux-android)
set(LITE_WITH_OPENMP OFF CACHE STRING "Due to libomp's bug(For ARM64, it has been fixed by https://reviews.llvm.org/D19879, but still exists on ARM32), disable OpenMP on armv7 when cross-compiling using Clang" FORCE)
if(ANDROID_STL_TYPE MATCHES "^c\\+\\+_")
# Use CMAKE_CXX_STANDARD_LIBRARIES_INIT to ensure libunwind and libc++ is linked in the right order
set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libunwind.a")
if(ANDROID_STL_TYPE STREQUAL "c++_shared")
set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_shared.so")
elseif(ANDROID_STL_TYPE STREQUAL "c++_static")
set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_static.a")
else()
message(FATAL_ERROR "Invalid Android STL TYPE: ${ANDROID_STL_TYPE}.")
endif()
endif()
else()
message(FATAL_ERROR "Clang do not support this ${ARM_TARGET_ARCH_ABI}, use armv8 or armv7")
endif()
Expand Down
28 changes: 21 additions & 7 deletions cmake/cross_compiling/postproject.cmake
Expand Up @@ -23,6 +23,21 @@ if(ANDROID)

set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -llog -fPIC")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog -fPIC")

# Don't re-export libgcc symbols
set(REMOVE_ATOMIC_GCC_SYMBOLS "-Wl,--exclude-libs,libatomic.a -Wl,--exclude-libs,libgcc.a")
set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}")
set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}")

# Only the libunwind.a from clang(with libc++) provide C++ exception handling support for 32-bit ARM
# Refer to https://android.googlesource.com/platform/ndk/+/master/docs/BuildSystemMaintainers.md#Unwinding
if (ARM_TARGET_LANG STREQUAL "clang" AND ARM_TARGET_ARCH_ABI STREQUAL "armv7" AND ANDROID_STL_TYPE MATCHES "^c\\+\\+_")
set(REMOVE_UNWIND_SYMBOLS "-Wl,--exclude-libs,libunwind.a")
set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}")
set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}")
endif()
endif()

if(ARMLINUX)
Expand Down Expand Up @@ -59,14 +74,13 @@ function(check_linker_flag)
endfunction()

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
if((LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang")) OR LITE_WITH_PYTHON OR LITE_WITH_EXCEPTION OR (NOT LITE_ON_TINY_PUBLISH))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions -fasynchronous-unwind-tables -funwind-tables")
else ()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -fno-asynchronous-unwind-tables -fno-unwind-tables")
endif()
if (LITE_ON_TINY_PUBLISH)
if((NOT LITE_WITH_PYTHON))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
endif()
if(LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang"))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections")
check_linker_flag(-Wl,--gc-sections)
endif()
Expand Down
15 changes: 14 additions & 1 deletion cmake/device/npu.cmake
Expand Up @@ -54,6 +54,11 @@ find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build
PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
NO_DEFAULT_PATH)

# Added in HiAI DDK 320 or later version
find_library(NPU_DDK_HCL_FILE NAMES hcl
PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
NO_DEFAULT_PATH)

if(NOT NPU_DDK_HIAI_FILE)
message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}")
else()
Expand All @@ -78,5 +83,13 @@ else()
set_property(TARGET npu_ddk_ir_build PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_BUILD_FILE})
endif()

set(npu_runtime_libs npu_ddk_hiai CACHE INTERNAL "npu ddk runtime libs")
if(NOT NPU_DDK_HCL_FILE)
# message(FATAL_ERROR "Can not find NPU_DDK_HCL_FILE in ${NPU_DDK_ROOT}")
else()
message(STATUS "Found NPU_DDK HCL Library: ${NPU_DDK_HCL_FILE}")
add_library(npu_ddk_hcl SHARED IMPORTED GLOBAL)
set_property(TARGET npu_ddk_hcl PROPERTY IMPORTED_LOCATION ${NPU_DDK_HCL_FILE})
endif()

set(npu_runtime_libs npu_ddk_hiai npu_ddk_hcl CACHE INTERNAL "npu ddk runtime libs")
set(npu_builder_libs npu_ddk_ir npu_ddk_ir_build CACHE INTERNAL "npu ddk builder libs")
1 change: 1 addition & 0 deletions lite/CMakeLists.txt
Expand Up @@ -45,6 +45,7 @@ if (WITH_TESTING)
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "MobileNetV1_quant.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "transformer_with_mask_fp32.tar.gz")
endif()
if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "GoogleNet_inference.tar.gz")
Expand Down
42 changes: 22 additions & 20 deletions lite/api/cxx_api.cc
Expand Up @@ -37,8 +37,7 @@ void Predictor::SaveModel(const std::string &dir,
if (!program_) {
GenRuntimeProgram();
}
program_->SaveOpInfosToProgram(program_desc_.get());
program_->UpdateVarsOfProgram(program_desc_.get());
program_->SaveToProgram(program_desc_);
switch (model_type) {
case lite_api::LiteModelType::kProtobuf:
SaveModelPb(dir, *program_->exec_scope(), *program_desc_.get(), true);
Expand All @@ -58,17 +57,21 @@ void Predictor::SaveModel(const std::string &dir,
void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
std::set<std::string> ops_info;
std::set<std::string> kernels_info;
const auto &instructions_ = program_->instructions();
for (auto &node : instructions_) {
// parse op type infomation
auto op = node.op()->op_info();
ops_info.insert(op->Type());
// parse kernel type information
std::string kernel_type_str =
node.kernel()->op_type() + "," + TargetRepr(node.kernel()->target()) +
"," + PrecisionRepr(node.kernel()->precision()) + "," +
DataLayoutRepr(node.kernel()->layout()) + "," + node.kernel()->alias();
kernels_info.insert(kernel_type_str);
auto block_size = program_->block_size();
for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
const auto &insts = program_->instructions(block_idx);
for (auto &inst : insts) {
// parse op type infomation
auto op = inst.op()->op_info();
ops_info.insert(op->Type());
// parse kernel type information
std::string kernel_type_str =
inst.kernel()->op_type() + "," + TargetRepr(inst.kernel()->target()) +
"," + PrecisionRepr(inst.kernel()->precision()) + "," +
DataLayoutRepr(inst.kernel()->layout()) + "," +
inst.kernel()->alias();
kernels_info.insert(kernel_type_str);
}
}

// get souce_file name from op type and kernel type
Expand Down Expand Up @@ -170,9 +173,9 @@ void Predictor::PrepareFeedFetch() {

std::vector<const cpp::OpDesc *> feeds;
std::vector<const cpp::OpDesc *> fetchs;
const auto &insts = program_->instructions();
for (size_t i = 0; i < program_->num_instructions(); i++) {
const auto &op = insts[i].op()->op_info();
const auto &insts = program_->instructions(kRootBlockIdx);
for (auto &inst : insts) {
const auto &op = inst.op()->op_info();
if (op->Type() == "feed") {
feeds.push_back(op);
} else if (op->Type() == "fetch") {
Expand Down Expand Up @@ -255,7 +258,6 @@ void Predictor::Build(const lite_api::CxxConfig &config,
} else {
LOG(INFO) << "Load model from file.";
}

Build(model_path,
model_file,
param_file,
Expand Down Expand Up @@ -296,10 +298,10 @@ void Predictor::Build(const std::string &model_path,
Build(program_desc_, valid_places, passes);
}

void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &program_desc,
const std::vector<Place> &valid_places,
const std::vector<std::string> &passes) {
program_desc_ = desc;
program_desc_ = program_desc;
// `inner_places` is used to optimize passes
std::vector<Place> inner_places = valid_places;
for (auto &valid_place : valid_places) {
Expand Down Expand Up @@ -336,7 +338,7 @@ void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
Place{TARGET(kARM), PRECISION(kInt8)});
}

Program program(*desc.get(), scope_, inner_places);
Program program(program_desc_, scope_, inner_places);
valid_places_ = inner_places;

core::KernelPickFactor factor;
Expand Down
29 changes: 14 additions & 15 deletions lite/api/cxx_api.h
Expand Up @@ -52,12 +52,12 @@ class LITE_API Predictor {
// Create a predictor with the weight variable scope set.
explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
: scope_(root_scope) {}
Predictor(const std::shared_ptr<cpp::ProgramDesc>& desc,
const std::shared_ptr<Scope>& root,
Predictor(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
const std::shared_ptr<Scope>& root_scope,
const std::vector<Place>& valid_places,
const std::vector<std::string>& var_names = {})
: program_desc_(desc), scope_(root) {
Program program(*desc.get(), scope_, valid_places, var_names);
const std::vector<std::string>& vars_to_clone = {})
: program_desc_(program_desc), scope_(root_scope) {
Program program(program_desc_, scope_, valid_places, vars_to_clone);
optimizer_ = Optimizer(std::move(program), valid_places);
exec_scope_ = optimizer_.exec_scope();
valid_places_ = valid_places;
Expand All @@ -79,30 +79,28 @@ class LITE_API Predictor {
lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
bool memory_from_memory = false);

void Build(const std::shared_ptr<cpp::ProgramDesc>& desc,
void Build(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
const std::vector<Place>& valid_places,
const std::vector<std::string>& passes = {});

std::shared_ptr<Predictor> Clone() const {
auto predictor =
std::make_shared<Predictor>(program_desc_, scope_, valid_places_);
return predictor;
return std::make_shared<Predictor>(program_desc_, scope_, valid_places_);
}

std::shared_ptr<Predictor> Clone(
const std::vector<std::string>& var_names) const {
const std::vector<std::string>& vars_to_clone) const {
CHECK(program_desc_) << "Both program and scope of current predicotr "
"should be not be nullptr in Clone mode.";
CHECK(scope_) << "Both program and scope of current predicotr should be "
"not be nullptr in Clone mode.";
auto predictor = std::make_shared<Predictor>(
program_desc_, scope_, valid_places_, var_names);
program_desc_, scope_, valid_places_, vars_to_clone);

for (auto i : var_names) {
predictor->exec_scope_->LocalVar(i);
auto* tensor = predictor->scope_->Var(i)->GetMutable<lite::Tensor>();
for (auto var_name : vars_to_clone) {
predictor->exec_scope_->LocalVar(var_name);
auto* tensor = predictor->scope_->Var(var_name)->GetMutable<Tensor>();
auto* sub_tensor =
predictor->exec_scope_->Var(i)->GetMutable<lite::Tensor>();
predictor->exec_scope_->Var(var_name)->GetMutable<Tensor>();
sub_tensor->CopyDataFrom(*tensor);
}
return predictor;
Expand Down Expand Up @@ -140,6 +138,7 @@ class LITE_API Predictor {
// get a const tensor according to its name
const lite::Tensor* GetTensor(const std::string& name) const;
const RuntimeProgram& runtime_program() const;
Scope* scope() { return scope_.get(); }

// This method is disabled in mobile, for unnecessary dependencies required.
void SaveModel(
Expand Down
4 changes: 3 additions & 1 deletion lite/api/cxx_api_impl.cc
Expand Up @@ -74,8 +74,10 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
mode_ = config.power_mode();
threads_ = config.threads();
#ifdef LITE_WITH_NPU
// Store the model-level configuration into scope for kernels, and use
// exe_scope to store the execution-level configuration
Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
config.subgraph_model_cache_dir());
raw_predictor_->scope(), config.subgraph_model_cache_dir());
#endif
#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
!(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
Expand Down