diff --git a/.cmake-format.py b/.cmake-format.py new file mode 100644 index 0000000000000..62f5651fb1c43 --- /dev/null +++ b/.cmake-format.py @@ -0,0 +1,125 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ----------------------------- +# Options affecting formatting. +# ----------------------------- +with section("format"): + + # How wide to allow formatted cmake files + line_width = 80 + +# ------------------------------------------------ +# Options affecting comment reflow and formatting. +# ------------------------------------------------ +with section("markup"): + # enable comment markup parsing and reflow + enable_markup = False + + # If comment markup is enabled, don't reflow the first comment block in each + # listfile. Use this to preserve formatting of your copyright/license + # statements. + first_comment_is_literal = True + +# ---------------------------------- +# Options affecting listfile parsing +# ---------------------------------- +with section("parse"): + # Additional FLAGS and KWARGS for custom commands + additional_commands = { + "cc_library": { + "kwargs": { + "SRCS": '*', + "DEPS": '*', + } + }, + "nv_library": { + "kwargs": { + "SRCS": '*', + "DEPS": '*', + } + }, + "hip_library": { + "kwargs": { + "SRCS": '*', + "DEPS": '*', + } + }, + "xpu_library": { + "kwargs": { + "SRCS": '*', + "DEPS": '*', + } + }, + "hip_library": { + "kwargs": { + "SRCS": '*', + "DEPS": '*', + } + }, + "hip_library": { + "kwargs": { + "SRCS": '*', + "DEPS": '*', + } + }, + "go_library": { + "kwargs": { + "SRCS": '*', + "DEPS": '*', + } + }, + "copy": { + "kwargs": { + "SRCS": '*', + "DSTS": '*', + } + }, + "cc_test": { + "kwargs": { + "SRCS": '*', + "DEPS": '*', + } + }, + "nv_test": { + "kwargs": { + "SRCS": '*', + "DEPS": '*', + } + }, + "hip_test": { + "kwargs": { + "SRCS": '*', + "DEPS": '*', + } + }, + "xpu_test": { + "kwargs": { + "SRCS": '*', + "DEPS": '*', + } + }, + "go_test": { + "kwargs": { + "SRCS": '*', + "DEPS": '*', + } + }, + "py_test": { + "kwargs": { + "SRCS": '*', + "DEPS": '*', + } + } + } diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 39d9ae5e0dcd7..4b588cbeb91dc 100755 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,14 +1,19 @@ repos: - repo: https://github.com/Lucas-C/pre-commit-hooks.git - sha: v1.0.1 + rev: v1.1.14 hooks: - id: remove-crlf files: (?!.*third_party)^.*$ | (?!.*book)^.*$ -- repo: https://github.com/PaddlePaddle/mirrors-yapf.git - sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37 +- repo: https://github.com/google/yapf + sha: v0.32.0 hooks: - id: yapf files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$ + exclude: | + (?x)^( + python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py| + python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py + )$ - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.1.0 hooks: @@ -28,6 +33,10 @@ repos: entry: bash ./tools/codestyle/clang_format.hook -i language: system files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps)$ + exclude: | + (?x)^( + paddle/fluid/distributed/ps/thirdparty/round_robin.h + )$ - repo: local hooks: - id: cpplint-cpp-source @@ -55,3 +64,13 @@ repos: (?x)^( paddle/utils/.* )$ +- repo: https://github.com/cheshirekow/cmake-format-precommit + rev: v0.6.13 + hooks: + - id: cmake-format + # exclude paddle/fluid/operators/CMakeLists.txt, see the comment + # https://github.com/PaddlePaddle/Paddle/pull/43057#pullrequestreview-993471860 + exclude: | + (?x)^( + paddle/fluid/operators/CMakeLists.txt + )$ diff --git a/CMakeLists.txt b/CMakeLists.txt index f3ed08d56e6d6..70eb5f11ea168 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -255,6 +255,7 @@ option(WITH_POCKETFFT "Compile with pocketfft support" ON) option(WITH_RECORD_BUILDTIME "Compile PaddlePaddle with record all targets build time" OFF) option(WITH_CUSTOM_DEVICE "Compile with custom device support" OFF) option(WITH_ARM_BRPC "Supprot Brpc in Arm" OFF) +option(WITH_FLPS "FL PS mode" OFF) if(WITH_RECORD_BUILDTIME) set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh ${CMAKE_CURRENT_BINARY_DIR}") diff --git a/cmake/FindGperftools.cmake b/cmake/FindGperftools.cmake index 318f9f5fd3b5a..bb76469c750b8 100644 --- a/cmake/FindGperftools.cmake +++ b/cmake/FindGperftools.cmake @@ -17,47 +17,46 @@ # GPERFTOOLS_LIBRARIES The Gperftools libraries (tcmalloc & profiler) # GPERFTOOLS_INCLUDE_DIR The location of Gperftools headers -find_library(GPERFTOOLS_TCMALLOC +find_library( + GPERFTOOLS_TCMALLOC NAMES tcmalloc HINTS ${Gperftools_ROOT_DIR}/lib) - -find_library(GPERFTOOLS_PROFILER + +find_library( + GPERFTOOLS_PROFILER NAMES profiler HINTS ${Gperftools_ROOT_DIR}/lib) -find_library(GPERFTOOLS_TCMALLOC_AND_PROFILER +find_library( + GPERFTOOLS_TCMALLOC_AND_PROFILER NAMES tcmalloc_and_profiler HINTS ${Gperftools_ROOT_DIR}/lib) -find_path(GPERFTOOLS_INCLUDE_DIR +find_path( + GPERFTOOLS_INCLUDE_DIR NAMES gperftools/heap-profiler.h HINTS ${Gperftools_ROOT_DIR}/include) set(GPERFTOOLS_LIBRARIES ${GPERFTOOLS_TCMALLOC_AND_PROFILER}) include(FindPackageHandleStandardArgs) -find_package_handle_standard_args( - Gperftools - DEFAULT_MSG - GPERFTOOLS_LIBRARIES - GPERFTOOLS_INCLUDE_DIR) +find_package_handle_standard_args(Gperftools DEFAULT_MSG GPERFTOOLS_LIBRARIES + GPERFTOOLS_INCLUDE_DIR) mark_as_advanced( - Gperftools_ROOT_DIR - GPERFTOOLS_TCMALLOC - GPERFTOOLS_PROFILER - GPERFTOOLS_TCMALLOC_AND_PROFILER - GPERFTOOLS_LIBRARIES - GPERFTOOLS_INCLUDE_DIR) + Gperftools_ROOT_DIR GPERFTOOLS_TCMALLOC GPERFTOOLS_PROFILER + GPERFTOOLS_TCMALLOC_AND_PROFILER GPERFTOOLS_LIBRARIES GPERFTOOLS_INCLUDE_DIR) # create IMPORTED targets -if (Gperftools_FOUND AND NOT TARGET gperftools::tcmalloc) +if(Gperftools_FOUND AND NOT TARGET gperftools::tcmalloc) add_library(gperftools::tcmalloc UNKNOWN IMPORTED) - set_target_properties(gperftools::tcmalloc PROPERTIES - IMPORTED_LOCATION ${GPERFTOOLS_TCMALLOC} - INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}") + set_target_properties( + gperftools::tcmalloc + PROPERTIES IMPORTED_LOCATION ${GPERFTOOLS_TCMALLOC} + INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}") add_library(gperftools::profiler UNKNOWN IMPORTED) - set_target_properties(gperftools::profiler PROPERTIES - IMPORTED_LOCATION ${GPERFTOOLS_PROFILER} - INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}") + set_target_properties( + gperftools::profiler + PROPERTIES IMPORTED_LOCATION ${GPERFTOOLS_PROFILER} + INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}") endif() diff --git a/cmake/FindNumPy.cmake b/cmake/FindNumPy.cmake index 8cdd642ac0131..fc7cdb8c1923c 100644 --- a/cmake/FindNumPy.cmake +++ b/cmake/FindNumPy.cmake @@ -14,13 +14,14 @@ if(NOT PYTHON_EXECUTABLE) endif() endif() -if (PYTHON_EXECUTABLE) +if(PYTHON_EXECUTABLE) # write a python script that finds the numpy path file(WRITE ${PROJECT_BINARY_DIR}/FindNumpyPath.py - "try: import numpy; print(numpy.get_include())\nexcept:pass\n") + "try: import numpy; print(numpy.get_include())\nexcept:pass\n") # execute the find script - exec_program("${PYTHON_EXECUTABLE}" ${PROJECT_BINARY_DIR} + exec_program( + "${PYTHON_EXECUTABLE}" ${PROJECT_BINARY_DIR} ARGS "FindNumpyPath.py" OUTPUT_VARIABLE NUMPY_PATH) elseif(_numpy_out) @@ -28,10 +29,12 @@ elseif(_numpy_out) endif(PYTHON_EXECUTABLE) find_path(PYTHON_NUMPY_INCLUDE_DIR numpy/arrayobject.h - HINTS "${NUMPY_PATH}" "${PYTHON_INCLUDE_PATH}") + HINTS "${NUMPY_PATH}" "${PYTHON_INCLUDE_PATH}") if(PYTHON_NUMPY_INCLUDE_DIR) - set(PYTHON_NUMPY_FOUND 1 CACHE INTERNAL "Python numpy found") + set(PYTHON_NUMPY_FOUND + 1 + CACHE INTERNAL "Python numpy found") endif(PYTHON_NUMPY_INCLUDE_DIR) include(FindPackageHandleStandardArgs) diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index 92a526a2b58a7..304246da4aea6 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -31,9 +31,9 @@ endif() ## Find MKLML First. if(WITH_MKLML) - include(external/mklml) # download, install mklml package + include(external/mklml) # download, install mklml package set(CBLAS_PROVIDER MKLML) - set(CBLAS_INC_DIR ${MKLML_INC_DIR}) + set(CBLAS_INC_DIR ${MKLML_INC_DIR}) set(CBLAS_LIBRARIES ${MKLML_LIB}) add_definitions(-DPADDLE_WITH_MKLML) @@ -43,40 +43,48 @@ if(WITH_MKLML) target_link_libraries(cblas dynload_mklml) message(STATUS "Found cblas and lapack in MKLML " - "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") + "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") endif() ## Then find openblas. if(NOT DEFINED CBLAS_PROVIDER) - set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas") + set(OPENBLAS_ROOT + $ENV{OPENBLAS_ROOT} + CACHE PATH "Folder contains Openblas") set(OPENBLAS_INCLUDE_SEARCH_PATHS - ${OPENBLAS_ROOT}/include - /usr/include - /usr/include/lapacke - /usr/include/openblas - /usr/local/opt/openblas/include) + ${OPENBLAS_ROOT}/include /usr/include /usr/include/lapacke + /usr/include/openblas /usr/local/opt/openblas/include) set(OPENBLAS_LIB_SEARCH_PATHS - ${OPENBLAS_ROOT}/lib - /usr/lib - /usr/lib/blas/openblas - /usr/lib/openblas - /usr/local/opt/openblas/lib) - - find_path(OPENBLAS_INC_DIR NAMES cblas.h - PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS} NO_DEFAULT_PATH) - find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h + ${OPENBLAS_ROOT}/lib /usr/lib /usr/lib/blas/openblas /usr/lib/openblas + /usr/local/opt/openblas/lib) + + find_path( + OPENBLAS_INC_DIR + NAMES cblas.h + PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS} + NO_DEFAULT_PATH) + find_path( + OPENBLAS_LAPACKE_INC_DIR + NAMES lapacke.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS}) - find_path(OPENBLAS_CONFIG_INC_DIR NAMES openblas_config.h + find_path( + OPENBLAS_CONFIG_INC_DIR + NAMES openblas_config.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS}) - find_library(OPENBLAS_LIB NAMES openblas + find_library( + OPENBLAS_LIB + NAMES openblas PATHS ${OPENBLAS_LIB_SEARCH_PATHS}) - if(OPENBLAS_LAPACKE_INC_DIR AND OPENBLAS_INC_DIR AND OPENBLAS_CONFIG_INC_DIR AND OPENBLAS_LIB) + if(OPENBLAS_LAPACKE_INC_DIR + AND OPENBLAS_INC_DIR + AND OPENBLAS_CONFIG_INC_DIR + AND OPENBLAS_LIB) file(READ "${OPENBLAS_CONFIG_INC_DIR}/openblas_config.h" config_file) string(REGEX MATCH "OpenBLAS ([0-9]+\.[0-9]+\.[0-9]+)" tmp ${config_file}) string(REGEX MATCH "([0-9]+\.[0-9]+\.[0-9]+)" ver ${tmp}) - - if (${ver} VERSION_GREATER_EQUAL "0.3.5") + + if(${ver} VERSION_GREATER_EQUAL "0.3.5") set(CBLAS_PROVIDER OPENBLAS) set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR}) set(CBLAS_LIBRARIES ${OPENBLAS_LIB}) @@ -84,52 +92,61 @@ if(NOT DEFINED CBLAS_PROVIDER) add_definitions(-DPADDLE_USE_OPENBLAS) add_definitions(-DLAPACK_FOUND) - message(STATUS "Found OpenBLAS (include: ${OPENBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") - message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})") + message( + STATUS + "Found OpenBLAS (include: ${OPENBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})" + ) + message( + STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})" + ) endif() endif() endif() ## Then find the reference-cblas if WITH_SYSTEM_BLAS. www.netlib.org/blas/ if(NOT DEFINED CBLAS_PROVIDER AND WITH_SYSTEM_BLAS) - set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH - "Folder contains reference-cblas") - set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS - ${REFERENCE_CBLAS_ROOT}/include - /usr/include - /usr/include/cblas - ) + set(REFERENCE_CBLAS_ROOT + $ENV{REFERENCE_CBLAS_ROOT} + CACHE PATH "Folder contains reference-cblas") + set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include + /usr/include /usr/include/cblas) set(REFERENCE_CBLAS_LIB_SEARCH_PATHS - ${REFERENCE_CBLAS_ROOT}/lib - /usr/lib - /usr/lib/blas/reference/ - /usr/lib/reference/ - ) - - find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS - ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS}) - find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS - ${REFERENCE_CBLAS_LIB_SEARCH_PATHS}) - find_library(REFERENCE_BLAS_LIBRARY NAMES blas PATHS - ${REFERENCE_CBLAS_LIB_SEARCH_PATHS}) + ${REFERENCE_CBLAS_ROOT}/lib /usr/lib /usr/lib/blas/reference/ + /usr/lib/reference/) + + find_path( + REFERENCE_CBLAS_INCLUDE_DIR + NAMES cblas.h + PATHS ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS}) + find_library( + REFERENCE_CBLAS_LIBRARY + NAMES cblas + PATHS ${REFERENCE_CBLAS_LIB_SEARCH_PATHS}) + find_library( + REFERENCE_BLAS_LIBRARY + NAMES blas + PATHS ${REFERENCE_CBLAS_LIB_SEARCH_PATHS}) if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY) set(CBLAS_PROVIDER REFERENCE_CBLAS) set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR}) set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY}) add_definitions(-DPADDLE_USE_REFERENCE_CBLAS) - message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") + message( + STATUS + "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})" + ) endif() endif() ## Then build openblas by external_project if(NOT DEFINED CBLAS_PROVIDER) - include(external/openblas) # download, build, install openblas + include(external/openblas) # download, build, install openblas set(CBLAS_PROVIDER EXTERN_OPENBLAS) add_dependencies(cblas extern_openblas) add_definitions(-DPADDLE_USE_OPENBLAS) message(STATUS "Build OpenBLAS by External Project " - "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") + "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") endif() # FIXME(gangliao): generate cblas target to track all high performance @@ -137,7 +154,8 @@ endif() include_directories(${CBLAS_INC_DIR}) if(${CBLAS_PROVIDER} STREQUAL REFERENCE_CBLAS) - target_link_libraries(cblas gfortran ${CBLAS_LIBRARIES} ${REFERENCE_BLAS_LIBRARY}) + target_link_libraries(cblas gfortran ${CBLAS_LIBRARIES} + ${REFERENCE_BLAS_LIBRARY}) elseif(NOT ${CBLAS_PROVIDER} STREQUAL MKLML) target_link_libraries(cblas ${CBLAS_LIBRARIES}) endif() diff --git a/cmake/ccache.cmake b/cmake/ccache.cmake index 5520720f7a6c7..85bc0e987a6b6 100644 --- a/cmake/ccache.cmake +++ b/cmake/ccache.cmake @@ -1,29 +1,34 @@ # Use ccache if found ccache program if(NOT WIN32) - find_program(CCACHE_PATH ccache) - if(CCACHE_PATH) - execute_process(COMMAND ccache -V OUTPUT_VARIABLE ccache_output) - execute_process(COMMAND ccache -s cache directory OUTPUT_VARIABLE cache_directory) - string(REGEX MATCH "[0-9]+.[0-9]+" ccache_version ${ccache_output}) - message(STATUS "ccache is founded, use ccache to speed up compile on Unix.") - # show statistics summary of ccache - message("ccache version\t\t\t " ${ccache_version} "\n" ${cache_directory}) - set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PATH}) - set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_PATH}) - endif(CCACHE_PATH) + find_program(CCACHE_PATH ccache) + if(CCACHE_PATH) + execute_process(COMMAND ccache -V OUTPUT_VARIABLE ccache_output) + execute_process(COMMAND ccache -s cache directory + OUTPUT_VARIABLE cache_directory) + string(REGEX MATCH "[0-9]+.[0-9]+" ccache_version ${ccache_output}) + message(STATUS "ccache is founded, use ccache to speed up compile on Unix.") + # show statistics summary of ccache + message("ccache version\t\t\t " ${ccache_version} "\n" + ${cache_directory}) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PATH}) + set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_PATH}) + endif(CCACHE_PATH) elseif("${CMAKE_GENERATOR}" STREQUAL "Ninja") - # (Note:zhouwei25) Only Ninja Generator can support sccache now - find_program(SCCACHE_PATH sccache) + # (Note:zhouwei25) Only Ninja Generator can support sccache now + find_program(SCCACHE_PATH sccache) - if(SCCACHE_PATH) - execute_process(COMMAND sccache -V OUTPUT_VARIABLE sccache_version) - message(STATUS "sccache is founded, use [${SCCACHE_PATH}] to speed up compile on Windows.") + if(SCCACHE_PATH) + execute_process(COMMAND sccache -V OUTPUT_VARIABLE sccache_version) + message( + STATUS + "sccache is founded, use [${SCCACHE_PATH}] to speed up compile on Windows." + ) - set(CMAKE_C_COMPILER_LAUNCHER ${SCCACHE_PATH}) - set(CMAKE_CXX_COMPILER_LAUNCHER ${SCCACHE_PATH}) - # (Note:zhouwei25) sccache for cuda compiler has bug so that it can't be hit - # refer to https://github.com/mozilla/sccache/issues/1017, so we fix it - set(CMAKE_CUDA_COMPILER_LAUNCHER ${SCCACHE_PATH}) - endif(SCCACHE_PATH) + set(CMAKE_C_COMPILER_LAUNCHER ${SCCACHE_PATH}) + set(CMAKE_CXX_COMPILER_LAUNCHER ${SCCACHE_PATH}) + # (Note:zhouwei25) sccache for cuda compiler has bug so that it can't be hit + # refer to https://github.com/mozilla/sccache/issues/1017, so we fix it + set(CMAKE_CUDA_COMPILER_LAUNCHER ${SCCACHE_PATH}) + endif(SCCACHE_PATH) endif() diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 5608b6f6f348b..91464b84ef029 100755 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -13,188 +13,195 @@ # limitations under the License. if(NOT WITH_PYTHON) - add_definitions(-DPADDLE_NO_PYTHON) + add_definitions(-DPADDLE_NO_PYTHON) endif(NOT WITH_PYTHON) if(WITH_TESTING) - add_definitions(-DPADDLE_WITH_TESTING) + add_definitions(-DPADDLE_WITH_TESTING) endif(WITH_TESTING) if(WITH_INFERENCE_API_TEST) - add_definitions(-DPADDLE_WITH_INFERENCE_API_TEST) + add_definitions(-DPADDLE_WITH_INFERENCE_API_TEST) endif(WITH_INFERENCE_API_TEST) if(NOT WITH_PROFILER) - add_definitions(-DPADDLE_DISABLE_PROFILER) + add_definitions(-DPADDLE_DISABLE_PROFILER) endif(NOT WITH_PROFILER) if(WITH_AVX AND AVX_FOUND) - set(SIMD_FLAG ${AVX_FLAG}) - add_definitions(-DPADDLE_WITH_AVX) + set(SIMD_FLAG ${AVX_FLAG}) + add_definitions(-DPADDLE_WITH_AVX) elseif(SSE3_FOUND AND NOT WIN32) - set(SIMD_FLAG ${SSE3_FLAG}) + set(SIMD_FLAG ${SSE3_FLAG}) endif() -if (SSE3_FOUND) - # TODO: Runtime detection should be used here. - add_definitions(-DPADDLE_WITH_SSE3) +if(SSE3_FOUND) + # TODO: Runtime detection should be used here. + add_definitions(-DPADDLE_WITH_SSE3) endif() if(WIN32) # windows header option for all targets. add_definitions(-D_XKEYCHECK_H) - # Use symbols instead of absolute path, reduce the cmake link command length. - SET(CMAKE_C_USE_RESPONSE_FILE_FOR_LIBRARIES 1) - SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_LIBRARIES 1) - SET(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) - SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS 1) - SET(CMAKE_C_USE_RESPONSE_FILE_FOR_INCLUDES 1) - SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_INCLUDES 1) - SET(CMAKE_C_RESPONSE_FILE_LINK_FLAG "@") - SET(CMAKE_CXX_RESPONSE_FILE_LINK_FLAG "@") + # Use symbols instead of absolute path, reduce the cmake link command length. + set(CMAKE_C_USE_RESPONSE_FILE_FOR_LIBRARIES 1) + set(CMAKE_CXX_USE_RESPONSE_FILE_FOR_LIBRARIES 1) + set(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) + set(CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS 1) + set(CMAKE_C_USE_RESPONSE_FILE_FOR_INCLUDES 1) + set(CMAKE_CXX_USE_RESPONSE_FILE_FOR_INCLUDES 1) + set(CMAKE_C_RESPONSE_FILE_LINK_FLAG "@") + set(CMAKE_CXX_RESPONSE_FILE_LINK_FLAG "@") add_definitions(-DPADDLE_DLL_INFERENCE) # set definition for the dll export - if (NOT MSVC) - message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.") + if(NOT MSVC) + message( + FATAL + "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA." + ) endif(NOT MSVC) endif(WIN32) if(WITH_MUSL) - add_definitions(-DPADDLE_WITH_MUSL) + add_definitions(-DPADDLE_WITH_MUSL) - message(STATUS, "Set compile option WITH_MKL=OFF when WITH_MUSL=ON") - SET(WITH_MKL OFF) + message(STATUS, "Set compile option WITH_MKL=OFF when WITH_MUSL=ON") + set(WITH_MKL OFF) - message(STATUS, "Set compile option WITH_GPU=OFF when WITH_MUSL=ON") - SET(WITH_GPU OFF) + message(STATUS, "Set compile option WITH_GPU=OFF when WITH_MUSL=ON") + set(WITH_GPU OFF) endif() if(WITH_PSLIB) - add_definitions(-DPADDLE_WITH_PSLIB) + add_definitions(-DPADDLE_WITH_PSLIB) endif() if(WITH_ARM_BRPC) - add_definitions(-DPADDLE_WITH_ARM_BRPC) + add_definitions(-DPADDLE_WITH_ARM_BRPC) +endif() + +if(WITH_FLPS) + add_definitions(-DPADDLE_WITH_FLPS) endif() if(WITH_GLOO) - add_definitions(-DPADDLE_WITH_GLOO) + add_definitions(-DPADDLE_WITH_GLOO) endif() if(WITH_BOX_PS) - add_definitions(-DPADDLE_WITH_BOX_PS) + add_definitions(-DPADDLE_WITH_BOX_PS) endif() if(WITH_ASCEND) - add_definitions(-DPADDLE_WITH_ASCEND) + add_definitions(-DPADDLE_WITH_ASCEND) endif() if(WITH_ASCEND_CL) - add_definitions(-DPADDLE_WITH_ASCEND_CL) + add_definitions(-DPADDLE_WITH_ASCEND_CL) endif() if(WITH_ASCEND_INT64) - add_definitions(-DPADDLE_WITH_ASCEND_INT64) + add_definitions(-DPADDLE_WITH_ASCEND_INT64) endif() if(WITH_XPU) - message(STATUS "Compile with XPU!") - add_definitions(-DPADDLE_WITH_XPU) + message(STATUS "Compile with XPU!") + add_definitions(-DPADDLE_WITH_XPU) endif() if(WITH_XPU_KP) - message(STATUS "Compile with XPU_KP!") - add_definitions(-DPADDLE_WITH_XPU_KP) + message(STATUS "Compile with XPU_KP!") + add_definitions(-DPADDLE_WITH_XPU_KP) endif() if(WITH_IPU) - message(STATUS "Compile with IPU!") - add_definitions(-DPADDLE_WITH_IPU) + message(STATUS "Compile with IPU!") + add_definitions(-DPADDLE_WITH_IPU) endif() if(WITH_MLU) - message(STATUS "Compile with MLU!") - add_definitions(-DPADDLE_WITH_MLU) + message(STATUS "Compile with MLU!") + add_definitions(-DPADDLE_WITH_MLU) endif() if(WITH_GPU) - add_definitions(-DPADDLE_WITH_CUDA) - add_definitions(-DEIGEN_USE_GPU) + add_definitions(-DPADDLE_WITH_CUDA) + add_definitions(-DEIGEN_USE_GPU) - FIND_PACKAGE(CUDA REQUIRED) + find_package(CUDA REQUIRED) - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 10.1) - message(FATAL_ERROR "Paddle needs CUDA >= 10.1 to compile") - endif() + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 10.1) + message(FATAL_ERROR "Paddle needs CUDA >= 10.1 to compile") + endif() - if(NOT CUDNN_FOUND) - message(FATAL_ERROR "Paddle needs cudnn to compile") - endif() + if(NOT CUDNN_FOUND) + message(FATAL_ERROR "Paddle needs cudnn to compile") + endif() - if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) - message(FATAL_ERROR "Paddle needs CUDNN >= 7.0 to compile") - endif() + if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) + message(FATAL_ERROR "Paddle needs CUDNN >= 7.0 to compile") + endif() + + if(CUPTI_FOUND) + include_directories(${CUPTI_INCLUDE_DIR}) + add_definitions(-DPADDLE_WITH_CUPTI) + else() + message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.") + endif() + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=\"${SIMD_FLAG}\"") - if(CUPTI_FOUND) - include_directories(${CUPTI_INCLUDE_DIR}) - add_definitions(-DPADDLE_WITH_CUPTI) + # Include cuda and cudnn + include_directories(${CUDNN_INCLUDE_DIR}) + include_directories(${CUDA_TOOLKIT_INCLUDE}) + + if(TENSORRT_FOUND) + if(WIN32) + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 9) + message(FATAL_ERROR "TensorRT needs CUDA >= 9.0 to compile on Windows") + endif() else() - message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.") - endif() - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=\"${SIMD_FLAG}\"") - - # Include cuda and cudnn - include_directories(${CUDNN_INCLUDE_DIR}) - include_directories(${CUDA_TOOLKIT_INCLUDE}) - - if(TENSORRT_FOUND) - if(WIN32) - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 9) - message(FATAL_ERROR "TensorRT needs CUDA >= 9.0 to compile on Windows") - endif() - else() - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 8) - message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile") - endif() - if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) - message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile") - endif() - if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4) - message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile") - endif() - endif() - include_directories(${TENSORRT_INCLUDE_DIR}) + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 8) + message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile") + endif() + if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) + message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile") + endif() + if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4) + message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile") + endif() endif() + include_directories(${TENSORRT_INCLUDE_DIR}) + endif() elseif(WITH_ROCM) - add_definitions(-DPADDLE_WITH_HIP) - add_definitions(-DEIGEN_USE_GPU) - add_definitions(-DEIGEN_USE_HIP) + add_definitions(-DPADDLE_WITH_HIP) + add_definitions(-DEIGEN_USE_GPU) + add_definitions(-DEIGEN_USE_HIP) - if(NOT MIOPEN_FOUND) - message(FATAL_ERROR "Paddle needs MIOpen to compile") - endif() + if(NOT MIOPEN_FOUND) + message(FATAL_ERROR "Paddle needs MIOpen to compile") + endif() - if(${MIOPEN_VERSION} VERSION_LESS 2090) - message(FATAL_ERROR "Paddle needs MIOPEN >= 2.9 to compile") - endif() + if(${MIOPEN_VERSION} VERSION_LESS 2090) + message(FATAL_ERROR "Paddle needs MIOPEN >= 2.9 to compile") + endif() else() - add_definitions(-DHPPL_STUB_FUNC) - list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) + add_definitions(-DHPPL_STUB_FUNC) + list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) endif() -if (WITH_MKLML AND MKLML_IOMP_LIB) - message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") - if(WIN32) - # openmp not support well for now on windows - set(OPENMP_FLAGS "") - else(WIN32) - set(OPENMP_FLAGS "-fopenmp") - endif(WIN32) - set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) - set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}") +if(WITH_MKLML AND MKLML_IOMP_LIB) + message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") + if(WIN32) + # openmp not support well for now on windows + set(OPENMP_FLAGS "") + else(WIN32) + set(OPENMP_FLAGS "-fopenmp") + endif(WIN32) + set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) + set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}") endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}") @@ -205,25 +212,25 @@ if(WITH_DISTRIBUTE) endif() if(WITH_PSCORE) - add_definitions(-DPADDLE_WITH_PSCORE) + add_definitions(-DPADDLE_WITH_PSCORE) endif() if(WITH_HETERPS) - add_definitions(-DPADDLE_WITH_HETERPS) + add_definitions(-DPADDLE_WITH_HETERPS) endif() if(WITH_BRPC_RDMA) - add_definitions(-DPADDLE_WITH_BRPC_RDMA) + add_definitions(-DPADDLE_WITH_BRPC_RDMA) endif(WITH_BRPC_RDMA) if(ON_INFER) - add_definitions(-DPADDLE_ON_INFERENCE) + add_definitions(-DPADDLE_ON_INFERENCE) endif(ON_INFER) if(WITH_CRYPTO) - add_definitions(-DPADDLE_WITH_CRYPTO) + add_definitions(-DPADDLE_WITH_CRYPTO) endif(WITH_CRYPTO) if(WITH_CUSTOM_DEVICE AND NOT WIN32) - add_definitions(-DPADDLE_WITH_CUSTOM_DEVICE) + add_definitions(-DPADDLE_WITH_CUSTOM_DEVICE) endif() diff --git a/cmake/coveralls.cmake b/cmake/coveralls.cmake index 598754bc9efaa..02c1a136280f7 100644 --- a/cmake/coveralls.cmake +++ b/cmake/coveralls.cmake @@ -5,107 +5,106 @@ # Param _COVERALLS_UPLOAD Upload the result to coveralls. # Param _CMAKE_SCRIPT_PATH CMake script path. function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH) - # clean previous gcov data. - file(REMOVE_RECURSE ${PROJECT_BINARY_DIR}/*.gcda) + # clean previous gcov data. + file(REMOVE_RECURSE ${PROJECT_BINARY_DIR}/*.gcda) - # find curl for upload JSON soon. - if (_COVERALLS_UPLOAD) - find_program(CURL_EXECUTABLE curl) - if (NOT CURL_EXECUTABLE) - message(FATAL_ERROR "Coveralls: curl not found!") - endif() + # find curl for upload JSON soon. + if(_COVERALLS_UPLOAD) + find_program(CURL_EXECUTABLE curl) + if(NOT CURL_EXECUTABLE) + message(FATAL_ERROR "Coveralls: curl not found!") endif() + endif() - # When passing a CMake list to an external process, the list - # will be converted from the format "1;2;3" to "1 2 3". - set(COVERAGE_SRCS "") - foreach (SINGLE_SRC ${_COVERAGE_SRCS}) - set(COVERAGE_SRCS "${COVERAGE_SRCS}*${SINGLE_SRC}") - endforeach() + # When passing a CMake list to an external process, the list + # will be converted from the format "1;2;3" to "1 2 3". + set(COVERAGE_SRCS "") + foreach(SINGLE_SRC ${_COVERAGE_SRCS}) + set(COVERAGE_SRCS "${COVERAGE_SRCS}*${SINGLE_SRC}") + endforeach() - # query number of logical cores - cmake_host_system_information(RESULT core_size QUERY NUMBER_OF_LOGICAL_CORES) - # coveralls json file. - set(COVERALLS_FILE ${PROJECT_BINARY_DIR}/coveralls.json) - add_custom_target(coveralls_generate - # Run regress tests. - COMMAND ${CMAKE_CTEST_COMMAND} - -j ${core_size} - --output-on-failure - # Generate Gcov and translate it into coveralls JSON. - COMMAND ${CMAKE_COMMAND} - -DCOVERAGE_SRCS="${COVERAGE_SRCS}" - -DCOVERALLS_OUTPUT_FILE="${COVERALLS_FILE}" - -DCOV_PATH="${PROJECT_BINARY_DIR}" - -DPROJECT_ROOT="${PROJECT_SOURCE_DIR}" - -P "${_CMAKE_SCRIPT_PATH}/coverallsGcovJsons.cmake" - WORKING_DIRECTORY ${PROJECT_BINARY_DIR} - COMMENT "Coveralls: generating coveralls output..." - ) + # query number of logical cores + cmake_host_system_information(RESULT core_size QUERY NUMBER_OF_LOGICAL_CORES) + # coveralls json file. + set(COVERALLS_FILE ${PROJECT_BINARY_DIR}/coveralls.json) + add_custom_target( + coveralls_generate + # Run regress tests. + COMMAND ${CMAKE_CTEST_COMMAND} -j ${core_size} --output-on-failure + # Generate Gcov and translate it into coveralls JSON. + COMMAND + ${CMAKE_COMMAND} -DCOVERAGE_SRCS="${COVERAGE_SRCS}" + -DCOVERALLS_OUTPUT_FILE="${COVERALLS_FILE}" + -DCOV_PATH="${PROJECT_BINARY_DIR}" -DPROJECT_ROOT="${PROJECT_SOURCE_DIR}" + -P "${_CMAKE_SCRIPT_PATH}/coverallsGcovJsons.cmake" + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + COMMENT "Coveralls: generating coveralls output...") - if (_COVERALLS_UPLOAD) - message("COVERALLS UPLOAD: ON") - # Upload the JSON to coveralls. - add_custom_target(coveralls_upload - COMMAND ${CURL_EXECUTABLE} - -S -F json_file=@${COVERALLS_FILE} - https://coveralls.io/api/v1/jobs - DEPENDS coveralls_generate - WORKING_DIRECTORY ${PROJECT_BINARY_DIR} - COMMENT "Coveralls: uploading coveralls output...") + if(_COVERALLS_UPLOAD) + message("COVERALLS UPLOAD: ON") + # Upload the JSON to coveralls. + add_custom_target( + coveralls_upload + COMMAND ${CURL_EXECUTABLE} -S -F json_file=@${COVERALLS_FILE} + https://coveralls.io/api/v1/jobs + DEPENDS coveralls_generate + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + COMMENT "Coveralls: uploading coveralls output...") - add_custom_target(coveralls DEPENDS coveralls_upload) - else() - message("COVERALLS UPLOAD: OFF") - add_custom_target(coveralls DEPENDS coveralls_generate) - endif() + add_custom_target(coveralls DEPENDS coveralls_upload) + else() + message("COVERALLS UPLOAD: OFF") + add_custom_target(coveralls DEPENDS coveralls_generate) + endif() endfunction() if(WITH_COVERAGE) - if (WITH_INCREMENTAL_COVERAGE) - # if *.h changed, generate coverage report totaly. - # if pybind.cc changed, generate coverage report totaly. - # Because if pybind.cc add '-g -O0 -fprofile-arcs -ftest-coverage' only, some testcase will fail. - if ( (NOT ("$ENV{PADDLE_GIT_DIFF_H_FILE}" STREQUAL "")) OR ("$ENV{PADDLE_GIT_DIFF_CC_FILE}" MATCHES "pybind.cc") ) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage") - endif() - else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage") + if(WITH_INCREMENTAL_COVERAGE) + # if *.h changed, generate coverage report totaly. + # if pybind.cc changed, generate coverage report totaly. + # Because if pybind.cc add '-g -O0 -fprofile-arcs -ftest-coverage' only, some testcase will fail. + if((NOT ("$ENV{PADDLE_GIT_DIFF_H_FILE}" STREQUAL "")) + OR ("$ENV{PADDLE_GIT_DIFF_CC_FILE}" MATCHES "pybind.cc")) + set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage") + set(CMAKE_C_FLAGS + "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage") endif() - set(EXCLUDE_DIRS - "demo/" - "build/" - "tests/" - ".test_env/" - ) + else() + set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage") + endif() + set(EXCLUDE_DIRS "demo/" "build/" "tests/" ".test_env/") - if(WITH_GPU) - file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" ".c" "*.cu") - else() - file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" "*.c") - endif() + if(WITH_GPU) + file( + GLOB_RECURSE PADDLE_SOURCES + RELATIVE "${PROJECT_SOURCE_DIR}" + "*.cpp" "*.cc" ".c" "*.cu") + else() + file( + GLOB_RECURSE PADDLE_SOURCES + RELATIVE "${PROJECT_SOURCE_DIR}" + "*.cpp" "*.cc" "*.c") + endif() - # exclude trivial files in PADDLE_SOURCES - foreach(EXCLUDE_DIR ${EXCLUDE_DIRS}) - foreach(TMP_PATH ${PADDLE_SOURCES}) - string(FIND ${TMP_PATH} ${EXCLUDE_DIR} EXCLUDE_DIR_FOUND) - if(NOT ${EXCLUDE_DIR_FOUND} EQUAL -1) - list(REMOVE_ITEM PADDLE_SOURCES ${TMP_PATH}) - endif() - endforeach(TMP_PATH) - endforeach() + # exclude trivial files in PADDLE_SOURCES + foreach(EXCLUDE_DIR ${EXCLUDE_DIRS}) + foreach(TMP_PATH ${PADDLE_SOURCES}) + string(FIND ${TMP_PATH} ${EXCLUDE_DIR} EXCLUDE_DIR_FOUND) + if(NOT ${EXCLUDE_DIR_FOUND} EQUAL -1) + list(REMOVE_ITEM PADDLE_SOURCES ${TMP_PATH}) + endif() + endforeach(TMP_PATH) + endforeach() - # convert to absolute path - set(PADDLE_SRCS "") - foreach(PADDLE_SRC ${PADDLE_SOURCES}) - set(PADDLE_SRCS "${PADDLE_SRCS};${PROJECT_SOURCE_DIR}/${PADDLE_SRC}") - endforeach() + # convert to absolute path + set(PADDLE_SRCS "") + foreach(PADDLE_SRC ${PADDLE_SOURCES}) + set(PADDLE_SRCS "${PADDLE_SRCS};${PROJECT_SOURCE_DIR}/${PADDLE_SRC}") + endforeach() - code_coverage( - "${PADDLE_SRCS}" - ${COVERALLS_UPLOAD} - "${PROJECT_SOURCE_DIR}/cmake" - ) + code_coverage("${PADDLE_SRCS}" ${COVERALLS_UPLOAD} + "${PROJECT_SOURCE_DIR}/cmake") endif() diff --git a/cmake/coverallsGcovJsons.cmake b/cmake/coverallsGcovJsons.cmake index 4d813a0726dc0..6c1186f69f14d 100644 --- a/cmake/coverallsGcovJsons.cmake +++ b/cmake/coverallsGcovJsons.cmake @@ -32,7 +32,7 @@ # https://coveralls.io/docs/api # -CMAKE_MINIMUM_REQUIRED(VERSION 2.8) +cmake_minimum_required(VERSION 2.8) # Since it's not possible to pass a CMake list properly in the # "1;2;3" format to an external process, we have replaced the @@ -41,44 +41,42 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.8) string(REGEX REPLACE "\\*" ";" COVERAGE_SRCS ${COVERAGE_SRCS}) find_program(GCOV_EXECUTABLE gcov) -if (NOT GCOV_EXECUTABLE) - message(FATAL_ERROR "gcov not found! Aborting...") +if(NOT GCOV_EXECUTABLE) + message(FATAL_ERROR "gcov not found! Aborting...") endif() find_package(Git) # TODO: Add these git things to the coveralls json. -if (GIT_FOUND) - # Branch. - execute_process( - COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} - OUTPUT_VARIABLE GIT_BRANCH - OUTPUT_STRIP_TRAILING_WHITESPACE - ) - - macro (git_log_format FORMAT_CHARS VAR_NAME) - execute_process( - COMMAND ${GIT_EXECUTABLE} log -1 --pretty=format:%${FORMAT_CHARS} - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} - OUTPUT_VARIABLE ${VAR_NAME} - OUTPUT_STRIP_TRAILING_WHITESPACE - ) - endmacro() - - git_log_format(an GIT_AUTHOR_EMAIL) - git_log_format(ae GIT_AUTHOR_EMAIL) - git_log_format(cn GIT_COMMITTER_NAME) - git_log_format(ce GIT_COMMITTER_EMAIL) - git_log_format(B GIT_COMMIT_MESSAGE) - - message("Git exe: ${GIT_EXECUTABLE}") - message("Git branch: ${GIT_BRANCH}") - message("Git author: ${GIT_AUTHOR_NAME}") - message("Git e-mail: ${GIT_AUTHOR_EMAIL}") - message("Git commiter name: ${GIT_COMMITTER_NAME}") - message("Git commiter e-mail: ${GIT_COMMITTER_EMAIL}") - message("Git commit message: ${GIT_COMMIT_MESSAGE}") +if(GIT_FOUND) + # Branch. + execute_process( + COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE GIT_BRANCH + OUTPUT_STRIP_TRAILING_WHITESPACE) + + macro(git_log_format FORMAT_CHARS VAR_NAME) + execute_process( + COMMAND ${GIT_EXECUTABLE} log -1 --pretty=format:%${FORMAT_CHARS} + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE ${VAR_NAME} + OUTPUT_STRIP_TRAILING_WHITESPACE) + endmacro() + + git_log_format(an GIT_AUTHOR_EMAIL) + git_log_format(ae GIT_AUTHOR_EMAIL) + git_log_format(cn GIT_COMMITTER_NAME) + git_log_format(ce GIT_COMMITTER_EMAIL) + git_log_format(B GIT_COMMIT_MESSAGE) + + message("Git exe: ${GIT_EXECUTABLE}") + message("Git branch: ${GIT_BRANCH}") + message("Git author: ${GIT_AUTHOR_NAME}") + message("Git e-mail: ${GIT_AUTHOR_EMAIL}") + message("Git commiter name: ${GIT_COMMITTER_NAME}") + message("Git commiter e-mail: ${GIT_COMMITTER_EMAIL}") + message("Git commit message: ${GIT_COMMIT_MESSAGE}") endif() @@ -95,15 +93,15 @@ endif() # macro(get_source_path_from_gcov_filename _SRC_FILENAME _GCOV_FILENAME) - # /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov - # -> - # #path#to#project#root#subdir#the_file.c.gcov - get_filename_component(_GCOV_FILENAME_WEXT ${_GCOV_FILENAME} NAME) + # /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov + # -> + # #path#to#project#root#subdir#the_file.c.gcov + get_filename_component(_GCOV_FILENAME_WEXT ${_GCOV_FILENAME} NAME) - # #path#to#project#root#subdir#the_file.c.gcov -> /path/to/project/root/subdir/the_file.c - string(REGEX REPLACE "\\.gcov$" "" SRC_FILENAME_TMP ${_GCOV_FILENAME_WEXT}) - string(REGEX REPLACE "\#" "/" SRC_FILENAME_TMP ${SRC_FILENAME_TMP}) - set(${_SRC_FILENAME} "${SRC_FILENAME_TMP}") + # #path#to#project#root#subdir#the_file.c.gcov -> /path/to/project/root/subdir/the_file.c + string(REGEX REPLACE "\\.gcov$" "" SRC_FILENAME_TMP ${_GCOV_FILENAME_WEXT}) + string(REGEX REPLACE "\#" "/" SRC_FILENAME_TMP ${SRC_FILENAME_TMP}) + set(${_SRC_FILENAME} "${SRC_FILENAME_TMP}") endmacro() ############################################################################## @@ -117,26 +115,24 @@ message("===============================") # (The directories the .gcda files and .o files are found in) # and run gcov on those. foreach(GCDA ${GCDA_FILES}) - get_filename_component(GCDA_DIR ${GCDA} PATH) - - # - # The -p below refers to "Preserve path components", - # This means that the generated gcov filename of a source file will - # keep the original files entire filepath, but / is replaced with #. - # Example: - # - # /path/to/project/root/build/CMakeFiles/the_file.dir/subdir/the_file.c.gcda - # ------------------------------------------------------------------------------ - # File '/path/to/project/root/subdir/the_file.c' - # Lines executed:68.34% of 199 - # /path/to/project/root/subdir/the_file.c:creating '#path#to#project#root#subdir#the_file.c.gcov' - # - # If -p is not specified then the file is named only "the_file.c.gcov" - # - execute_process( - COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} >/dev/null - WORKING_DIRECTORY ${GCDA_DIR} - ) + get_filename_component(GCDA_DIR ${GCDA} PATH) + + # + # The -p below refers to "Preserve path components", + # This means that the generated gcov filename of a source file will + # keep the original files entire filepath, but / is replaced with #. + # Example: + # + # /path/to/project/root/build/CMakeFiles/the_file.dir/subdir/the_file.c.gcda + # ------------------------------------------------------------------------------ + # File '/path/to/project/root/subdir/the_file.c' + # Lines executed:68.34% of 199 + # /path/to/project/root/subdir/the_file.c:creating '#path#to#project#root#subdir#the_file.c.gcov' + # + # If -p is not specified then the file is named only "the_file.c.gcov" + # + execute_process(COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} + >/dev/null WORKING_DIRECTORY ${GCDA_DIR}) endforeach() # TODO: Make these be absolute path @@ -164,9 +160,9 @@ file(GLOB_RECURSE ALL_GCOV_FILES "${COV_PATH}" "*.gcov") # ALL_GCOV_FILES = # /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov # /path/to/project/root/build/#path#to#project#root#subdir#other_file.c.gcov -# +# # Result should be: -# GCOV_FILES = +# GCOV_FILES = # /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov # set(GCOV_FILES "") @@ -176,29 +172,29 @@ message("===============================") set(COVERAGE_SRCS_REMAINING ${COVERAGE_SRCS}) -foreach (GCOV_FILE ${ALL_GCOV_FILES}) - - # - # /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov - # -> - # /path/to/project/root/subdir/the_file.c - get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE}) - - # Is this in the list of source files? - # TODO: We want to match against relative path filenames from the source file root... - list(FIND COVERAGE_SRCS ${GCOV_SRC_PATH} WAS_FOUND) - - if (NOT WAS_FOUND EQUAL -1) - message("YES: ${GCOV_FILE}") - list(APPEND GCOV_FILES ${GCOV_FILE}) - - # We remove it from the list, so we don't bother searching for it again. - # Also files left in COVERAGE_SRCS_REMAINING after this loop ends should - # have coverage data generated from them (no lines are covered). - list(REMOVE_ITEM COVERAGE_SRCS_REMAINING ${GCOV_SRC_PATH}) - else() - message("NO: ${GCOV_FILE}") - endif() +foreach(GCOV_FILE ${ALL_GCOV_FILES}) + + # + # /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov + # -> + # /path/to/project/root/subdir/the_file.c + get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE}) + + # Is this in the list of source files? + # TODO: We want to match against relative path filenames from the source file root... + list(FIND COVERAGE_SRCS ${GCOV_SRC_PATH} WAS_FOUND) + + if(NOT WAS_FOUND EQUAL -1) + message("YES: ${GCOV_FILE}") + list(APPEND GCOV_FILES ${GCOV_FILE}) + + # We remove it from the list, so we don't bother searching for it again. + # Also files left in COVERAGE_SRCS_REMAINING after this loop ends should + # have coverage data generated from them (no lines are covered). + list(REMOVE_ITEM COVERAGE_SRCS_REMAINING ${GCOV_SRC_PATH}) + else() + message("NO: ${GCOV_FILE}") + endif() endforeach() # TODO: Enable setting these @@ -206,20 +202,18 @@ set(JSON_SERVICE_NAME "travis-ci") set(JSON_SERVICE_JOB_ID $ENV{TRAVIS_JOB_ID}) set(JSON_TEMPLATE -"{ + "{ \"service_name\": \"\@JSON_SERVICE_NAME\@\", \"service_job_id\": \"\@JSON_SERVICE_JOB_ID\@\", \"source_files\": \@JSON_GCOV_FILES\@ -}" -) +}") set(SRC_FILE_TEMPLATE -"{ + "{ \"name\": \"\@GCOV_SRC_REL_PATH\@\", \"source_digest\": \"\@GCOV_CONTENTS_MD5\@\", \"coverage\": \@GCOV_FILE_COVERAGE\@ - }" -) + }") message("\nGenerate JSON for files:") message("=========================") @@ -227,163 +221,163 @@ message("=========================") set(JSON_GCOV_FILES "[") # Read the GCOV files line by line and get the coverage data. -foreach (GCOV_FILE ${GCOV_FILES}) - - get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE}) - file(RELATIVE_PATH GCOV_SRC_REL_PATH "${PROJECT_ROOT}" "${GCOV_SRC_PATH}") - - # The new coveralls API doesn't need the entire source (Yay!) - # However, still keeping that part for now. Will cleanup in the future. - file(MD5 "${GCOV_SRC_PATH}" GCOV_CONTENTS_MD5) - message("MD5: ${GCOV_SRC_PATH} = ${GCOV_CONTENTS_MD5}") - - # Loads the gcov file as a list of lines. - # (We first open the file and replace all occurrences of [] with _ - # because CMake will fail to parse a line containing unmatched brackets... - # also the \ to escaped \n in macros screws up things.) - # https://public.kitware.com/Bug/view.php?id=15369 - file(READ ${GCOV_FILE} GCOV_CONTENTS) - string(REPLACE "[" "_" GCOV_CONTENTS "${GCOV_CONTENTS}") - string(REPLACE "]" "_" GCOV_CONTENTS "${GCOV_CONTENTS}") - string(REPLACE "\\" "_" GCOV_CONTENTS "${GCOV_CONTENTS}") - file(WRITE ${GCOV_FILE}_tmp "${GCOV_CONTENTS}") - - file(STRINGS ${GCOV_FILE}_tmp GCOV_LINES) - list(LENGTH GCOV_LINES LINE_COUNT) - - # Instead of trying to parse the source from the - # gcov file, simply read the file contents from the source file. - # (Parsing it from the gcov is hard because C-code uses ; in many places - # which also happens to be the same as the CMake list delimeter). - file(READ ${GCOV_SRC_PATH} GCOV_FILE_SOURCE) - - string(REPLACE "\\" "\\\\" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") - string(REGEX REPLACE "\"" "\\\\\"" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") - string(REPLACE "\t" "\\\\t" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") - string(REPLACE "\r" "\\\\r" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") - string(REPLACE "\n" "\\\\n" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") - # According to http://json.org/ these should be escaped as well. - # Don't know how to do that in CMake however... - #string(REPLACE "\b" "\\\\b" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") - #string(REPLACE "\f" "\\\\f" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") - #string(REGEX REPLACE "\u([a-fA-F0-9]{4})" "\\\\u\\1" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") - - # We want a json array of coverage data as a single string - # start building them from the contents of the .gcov - set(GCOV_FILE_COVERAGE "[") - - set(GCOV_LINE_COUNT 1) # Line number for the .gcov. - set(DO_SKIP 0) - foreach (GCOV_LINE ${GCOV_LINES}) - #message("${GCOV_LINE}") - # Example of what we're parsing: - # Hitcount |Line | Source - # " 8: 26: if (!allowed || (strlen(allowed) == 0))" - string(REGEX REPLACE - "^([^:]*):([^:]*):(.*)$" - "\\1;\\2;\\3" - RES - "${GCOV_LINE}") - - # Check if we should exclude lines using the Lcov syntax. - string(REGEX MATCH "LCOV_EXCL_START" START_SKIP "${GCOV_LINE}") - string(REGEX MATCH "LCOV_EXCL_END" END_SKIP "${GCOV_LINE}") - string(REGEX MATCH "LCOV_EXCL_LINE" LINE_SKIP "${GCOV_LINE}") - - set(RESET_SKIP 0) - if (LINE_SKIP AND NOT DO_SKIP) - set(DO_SKIP 1) - set(RESET_SKIP 1) - endif() - - if (START_SKIP) - set(DO_SKIP 1) - message("${GCOV_LINE_COUNT}: Start skip") - endif() - - if (END_SKIP) - set(DO_SKIP 0) - endif() - - list(LENGTH RES RES_COUNT) - - if (RES_COUNT GREATER 2) - list(GET RES 0 HITCOUNT) - list(GET RES 1 LINE) - list(GET RES 2 SOURCE) - - string(STRIP ${HITCOUNT} HITCOUNT) - string(STRIP ${LINE} LINE) - - # Lines with 0 line numbers are metadata and can be ignored. - if (NOT ${LINE} EQUAL 0) - - if (DO_SKIP) - set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ") - else() - # Translate the hitcount into valid JSON values. - if (${HITCOUNT} STREQUAL "#####") - set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ") - elseif (${HITCOUNT} STREQUAL "-") - set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ") - else() - set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}${HITCOUNT}, ") - endif() - endif() - endif() - else() - message(WARNING "Failed to properly parse line (RES_COUNT = ${RES_COUNT}) ${GCOV_FILE}:${GCOV_LINE_COUNT}\n-->${GCOV_LINE}") - endif() - - if (RESET_SKIP) - set(DO_SKIP 0) - endif() - math(EXPR GCOV_LINE_COUNT "${GCOV_LINE_COUNT}+1") - endforeach() - - message("${GCOV_LINE_COUNT} of ${LINE_COUNT} lines read!") - - # Advanced way of removing the trailing comma in the JSON array. - # "[1, 2, 3, " -> "[1, 2, 3" - string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE}) - - # Append the trailing ] to complete the JSON array. - set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]") - - # Generate the final JSON for this file. - message("Generate JSON for file: ${GCOV_SRC_REL_PATH}...") - string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON) - - set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ") +foreach(GCOV_FILE ${GCOV_FILES}) + + get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE}) + file(RELATIVE_PATH GCOV_SRC_REL_PATH "${PROJECT_ROOT}" "${GCOV_SRC_PATH}") + + # The new coveralls API doesn't need the entire source (Yay!) + # However, still keeping that part for now. Will cleanup in the future. + file(MD5 "${GCOV_SRC_PATH}" GCOV_CONTENTS_MD5) + message("MD5: ${GCOV_SRC_PATH} = ${GCOV_CONTENTS_MD5}") + + # Loads the gcov file as a list of lines. + # (We first open the file and replace all occurrences of [] with _ + # because CMake will fail to parse a line containing unmatched brackets... + # also the \ to escaped \n in macros screws up things.) + # https://public.kitware.com/Bug/view.php?id=15369 + file(READ ${GCOV_FILE} GCOV_CONTENTS) + string(REPLACE "[" "_" GCOV_CONTENTS "${GCOV_CONTENTS}") + string(REPLACE "]" "_" GCOV_CONTENTS "${GCOV_CONTENTS}") + string(REPLACE "\\" "_" GCOV_CONTENTS "${GCOV_CONTENTS}") + file(WRITE ${GCOV_FILE}_tmp "${GCOV_CONTENTS}") + + file(STRINGS ${GCOV_FILE}_tmp GCOV_LINES) + list(LENGTH GCOV_LINES LINE_COUNT) + + # Instead of trying to parse the source from the + # gcov file, simply read the file contents from the source file. + # (Parsing it from the gcov is hard because C-code uses ; in many places + # which also happens to be the same as the CMake list delimeter). + file(READ ${GCOV_SRC_PATH} GCOV_FILE_SOURCE) + + string(REPLACE "\\" "\\\\" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") + string(REGEX REPLACE "\"" "\\\\\"" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") + string(REPLACE "\t" "\\\\t" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") + string(REPLACE "\r" "\\\\r" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") + string(REPLACE "\n" "\\\\n" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") + # According to http://json.org/ these should be escaped as well. + # Don't know how to do that in CMake however... + #string(REPLACE "\b" "\\\\b" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") + #string(REPLACE "\f" "\\\\f" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") + #string(REGEX REPLACE "\u([a-fA-F0-9]{4})" "\\\\u\\1" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") + + # We want a json array of coverage data as a single string + # start building them from the contents of the .gcov + set(GCOV_FILE_COVERAGE "[") + + set(GCOV_LINE_COUNT 1) # Line number for the .gcov. + set(DO_SKIP 0) + foreach(GCOV_LINE ${GCOV_LINES}) + #message("${GCOV_LINE}") + # Example of what we're parsing: + # Hitcount |Line | Source + # " 8: 26: if (!allowed || (strlen(allowed) == 0))" + string(REGEX REPLACE "^([^:]*):([^:]*):(.*)$" "\\1;\\2;\\3" RES + "${GCOV_LINE}") + + # Check if we should exclude lines using the Lcov syntax. + string(REGEX MATCH "LCOV_EXCL_START" START_SKIP "${GCOV_LINE}") + string(REGEX MATCH "LCOV_EXCL_END" END_SKIP "${GCOV_LINE}") + string(REGEX MATCH "LCOV_EXCL_LINE" LINE_SKIP "${GCOV_LINE}") + + set(RESET_SKIP 0) + if(LINE_SKIP AND NOT DO_SKIP) + set(DO_SKIP 1) + set(RESET_SKIP 1) + endif() + + if(START_SKIP) + set(DO_SKIP 1) + message("${GCOV_LINE_COUNT}: Start skip") + endif() + + if(END_SKIP) + set(DO_SKIP 0) + endif() + + list(LENGTH RES RES_COUNT) + + if(RES_COUNT GREATER 2) + list(GET RES 0 HITCOUNT) + list(GET RES 1 LINE) + list(GET RES 2 SOURCE) + + string(STRIP ${HITCOUNT} HITCOUNT) + string(STRIP ${LINE} LINE) + + # Lines with 0 line numbers are metadata and can be ignored. + if(NOT ${LINE} EQUAL 0) + + if(DO_SKIP) + set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ") + else() + # Translate the hitcount into valid JSON values. + if(${HITCOUNT} STREQUAL "#####") + set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ") + elseif(${HITCOUNT} STREQUAL "-") + set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ") + else() + set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}${HITCOUNT}, ") + endif() + endif() + endif() + else() + message( + WARNING + "Failed to properly parse line (RES_COUNT = ${RES_COUNT}) ${GCOV_FILE}:${GCOV_LINE_COUNT}\n-->${GCOV_LINE}" + ) + endif() + + if(RESET_SKIP) + set(DO_SKIP 0) + endif() + math(EXPR GCOV_LINE_COUNT "${GCOV_LINE_COUNT}+1") + endforeach() + + message("${GCOV_LINE_COUNT} of ${LINE_COUNT} lines read!") + + # Advanced way of removing the trailing comma in the JSON array. + # "[1, 2, 3, " -> "[1, 2, 3" + string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE}) + + # Append the trailing ] to complete the JSON array. + set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]") + + # Generate the final JSON for this file. + message("Generate JSON for file: ${GCOV_SRC_REL_PATH}...") + string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON) + + set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ") endforeach() # Loop through all files we couldn't find any coverage for # as well, and generate JSON for those as well with 0% coverage. foreach(NOT_COVERED_SRC ${COVERAGE_SRCS_REMAINING}) - # Loads the source file as a list of lines. - file(STRINGS ${NOT_COVERED_SRC} SRC_LINES) + # Loads the source file as a list of lines. + file(STRINGS ${NOT_COVERED_SRC} SRC_LINES) - set(GCOV_FILE_COVERAGE "[") - set(GCOV_FILE_SOURCE "") + set(GCOV_FILE_COVERAGE "[") + set(GCOV_FILE_SOURCE "") - foreach (SOURCE ${SRC_LINES}) - set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ") + foreach(SOURCE ${SRC_LINES}) + set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ") - string(REPLACE "\\" "\\\\" SOURCE "${SOURCE}") - string(REGEX REPLACE "\"" "\\\\\"" SOURCE "${SOURCE}") - string(REPLACE "\t" "\\\\t" SOURCE "${SOURCE}") - string(REPLACE "\r" "\\\\r" SOURCE "${SOURCE}") - set(GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}${SOURCE}\\n") - endforeach() + string(REPLACE "\\" "\\\\" SOURCE "${SOURCE}") + string(REGEX REPLACE "\"" "\\\\\"" SOURCE "${SOURCE}") + string(REPLACE "\t" "\\\\t" SOURCE "${SOURCE}") + string(REPLACE "\r" "\\\\r" SOURCE "${SOURCE}") + set(GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}${SOURCE}\\n") + endforeach() - # Remove trailing comma, and complete JSON array with ] - string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE}) - set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]") + # Remove trailing comma, and complete JSON array with ] + string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE}) + set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]") - # Generate the final JSON for this file. - string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON) - set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ") + # Generate the final JSON for this file. + string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON) + set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ") endforeach() # Get rid of trailing comma. @@ -395,7 +389,9 @@ message("Generate final JSON...") string(CONFIGURE ${JSON_TEMPLATE} JSON) file(WRITE "${COVERALLS_OUTPUT_FILE}" "${JSON}") -message("###########################################################################") -message("Generated coveralls JSON containing coverage data:") +message( + "###########################################################################") +message("Generated coveralls JSON containing coverage data:") message("${COVERALLS_OUTPUT_FILE}") -message("###########################################################################") +message( + "###########################################################################") diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 4894d615c2a35..aa958786cb8f4 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -1,5 +1,5 @@ if(NOT WITH_GPU) - return() + return() endif() if(WITH_NV_JETSON) @@ -38,7 +38,9 @@ function(detect_installed_gpus out_variable) if(NOT CUDA_gpu_detect_output) set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu) - file(WRITE ${cufile} "" + file( + WRITE ${cufile} + "" "#include \"stdio.h\"\n" "#include \"cuda.h\"\n" "#include \"cuda_runtime.h\"\n" @@ -54,55 +56,86 @@ function(detect_installed_gpus out_variable) " return 0;\n" "}\n") - execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" - "--run" "${cufile}" - WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" - RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND "${CUDA_NVCC_EXECUTABLE}" "--run" "${cufile}" + WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" + RESULT_VARIABLE nvcc_res + OUTPUT_VARIABLE nvcc_out + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) if(nvcc_res EQUAL 0) # only keep the last line of nvcc_out - STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}") - STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}") + string(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}") + string(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}") list(GET nvcc_out -1 nvcc_out) string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}") - set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE) + set(CUDA_gpu_detect_output + ${nvcc_out} + CACHE INTERNAL + "Returned GPU architetures from detect_installed_gpus tool" + FORCE) endif() endif() if(NOT CUDA_gpu_detect_output) - message(STATUS "Automatic GPU detection failed. Building for all known architectures.") - set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE) + message( + STATUS + "Automatic GPU detection failed. Building for all known architectures.") + set(${out_variable} + ${paddle_known_gpu_archs} + PARENT_SCOPE) else() - set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE) + set(${out_variable} + ${CUDA_gpu_detect_output} + PARENT_SCOPE) endif() endfunction() - ######################################################################## # Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME # Usage: # select_nvcc_arch_flags(out_variable) function(select_nvcc_arch_flags out_variable) # List of arch names - set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "Ampere" "All" "Manual") + set(archs_names + "Kepler" + "Maxwell" + "Pascal" + "Volta" + "Turing" + "Ampere" + "All" + "Manual") set(archs_name_default "Auto") list(APPEND archs_names "Auto") # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui) - set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.") - set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} ) + set(CUDA_ARCH_NAME + ${archs_name_default} + CACHE STRING "Select target NVIDIA GPU achitecture.") + set_property(CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names}) mark_as_advanced(CUDA_ARCH_NAME) # verify CUDA_ARCH_NAME value if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};") string(REPLACE ";" ", " archs_names "${archs_names}") - message(FATAL_ERROR "Only ${archs_names} architectures names are supported.") + message( + FATAL_ERROR "Only ${archs_names} architectures names are supported.") endif() if(${CUDA_ARCH_NAME} STREQUAL "Manual") - set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported") - set(CUDA_ARCH_PTX "" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for") + set(CUDA_ARCH_BIN + ${paddle_known_gpu_archs} + CACHE + STRING + "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported" + ) + set(CUDA_ARCH_PTX + "" + CACHE + STRING + "Specify 'virtual' PTX architectures to build PTX intermediate code for" + ) mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX) else() unset(CUDA_ARCH_BIN CACHE) @@ -112,19 +145,19 @@ function(select_nvcc_arch_flags out_variable) if(${CUDA_ARCH_NAME} STREQUAL "Kepler") set(cuda_arch_bin "30 35") elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell") - if (WITH_NV_JETSON) + if(WITH_NV_JETSON) set(cuda_arch_bin "53") else() set(cuda_arch_bin "50") endif() elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal") - if (WITH_NV_JETSON) + if(WITH_NV_JETSON) set(cuda_arch_bin "62") else() set(cuda_arch_bin "60 61") endif() elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") - if (WITH_NV_JETSON) + if(WITH_NV_JETSON) set(cuda_arch_bin "72") else() set(cuda_arch_bin "70") @@ -132,35 +165,37 @@ function(select_nvcc_arch_flags out_variable) elseif(${CUDA_ARCH_NAME} STREQUAL "Turing") set(cuda_arch_bin "75") elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere") - if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.1) # CUDA 11.0 + if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.1) # CUDA 11.0 set(cuda_arch_bin "80") - elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.1+ + elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.1+ set(cuda_arch_bin "80 86") endif() elseif(${CUDA_ARCH_NAME} STREQUAL "All") set(cuda_arch_bin ${paddle_known_gpu_archs}) elseif(${CUDA_ARCH_NAME} STREQUAL "Auto") - message(STATUS "WARNING: This is just a warning for publishing release. + message( + STATUS + "WARNING: This is just a warning for publishing release. You are building GPU version without supporting different architectures. So the wheel package may fail on other GPU architectures. You can add -DCUDA_ARCH_NAME=All in cmake command to get a full wheel package to resolve this warning. While, this version will still work on local GPU architecture.") detect_installed_gpus(cuda_arch_bin) - else() # (${CUDA_ARCH_NAME} STREQUAL "Manual") + else() # (${CUDA_ARCH_NAME} STREQUAL "Manual") set(cuda_arch_bin ${CUDA_ARCH_BIN}) endif() if(NEW_RELEASE_JIT) - set(cuda_arch_ptx "${cuda_arch_ptx}${cuda_arch_bin}") - set(cuda_arch_bin "") + set(cuda_arch_ptx "${cuda_arch_ptx}${cuda_arch_bin}") + set(cuda_arch_bin "") endif() # remove dots and convert to lists string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}") string(REGEX REPLACE "\\." "" cuda_arch_ptx "${cuda_arch_ptx}") string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}") - string(REGEX MATCHALL "[0-9]+" cuda_arch_ptx "${cuda_arch_ptx}") + string(REGEX MATCHALL "[0-9]+" cuda_arch_ptx "${cuda_arch_ptx}") list(REMOVE_DUPLICATES cuda_arch_bin) list(REMOVE_DUPLICATES cuda_arch_ptx) @@ -172,7 +207,8 @@ function(select_nvcc_arch_flags out_variable) foreach(arch ${cuda_arch_bin}) if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)") # User explicitly specified PTX for the concrete BIN - string(APPEND nvcc_flags " -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}") + string(APPEND nvcc_flags + " -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}") string(APPEND nvcc_archs_readable " sm_${CMAKE_MATCH_1}") else() # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN @@ -183,34 +219,39 @@ function(select_nvcc_arch_flags out_variable) # Tell NVCC to add PTX intermediate code for the specified architectures foreach(arch ${cuda_arch_ptx}) - string(APPEND nvcc_flags " -gencode arch=compute_${arch},code=compute_${arch}") + string(APPEND nvcc_flags + " -gencode arch=compute_${arch},code=compute_${arch}") string(APPEND nvcc_archs_readable " compute_${arch}") endforeach() string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}") - set(${out_variable} ${nvcc_flags} PARENT_SCOPE) - set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE) + set(${out_variable} + ${nvcc_flags} + PARENT_SCOPE) + set(${out_variable}_readable + ${nvcc_archs_readable} + PARENT_SCOPE) endfunction() message(STATUS "CUDA detected: " ${CMAKE_CUDA_COMPILER_VERSION}) -if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x +if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x set(paddle_known_gpu_archs ${paddle_known_gpu_archs10}) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") -elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.2) # CUDA 11.0/11.1 +elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.2) # CUDA 11.0/11.1 set(paddle_known_gpu_archs ${paddle_known_gpu_archs11}) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") -elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.2+ +elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.2+ set(paddle_known_gpu_archs "${paddle_known_gpu_archs11} 86") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") endif() -if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0) +if(NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0) add_definitions("-DTRT_PLUGIN_FP16_AVALIABLE") endif() @@ -231,7 +272,7 @@ set(CMAKE_CUDA_STANDARD 14) # (Note) For windows, if delete /W[1-4], /W1 will be added defaultly and conflic with -w # So replace /W[1-4] with /W0 -if (WIN32) +if(WIN32) string(REGEX REPLACE "/W[1-4]" " /W0 " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") endif(WIN32) # in cuda9, suppress cuda warning on eigen @@ -242,15 +283,16 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda") if(WIN32) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"/wd4244 /wd4267 /wd4819 \"") + set(CMAKE_CUDA_FLAGS + "${CMAKE_CUDA_FLAGS} -Xcompiler \"/wd4244 /wd4267 /wd4819 \"") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /bigobj") if(MSVC_STATIC_CRT) foreach(flag_var - CMAKE_CUDA_FLAGS CMAKE_CUDA_FLAGS_DEBUG CMAKE_CUDA_FLAGS_RELEASE - CMAKE_CUDA_FLAGS_MINSIZEREL CMAKE_CUDA_FLAGS_RELWITHDEBINFO) - if(${flag_var} MATCHES "-MD") - string(REGEX REPLACE "-MD" "-MT" ${flag_var} "${${flag_var}}") - endif() + CMAKE_CUDA_FLAGS CMAKE_CUDA_FLAGS_DEBUG CMAKE_CUDA_FLAGS_RELEASE + CMAKE_CUDA_FLAGS_MINSIZEREL CMAKE_CUDA_FLAGS_RELWITHDEBINFO) + if(${flag_var} MATCHES "-MD") + string(REGEX REPLACE "-MD" "-MT" ${flag_var} "${${flag_var}}") + endif() endforeach(flag_var) endif() endif() diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index c82847100abef..2e5131d217a50 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -1,107 +1,113 @@ if(NOT WITH_GPU) - return() + return() endif() if(WIN32) - set(CUDNN_ROOT ${CUDA_TOOLKIT_ROOT_DIR}) + set(CUDNN_ROOT ${CUDA_TOOLKIT_ROOT_DIR}) else(WIN32) - set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT") + set(CUDNN_ROOT + "/usr" + CACHE PATH "CUDNN ROOT") endif(WIN32) -find_path(CUDNN_INCLUDE_DIR cudnn.h - PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include - $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE} - NO_DEFAULT_PATH -) +find_path( + CUDNN_INCLUDE_DIR cudnn.h + PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include $ENV{CUDNN_ROOT} + $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE} + NO_DEFAULT_PATH) get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH) set(TARGET_ARCH "x86_64") if(NOT ${CMAKE_SYSTEM_PROCESSOR}) - set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR}) + set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR}) endif() -list(APPEND CUDNN_CHECK_LIBRARY_DIRS - ${CUDNN_ROOT} - ${CUDNN_ROOT}/lib64 - ${CUDNN_ROOT}/lib - ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu - ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/ - $ENV{CUDNN_ROOT} - $ENV{CUDNN_ROOT}/lib64 - $ENV{CUDNN_ROOT}/lib - /usr/lib - ${CUDA_TOOLKIT_ROOT_DIR} - ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 - ) +list( + APPEND + CUDNN_CHECK_LIBRARY_DIRS + ${CUDNN_ROOT} + ${CUDNN_ROOT}/lib64 + ${CUDNN_ROOT}/lib + ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu + ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/ + $ENV{CUDNN_ROOT} + $ENV{CUDNN_ROOT}/lib64 + $ENV{CUDNN_ROOT}/lib + /usr/lib + ${CUDA_TOOLKIT_ROOT_DIR} + ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64) set(CUDNN_LIB_NAME "") -if (LINUX) - set(CUDNN_LIB_NAME "libcudnn.so") +if(LINUX) + set(CUDNN_LIB_NAME "libcudnn.so") endif(LINUX) if(WIN32) - # only support cudnn7 - set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll") + # only support cudnn7 + set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll") endif(WIN32) if(APPLE) - set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so") + set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so") endif(APPLE) -find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a - PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist} - NO_DEFAULT_PATH - DOC "Path to cuDNN library.") - +find_library( + CUDNN_LIBRARY + NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a + PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist} + NO_DEFAULT_PATH + DOC "Path to cuDNN library.") if(CUDNN_INCLUDE_DIR AND CUDNN_LIBRARY) - set(CUDNN_FOUND ON) + set(CUDNN_FOUND ON) else() - set(CUDNN_FOUND OFF) + set(CUDNN_FOUND OFF) endif() -macro(find_cudnn_version cudnn_header_file) - file(READ ${cudnn_header_file} CUDNN_VERSION_FILE_CONTENTS) - get_filename_component(CUDNN_LIB_PATH ${CUDNN_LIBRARY} DIRECTORY) - - string(REGEX MATCH "define CUDNN_VERSION +([0-9]+)" - CUDNN_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define CUDNN_VERSION +([0-9]+)" "\\1" - CUDNN_VERSION "${CUDNN_VERSION}") - - if("${CUDNN_VERSION}" STREQUAL "2000") - message(STATUS "Current cuDNN version is v2. ") +macro(find_cudnn_version cudnn_header_file) + file(READ ${cudnn_header_file} CUDNN_VERSION_FILE_CONTENTS) + get_filename_component(CUDNN_LIB_PATH ${CUDNN_LIBRARY} DIRECTORY) + + string(REGEX MATCH "define CUDNN_VERSION +([0-9]+)" CUDNN_VERSION + "${CUDNN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define CUDNN_VERSION +([0-9]+)" "\\1" CUDNN_VERSION + "${CUDNN_VERSION}") + + if("${CUDNN_VERSION}" STREQUAL "2000") + message(STATUS "Current cuDNN version is v2. ") + else() + string(REGEX MATCH "define CUDNN_MAJOR +([0-9]+)" CUDNN_MAJOR_VERSION + "${CUDNN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define CUDNN_MAJOR +([0-9]+)" "\\1" + CUDNN_MAJOR_VERSION "${CUDNN_MAJOR_VERSION}") + string(REGEX MATCH "define CUDNN_MINOR +([0-9]+)" CUDNN_MINOR_VERSION + "${CUDNN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define CUDNN_MINOR +([0-9]+)" "\\1" + CUDNN_MINOR_VERSION "${CUDNN_MINOR_VERSION}") + string(REGEX MATCH "define CUDNN_PATCHLEVEL +([0-9]+)" + CUDNN_PATCHLEVEL_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define CUDNN_PATCHLEVEL +([0-9]+)" "\\1" + CUDNN_PATCHLEVEL_VERSION "${CUDNN_PATCHLEVEL_VERSION}") + + if(NOT CUDNN_MAJOR_VERSION) + set(CUDNN_VERSION "???") else() - string(REGEX MATCH "define CUDNN_MAJOR +([0-9]+)" CUDNN_MAJOR_VERSION - "${CUDNN_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define CUDNN_MAJOR +([0-9]+)" "\\1" - CUDNN_MAJOR_VERSION "${CUDNN_MAJOR_VERSION}") - string(REGEX MATCH "define CUDNN_MINOR +([0-9]+)" CUDNN_MINOR_VERSION - "${CUDNN_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define CUDNN_MINOR +([0-9]+)" "\\1" - CUDNN_MINOR_VERSION "${CUDNN_MINOR_VERSION}") - string(REGEX MATCH "define CUDNN_PATCHLEVEL +([0-9]+)" - CUDNN_PATCHLEVEL_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define CUDNN_PATCHLEVEL +([0-9]+)" "\\1" - CUDNN_PATCHLEVEL_VERSION "${CUDNN_PATCHLEVEL_VERSION}") - - if(NOT CUDNN_MAJOR_VERSION) - set(CUDNN_VERSION "???") - else() - add_definitions("-DCUDNN_MAJOR_VERSION=\"${CUDNN_MAJOR_VERSION}\"") - math(EXPR CUDNN_VERSION - "${CUDNN_MAJOR_VERSION} * 1000 + + add_definitions("-DCUDNN_MAJOR_VERSION=\"${CUDNN_MAJOR_VERSION}\"") + math(EXPR CUDNN_VERSION "${CUDNN_MAJOR_VERSION} * 1000 + ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}") - message(STATUS "Current cuDNN header is ${cudnn_header_file} " - "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}.${CUDNN_PATCHLEVEL_VERSION}. ") - endif() + message( + STATUS + "Current cuDNN header is ${cudnn_header_file} " + "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}.${CUDNN_PATCHLEVEL_VERSION}. " + ) endif() + endif() endmacro() if(CUDNN_FOUND) - find_cudnn_version(${CUDNN_INCLUDE_DIR}/cudnn.h) - if (NOT CUDNN_MAJOR_VERSION) - find_cudnn_version(${CUDNN_INCLUDE_DIR}/cudnn_version.h) + find_cudnn_version(${CUDNN_INCLUDE_DIR}/cudnn.h) + if(NOT CUDNN_MAJOR_VERSION) + find_cudnn_version(${CUDNN_INCLUDE_DIR}/cudnn_version.h) endif() endif() diff --git a/cmake/cupti.cmake b/cmake/cupti.cmake index 2d7b1917b6873..6bf0141c208c7 100644 --- a/cmake/cupti.cmake +++ b/cmake/cupti.cmake @@ -1,44 +1,51 @@ if(NOT WITH_GPU) - return() + return() endif() - -set(CUPTI_ROOT "/usr" CACHE PATH "CUPTI ROOT") -find_path(CUPTI_INCLUDE_DIR cupti.h - PATHS ${CUPTI_ROOT} ${CUPTI_ROOT}/include - $ENV{CUPTI_ROOT} $ENV{CUPTI_ROOT}/include +set(CUPTI_ROOT + "/usr" + CACHE PATH "CUPTI ROOT") +find_path( + CUPTI_INCLUDE_DIR cupti.h + PATHS ${CUPTI_ROOT} + ${CUPTI_ROOT}/include + $ENV{CUPTI_ROOT} + $ENV{CUPTI_ROOT}/include ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include ${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/include ${CUDA_TOOLKIT_ROOT_DIR}/targets/aarch64-linux/include - NO_DEFAULT_PATH - ) + NO_DEFAULT_PATH) get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH) set(TARGET_ARCH "x86_64") if(NOT ${CMAKE_SYSTEM_PROCESSOR}) - set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR}) + set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR}) endif() -list(APPEND CUPTI_CHECK_LIBRARY_DIRS - ${CUPTI_ROOT} - ${CUPTI_ROOT}/lib64 - ${CUPTI_ROOT}/lib - ${CUPTI_ROOT}/lib/${TARGET_ARCH}-linux-gnu - $ENV{CUPTI_ROOT} - $ENV{CUPTI_ROOT}/lib64 - $ENV{CUPTI_ROOT}/lib - /usr/lib - ${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib64 - ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/lib64) -find_library(CUPTI_LIBRARY NAMES libcupti.so libcupti.dylib # libcupti_static.a - PATHS ${CUPTI_CHECK_LIBRARY_DIRS} ${CUPTI_INCLUDE_DIR} ${__libpath_hist} - NO_DEFAULT_PATH - DOC "Path to cuPTI library.") +list( + APPEND + CUPTI_CHECK_LIBRARY_DIRS + ${CUPTI_ROOT} + ${CUPTI_ROOT}/lib64 + ${CUPTI_ROOT}/lib + ${CUPTI_ROOT}/lib/${TARGET_ARCH}-linux-gnu + $ENV{CUPTI_ROOT} + $ENV{CUPTI_ROOT}/lib64 + $ENV{CUPTI_ROOT}/lib + /usr/lib + ${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib64 + ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/lib64) +find_library( + CUPTI_LIBRARY + NAMES libcupti.so libcupti.dylib # libcupti_static.a + PATHS ${CUPTI_CHECK_LIBRARY_DIRS} ${CUPTI_INCLUDE_DIR} ${__libpath_hist} + NO_DEFAULT_PATH + DOC "Path to cuPTI library.") get_filename_component(CUPTI_LIBRARY_PATH ${CUPTI_LIBRARY} DIRECTORY) if(CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY) - set(CUPTI_FOUND ON) + set(CUPTI_FOUND ON) else() - set(CUPTI_FOUND OFF) + set(CUPTI_FOUND OFF) endif() diff --git a/cmake/experimental.cmake b/cmake/experimental.cmake index 55e7fe263f9dc..0e4b197645673 100644 --- a/cmake/experimental.cmake +++ b/cmake/experimental.cmake @@ -1,11 +1,11 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. diff --git a/cmake/experiments/cuda_module_loading_lazy.cmake b/cmake/experiments/cuda_module_loading_lazy.cmake index ef6a51b594b9e..0f0793a8ee32b 100644 --- a/cmake/experiments/cuda_module_loading_lazy.cmake +++ b/cmake/experiments/cuda_module_loading_lazy.cmake @@ -1,11 +1,11 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -16,25 +16,35 @@ # cuda moduel lazy loading is supported by CUDA 11.6+ # this experiment option makes Paddle supports lazy loading before CUDA 11.6. -option(EXP_CUDA_MODULE_LOADING_LAZY "enable lazy cuda module loading" OFF) -if (${EXP_CUDA_MODULE_LOADING_LAZY}) - if (NOT ${ON_INFER} OR NOT ${LINUX}) - message("EXP_CUDA_MODULE_LOADING_LAZY only works with ON_INFER=ON on Linux platforms") +option(EXP_CUDA_MODULE_LOADING_LAZY "enable lazy cuda module loading" OFF) +if(${EXP_CUDA_MODULE_LOADING_LAZY}) + if(NOT ${ON_INFER} OR NOT ${LINUX}) + message( + "EXP_CUDA_MODULE_LOADING_LAZY only works with ON_INFER=ON on Linux platforms" + ) return() - endif () - if (NOT ${CUDA_FOUND}) + endif() + if(NOT ${CUDA_FOUND}) message("EXP_CUDA_MODULE_LOADING_LAZY only works with CUDA") return() - endif () - if (${CUDA_VERSION} VERSION_GREATER_EQUAL "11.6") + endif() + if(${CUDA_VERSION} VERSION_GREATER_EQUAL "11.6") message("cuda 11.6+ already support lazy module loading") return() - endif () + endif() - message("for cuda before 11.6, libcudart.so must be used for the lazy module loading trick to work, instead of libcudart_static.a") - set(CUDA_USE_STATIC_CUDA_RUNTIME OFF CACHE BOOL "" FORCE) + message( + "for cuda before 11.6, libcudart.so must be used for the lazy module loading trick to work, instead of libcudart_static.a" + ) + set(CUDA_USE_STATIC_CUDA_RUNTIME + OFF + CACHE BOOL "" FORCE) set(CMAKE_CUDA_FLAGS "--cudart shared") enable_language(CUDA) - set(CUDA_NVCC_EXECUTABLE "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy" CACHE FILEPATH "" FORCE) - set(CMAKE_CUDA_COMPILER "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy" CACHE FILEPATH "" FORCE) + set(CUDA_NVCC_EXECUTABLE + "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy" + CACHE FILEPATH "" FORCE) + set(CMAKE_CUDA_COMPILER + "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy" + CACHE FILEPATH "" FORCE) endif() diff --git a/cmake/external/arm_brpc.cmake b/cmake/external/arm_brpc.cmake index 83935ae0c6346..660261d3ffcce 100755 --- a/cmake/external/arm_brpc.cmake +++ b/cmake/external/arm_brpc.cmake @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) +include(ExternalProject) #find_package(OpenSSL REQUIRED) @@ -25,52 +25,56 @@ INCLUDE(ExternalProject) #ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL) #SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY}) -IF((NOT DEFINED ARM_BRPC_NAME) OR (NOT DEFINED ARM_BRPC_URL)) - SET(ARM_BRPC_VER "1.1.0" CACHE STRING "" FORCE) - SET(ARM_BRPC_NAME "arm_brpc" CACHE STRING "" FORCE) -ENDIF() +if((NOT DEFINED ARM_BRPC_NAME) OR (NOT DEFINED ARM_BRPC_URL)) + set(ARM_BRPC_VER + "1.1.0" + CACHE STRING "" FORCE) + set(ARM_BRPC_NAME + "arm_brpc" + CACHE STRING "" FORCE) +endif() -MESSAGE(STATUS "ARM_BRPC_NAME: ${ARM_BRPC_NAME}, ARM_BRPC_URL: ${ARM_BRPC_URL}") -SET(ARM_BRPC_PREFIX_DIR "${THIRD_PARTY_PATH}/arm_brpc") -SET(ARM_BRPC_PROJECT "extern_arm_brpc") -SET(ARM_BRPC_DOWNLOAD_DIR "${ARM_BRPC_PREFIX_DIR}/src/${ARM_BRPC_PROJECT}") -SET(ARM_BRPC_DST_DIR "output") -SET(ARM_BRPC_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") -SET(ARM_BRPC_INSTALL_DIR ${ARM_BRPC_INSTALL_ROOT}/arm_brpc/output) -SET(ARM_BRPC_ROOT ${ARM_BRPC_INSTALL_DIR}) -SET(ARM_BRPC_INC_DIR ${ARM_BRPC_ROOT}/include) -SET(ARM_BRPC_LIB_DIR ${ARM_BRPC_ROOT}/lib) -SET(ARM_BRPC_LIB ${ARM_BRPC_LIB_DIR}/libbrpc.a) -SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${ARM_BRPC_ROOT}/lib") +message(STATUS "ARM_BRPC_NAME: ${ARM_BRPC_NAME}, ARM_BRPC_URL: ${ARM_BRPC_URL}") +set(ARM_BRPC_PREFIX_DIR "${THIRD_PARTY_PATH}/arm_brpc") +set(ARM_BRPC_PROJECT "extern_arm_brpc") +set(ARM_BRPC_DOWNLOAD_DIR "${ARM_BRPC_PREFIX_DIR}/src/${ARM_BRPC_PROJECT}") +set(ARM_BRPC_DST_DIR "output") +set(ARM_BRPC_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +set(ARM_BRPC_INSTALL_DIR ${ARM_BRPC_INSTALL_ROOT}/arm_brpc/output) +set(ARM_BRPC_ROOT ${ARM_BRPC_INSTALL_DIR}) +set(ARM_BRPC_INC_DIR ${ARM_BRPC_ROOT}/include) +set(ARM_BRPC_LIB_DIR ${ARM_BRPC_ROOT}/lib) +set(ARM_BRPC_LIB ${ARM_BRPC_LIB_DIR}/libbrpc.a) +set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${ARM_BRPC_ROOT}/lib") -INCLUDE_DIRECTORIES(${ARM_BRPC_INSTALL_ROOT}/${ARM_BRPC_NAME}/output/include) +include_directories(${ARM_BRPC_INSTALL_ROOT}/${ARM_BRPC_NAME}/output/include) -FILE(WRITE ${ARM_BRPC_DOWNLOAD_DIR}/CMakeLists.txt - "PROJECT(ARM_BRPC)\n" - "cmake_minimum_required(VERSION 3.0)\n" +file( + WRITE ${ARM_BRPC_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(ARM_BRPC)\n" "cmake_minimum_required(VERSION 3.0)\n" "install(DIRECTORY ${ARM_BRPC_DST_DIR} ${ARM_BRPC_DST_DIR} \n" " DESTINATION ${ARM_BRPC_NAME})\n") - -SET(ARM_BRPC_URL "https://paddlerec.bj.bcebos.com/online_infer/arm_brpc_ubuntu18/output.tar.gz" CACHE STRING "" FORCE) + +set(ARM_BRPC_URL + "https://paddlerec.bj.bcebos.com/online_infer/arm_brpc_ubuntu18/output.tar.gz" + CACHE STRING "" FORCE) ExternalProject_Add( - ${ARM_BRPC_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${ARM_BRPC_PREFIX_DIR} - DOWNLOAD_DIR ${ARM_BRPC_DOWNLOAD_DIR} - DOWNLOAD_COMMAND rm -rf output.tar.gz - && wget --no-check-certificate ${ARM_BRPC_URL} - && tar zxvf output.tar.gz - #DOWNLOAD_COMMAND cp /home/wangbin44/Paddle/build/output.tar.gz . - # && tar zxvf output.tar.gz - DOWNLOAD_NO_PROGRESS 1 - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ARM_BRPC_INSTALL_ROOT} - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ARM_BRPC_INSTALL_ROOT} - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - BUILD_BYPRODUCTS ${ARM_BRPC_LIB} -) + ${ARM_BRPC_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${ARM_BRPC_PREFIX_DIR} + DOWNLOAD_DIR ${ARM_BRPC_DOWNLOAD_DIR} + DOWNLOAD_COMMAND rm -rf output.tar.gz && wget --no-check-certificate + ${ARM_BRPC_URL} && tar zxvf output.tar.gz + #DOWNLOAD_COMMAND cp /home/wangbin44/Paddle/build/output.tar.gz . + # && tar zxvf output.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ARM_BRPC_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ARM_BRPC_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${ARM_BRPC_LIB}) -ADD_LIBRARY(arm_brpc STATIC IMPORTED GLOBAL) # 直接导入已经生成的库 -SET_PROPERTY(TARGET arm_brpc PROPERTY IMPORTED_LOCATION ${ARM_BRPC_LIB}) -ADD_DEPENDENCIES(arm_brpc ${ARM_BRPC_PROJECT}) +add_library(arm_brpc STATIC IMPORTED GLOBAL) # 直接导入已经生成的库 +set_property(TARGET arm_brpc PROPERTY IMPORTED_LOCATION ${ARM_BRPC_LIB}) +add_dependencies(arm_brpc ${ARM_BRPC_PROJECT}) diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake index d02f47142e775..3dbe7e6e8aa90 100644 --- a/cmake/external/ascend.cmake +++ b/cmake/external/ascend.cmake @@ -12,21 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. - #NOTE: Logic is from # https://github.com/mindspore-ai/graphengine/blob/master/CMakeLists.txt if(DEFINED ENV{ASCEND_CUSTOM_PATH}) - set(ASCEND_DIR $ENV{ASCEND_CUSTOM_PATH}) + set(ASCEND_DIR $ENV{ASCEND_CUSTOM_PATH}) else() - set(ASCEND_DIR /usr/local/Ascend) + set(ASCEND_DIR /usr/local/Ascend) endif() -if(EXISTS ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_string.h) +if(EXISTS + ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_string.h) # It means CANN 20.2 + add_definitions(-DPADDLE_WITH_ASCEND_STRING) endif() - if(WITH_ASCEND OR WITH_ASCEND_CL) set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64) set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common) @@ -36,28 +35,32 @@ if(WITH_ASCEND OR WITH_ASCEND_CL) set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64) set(STATIC_ACL_LIB ${ASCEND_ACL_DIR}) - set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR}) + set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} + ${ASCEND_ATC_DIR}) set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR}) set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64) - set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) + set(ATLAS_RUNTIME_INC_DIR + ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64) set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64) - set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR}) + set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} + ${ATLAS_ATC_DIR}) set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so) set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so) set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so) - INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR}) - + include_directories(${ATLAS_RUNTIME_INC_DIR}) - ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL) - SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib}) + add_library(ascend_ge SHARED IMPORTED GLOBAL) + set_property(TARGET ascend_ge PROPERTY IMPORTED_LOCATION + ${atlas_ge_runner_lib}) - ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL) - SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib}) + add_library(ascend_graph SHARED IMPORTED GLOBAL) + set_property(TARGET ascend_graph PROPERTY IMPORTED_LOCATION + ${atlas_graph_lib}) - ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL) - SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib}) + add_library(atlas_acl SHARED IMPORTED GLOBAL) + set_property(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib}) add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl) endif() @@ -73,52 +76,60 @@ if(WITH_ASCEND_CL) message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}") message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}") - INCLUDE_DIRECTORIES(${FWKACLLIB_INC_DIR}) - INCLUDE_DIRECTORIES(${ACLLIB_INC_DIR}) + include_directories(${FWKACLLIB_INC_DIR}) + include_directories(${ACLLIB_INC_DIR}) - ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL) - SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib}) + add_library(ascendcl SHARED IMPORTED GLOBAL) + set_property(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib}) - ADD_LIBRARY(ascend_hccl SHARED IMPORTED GLOBAL) - SET_PROPERTY(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib}) + add_library(ascend_hccl SHARED IMPORTED GLOBAL) + set_property(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib}) - ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL) - SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib}) + add_library(acl_op_compiler SHARED IMPORTED GLOBAL) + set_property(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION + ${acl_op_compiler_lib}) add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler) endif() -if (WITH_ASCEND_CL) -macro(find_ascend_toolkit_version ascend_toolkit_version_info) +if(WITH_ASCEND_CL) + macro(find_ascend_toolkit_version ascend_toolkit_version_info) file(READ ${ascend_toolkit_version_info} ASCEND_TOOLKIT_VERSION_CONTENTS) - string(REGEX MATCH "version=([0-9]+\.[0-9]+\.(RC)?[0-9][.a-z0-9]*)" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}") - string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.(RC)?[0-9][.a-z0-9]*)" "\\1" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}") - string(REGEX REPLACE "[A-Z]|[a-z|\.]" "" CANN_VERSION ${ASCEND_TOOLKIT_VERSION}) - STRING(SUBSTRING "${CANN_VERSION}000" 0 6 CANN_VERSION) + string(REGEX MATCH "version=([0-9]+\.[0-9]+\.(RC)?[0-9][.a-z0-9]*)" + ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}") + string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.(RC)?[0-9][.a-z0-9]*)" "\\1" + ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}") + string(REGEX REPLACE "[A-Z]|[a-z|\.]" "" CANN_VERSION + ${ASCEND_TOOLKIT_VERSION}) + string(SUBSTRING "${CANN_VERSION}000" 0 6 CANN_VERSION) add_definitions("-DCANN_VERSION_CODE=${CANN_VERSION}") if(NOT ASCEND_TOOLKIT_VERSION) - set(ASCEND_TOOLKIT_VERSION "???") + set(ASCEND_TOOLKIT_VERSION "???") else() - message(STATUS "Current Ascend Toolkit version is ${ASCEND_TOOLKIT_VERSION}") + message( + STATUS "Current Ascend Toolkit version is ${ASCEND_TOOLKIT_VERSION}") endif() -endmacro() + endmacro() -macro(find_ascend_driver_version ascend_driver_version_info) + macro(find_ascend_driver_version ascend_driver_version_info) file(READ ${ascend_driver_version_info} ASCEND_DRIVER_VERSION_CONTENTS) - string(REGEX MATCH "Version=([0-9]+\.[0-9]+\.[0-9]+)" ASCEND_DRIVER_VERSION "${ASCEND_DRIVER_VERSION_CONTENTS}") - string(REGEX REPLACE "Version=([0-9]+\.[0-9]+\.[0-9]+)" "\\1" ASCEND_DRIVER_VERSION "${ASCEND_DRIVER_VERSION}") + string(REGEX MATCH "Version=([0-9]+\.[0-9]+\.[0-9]+)" ASCEND_DRIVER_VERSION + "${ASCEND_DRIVER_VERSION_CONTENTS}") + string(REGEX REPLACE "Version=([0-9]+\.[0-9]+\.[0-9]+)" "\\1" + ASCEND_DRIVER_VERSION "${ASCEND_DRIVER_VERSION}") if(NOT ASCEND_DRIVER_VERSION) - set(ASCEND_DRIVER_VERSION "???") + set(ASCEND_DRIVER_VERSION "???") else() - message(STATUS "Current Ascend Driver version is ${ASCEND_DRIVER_VERSION}") + message( + STATUS "Current Ascend Driver version is ${ASCEND_DRIVER_VERSION}") endif() -endmacro() + endmacro() -if (WITH_ARM) - set(ASCEND_TOOLKIT_DIR ${ASCEND_DIR}/ascend-toolkit/latest/arm64-linux) -else() - set(ASCEND_TOOLKIT_DIR ${ASCEND_DIR}/ascend-toolkit/latest/x86_64-linux) -endif() + if(WITH_ARM) + set(ASCEND_TOOLKIT_DIR ${ASCEND_DIR}/ascend-toolkit/latest/arm64-linux) + else() + set(ASCEND_TOOLKIT_DIR ${ASCEND_DIR}/ascend-toolkit/latest/x86_64-linux) + endif() -find_ascend_toolkit_version(${ASCEND_TOOLKIT_DIR}/ascend_toolkit_install.info) -find_ascend_driver_version(${ASCEND_DIR}/driver/version.info) + find_ascend_toolkit_version(${ASCEND_TOOLKIT_DIR}/ascend_toolkit_install.info) + find_ascend_driver_version(${ASCEND_DIR}/driver/version.info) endif() diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake index e47b608341bee..810796831e23e 100644 --- a/cmake/external/boost.cmake +++ b/cmake/external/boost.cmake @@ -14,7 +14,7 @@ include(ExternalProject) -set(BOOST_PROJECT "extern_boost") +set(BOOST_PROJECT "extern_boost") # To release PaddlePaddle as a pip package, we have to follow the # manylinux1 standard, which features as old Linux kernels and # compilers as possible and recommends CentOS 5. Indeed, the earliest @@ -22,36 +22,41 @@ set(BOOST_PROJECT "extern_boost") # version of boost, say, 1.66.0, doesn't build on CentOS 6. We # checked that the devtools package of CentOS 6 installs boost 1.41.0. # So we use 1.41.0 here. -set(BOOST_VER "1.41.0") +set(BOOST_VER "1.41.0") # boost_1_41_0_2021_10.tar.gz is almost the same with boost_1_41_0.tar.gz, # except in visualc.hpp i comment a warning of "unknown compiler version", # so if you need to change boost, you may need to block the warning similarly. -set(BOOST_TAR "boost_1_41_0_2021_10" CACHE STRING "" FORCE) -set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE) +set(BOOST_TAR + "boost_1_41_0_2021_10" + CACHE STRING "" FORCE) +set(BOOST_URL + "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz" + CACHE STRING "" FORCE) -MESSAGE(STATUS "BOOST_VERSION: ${BOOST_VER}, BOOST_URL: ${BOOST_URL}") +message(STATUS "BOOST_VERSION: ${BOOST_VER}, BOOST_URL: ${BOOST_URL}") set(BOOST_PREFIX_DIR ${THIRD_PARTY_PATH}/boost) -set(BOOST_INCLUDE_DIR "${THIRD_PARTY_PATH}/boost/src/extern_boost" CACHE PATH "boost include directory." FORCE) +set(BOOST_INCLUDE_DIR + "${THIRD_PARTY_PATH}/boost/src/extern_boost" + CACHE PATH "boost include directory." FORCE) set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1) include_directories(${BOOST_INCLUDE_DIR}) if(WIN32 AND MSVC_VERSION GREATER_EQUAL 1600) - add_definitions(-DBOOST_HAS_STATIC_ASSERT) + add_definitions(-DBOOST_HAS_STATIC_ASSERT) endif() ExternalProject_Add( - ${BOOST_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - URL ${BOOST_URL} - URL_MD5 51be7cc203628dc0848e97eee32d79e3 - PREFIX ${BOOST_PREFIX_DIR} - DOWNLOAD_NO_PROGRESS 1 - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - UPDATE_COMMAND "" - ) + ${BOOST_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + URL ${BOOST_URL} + URL_MD5 51be7cc203628dc0848e97eee32d79e3 + PREFIX ${BOOST_PREFIX_DIR} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + UPDATE_COMMAND "") add_library(boost INTERFACE) diff --git a/cmake/external/box_ps.cmake b/cmake/external/box_ps.cmake index 85e1f94fd2c67..2bb1fe0a0d1b0 100644 --- a/cmake/external/box_ps.cmake +++ b/cmake/external/box_ps.cmake @@ -12,48 +12,53 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) +include(ExternalProject) -SET(BOX_PS_PROJECT "extern_box_ps") -IF((NOT DEFINED BOX_PS_VER) OR (NOT DEFINED BOX_PS_URL)) - MESSAGE(STATUS "use pre defined download url") - SET(BOX_PS_VER "0.1.1" CACHE STRING "" FORCE) - SET(BOX_PS_NAME "box_ps" CACHE STRING "" FORCE) - SET(BOX_PS_URL "http://box-ps.gz.bcebos.com/box_ps.tar.gz" CACHE STRING "" FORCE) -ENDIF() -MESSAGE(STATUS "BOX_PS_NAME: ${BOX_PS_NAME}, BOX_PS_URL: ${BOX_PS_URL}") -SET(BOX_PS_SOURCE_DIR "${THIRD_PARTY_PATH}/box_ps") -SET(BOX_PS_DOWNLOAD_DIR "${BOX_PS_SOURCE_DIR}/src/${BOX_PS_PROJECT}") -SET(BOX_PS_DST_DIR "box_ps") -SET(BOX_PS_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") -SET(BOX_PS_INSTALL_DIR ${BOX_PS_INSTALL_ROOT}/${BOX_PS_DST_DIR}) -SET(BOX_PS_ROOT ${BOX_PS_INSTALL_DIR}) -SET(BOX_PS_INC_DIR ${BOX_PS_ROOT}/include) -SET(BOX_PS_LIB_DIR ${BOX_PS_ROOT}/lib) -SET(BOX_PS_LIB ${BOX_PS_LIB_DIR}/libbox_ps.so) -SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${BOX_PS_ROOT}/lib") +set(BOX_PS_PROJECT "extern_box_ps") +if((NOT DEFINED BOX_PS_VER) OR (NOT DEFINED BOX_PS_URL)) + message(STATUS "use pre defined download url") + set(BOX_PS_VER + "0.1.1" + CACHE STRING "" FORCE) + set(BOX_PS_NAME + "box_ps" + CACHE STRING "" FORCE) + set(BOX_PS_URL + "http://box-ps.gz.bcebos.com/box_ps.tar.gz" + CACHE STRING "" FORCE) +endif() +message(STATUS "BOX_PS_NAME: ${BOX_PS_NAME}, BOX_PS_URL: ${BOX_PS_URL}") +set(BOX_PS_SOURCE_DIR "${THIRD_PARTY_PATH}/box_ps") +set(BOX_PS_DOWNLOAD_DIR "${BOX_PS_SOURCE_DIR}/src/${BOX_PS_PROJECT}") +set(BOX_PS_DST_DIR "box_ps") +set(BOX_PS_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +set(BOX_PS_INSTALL_DIR ${BOX_PS_INSTALL_ROOT}/${BOX_PS_DST_DIR}) +set(BOX_PS_ROOT ${BOX_PS_INSTALL_DIR}) +set(BOX_PS_INC_DIR ${BOX_PS_ROOT}/include) +set(BOX_PS_LIB_DIR ${BOX_PS_ROOT}/lib) +set(BOX_PS_LIB ${BOX_PS_LIB_DIR}/libbox_ps.so) +set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${BOX_PS_ROOT}/lib") -INCLUDE_DIRECTORIES(${BOX_PS_INC_DIR}) -FILE(WRITE ${BOX_PS_DOWNLOAD_DIR}/CMakeLists.txt - "PROJECT(BOX_PS)\n" - "cmake_minimum_required(VERSION 3.0)\n" +include_directories(${BOX_PS_INC_DIR}) +file( + WRITE ${BOX_PS_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(BOX_PS)\n" "cmake_minimum_required(VERSION 3.0)\n" "install(DIRECTORY ${BOX_PS_NAME}/include ${BOX_PS_NAME}/lib \n" " DESTINATION ${BOX_PS_DST_DIR})\n") ExternalProject_Add( - ${BOX_PS_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${BOX_PS_SOURCE_DIR} - DOWNLOAD_DIR ${BOX_PS_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate ${BOX_PS_URL} -c -q -O ${BOX_PS_NAME}.tar.gz - && tar zxvf ${BOX_PS_NAME}.tar.gz - DOWNLOAD_NO_PROGRESS 1 - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${BOX_PS_INSTALL_ROOT} - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BOX_PS_INSTALL_ROOT} - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - BUILD_BYPRODUCTS ${BOX_PS_LIB} -) -ADD_LIBRARY(box_ps SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET box_ps PROPERTY IMPORTED_LOCATION ${BOX_PS_LIB}) -ADD_DEPENDENCIES(box_ps ${BOX_PS_PROJECT}) + ${BOX_PS_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${BOX_PS_SOURCE_DIR} + DOWNLOAD_DIR ${BOX_PS_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${BOX_PS_URL} -c -q -O + ${BOX_PS_NAME}.tar.gz && tar zxvf ${BOX_PS_NAME}.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${BOX_PS_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BOX_PS_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${BOX_PS_LIB}) +add_library(box_ps SHARED IMPORTED GLOBAL) +set_property(TARGET box_ps PROPERTY IMPORTED_LOCATION ${BOX_PS_LIB}) +add_dependencies(box_ps ${BOX_PS_PROJECT}) diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake index c891708751aa8..4434e3fbed180 100755 --- a/cmake/external/brpc.cmake +++ b/cmake/external/brpc.cmake @@ -12,66 +12,80 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) +include(ExternalProject) find_package(OpenSSL REQUIRED) message(STATUS "ssl:" ${OPENSSL_SSL_LIBRARY}) message(STATUS "crypto:" ${OPENSSL_CRYPTO_LIBRARY}) -ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY}) +add_library(ssl SHARED IMPORTED GLOBAL) +set_property(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY}) -ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY}) +add_library(crypto SHARED IMPORTED GLOBAL) +set_property(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY}) -SET(BRPC_PREFIX_DIR ${THIRD_PARTY_PATH}/brpc) -SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc) -SET(BRPC_INCLUDE_DIR "${BRPC_INSTALL_DIR}/include" CACHE PATH "brpc include directory." FORCE) -SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc library." FORCE) +set(BRPC_PREFIX_DIR ${THIRD_PARTY_PATH}/brpc) +set(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc) +set(BRPC_INCLUDE_DIR + "${BRPC_INSTALL_DIR}/include" + CACHE PATH "brpc include directory." FORCE) +set(BRPC_LIBRARIES + "${BRPC_INSTALL_DIR}/lib/libbrpc.a" + CACHE FILEPATH "brpc library." FORCE) -INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR}) +include_directories(${BRPC_INCLUDE_DIR}) # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args -set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog") +set(prefix_path + "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog" +) # If minimal .a is need, you can set WITH_DEBUG_SYMBOLS=OFF ExternalProject_Add( - extern_brpc - ${EXTERNAL_PROJECT_LOG_ARGS} - # TODO(gongwb): change to de newst repo when they changed - GIT_REPOSITORY "https://github.com/wangjiawei04/brpc" - #GIT_REPOSITORY "https://github.com/ziyoujiyi/brpc" # ssl error in the previous repo(can be mannual fixed) - GIT_TAG "e203afb794caf027da0f1e0776443e7d20c0c28e" - PREFIX ${BRPC_PREFIX_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_INSTALL_PREFIX=${BRPC_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR=${BRPC_INSTALL_DIR}/lib - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - -DCMAKE_PREFIX_PATH=${prefix_path} - -DWITH_GLOG=ON - -DIOBUF_WITH_HUGE_BLOCK=ON - -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA} - ${EXTERNAL_OPTIONAL_ARGS} - LIST_SEPARATOR | - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR:PATH=${BRPC_INSTALL_DIR}/lib - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - BUILD_BYPRODUCTS ${BRPC_LIBRARIES} -) + extern_brpc + ${EXTERNAL_PROJECT_LOG_ARGS} + # TODO(gongwb): change to de newst repo when they changed + GIT_REPOSITORY "https://github.com/wangjiawei04/brpc" + #GIT_REPOSITORY "https://github.com/ziyoujiyi/brpc" # ssl error in the previous repo(can be mannual fixed) + GIT_TAG "e203afb794caf027da0f1e0776443e7d20c0c28e" + PREFIX ${BRPC_PREFIX_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_INSTALL_PREFIX=${BRPC_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${BRPC_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_PREFIX_PATH=${prefix_path} + -DWITH_GLOG=ON + -DIOBUF_WITH_HUGE_BLOCK=ON + -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA} + ${EXTERNAL_OPTIONAL_ARGS} + LIST_SEPARATOR | + CMAKE_CACHE_ARGS + -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR:PATH=${BRPC_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${BRPC_LIBRARIES}) # ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest snappy) -ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog snappy) -ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES}) -ADD_DEPENDENCIES(brpc extern_brpc) +add_dependencies( + extern_brpc + protobuf + ssl + crypto + leveldb + gflags + glog + snappy) +add_library(brpc STATIC IMPORTED GLOBAL) +set_property(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES}) +add_dependencies(brpc extern_brpc) add_definitions(-DBRPC_WITH_GLOG) -LIST(APPEND external_project_dependencies brpc) +list(APPEND external_project_dependencies brpc) diff --git a/cmake/external/cinn.cmake b/cmake/external/cinn.cmake index 2ec9a3faa07b7..5dd84657c8605 100644 --- a/cmake/external/cinn.cmake +++ b/cmake/external/cinn.cmake @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -if (NOT WITH_CINN) +if(NOT WITH_CINN) return() endif() @@ -27,36 +27,33 @@ add_definitions(-w) include(ExternalProject) set(CINN_PREFIX_DIR ${THIRD_PARTY_PATH}/CINN) set(CINN_GIT_TAG release/v0.2) -set(CINN_OPTIONAL_ARGS -DPY_VERSION=${PY_VERSION} - -DWITH_CUDA=${WITH_GPU} - -DWITH_CUDNN=${WITH_GPU} - -DWITH_MKL_CBLAS=${WITH_MKL} - -DWITH_MKLDNN=${WITH_MKL} - -DPUBLISH_LIBS=ON - -DWITH_TESTING=ON -) +set(CINN_OPTIONAL_ARGS + -DPY_VERSION=${PY_VERSION} + -DWITH_CUDA=${WITH_GPU} + -DWITH_CUDNN=${WITH_GPU} + -DWITH_MKL_CBLAS=${WITH_MKL} + -DWITH_MKLDNN=${WITH_MKL} + -DPUBLISH_LIBS=ON + -DWITH_TESTING=ON) set(CINN_BUILD_COMMAND $(MAKE) cinnapi -j) ExternalProject_Add( external_cinn ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "${GIT_URL}/PaddlePaddle/CINN.git" - GIT_TAG ${CINN_GIT_TAG} - PREFIX ${CINN_PREFIX_DIR} - BUILD_COMMAND ${CINN_BUILD_COMMAND} - INSTALL_COMMAND "" - CMAKE_ARGS ${CINN_OPTIONAL_ARGS}) + GIT_REPOSITORY "${GIT_URL}/PaddlePaddle/CINN.git" + GIT_TAG ${CINN_GIT_TAG} + PREFIX ${CINN_PREFIX_DIR} + BUILD_COMMAND ${CINN_BUILD_COMMAND} + INSTALL_COMMAND "" + CMAKE_ARGS ${CINN_OPTIONAL_ARGS}) - - -ExternalProject_Get_property(external_cinn BINARY_DIR) -ExternalProject_Get_property(external_cinn SOURCE_DIR) +ExternalProject_Get_Property(external_cinn BINARY_DIR) +ExternalProject_Get_Property(external_cinn SOURCE_DIR) set(CINN_BINARY_DIR ${BINARY_DIR}) set(CINN_SOURCE_DIR ${SOURCE_DIR}) message(STATUS "CINN BINARY_DIR: ${CINN_BINARY_DIR}") message(STATUS "CINN SOURCE_DIR: ${CINN_SOURCE_DIR}") - ###################################### # Add CINN's dependencies header files ###################################### @@ -82,6 +79,7 @@ set(CINN_LIB_LOCATION "${CINN_BINARY_DIR}/dist/cinn/lib") set(CINN_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/cinn/include") add_library(cinn SHARED IMPORTED GLOBAL) -set_target_properties(cinn PROPERTIES IMPORTED_LOCATION "${CINN_LIB_LOCATION}/${CINN_LIB_NAME}") +set_target_properties(cinn PROPERTIES IMPORTED_LOCATION + "${CINN_LIB_LOCATION}/${CINN_LIB_NAME}") include_directories(${CINN_INCLUDE_DIR}) add_dependencies(cinn external_cinn) diff --git a/cmake/external/concurrentqueue.cmake b/cmake/external/concurrentqueue.cmake index 9e4331ae6fdea..0ff3612efed4b 100644 --- a/cmake/external/concurrentqueue.cmake +++ b/cmake/external/concurrentqueue.cmake @@ -16,27 +16,32 @@ include(ExternalProject) set(CONCURRENTQUEUE_PROJECT "extern_concurrentqueue") set(CONCURRENTQUEUE_VER "v1.0.3") -SET(CONCURRENTQUEUE_URL_MD5 118e5bb661b567634647312991e10222) -set(CONCURRENTQUEUE_PREFIX_URL "https://github.com/cameron314/concurrentqueue/archive/refs/tags") -set(CONCURRENTQUEUE_URL "${CONCURRENTQUEUE_PREFIX_URL}/${CONCURRENTQUEUE_VER}.tar.gz") +set(CONCURRENTQUEUE_URL_MD5 118e5bb661b567634647312991e10222) +set(CONCURRENTQUEUE_PREFIX_URL + "https://github.com/cameron314/concurrentqueue/archive/refs/tags") +set(CONCURRENTQUEUE_URL + "${CONCURRENTQUEUE_PREFIX_URL}/${CONCURRENTQUEUE_VER}.tar.gz") -MESSAGE(STATUS "CONCURRENTQUEUE_VERSION: ${CONCURRENTQUEUE_VER}, CONCURRENTQUEUE_URL: ${CONCURRENTQUEUE_URL}") +message( + STATUS + "CONCURRENTQUEUE_VERSION: ${CONCURRENTQUEUE_VER}, CONCURRENTQUEUE_URL: ${CONCURRENTQUEUE_URL}" +) set(CONCURRENTQUEUE_PREFIX_DIR ${THIRD_PARTY_PATH}/concurrentqueue) set(CONCURRENTQUEUE_SOURCE_DIR ${THIRD_PARTY_PATH}/concurrentqueue/src/) -set(CONCURRENTQUEUE_INCLUDE_DIR "${CONCURRENTQUEUE_SOURCE_DIR}/extern_concurrentqueue") +set(CONCURRENTQUEUE_INCLUDE_DIR + "${CONCURRENTQUEUE_SOURCE_DIR}/extern_concurrentqueue") ExternalProject_Add( - ${CONCURRENTQUEUE_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - URL ${CONCURRENTQUEUE_URL} - URL_MD5 ${CONCURRENTQUEUE_URL_MD5} - PREFIX ${CONCURRENTQUEUE_PREFIX_DIR} - DOWNLOAD_NO_PROGRESS 1 - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - UPDATE_COMMAND "" - ) + ${CONCURRENTQUEUE_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + URL ${CONCURRENTQUEUE_URL} + URL_MD5 ${CONCURRENTQUEUE_URL_MD5} + PREFIX ${CONCURRENTQUEUE_PREFIX_DIR} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + UPDATE_COMMAND "") include_directories(${CONCURRENTQUEUE_INCLUDE_DIR}) diff --git a/cmake/external/cryptopp.cmake b/cmake/external/cryptopp.cmake index 27a013c1763a7..ff4d3b5c9ea9e 100644 --- a/cmake/external/cryptopp.cmake +++ b/cmake/external/cryptopp.cmake @@ -12,68 +12,77 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) +include(ExternalProject) -SET(CRYPTOPP_PREFIX_DIR ${THIRD_PARTY_PATH}/cryptopp) -SET(CRYPTOPP_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cryptopp) -SET(CRYPTOPP_INCLUDE_DIR "${CRYPTOPP_INSTALL_DIR}/include" CACHE PATH "cryptopp include directory." FORCE) -SET(CRYPTOPP_REPOSITORY ${GIT_URL}/weidai11/cryptopp.git) -SET(CRYPTOPP_TAG CRYPTOPP_8_2_0) +set(CRYPTOPP_PREFIX_DIR ${THIRD_PARTY_PATH}/cryptopp) +set(CRYPTOPP_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cryptopp) +set(CRYPTOPP_INCLUDE_DIR + "${CRYPTOPP_INSTALL_DIR}/include" + CACHE PATH "cryptopp include directory." FORCE) +set(CRYPTOPP_REPOSITORY ${GIT_URL}/weidai11/cryptopp.git) +set(CRYPTOPP_TAG CRYPTOPP_8_2_0) -IF(WIN32) - SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/cryptopp-static.lib" CACHE FILEPATH "cryptopp library." FORCE) +if(WIN32) + set(CRYPTOPP_LIBRARIES + "${CRYPTOPP_INSTALL_DIR}/lib/cryptopp-static.lib" + CACHE FILEPATH "cryptopp library." FORCE) # There is a compilation parameter "/FI\"winapifamily.h\"" or "/FIwinapifamily.h" can't be used correctly # with Ninja on Windows. The only difference between the patch file and original # file is that the compilation parameters are changed to '/nologo'. This # patch command can be removed when upgrading to a higher version. if("${CMAKE_GENERATOR}" STREQUAL "Ninja") - set(CRYPTOPP_PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different "${PADDLE_SOURCE_DIR}/patches/cryptopp/CMakeLists.txt" "/") + set(CRYPTOPP_PATCH_COMMAND + ${CMAKE_COMMAND} -E copy_if_different + "${PADDLE_SOURCE_DIR}/patches/cryptopp/CMakeLists.txt" "/") endif() -ELSE(WIN32) - SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/libcryptopp.a" CACHE FILEPATH "cryptopp library." FORCE) -ENDIF(WIN32) +else(WIN32) + set(CRYPTOPP_LIBRARIES + "${CRYPTOPP_INSTALL_DIR}/lib/libcryptopp.a" + CACHE FILEPATH "cryptopp library." FORCE) +endif(WIN32) -IF(APPLE AND WITH_ARM) - SET(CMAKE_CXX_FLAGS "-DCRYPTOPP_ARM_CRC32_AVAILABLE=0") -ENDIF() +if(APPLE AND WITH_ARM) + set(CMAKE_CXX_FLAGS "-DCRYPTOPP_ARM_CRC32_AVAILABLE=0") +endif() -set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS} - -DBUILD_SHARED=ON - -DBUILD_STATIC=ON - -DBUILD_TESTING=OFF - -DCMAKE_INSTALL_LIBDIR=${CRYPTOPP_INSTALL_DIR}/lib - -DCMAKE_INSTALL_PREFIX=${CRYPTOPP_INSTALL_DIR} - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -) +set(CRYPTOPP_CMAKE_ARGS + ${COMMON_CMAKE_ARGS} + -DBUILD_SHARED=ON + -DBUILD_STATIC=ON + -DBUILD_TESTING=OFF + -DCMAKE_INSTALL_LIBDIR=${CRYPTOPP_INSTALL_DIR}/lib + -DCMAKE_INSTALL_PREFIX=${CRYPTOPP_INSTALL_DIR} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}) -INCLUDE_DIRECTORIES(${CRYPTOPP_INCLUDE_DIR}) +include_directories(${CRYPTOPP_INCLUDE_DIR}) ExternalProject_Add( - extern_cryptopp - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${CRYPTOPP_REPOSITORY} - GIT_TAG ${CRYPTOPP_TAG} - PREFIX ${CRYPTOPP_PREFIX_DIR} - UPDATE_COMMAND "" - PATCH_COMMAND - COMMAND ${CMAKE_COMMAND} -E remove_directory "/cmake/" - COMMAND git clone ${GIT_URL}/noloader/cryptopp-cmake "/cmake" - COMMAND cd "/cmake" && git checkout tags/${CRYPTOPP_TAG} -b ${CRYPTOPP_TAG} - COMMAND ${CMAKE_COMMAND} -E copy_directory "/cmake/" "/" - COMMAND ${CRYPTOPP_PATCH_COMMAND} - INSTALL_DIR ${CRYPTOPP_INSTALL_DIR} - CMAKE_ARGS ${CRYPTOPP_CMAKE_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${CRYPTOPP_INSTALL_DIR} - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - BUILD_BYPRODUCTS ${CRYPTOPP_LIBRARIES} -) + extern_cryptopp + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${CRYPTOPP_REPOSITORY} + GIT_TAG ${CRYPTOPP_TAG} + PREFIX ${CRYPTOPP_PREFIX_DIR} + UPDATE_COMMAND "" + PATCH_COMMAND + COMMAND ${CMAKE_COMMAND} -E remove_directory "/cmake/" + COMMAND git clone ${GIT_URL}/noloader/cryptopp-cmake "/cmake" + COMMAND cd "/cmake" && git checkout tags/${CRYPTOPP_TAG} -b + ${CRYPTOPP_TAG} + COMMAND ${CMAKE_COMMAND} -E copy_directory "/cmake/" + "/" + COMMAND ${CRYPTOPP_PATCH_COMMAND} + INSTALL_DIR ${CRYPTOPP_INSTALL_DIR} + CMAKE_ARGS ${CRYPTOPP_CMAKE_ARGS} + CMAKE_CACHE_ARGS + -DCMAKE_INSTALL_PREFIX:PATH=${CRYPTOPP_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${CRYPTOPP_LIBRARIES}) -ADD_LIBRARY(cryptopp STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET cryptopp PROPERTY IMPORTED_LOCATION ${CRYPTOPP_LIBRARIES}) -ADD_DEPENDENCIES(cryptopp extern_cryptopp) +add_library(cryptopp STATIC IMPORTED GLOBAL) +set_property(TARGET cryptopp PROPERTY IMPORTED_LOCATION ${CRYPTOPP_LIBRARIES}) +add_dependencies(cryptopp extern_cryptopp) diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake index f263086e8bef8..04fad252dac88 100644 --- a/cmake/external/cub.cmake +++ b/cmake/external/cub.cmake @@ -14,32 +14,32 @@ include(ExternalProject) -# Note(zhouwei): extern_cub has code __FILE_, If the path of extern_cub is changed, -# it will effect about 30+ cu files sccache hit and slow compile speed on windows. +# Note(zhouwei): extern_cub has code __FILE_, If the path of extern_cub is changed, +# it will effect about 30+ cu files sccache hit and slow compile speed on windows. # Therefore, a fixed CUB_PATH will be input to increase the sccache hit rate. -set(CUB_PATH "${THIRD_PARTY_PATH}/cub" CACHE STRING "A path setting for external_cub path.") -set(CUB_PREFIX_DIR ${CUB_PATH}) +set(CUB_PATH + "${THIRD_PARTY_PATH}/cub" + CACHE STRING "A path setting for external_cub path.") +set(CUB_PREFIX_DIR ${CUB_PATH}) -set(CUB_REPOSITORY ${GIT_URL}/NVlabs/cub.git) -set(CUB_TAG 1.8.0) +set(CUB_REPOSITORY ${GIT_URL}/NVlabs/cub.git) +set(CUB_TAG 1.8.0) -SET(CUB_INCLUDE_DIR ${CUB_PREFIX_DIR}/src/extern_cub) +set(CUB_INCLUDE_DIR ${CUB_PREFIX_DIR}/src/extern_cub) message("CUB_INCLUDE_DIR is ${CUB_INCLUDE_DIR}") include_directories(${CUB_INCLUDE_DIR}) ExternalProject_Add( extern_cub - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${CUB_REPOSITORY} - GIT_TAG ${CUB_TAG} - PREFIX ${CUB_PREFIX_DIR} - UPDATE_COMMAND "" + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${CUB_REPOSITORY} + GIT_TAG ${CUB_TAG} + PREFIX ${CUB_PREFIX_DIR} + UPDATE_COMMAND "" CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" -) + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "") add_library(cub INTERFACE) diff --git a/cmake/external/dgc.cmake b/cmake/external/dgc.cmake index 711d6c5b10aac..9c22ee89d48ea 100644 --- a/cmake/external/dgc.cmake +++ b/cmake/external/dgc.cmake @@ -12,32 +12,35 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) +include(ExternalProject) -SET(DGC_PREFIX_DIR "${THIRD_PARTY_PATH}/dgc") -SET(DGC_SOURCES_DIR "${THIRD_PARTY_PATH}/dgc/src/extern_dgc") -SET(DGC_INSTALL_DIR "${THIRD_PARTY_PATH}/install/dgc") -SET(DGC_INCLUDE_DIR "${DGC_INSTALL_DIR}/include" CACHE PATH "dgc include directory." FORCE) -SET(DGC_LIBRARIES "${DGC_INSTALL_DIR}/lib/libdgc.a" CACHE FILEPATH "dgc library." FORCE) -SET(DGC_URL "https://fleet.bj.bcebos.com/dgc/collective_f66ef73.tgz") -INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR}) +set(DGC_PREFIX_DIR "${THIRD_PARTY_PATH}/dgc") +set(DGC_SOURCES_DIR "${THIRD_PARTY_PATH}/dgc/src/extern_dgc") +set(DGC_INSTALL_DIR "${THIRD_PARTY_PATH}/install/dgc") +set(DGC_INCLUDE_DIR + "${DGC_INSTALL_DIR}/include" + CACHE PATH "dgc include directory." FORCE) +set(DGC_LIBRARIES + "${DGC_INSTALL_DIR}/lib/libdgc.a" + CACHE FILEPATH "dgc library." FORCE) +set(DGC_URL "https://fleet.bj.bcebos.com/dgc/collective_f66ef73.tgz") +include_directories(${DGC_INCLUDE_DIR}) ExternalProject_Add( - extern_dgc - ${EXTERNAL_PROJECT_LOG_ARGS} - URL ${DGC_URL} - URL_MD5 "94e6fa1bc97169d0e1aad44570fe3251" - PREFIX "${DGC_PREFIX_DIR}" - CONFIGURE_COMMAND "" - BUILD_COMMAND make -j $(nproc) - INSTALL_COMMAND mkdir -p ${DGC_INSTALL_DIR}/lib/ ${DGC_INCLUDE_DIR}/dgc - && cp ${DGC_SOURCES_DIR}/build/lib/libdgc.a ${DGC_LIBRARIES} - && cp ${DGC_SOURCES_DIR}/build/include/dgc.h ${DGC_INCLUDE_DIR}/dgc/ - BUILD_IN_SOURCE 1 - BUILD_BYPRODUCTS ${DGC_LIBRARIES} -) - -ADD_LIBRARY(dgc STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET dgc PROPERTY IMPORTED_LOCATION ${DGC_LIBRARIES}) -ADD_DEPENDENCIES(dgc extern_dgc) + extern_dgc + ${EXTERNAL_PROJECT_LOG_ARGS} + URL ${DGC_URL} + URL_MD5 "94e6fa1bc97169d0e1aad44570fe3251" + PREFIX "${DGC_PREFIX_DIR}" + CONFIGURE_COMMAND "" + BUILD_COMMAND make -j $(nproc) + INSTALL_COMMAND + mkdir -p ${DGC_INSTALL_DIR}/lib/ ${DGC_INCLUDE_DIR}/dgc && cp + ${DGC_SOURCES_DIR}/build/lib/libdgc.a ${DGC_LIBRARIES} && cp + ${DGC_SOURCES_DIR}/build/include/dgc.h ${DGC_INCLUDE_DIR}/dgc/ + BUILD_IN_SOURCE 1 + BUILD_BYPRODUCTS ${DGC_LIBRARIES}) +add_library(dgc STATIC IMPORTED GLOBAL) +set_property(TARGET dgc PROPERTY IMPORTED_LOCATION ${DGC_LIBRARIES}) +add_dependencies(dgc extern_dgc) diff --git a/cmake/external/dirent.cmake b/cmake/external/dirent.cmake index 59caa43741595..51d8eaac29e7d 100644 --- a/cmake/external/dirent.cmake +++ b/cmake/external/dirent.cmake @@ -15,30 +15,28 @@ # Note(chenxin33): dirent.h is only exist in Linux, so get it from github when build in windows. # use dirent tag v1.23.2 on 09/05//2018 https://github.com/tronkko/dirent.git -INCLUDE (ExternalProject) +include(ExternalProject) -SET(DIRENT_PREFIX_DIR ${THIRD_PARTY_PATH}/dirent) -SET(DIRENT_INCLUDE_DIR ${THIRD_PARTY_PATH}/dirent/src/extern_dirent/include) +set(DIRENT_PREFIX_DIR ${THIRD_PARTY_PATH}/dirent) +set(DIRENT_INCLUDE_DIR ${THIRD_PARTY_PATH}/dirent/src/extern_dirent/include) include_directories(${DIRENT_INCLUDE_DIR}) -set(DIRENT_REPOSITORY ${GIT_URL}/tronkko/dirent) -set(DIRENT_TAG 1.23.2) +set(DIRENT_REPOSITORY ${GIT_URL}/tronkko/dirent) +set(DIRENT_TAG 1.23.2) ExternalProject_Add( extern_dirent - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${DIRENT_REPOSITORY} - GIT_TAG ${DIRENT_TAG} - PREFIX ${DIRENT_PREFIX_DIR} - UPDATE_COMMAND "" + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${DIRENT_REPOSITORY} + GIT_TAG ${DIRENT_TAG} + PREFIX ${DIRENT_PREFIX_DIR} + UPDATE_COMMAND "" CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" -) + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "") add_library(dirent INTERFACE) -add_dependencies(dirent extern_dirent) \ No newline at end of file +add_dependencies(dirent extern_dirent) diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake index 1aeea752e6678..727202a434683 100644 --- a/cmake/external/dlpack.cmake +++ b/cmake/external/dlpack.cmake @@ -17,24 +17,22 @@ include(ExternalProject) set(DLPACK_PREFIX_DIR ${THIRD_PARTY_PATH}/dlpack) set(DLPACK_REPOSITORY ${GIT_URL}/dmlc/dlpack.git) -set(DLPACK_TAG v0.4) +set(DLPACK_TAG v0.4) -set(DLPACK_INCLUDE_DIR ${THIRD_PARTY_PATH}/dlpack/src/extern_dlpack/include) +set(DLPACK_INCLUDE_DIR ${THIRD_PARTY_PATH}/dlpack/src/extern_dlpack/include) include_directories(${DLPACK_INCLUDE_DIR}) ExternalProject_Add( extern_dlpack - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${DLPACK_REPOSITORY} - GIT_TAG ${DLPACK_TAG} - PREFIX ${DLPACK_PREFIX_DIR} - UPDATE_COMMAND "" + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${DLPACK_REPOSITORY} + GIT_TAG ${DLPACK_TAG} + PREFIX ${DLPACK_PREFIX_DIR} + UPDATE_COMMAND "" CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" -) + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "") add_library(dlpack INTERFACE) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index f8bac96b68fa5..443b7aa7d56b7 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -18,39 +18,43 @@ include(ExternalProject) set(EIGEN_PREFIX_DIR ${THIRD_PARTY_PATH}/eigen3) set(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3/src/extern_eigen3) set(EIGEN_REPOSITORY https://gitlab.com/libeigen/eigen.git) -set(EIGEN_TAG f612df273689a19d25b45ca4f8269463207c4fee) +set(EIGEN_TAG f612df273689a19d25b45ca4f8269463207c4fee) if(WIN32) - add_definitions(-DEIGEN_STRONG_INLINE=inline) + add_definitions(-DEIGEN_STRONG_INLINE=inline) elseif(LINUX) - if(WITH_ROCM) - # For HIPCC Eigen::internal::device::numeric_limits is not EIGEN_DEVICE_FUNC - # which will cause compiler error of using __host__ funciont in __host__ __device__ - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst) - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorReductionGpu.h native_src1) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h native_dst1) - set(EIGEN_PATCH_COMMAND cp ${native_src} ${native_dst} && cp ${native_src1} ${native_dst1}) - endif() + if(WITH_ROCM) + # For HIPCC Eigen::internal::device::numeric_limits is not EIGEN_DEVICE_FUNC + # which will cause compiler error of using __host__ funciont in __host__ __device__ + file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src) + file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h + native_dst) + file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorReductionGpu.h + native_src1) + file( + TO_NATIVE_PATH + ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h + native_dst1) + set(EIGEN_PATCH_COMMAND cp ${native_src} ${native_dst} && cp ${native_src1} + ${native_dst1}) + endif() endif() set(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}) -INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR}) +include_directories(${EIGEN_INCLUDE_DIR}) ExternalProject_Add( - extern_eigen3 - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${EIGEN_REPOSITORY} - GIT_TAG ${EIGEN_TAG} - PREFIX ${EIGEN_PREFIX_DIR} - UPDATE_COMMAND "" - PATCH_COMMAND ${EIGEN_PATCH_COMMAND} - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" -) + extern_eigen3 + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${EIGEN_REPOSITORY} + GIT_TAG ${EIGEN_TAG} + PREFIX ${EIGEN_PREFIX_DIR} + UPDATE_COMMAND "" + PATCH_COMMAND ${EIGEN_PATCH_COMMAND} + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "") add_library(eigen3 INTERFACE) diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index 056ff32c8c0d9..783e1c0d442f7 100755 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -12,90 +12,94 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) +include(ExternalProject) -SET(GFLAGS_PREFIX_DIR ${THIRD_PARTY_PATH}/gflags) -SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags) -SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE) +set(GFLAGS_PREFIX_DIR ${THIRD_PARTY_PATH}/gflags) +set(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags) +set(GFLAGS_INCLUDE_DIR + "${GFLAGS_INSTALL_DIR}/include" + CACHE PATH "gflags include directory." FORCE) set(GFLAGS_REPOSITORY ${GIT_URL}/gflags/gflags.git) set(GFLAGS_TAG "v2.2.2") -IF(WIN32) - set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) -ELSE(WIN32) - set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) +if(WIN32) + set(GFLAGS_LIBRARIES + "${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib" + CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) +else(WIN32) + set(GFLAGS_LIBRARIES + "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" + CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) set(BUILD_COMMAND $(MAKE) --silent) set(INSTALL_COMMAND $(MAKE) install) -ENDIF(WIN32) +endif(WIN32) -INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR}) +include_directories(${GFLAGS_INCLUDE_DIR}) if(WITH_ARM_BRPC) - SET(ARM_GFLAGS_URL "https://paddlerec.bj.bcebos.com/online_infer/arm_brpc_ubuntu18/arm_gflags.tar.gz" CACHE STRING "" FORCE) - set(GFLAGS_SOURCE_DIR ${THIRD_PARTY_PATH}/gflags/src/extern_gflags) - FILE(WRITE ${GFLAGS_SOURCE_DIR}/CMakeLists.txt - "PROJECT(ARM_GFLAGS)\n" - "cmake_minimum_required(VERSION 3.0)\n" + set(ARM_GFLAGS_URL + "https://paddlerec.bj.bcebos.com/online_infer/arm_brpc_ubuntu18/arm_gflags.tar.gz" + CACHE STRING "" FORCE) + set(GFLAGS_SOURCE_DIR ${THIRD_PARTY_PATH}/gflags/src/extern_gflags) + file( + WRITE ${GFLAGS_SOURCE_DIR}/CMakeLists.txt + "PROJECT(ARM_GFLAGS)\n" "cmake_minimum_required(VERSION 3.0)\n" "install(DIRECTORY arm_gflags/bin arm_gflags/include arm_gflags/lib \n" " DESTINATION . USE_SOURCE_PERMISSIONS)\n") - ExternalProject_Add( - extern_gflags - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - PREFIX ${GFLAGS_PREFIX_DIR} - DOWNLOAD_DIR ${GFLAGS_SOURCE_DIR} - DOWNLOAD_COMMAND rm -rf arm_gflags.tar.gz && - wget --no-check-certificate ${ARM_GFLAGS_URL} - && tar zxvf arm_gflags.tar.gz - #DOWNLOAD_COMMAND cp /home/wangbin44/Paddle/build/arm_gflags.tar.gz . - # && tar zxvf arm_gflags.tar.gz - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR} - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR} - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - BUILD_BYPRODUCTS ${GFLAGS_LIBRARIES} - ) + ExternalProject_Add( + extern_gflags + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + PREFIX ${GFLAGS_PREFIX_DIR} + DOWNLOAD_DIR ${GFLAGS_SOURCE_DIR} + DOWNLOAD_COMMAND rm -rf arm_gflags.tar.gz && wget --no-check-certificate + ${ARM_GFLAGS_URL} && tar zxvf arm_gflags.tar.gz + #DOWNLOAD_COMMAND cp /home/wangbin44/Paddle/build/arm_gflags.tar.gz . + # && tar zxvf arm_gflags.tar.gz + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${GFLAGS_LIBRARIES}) else() - ExternalProject_Add( - extern_gflags - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${GFLAGS_REPOSITORY} - GIT_TAG ${GFLAGS_TAG} - PREFIX ${GFLAGS_PREFIX_DIR} - UPDATE_COMMAND "" - BUILD_COMMAND ${BUILD_COMMAND} - INSTALL_COMMAND ${INSTALL_COMMAND} - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - -DBUILD_STATIC_LIBS=ON - -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR} - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DBUILD_TESTING=OFF - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR} - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - BUILD_BYPRODUCTS ${GFLAGS_LIBRARIES} - ) + ExternalProject_Add( + extern_gflags + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${GFLAGS_REPOSITORY} + GIT_TAG ${GFLAGS_TAG} + PREFIX ${GFLAGS_PREFIX_DIR} + UPDATE_COMMAND "" + BUILD_COMMAND ${BUILD_COMMAND} + INSTALL_COMMAND ${INSTALL_COMMAND} + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DBUILD_STATIC_LIBS=ON + -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DBUILD_TESTING=OFF + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS + -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${GFLAGS_LIBRARIES}) endif() -ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES}) -ADD_DEPENDENCIES(gflags extern_gflags) +add_library(gflags STATIC IMPORTED GLOBAL) +set_property(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES}) +add_dependencies(gflags extern_gflags) # On Windows (including MinGW), the Shlwapi library is used by gflags if available. -if (WIN32) +if(WIN32) include(CheckIncludeFileCXX) check_include_file_cxx("shlwapi.h" HAVE_SHLWAPI) - if (HAVE_SHLWAPI) + if(HAVE_SHLWAPI) set_property(GLOBAL PROPERTY OS_DEPENDENCY_MODULES shlwapi.lib) endif(HAVE_SHLWAPI) -endif (WIN32) +endif(WIN32) diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index b2f3afdabf415..a9942a6bca67b 100755 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -12,86 +12,90 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) +include(ExternalProject) -SET(GLOG_PREFIX_DIR ${THIRD_PARTY_PATH}/glog) -SET(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog) -SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include directory." FORCE) -SET(GLOG_REPOSITORY ${GIT_URL}/google/glog.git) -SET(GLOG_TAG v0.4.0) +set(GLOG_PREFIX_DIR ${THIRD_PARTY_PATH}/glog) +set(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog) +set(GLOG_INCLUDE_DIR + "${GLOG_INSTALL_DIR}/include" + CACHE PATH "glog include directory." FORCE) +set(GLOG_REPOSITORY ${GIT_URL}/google/glog.git) +set(GLOG_TAG v0.4.0) -IF(WIN32) - SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/glog.lib" CACHE FILEPATH "glog library." FORCE) - SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530") +if(WIN32) + set(GLOG_LIBRARIES + "${GLOG_INSTALL_DIR}/lib/glog.lib" + CACHE FILEPATH "glog library." FORCE) + set(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530") add_definitions("/DGOOGLE_GLOG_DLL_DECL=") -ELSE(WIN32) - SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE) - SET(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) -ENDIF(WIN32) +else(WIN32) + set(GLOG_LIBRARIES + "${GLOG_INSTALL_DIR}/lib/libglog.a" + CACHE FILEPATH "glog library." FORCE) + set(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) +endif(WIN32) -INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR}) +include_directories(${GLOG_INCLUDE_DIR}) if(WITH_ARM_BRPC) - SET(ARM_GLOG_URL "https://paddlerec.bj.bcebos.com/online_infer/arm_brpc_ubuntu18/arm_glog.tar.gz" CACHE STRING "" FORCE) - set(GLOG_SOURCE_DIR ${THIRD_PARTY_PATH}/glog/src/extern_glog) - FILE(WRITE ${GLOG_SOURCE_DIR}/CMakeLists.txt - "PROJECT(ARM_GLOGS)\n" - "cmake_minimum_required(VERSION 3.0)\n" + set(ARM_GLOG_URL + "https://paddlerec.bj.bcebos.com/online_infer/arm_brpc_ubuntu18/arm_glog.tar.gz" + CACHE STRING "" FORCE) + set(GLOG_SOURCE_DIR ${THIRD_PARTY_PATH}/glog/src/extern_glog) + file( + WRITE ${GLOG_SOURCE_DIR}/CMakeLists.txt + "PROJECT(ARM_GLOGS)\n" "cmake_minimum_required(VERSION 3.0)\n" "install(DIRECTORY arm_glog/include arm_glog/lib \n" " DESTINATION . USE_SOURCE_PERMISSIONS)\n") - ExternalProject_Add( - extern_glog - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - DEPENDS gflags - PREFIX ${GLOG_PREFIX_DIR} - DOWNLOAD_DIR ${GLOG_SOURCE_DIR} - DOWNLOAD_COMMAND rm -rf arm_glog.tar.gz && - wget --no-check-certificate ${ARM_GLOG_URL} - && tar zxvf arm_glog.tar.gz - #DOWNLOAD_COMMAND cp /home/wangbin44/Paddle/build/arm_glog.tar.gz . - # && tar zxvf arm_glog.tar.gz - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR} - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR} - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - BUILD_BYPRODUCTS ${GLOG_LIBRARIES} - ) + ExternalProject_Add( + extern_glog + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + DEPENDS gflags + PREFIX ${GLOG_PREFIX_DIR} + DOWNLOAD_DIR ${GLOG_SOURCE_DIR} + DOWNLOAD_COMMAND rm -rf arm_glog.tar.gz && wget --no-check-certificate + ${ARM_GLOG_URL} && tar zxvf arm_glog.tar.gz + #DOWNLOAD_COMMAND cp /home/wangbin44/Paddle/build/arm_glog.tar.gz . + # && tar zxvf arm_glog.tar.gz + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${GLOG_LIBRARIES}) else() - ExternalProject_Add( - extern_glog - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${GLOG_REPOSITORY} - GIT_TAG ${GLOG_TAG} - DEPENDS gflags - PREFIX ${GLOG_PREFIX_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS} - -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DWITH_GFLAGS=OFF - -DBUILD_TESTING=OFF - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - BUILD_BYPRODUCTS ${GLOG_LIBRARIES} - ) + ExternalProject_Add( + extern_glog + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${GLOG_REPOSITORY} + GIT_TAG ${GLOG_TAG} + DEPENDS gflags + PREFIX ${GLOG_PREFIX_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DWITH_GFLAGS=OFF + -DBUILD_TESTING=OFF + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS + -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${GLOG_LIBRARIES}) endif() -ADD_LIBRARY(glog STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES}) -ADD_DEPENDENCIES(glog extern_glog gflags) -LINK_LIBRARIES(glog) +add_library(glog STATIC IMPORTED GLOBAL) +set_property(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES}) +add_dependencies(glog extern_glog gflags) +link_libraries(glog) diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake index 778d7c2a0ae29..cd7b254892ed1 100644 --- a/cmake/external/gloo.cmake +++ b/cmake/external/gloo.cmake @@ -12,58 +12,65 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) +include(ExternalProject) -SET(GLOO_PROJECT "extern_gloo") -SET(GLOO_PREFIX_DIR ${THIRD_PARTY_PATH}/gloo) -SET(GLOO_SOURCE_DIR ${THIRD_PARTY_PATH}/gloo/src/extern_gloo) -SET(GLOO_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gloo) -SET(GLOO_INCLUDE_DIR "${GLOO_INSTALL_DIR}/include" CACHE PATH "gloo include directory." FORCE) -SET(GLOO_LIBRARY_DIR "${GLOO_INSTALL_DIR}/lib" CACHE PATH "gloo library directory." FORCE) +set(GLOO_PROJECT "extern_gloo") +set(GLOO_PREFIX_DIR ${THIRD_PARTY_PATH}/gloo) +set(GLOO_SOURCE_DIR ${THIRD_PARTY_PATH}/gloo/src/extern_gloo) +set(GLOO_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gloo) +set(GLOO_INCLUDE_DIR + "${GLOO_INSTALL_DIR}/include" + CACHE PATH "gloo include directory." FORCE) +set(GLOO_LIBRARY_DIR + "${GLOO_INSTALL_DIR}/lib" + CACHE PATH "gloo library directory." FORCE) # As we add extra features for gloo, we use the non-official repo -SET(GLOO_REPOSITORY ${GIT_URL}/sandyhouse/gloo.git) -SET(GLOO_TAG v0.0.2) -SET(GLOO_LIBRARIES "${GLOO_INSTALL_DIR}/lib/libgloo.a" CACHE FILEPATH "gloo library." FORCE) +set(GLOO_REPOSITORY ${GIT_URL}/sandyhouse/gloo.git) +set(GLOO_TAG v0.0.2) +set(GLOO_LIBRARIES + "${GLOO_INSTALL_DIR}/lib/libgloo.a" + CACHE FILEPATH "gloo library." FORCE) -INCLUDE_DIRECTORIES(${GLOO_INCLUDE_DIR}) +include_directories(${GLOO_INCLUDE_DIR}) if(WITH_ASCEND OR WITH_ASCEND_CL) ExternalProject_Add( - ${GLOO_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${GLOO_REPOSITORY} - GIT_TAG ${GLOO_TAG} - PREFIX "${GLOO_PREFIX_DIR}" - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND mkdir -p ${GLOO_SOURCE_DIR}/build - && cd ${GLOO_SOURCE_DIR}/build && cmake .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && make - && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo - INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR} - COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo" - BUILD_BYPRODUCTS ${GLOO_LIBRARIES} - ) + ${GLOO_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${GLOO_REPOSITORY} + GIT_TAG ${GLOO_TAG} + PREFIX "${GLOO_PREFIX_DIR}" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND + mkdir -p ${GLOO_SOURCE_DIR}/build && cd ${GLOO_SOURCE_DIR}/build && cmake + .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && make && mkdir -p + ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy + ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR} + COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" + "${GLOO_INCLUDE_DIR}/gloo" + BUILD_BYPRODUCTS ${GLOO_LIBRARIES}) else() ExternalProject_Add( - ${GLOO_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${GLOO_REPOSITORY} - GIT_TAG ${GLOO_TAG} - PREFIX "${GLOO_PREFIX_DIR}" - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND mkdir -p ${GLOO_SOURCE_DIR}/build - && cd ${GLOO_SOURCE_DIR}/build && cmake .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && make - && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo - INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR} - COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo" - BUILD_BYPRODUCTS ${GLOO_LIBRARIES} - ) + ${GLOO_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${GLOO_REPOSITORY} + GIT_TAG ${GLOO_TAG} + PREFIX "${GLOO_PREFIX_DIR}" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND + mkdir -p ${GLOO_SOURCE_DIR}/build && cd ${GLOO_SOURCE_DIR}/build && cmake + .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && make && mkdir -p + ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy + ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR} + COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" + "${GLOO_INCLUDE_DIR}/gloo" + BUILD_BYPRODUCTS ${GLOO_LIBRARIES}) endif() - -ADD_LIBRARY(gloo STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET gloo PROPERTY IMPORTED_LOCATION ${GLOO_LIBRARIES}) -ADD_DEPENDENCIES(gloo ${GLOO_PROJECT}) +add_library(gloo STATIC IMPORTED GLOBAL) +set_property(TARGET gloo PROPERTY IMPORTED_LOCATION ${GLOO_LIBRARIES}) +add_dependencies(gloo ${GLOO_PROJECT}) diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index 3c740af6e0b3f..00527ceecdc1f 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -14,79 +14,85 @@ #FIXME:(gongwb) Move brpc's gtest dependency. -IF(WITH_TESTING) - ENABLE_TESTING() -ENDIF() +if(WITH_TESTING) + enable_testing() +endif() -INCLUDE(GNUInstallDirs) -INCLUDE(ExternalProject) +include(GNUInstallDirs) +include(ExternalProject) -SET(GTEST_PREFIX_DIR ${THIRD_PARTY_PATH}/gtest) -SET(GTEST_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gtest) -SET(GTEST_INCLUDE_DIR "${GTEST_INSTALL_DIR}/include" CACHE PATH "gtest include directory." FORCE) -set(GTEST_REPOSITORY ${GIT_URL}/google/googletest.git) -set(GTEST_TAG release-1.8.1) +set(GTEST_PREFIX_DIR ${THIRD_PARTY_PATH}/gtest) +set(GTEST_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gtest) +set(GTEST_INCLUDE_DIR + "${GTEST_INSTALL_DIR}/include" + CACHE PATH "gtest include directory." FORCE) +set(GTEST_REPOSITORY ${GIT_URL}/google/googletest.git) +set(GTEST_TAG release-1.8.1) -INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIR}) +include_directories(${GTEST_INCLUDE_DIR}) -IF(WIN32) - set(GTEST_LIBRARIES - "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE) - set(GTEST_MAIN_LIBRARIES - "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE) - string(REPLACE "/w " "" GTEST_CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") - string(REPLACE "/w " "" GTEST_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - string(REPLACE "/W0 " "" GTEST_CMAKE_C_FLAGS "${GTEST_CMAKE_C_FLAGS}") - string(REPLACE "/W0 " "" GTEST_CMAKE_CXX_FLAGS "${GTEST_CMAKE_CXX_FLAGS}") -ELSE(WIN32) - set(GTEST_LIBRARIES - "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE) - set(GTEST_MAIN_LIBRARIES - "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE) - set(GTEST_CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") - set(GTEST_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") -ENDIF(WIN32) +if(WIN32) + set(GTEST_LIBRARIES + "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/gtest.lib" + CACHE FILEPATH "gtest libraries." FORCE) + set(GTEST_MAIN_LIBRARIES + "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/gtest_main.lib" + CACHE FILEPATH "gtest main libraries." FORCE) + string(REPLACE "/w " "" GTEST_CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") + string(REPLACE "/w " "" GTEST_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "/W0 " "" GTEST_CMAKE_C_FLAGS "${GTEST_CMAKE_C_FLAGS}") + string(REPLACE "/W0 " "" GTEST_CMAKE_CXX_FLAGS "${GTEST_CMAKE_CXX_FLAGS}") +else(WIN32) + set(GTEST_LIBRARIES + "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest.a" + CACHE FILEPATH "gtest libraries." FORCE) + set(GTEST_MAIN_LIBRARIES + "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest_main.a" + CACHE FILEPATH "gtest main libraries." FORCE) + set(GTEST_CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") + set(GTEST_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") +endif(WIN32) -IF(WITH_MKLML) - # wait for mklml downloading completed - SET(GTEST_DEPENDS ${MKLML_PROJECT}) -ENDIF() +if(WITH_MKLML) + # wait for mklml downloading completed + set(GTEST_DEPENDS ${MKLML_PROJECT}) +endif() ExternalProject_Add( - extern_gtest - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${GTEST_REPOSITORY} - GIT_TAG ${GTEST_TAG} - DEPENDS ${GTEST_DEPENDS} - PREFIX ${GTEST_PREFIX_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${GTEST_CMAKE_CXX_FLAGS} - -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} - -DCMAKE_C_FLAGS=${GTEST_CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR} - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DBUILD_GMOCK=ON - -Dgtest_disable_pthreads=ON - -Dgtest_force_shared_crt=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR} - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - BUILD_BYPRODUCTS ${GTEST_LIBRARIES} - BUILD_BYPRODUCTS ${GTEST_MAIN_LIBRARIES} -) + extern_gtest + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${GTEST_REPOSITORY} + GIT_TAG ${GTEST_TAG} + DEPENDS ${GTEST_DEPENDS} + PREFIX ${GTEST_PREFIX_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${GTEST_CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_C_FLAGS=${GTEST_CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DBUILD_GMOCK=ON + -Dgtest_disable_pthreads=ON + -Dgtest_force_shared_crt=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS + -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${GTEST_LIBRARIES} + BUILD_BYPRODUCTS ${GTEST_MAIN_LIBRARIES}) -ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET gtest PROPERTY IMPORTED_LOCATION ${GTEST_LIBRARIES}) -ADD_DEPENDENCIES(gtest extern_gtest) +add_library(gtest STATIC IMPORTED GLOBAL) +set_property(TARGET gtest PROPERTY IMPORTED_LOCATION ${GTEST_LIBRARIES}) +add_dependencies(gtest extern_gtest) -ADD_LIBRARY(gtest_main STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES}) -ADD_DEPENDENCIES(gtest_main extern_gtest) +add_library(gtest_main STATIC IMPORTED GLOBAL) +set_property(TARGET gtest_main PROPERTY IMPORTED_LOCATION + ${GTEST_MAIN_LIBRARIES}) +add_dependencies(gtest_main extern_gtest) diff --git a/cmake/external/lapack.cmake b/cmake/external/lapack.cmake index 4cca61681c66c..43305223fe280 100644 --- a/cmake/external/lapack.cmake +++ b/cmake/external/lapack.cmake @@ -12,56 +12,68 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE (ExternalProject) +include(ExternalProject) -SET(LAPACK_PREFIX_DIR ${THIRD_PARTY_PATH}/lapack) -SET(LAPACK_SOURCE_DIR ${THIRD_PARTY_PATH}/lapack/src/extern_lapack) -SET(LAPACK_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lapack) -SET(LAPACK_LIB_DIR ${LAPACK_INSTALL_DIR}/lib) +set(LAPACK_PREFIX_DIR ${THIRD_PARTY_PATH}/lapack) +set(LAPACK_SOURCE_DIR ${THIRD_PARTY_PATH}/lapack/src/extern_lapack) +set(LAPACK_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lapack) +set(LAPACK_LIB_DIR ${LAPACK_INSTALL_DIR}/lib) # Note(zhouwei): lapack need fortan compiler which many machines don't have, so use precompiled library. # use lapack tag v3.10.0 on 06/28/2021 https://github.com/Reference-LAPACK/lapack if(LINUX) - SET(LAPACK_VER "lapack_lnx_v3.10.0.20210628" CACHE STRING "" FORCE) - SET(LAPACK_URL "https://paddlepaddledeps.bj.bcebos.com/${LAPACK_VER}.tar.gz" CACHE STRING "" FORCE) - SET(LAPACK_URL_MD5 71f8cc8237a8571692f3e07f9a4f25f6) - SET(GNU_RT_LIB_1 "${LAPACK_LIB_DIR}/libquadmath.so.0") - SET(GFORTRAN_LIB "${LAPACK_LIB_DIR}/libgfortran.so.3") - SET(BLAS_LIB "${LAPACK_LIB_DIR}/libblas.so.3") - SET(LAPACK_LIB "${LAPACK_LIB_DIR}/liblapack.so.3") + set(LAPACK_VER + "lapack_lnx_v3.10.0.20210628" + CACHE STRING "" FORCE) + set(LAPACK_URL + "https://paddlepaddledeps.bj.bcebos.com/${LAPACK_VER}.tar.gz" + CACHE STRING "" FORCE) + set(LAPACK_URL_MD5 71f8cc8237a8571692f3e07f9a4f25f6) + set(GNU_RT_LIB_1 "${LAPACK_LIB_DIR}/libquadmath.so.0") + set(GFORTRAN_LIB "${LAPACK_LIB_DIR}/libgfortran.so.3") + set(BLAS_LIB "${LAPACK_LIB_DIR}/libblas.so.3") + set(LAPACK_LIB "${LAPACK_LIB_DIR}/liblapack.so.3") elseif(WIN32) - # Refer to [lapack-for-windows] http://icl.cs.utk.edu/lapack-for-windows/lapack/#lapacke - SET(LAPACK_VER "lapack_win_v3.10.0.20210628" CACHE STRING "" FORCE) - SET(LAPACK_URL "https://paddlepaddledeps.bj.bcebos.com/${LAPACK_VER}.zip" CACHE STRING "" FORCE) - SET(LAPACK_URL_MD5 590d080392dcd5abbd5dca767a50b63a) - SET(GNU_RT_LIB_1 "${LAPACK_LIB_DIR}/libquadmath-0.dll") - SET(GNU_RT_LIB_2 "${LAPACK_LIB_DIR}/libgcc_s_seh-1.dll") - SET(GFORTRAN_LIB "${LAPACK_LIB_DIR}/libgfortran-3.dll") - SET(BLAS_LIB "${LAPACK_LIB_DIR}/libblas.dll") - SET(LAPACK_LIB "${LAPACK_LIB_DIR}/liblapack.dll") + # Refer to [lapack-for-windows] http://icl.cs.utk.edu/lapack-for-windows/lapack/#lapacke + set(LAPACK_VER + "lapack_win_v3.10.0.20210628" + CACHE STRING "" FORCE) + set(LAPACK_URL + "https://paddlepaddledeps.bj.bcebos.com/${LAPACK_VER}.zip" + CACHE STRING "" FORCE) + set(LAPACK_URL_MD5 590d080392dcd5abbd5dca767a50b63a) + set(GNU_RT_LIB_1 "${LAPACK_LIB_DIR}/libquadmath-0.dll") + set(GNU_RT_LIB_2 "${LAPACK_LIB_DIR}/libgcc_s_seh-1.dll") + set(GFORTRAN_LIB "${LAPACK_LIB_DIR}/libgfortran-3.dll") + set(BLAS_LIB "${LAPACK_LIB_DIR}/libblas.dll") + set(LAPACK_LIB "${LAPACK_LIB_DIR}/liblapack.dll") else() - SET(LAPACK_VER "lapack_mac_v3.10.0.20210628" CACHE STRING "" FORCE) - SET(LAPACK_URL "https://paddlepaddledeps.bj.bcebos.com/${LAPACK_VER}.tar.gz" CACHE STRING "" FORCE) - SET(LAPACK_URL_MD5 427aecf8dee8523de3566ca8e47944d7) - SET(GNU_RT_LIB_1 "${LAPACK_LIB_DIR}/libquadmath.0.dylib") - SET(GNU_RT_LIB_2 "${LAPACK_LIB_DIR}/libgcc_s.1.dylib") - SET(GFORTRAN_LIB "${LAPACK_LIB_DIR}/libgfortran.5.dylib") - SET(BLAS_LIB "${LAPACK_LIB_DIR}/libblas.3.dylib") - SET(LAPACK_LIB "${LAPACK_LIB_DIR}/liblapack.3.dylib") + set(LAPACK_VER + "lapack_mac_v3.10.0.20210628" + CACHE STRING "" FORCE) + set(LAPACK_URL + "https://paddlepaddledeps.bj.bcebos.com/${LAPACK_VER}.tar.gz" + CACHE STRING "" FORCE) + set(LAPACK_URL_MD5 427aecf8dee8523de3566ca8e47944d7) + set(GNU_RT_LIB_1 "${LAPACK_LIB_DIR}/libquadmath.0.dylib") + set(GNU_RT_LIB_2 "${LAPACK_LIB_DIR}/libgcc_s.1.dylib") + set(GFORTRAN_LIB "${LAPACK_LIB_DIR}/libgfortran.5.dylib") + set(BLAS_LIB "${LAPACK_LIB_DIR}/libblas.3.dylib") + set(LAPACK_LIB "${LAPACK_LIB_DIR}/liblapack.3.dylib") endif() ExternalProject_Add( - extern_lapack - ${EXTERNAL_PROJECT_LOG_ARGS} - URL ${LAPACK_URL} - URL_MD5 ${LAPACK_URL_MD5} - PREFIX ${LAPACK_PREFIX_DIR} - DOWNLOAD_NO_PROGRESS 1 - PATCH_COMMAND "" - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory ${LAPACK_SOURCE_DIR} ${LAPACK_LIB_DIR} - BUILD_BYPRODUCTS ${BLAS_LIB} - BUILD_BYPRODUCTS ${LAPACK_LIB} -) + extern_lapack + ${EXTERNAL_PROJECT_LOG_ARGS} + URL ${LAPACK_URL} + URL_MD5 ${LAPACK_URL_MD5} + PREFIX ${LAPACK_PREFIX_DIR} + DOWNLOAD_NO_PROGRESS 1 + PATCH_COMMAND "" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory ${LAPACK_SOURCE_DIR} + ${LAPACK_LIB_DIR} + BUILD_BYPRODUCTS ${BLAS_LIB} + BUILD_BYPRODUCTS ${LAPACK_LIB}) diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake index 65a21a87dbde2..b1f2345794e15 100644 --- a/cmake/external/leveldb.cmake +++ b/cmake/external/leveldb.cmake @@ -12,35 +12,39 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) +include(ExternalProject) -SET(LEVELDB_PREFIX_DIR ${THIRD_PARTY_PATH}/leveldb) -SET(LEVELDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/leveldb) -SET(LEVELDB_INCLUDE_DIR "${LEVELDB_INSTALL_DIR}/include" CACHE PATH "leveldb include directory." FORCE) -SET(LEVELDB_LIBRARIES "${LEVELDB_INSTALL_DIR}/lib/libleveldb.a" CACHE FILEPATH "leveldb library." FORCE) -INCLUDE_DIRECTORIES(${LEVELDB_INCLUDE_DIR}) +set(LEVELDB_PREFIX_DIR ${THIRD_PARTY_PATH}/leveldb) +set(LEVELDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/leveldb) +set(LEVELDB_INCLUDE_DIR + "${LEVELDB_INSTALL_DIR}/include" + CACHE PATH "leveldb include directory." FORCE) +set(LEVELDB_LIBRARIES + "${LEVELDB_INSTALL_DIR}/lib/libleveldb.a" + CACHE FILEPATH "leveldb library." FORCE) +include_directories(${LEVELDB_INCLUDE_DIR}) ExternalProject_Add( - extern_leveldb - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${LEVELDB_PREFIX_DIR} - GIT_REPOSITORY "https://github.com/google/leveldb" - GIT_TAG v1.18 - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a - INSTALL_COMMAND mkdir -p ${LEVELDB_INSTALL_DIR}/lib/ - && cp ${LEVELDB_PREFIX_DIR}/src/extern_leveldb/libleveldb.a ${LEVELDB_LIBRARIES} - && cp -r ${LEVELDB_PREFIX_DIR}/src/extern_leveldb/include ${LEVELDB_INSTALL_DIR}/ - BUILD_IN_SOURCE 1 - BUILD_BYPRODUCTS ${LEVELDB_LIBRARIES} -) + extern_leveldb + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${LEVELDB_PREFIX_DIR} + GIT_REPOSITORY "https://github.com/google/leveldb" + GIT_TAG v1.18 + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a + INSTALL_COMMAND + mkdir -p ${LEVELDB_INSTALL_DIR}/lib/ && cp + ${LEVELDB_PREFIX_DIR}/src/extern_leveldb/libleveldb.a ${LEVELDB_LIBRARIES} + && cp -r ${LEVELDB_PREFIX_DIR}/src/extern_leveldb/include + ${LEVELDB_INSTALL_DIR}/ + BUILD_IN_SOURCE 1 + BUILD_BYPRODUCTS ${LEVELDB_LIBRARIES}) -ADD_DEPENDENCIES(extern_leveldb snappy) +add_dependencies(extern_leveldb snappy) -ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES}) -ADD_DEPENDENCIES(leveldb extern_leveldb) - -LIST(APPEND external_project_dependencies leveldb) +add_library(leveldb STATIC IMPORTED GLOBAL) +set_property(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES}) +add_dependencies(leveldb extern_leveldb) +list(APPEND external_project_dependencies leveldb) diff --git a/cmake/external/libmct.cmake b/cmake/external/libmct.cmake index a166e43c7b95e..28bf083f7791e 100644 --- a/cmake/external/libmct.cmake +++ b/cmake/external/libmct.cmake @@ -12,48 +12,54 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) +include(ExternalProject) -SET(LIBMCT_PROJECT "extern_libmct") -IF((NOT DEFINED LIBMCT_VER) OR (NOT DEFINED LIBMCT_URL)) - MESSAGE(STATUS "use pre defined download url") - SET(LIBMCT_VER "0.1.0" CACHE STRING "" FORCE) - SET(LIBMCT_NAME "libmct" CACHE STRING "" FORCE) - SET(LIBMCT_URL "https://pslib.bj.bcebos.com/libmct/libmct.tar.gz" CACHE STRING "" FORCE) -ENDIF() -MESSAGE(STATUS "LIBMCT_NAME: ${LIBMCT_NAME}, LIBMCT_URL: ${LIBMCT_URL}") -SET(LIBMCT_PREFIX_DIR "${THIRD_PARTY_PATH}/libmct") -SET(LIBMCT_DOWNLOAD_DIR "${LIBMCT_PREFIX_DIR}/src/${LIBMCT_PROJECT}") -SET(LIBMCT_DST_DIR "libmct") -SET(LIBMCT_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") -SET(LIBMCT_INSTALL_DIR ${LIBMCT_INSTALL_ROOT}/${LIBMCT_DST_DIR}) -SET(LIBMCT_ROOT ${LIBMCT_INSTALL_DIR}) -SET(LIBMCT_INC_DIR ${LIBMCT_ROOT}/include) -SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${LIBMCT_ROOT}/lib") +set(LIBMCT_PROJECT "extern_libmct") +if((NOT DEFINED LIBMCT_VER) OR (NOT DEFINED LIBMCT_URL)) + message(STATUS "use pre defined download url") + set(LIBMCT_VER + "0.1.0" + CACHE STRING "" FORCE) + set(LIBMCT_NAME + "libmct" + CACHE STRING "" FORCE) + set(LIBMCT_URL + "https://pslib.bj.bcebos.com/libmct/libmct.tar.gz" + CACHE STRING "" FORCE) +endif() +message(STATUS "LIBMCT_NAME: ${LIBMCT_NAME}, LIBMCT_URL: ${LIBMCT_URL}") +set(LIBMCT_PREFIX_DIR "${THIRD_PARTY_PATH}/libmct") +set(LIBMCT_DOWNLOAD_DIR "${LIBMCT_PREFIX_DIR}/src/${LIBMCT_PROJECT}") +set(LIBMCT_DST_DIR "libmct") +set(LIBMCT_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +set(LIBMCT_INSTALL_DIR ${LIBMCT_INSTALL_ROOT}/${LIBMCT_DST_DIR}) +set(LIBMCT_ROOT ${LIBMCT_INSTALL_DIR}) +set(LIBMCT_INC_DIR ${LIBMCT_ROOT}/include) +set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${LIBMCT_ROOT}/lib") -INCLUDE_DIRECTORIES(${LIBMCT_INC_DIR}) +include_directories(${LIBMCT_INC_DIR}) -FILE(WRITE ${LIBMCT_DOWNLOAD_DIR}/CMakeLists.txt - "PROJECT(LIBMCT)\n" - "cmake_minimum_required(VERSION 3.0)\n" +file( + WRITE ${LIBMCT_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(LIBMCT)\n" "cmake_minimum_required(VERSION 3.0)\n" "install(DIRECTORY ${LIBMCT_NAME}/include ${LIBMCT_NAME}/lib \n" " DESTINATION ${LIBMCT_DST_DIR})\n") ExternalProject_Add( - ${LIBMCT_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${LIBMCT_PREFIX_DIR} - DOWNLOAD_DIR ${LIBMCT_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate ${LIBMCT_URL} -c -q -O ${LIBMCT_NAME}.tar.gz - && tar --no-same-owner -zxvf ${LIBMCT_NAME}.tar.gz - DOWNLOAD_NO_PROGRESS 1 - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBMCT_INSTALL_ROOT} - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${LIBMCT_INSTALL_ROOT} - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} -) + ${LIBMCT_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${LIBMCT_PREFIX_DIR} + DOWNLOAD_DIR ${LIBMCT_DOWNLOAD_DIR} + DOWNLOAD_COMMAND + wget --no-check-certificate ${LIBMCT_URL} -c -q -O ${LIBMCT_NAME}.tar.gz && + tar --no-same-owner -zxvf ${LIBMCT_NAME}.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBMCT_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${LIBMCT_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}) add_library(libmct INTERFACE) -ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT}) +add_dependencies(libmct ${LIBMCT_PROJECT}) diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake index da7cb696ef8c7..1efb95cc0cfa9 100644 --- a/cmake/external/libxsmm.cmake +++ b/cmake/external/libxsmm.cmake @@ -12,34 +12,38 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE (ExternalProject) +include(ExternalProject) -SET(LIBXSMM_PREFIX_DIR ${THIRD_PARTY_PATH}/libxsmm) -SET(LIBXSMM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/libxsmm) -SET(LIBXSMM_INCLUDE_DIR "${LIBXSMM_INSTALL_DIR}/include" CACHE PATH "LIBXSMM include directory." FORCE) -SET(LIBXSMM_LIBRARY_DIR "${LIBXSMM_INSTALL_DIR}/lib" CACHE PATH "LIBXSMM library directory." FORCE) -SET(LIBXSMM_LIB "${LIBXSMM_LIBRARY_DIR}/libxsmm.a") -SET(LIBXSMMNOBLAS_LIB "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a") +set(LIBXSMM_PREFIX_DIR ${THIRD_PARTY_PATH}/libxsmm) +set(LIBXSMM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/libxsmm) +set(LIBXSMM_INCLUDE_DIR + "${LIBXSMM_INSTALL_DIR}/include" + CACHE PATH "LIBXSMM include directory." FORCE) +set(LIBXSMM_LIBRARY_DIR + "${LIBXSMM_INSTALL_DIR}/lib" + CACHE PATH "LIBXSMM library directory." FORCE) +set(LIBXSMM_LIB "${LIBXSMM_LIBRARY_DIR}/libxsmm.a") +set(LIBXSMMNOBLAS_LIB "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a") ExternalProject_Add( - extern_libxsmm - ${SHALLOW_CLONE} - GIT_REPOSITORY "${GIT_URL}/hfp/libxsmm.git" - GIT_TAG "7cc03b5b342fdbc6b6d990b190671c5dbb8489a2" - PREFIX ${LIBXSMM_PREFIX_DIR} - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_IN_SOURCE 1 - BUILD_COMMAND $(MAKE) --silent PREFIX=${LIBXSMM_INSTALL_DIR} CXX=g++ CC=gcc WARP=0 install - INSTALL_COMMAND "" - BUILD_BYPRODUCTS ${LIBXSMM_LIB} - BUILD_BYPRODUCTS ${LIBXSMMNOBLAS_LIB} -) -ADD_LIBRARY(libxsmm STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIB}") -SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMMNOBLAS_LIB}") + extern_libxsmm + ${SHALLOW_CLONE} + GIT_REPOSITORY "${GIT_URL}/hfp/libxsmm.git" + GIT_TAG "7cc03b5b342fdbc6b6d990b190671c5dbb8489a2" + PREFIX ${LIBXSMM_PREFIX_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_IN_SOURCE 1 + BUILD_COMMAND $(MAKE) --silent PREFIX=${LIBXSMM_INSTALL_DIR} CXX=g++ CC=gcc + WARP=0 install + INSTALL_COMMAND "" + BUILD_BYPRODUCTS ${LIBXSMM_LIB} + BUILD_BYPRODUCTS ${LIBXSMMNOBLAS_LIB}) +add_library(libxsmm STATIC IMPORTED GLOBAL) +set_property(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIB}") +set_property(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMMNOBLAS_LIB}") -MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}") +message(STATUS "Libxsmm library: ${LIBXSMM_LIBS}") include_directories(${LIBXSMM_INCLUDE_DIR}) -ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM) -ADD_DEPENDENCIES(libxsmm extern_libxsmm) +add_definitions(-DPADDLE_WITH_LIBXSMM) +add_dependencies(libxsmm extern_libxsmm) diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake index 0031757467f37..b994f407604b4 100644 --- a/cmake/external/lite.cmake +++ b/cmake/external/lite.cmake @@ -18,32 +18,34 @@ if(NOT LINUX) return() endif() -if (LITE_WITH_XPU) +if(LITE_WITH_XPU) add_definitions(-DLITE_SUBGRAPH_WITH_XPU) - IF(WITH_AARCH64) - SET(XPU_SDK_ENV "kylin_aarch64") - ELSEIF(WITH_SUNWAY) - SET(XPU_SDK_ENV "deepin_sw6_64") - ELSEIF(WITH_BDCENTOS) - SET(XPU_SDK_ENV "bdcentos_x86_64") - ELSEIF(WITH_UBUNTU) - SET(XPU_SDK_ENV "ubuntu_x86_64") - ELSEIF(WITH_CENTOS) - SET(XPU_SDK_ENV "centos7_x86_64") - ELSE () - SET(XPU_SDK_ENV "ubuntu_x86_64") - ENDIF() + if(WITH_AARCH64) + set(XPU_SDK_ENV "kylin_aarch64") + elseif(WITH_SUNWAY) + set(XPU_SDK_ENV "deepin_sw6_64") + elseif(WITH_BDCENTOS) + set(XPU_SDK_ENV "bdcentos_x86_64") + elseif(WITH_UBUNTU) + set(XPU_SDK_ENV "ubuntu_x86_64") + elseif(WITH_CENTOS) + set(XPU_SDK_ENV "centos7_x86_64") + else() + set(XPU_SDK_ENV "ubuntu_x86_64") + endif() endif() -if (LITE_WITH_NNADAPTER) - add_definitions(-DLITE_SUBGRAPH_WITH_NNADAPTER) - if (NNADAPTER_WITH_HUAWEI_ASCEND_NPU) +if(LITE_WITH_NNADAPTER) + add_definitions(-DLITE_SUBGRAPH_WITH_NNADAPTER) + if(NNADAPTER_WITH_HUAWEI_ASCEND_NPU) add_definitions(-DLITE_SUBGRAPH_WITH_NPU) - set(NPU_SDK_ROOT "/usr/local/Ascend/ascend-toolkit/latest" CACHE STRING "default NPU SDK ROOT") + set(NPU_SDK_ROOT + "/usr/local/Ascend/ascend-toolkit/latest" + CACHE STRING "default NPU SDK ROOT") endif() endif() -if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) +if(NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) include(ExternalProject) set(LITE_PROJECT extern_lite) set(LITE_PREFIX_DIR ${THIRD_PARTY_PATH}/lite) @@ -61,109 +63,118 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) if(WITH_ARM) set(LITE_BUILD_COMMAND $(MAKE) publish_inference -j) message(WARNING "BUILD_COMMAND: ${LITE_BUILD_COMMAND}") - set(LITE_OPTIONAL_ARGS -DWITH_MKL=OFF - -DLITE_WITH_CUDA=OFF - -DWITH_MKLDNN=OFF - -DLITE_WITH_X86=OFF - -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON - -DLITE_WITH_PROFILE=OFF - -DARM_TARGET_OS=armlinux - -DWITH_LITE=ON - -DWITH_PYTHON=OFF - -DWITH_TESTING=OFF - -DLITE_BUILD_EXTRA=ON - -DLITE_WITH_XPU=${LITE_WITH_XPU} - -DXPU_SDK_URL=${XPU_BASE_URL} - -DXPU_SDK_ENV=${XPU_SDK_ENV} - -DLITE_WITH_NNADAPTER=${LITE_WITH_NNADAPTER} - -DNNADAPTER_WITH_HUAWEI_ASCEND_NPU=${NNADAPTER_WITH_HUAWEI_ASCEND_NPU} - -DNNADAPTER_HUAWEI_ASCEND_NPU_SDK_ROOT=${NPU_SDK_ROOT} - -DLITE_WITH_CODE_META_INFO=OFF - -DLITE_WITH_ARM=ON) + set(LITE_OPTIONAL_ARGS + -DWITH_MKL=OFF + -DLITE_WITH_CUDA=OFF + -DWITH_MKLDNN=OFF + -DLITE_WITH_X86=OFF + -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON + -DLITE_WITH_PROFILE=OFF + -DARM_TARGET_OS=armlinux + -DWITH_LITE=ON + -DWITH_PYTHON=OFF + -DWITH_TESTING=OFF + -DLITE_BUILD_EXTRA=ON + -DLITE_WITH_XPU=${LITE_WITH_XPU} + -DXPU_SDK_URL=${XPU_BASE_URL} + -DXPU_SDK_ENV=${XPU_SDK_ENV} + -DLITE_WITH_NNADAPTER=${LITE_WITH_NNADAPTER} + -DNNADAPTER_WITH_HUAWEI_ASCEND_NPU=${NNADAPTER_WITH_HUAWEI_ASCEND_NPU} + -DNNADAPTER_HUAWEI_ASCEND_NPU_SDK_ROOT=${NPU_SDK_ROOT} + -DLITE_WITH_CODE_META_INFO=OFF + -DLITE_WITH_ARM=ON) ExternalProject_Add( ${LITE_PROJECT} ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "${GIT_URL}/PaddlePaddle/Paddle-Lite.git" - GIT_TAG ${LITE_GIT_TAG} - PREFIX ${LITE_PREFIX_DIR} - PATCH_COMMAND mkdir -p ${LITE_PREFIX_DIR}/src/extern_lite-build/lite/gen_code && touch ${LITE_PREFIX_DIR}/src/extern_lite-build/lite/gen_code/__generated_code__.cc && sed -i "/aarch64-linux-gnu-gcc/d" ${LITE_PREFIX_DIR}/src/extern_lite/cmake/os/armlinux.cmake && sed -i "/aarch64-linux-gnu-g++/d" ${LITE_PREFIX_DIR}/src/extern_lite/cmake/os/armlinux.cmake - UPDATE_COMMAND "" - BUILD_COMMAND ${LITE_BUILD_COMMAND} - INSTALL_COMMAND "" - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${LITE_CMAKE_CXX_FLAGS} - -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - ${EXTERNAL_OPTIONAL_ARGS} - ${LITE_OPTIONAL_ARGS} - ) + GIT_REPOSITORY "${GIT_URL}/PaddlePaddle/Paddle-Lite.git" + GIT_TAG ${LITE_GIT_TAG} + PREFIX ${LITE_PREFIX_DIR} + PATCH_COMMAND + mkdir -p ${LITE_PREFIX_DIR}/src/extern_lite-build/lite/gen_code && touch + ${LITE_PREFIX_DIR}/src/extern_lite-build/lite/gen_code/__generated_code__.cc + && sed -i "/aarch64-linux-gnu-gcc/d" + ${LITE_PREFIX_DIR}/src/extern_lite/cmake/os/armlinux.cmake && sed -i + "/aarch64-linux-gnu-g++/d" + ${LITE_PREFIX_DIR}/src/extern_lite/cmake/os/armlinux.cmake + UPDATE_COMMAND "" + BUILD_COMMAND ${LITE_BUILD_COMMAND} + INSTALL_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${LITE_CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + ${LITE_OPTIONAL_ARGS}) else() set(LITE_BUILD_COMMAND $(MAKE) publish_inference -j) - set(LITE_OPTIONAL_ARGS -DWITH_MKL=ON - -DLITE_WITH_CUDA=${WITH_GPU} - -DWITH_MKLDNN=OFF - -DLITE_WITH_X86=ON - -DLITE_WITH_PROFILE=OFF - -DWITH_LITE=OFF - -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF - -DWITH_PYTHON=OFF - -DWITH_TESTING=OFF - -DLITE_BUILD_EXTRA=ON - -DCUDNN_ROOT=${CUDNN_ROOT} - -DLITE_WITH_STATIC_CUDA=OFF - -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME} - -DLITE_WITH_XPU=${LITE_WITH_XPU} - -DXPU_SDK_URL=${XPU_BASE_URL} - -DXPU_SDK_ENV=${XPU_SDK_ENV} - -DLITE_WITH_NNADAPTER=${LITE_WITH_NNADAPTER} - -DNNADAPTER_WITH_HUAWEI_ASCEND_NPU=${NNADAPTER_WITH_HUAWEI_ASCEND_NPU} - -DNNADAPTER_HUAWEI_ASCEND_NPU_SDK_ROOT=${NPU_SDK_ROOT} - -DLITE_WITH_CODE_META_INFO=OFF - -DLITE_WITH_ARM=OFF) + set(LITE_OPTIONAL_ARGS + -DWITH_MKL=ON + -DLITE_WITH_CUDA=${WITH_GPU} + -DWITH_MKLDNN=OFF + -DLITE_WITH_X86=ON + -DLITE_WITH_PROFILE=OFF + -DWITH_LITE=OFF + -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF + -DWITH_PYTHON=OFF + -DWITH_TESTING=OFF + -DLITE_BUILD_EXTRA=ON + -DCUDNN_ROOT=${CUDNN_ROOT} + -DLITE_WITH_STATIC_CUDA=OFF + -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME} + -DLITE_WITH_XPU=${LITE_WITH_XPU} + -DXPU_SDK_URL=${XPU_BASE_URL} + -DXPU_SDK_ENV=${XPU_SDK_ENV} + -DLITE_WITH_NNADAPTER=${LITE_WITH_NNADAPTER} + -DNNADAPTER_WITH_HUAWEI_ASCEND_NPU=${NNADAPTER_WITH_HUAWEI_ASCEND_NPU} + -DNNADAPTER_HUAWEI_ASCEND_NPU_SDK_ROOT=${NPU_SDK_ROOT} + -DLITE_WITH_CODE_META_INFO=OFF + -DLITE_WITH_ARM=OFF) ExternalProject_Add( - ${LITE_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "${GIT_URL}/PaddlePaddle/Paddle-Lite.git" - GIT_TAG ${LITE_GIT_TAG} - PREFIX ${LITE_PREFIX_DIR} - UPDATE_COMMAND "" - PATCH_COMMAND sed -i "s?NNadapter_bridges_path = os.path.abspath('..')+\"\/lite\/kernels\/nnadapter\/bridges\/paddle_use_bridges.h\"?NNadapter_bridges_path = os.path.abspath(\'..\')+\"\/extern_lite\/lite\/kernels\/nnadapter\/bridges\/paddle_use_bridges.h\"?" ${LITE_PREFIX_DIR}/src/extern_lite//lite/tools/cmake_tools/record_supported_kernel_op.py - BUILD_COMMAND ${LITE_BUILD_COMMAND} - INSTALL_COMMAND "" - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${LITE_CMAKE_CXX_FLAGS} - -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - ${EXTERNAL_OPTIONAL_ARGS} - ${LITE_OPTIONAL_ARGS} - ) + ${LITE_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "${GIT_URL}/PaddlePaddle/Paddle-Lite.git" + GIT_TAG ${LITE_GIT_TAG} + PREFIX ${LITE_PREFIX_DIR} + UPDATE_COMMAND "" + PATCH_COMMAND + sed -i + "s?NNadapter_bridges_path = os.path.abspath('..')+\"\/lite\/kernels\/nnadapter\/bridges\/paddle_use_bridges.h\"?NNadapter_bridges_path = os.path.abspath(\'..\')+\"\/extern_lite\/lite\/kernels\/nnadapter\/bridges\/paddle_use_bridges.h\"?" + ${LITE_PREFIX_DIR}/src/extern_lite//lite/tools/cmake_tools/record_supported_kernel_op.py + BUILD_COMMAND ${LITE_BUILD_COMMAND} + INSTALL_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${LITE_CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + ${LITE_OPTIONAL_ARGS}) endif() - ExternalProject_Get_property(${LITE_PROJECT} BINARY_DIR) - ExternalProject_Get_property(${LITE_PROJECT} SOURCE_DIR) + ExternalProject_Get_Property(${LITE_PROJECT} BINARY_DIR) + ExternalProject_Get_Property(${LITE_PROJECT} SOURCE_DIR) set(LITE_BINARY_DIR ${BINARY_DIR}) set(LITE_SOURCE_DIR ${SOURCE_DIR}) endif() -if (WITH_ARM) +if(WITH_ARM) if(LITE_WITH_XPU) set(LITE_OUTPUT_BIN_DIR inference_lite_lib.armlinux.armv8.xpu) elseif(LITE_WITH_NNADAPTER) message("Enable LITE_WITH_NNADAPTER") - if (NNADAPTER_WITH_HUAWEI_ASCEND_NPU) + if(NNADAPTER_WITH_HUAWEI_ASCEND_NPU) set(LITE_OUTPUT_BIN_DIR inference_lite_lib.armlinux.armv8.nnadapter) endif() else() @@ -184,22 +195,32 @@ endif() function(external_lite_libs alias path) add_library(${alias} SHARED IMPORTED GLOBAL) - SET_PROPERTY(TARGET ${alias} PROPERTY IMPORTED_LOCATION - ${path}) - if (LITE_PROJECT) + set_property(TARGET ${alias} PROPERTY IMPORTED_LOCATION ${path}) + if(LITE_PROJECT) add_dependencies(${alias} ${LITE_PROJECT}) endif() endfunction() -external_lite_libs(lite_full_static ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libpaddle_full_api_shared.so) -set(LITE_SHARED_LIB ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libpaddle_full_api_shared.so) +external_lite_libs( + lite_full_static + ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libpaddle_full_api_shared.so +) +set(LITE_SHARED_LIB + ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libpaddle_full_api_shared.so +) -if (LITE_WITH_NNADAPTER) - set(LITE_NNADAPTER_LIB ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libnnadapter.so) - if (NNADAPTER_WITH_HUAWEI_ASCEND_NPU) - external_lite_libs(lite_nnadapter ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libnnadapter.so ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libhuawei_ascend_npu.so) +if(LITE_WITH_NNADAPTER) + set(LITE_NNADAPTER_LIB + ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libnnadapter.so) + if(NNADAPTER_WITH_HUAWEI_ASCEND_NPU) + external_lite_libs( + lite_nnadapter + ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libnnadapter.so + ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libhuawei_ascend_npu.so) set(LITE_DEPS lite_full_static lite_nnadapter) - set(LITE_NNADAPTER_NPU_LIB ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libhuawei_ascend_npu.so) + set(LITE_NNADAPTER_NPU_LIB + ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libhuawei_ascend_npu.so + ) endif() else() set(LITE_DEPS lite_full_static) diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake index 5c48afa2806aa..8b33a73e24c8d 100644 --- a/cmake/external/llvm.cmake +++ b/cmake/external/llvm.cmake @@ -1,31 +1,33 @@ include(FetchContent) -set(LLVM_DOWNLOAD_URL https://paddle-inference-dist.bj.bcebos.com/infrt/llvm_b5149f4e66a49a98b67e8e2de4e24a4af8e2781b.tar.gz) +set(LLVM_DOWNLOAD_URL + https://paddle-inference-dist.bj.bcebos.com/infrt/llvm_b5149f4e66a49a98b67e8e2de4e24a4af8e2781b.tar.gz +) set(LLVM_MD5 022819bb5760817013cf4b8a37e97d5e) set(FETCHCONTENT_BASE_DIR ${THIRD_PARTY_PATH}/llvm) set(FETCHCONTENT_QUIET OFF) -FetchContent_Declare(external_llvm +FetchContent_Declare( + external_llvm URL ${LLVM_DOWNLOAD_URL} URL_MD5 ${LLVM_MD5} - PREFIX ${THIRD_PARTY_PATH}/llvm - SOURCE_DIR ${THIRD_PARTY_PATH}/install/llvm -) -if (NOT LLVM_PATH) + PREFIX ${THIRD_PARTY_PATH}/llvm SOURCE_DIR ${THIRD_PARTY_PATH}/install/llvm) +if(NOT LLVM_PATH) FetchContent_GetProperties(external_llvm) - if (NOT external_llvm_POPULATED) + if(NOT external_llvm_POPULATED) FetchContent_Populate(external_llvm) endif() set(LLVM_PATH ${THIRD_PARTY_PATH}/install/llvm) set(LLVM_DIR ${THIRD_PARTY_PATH}/install/llvm/lib/cmake/llvm) set(MLIR_DIR ${THIRD_PARTY_PATH}/install/llvm/lib/cmake/mlir) -else () +else() set(LLVM_DIR ${LLVM_PATH}/lib/cmake/llvm) set(MLIR_DIR ${LLVM_PATH}/lib/cmake/mlir) endif() -if (${CMAKE_CXX_COMPILER} STREQUAL "clang++") - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++ -lc++abi") +if(${CMAKE_CXX_COMPILER} STREQUAL "clang++") + set(CMAKE_EXE_LINKER_FLAGS + "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++ -lc++abi") endif() message(STATUS "set LLVM_DIR: ${LLVM_DIR}") @@ -66,8 +68,17 @@ cmake ../llvm -G "Unix Makefiles" \ add_definitions(${LLVM_DEFINITIONS}) -llvm_map_components_to_libnames(llvm_libs Support Core irreader - X86 executionengine orcjit mcjit all codegen) +llvm_map_components_to_libnames( + llvm_libs + Support + Core + irreader + X86 + executionengine + orcjit + mcjit + all + codegen) message(STATUS "LLVM libs: ${llvm_libs}") @@ -75,23 +86,24 @@ get_property(mlir_libs GLOBAL PROPERTY MLIR_ALL_LIBS) message(STATUS "MLIR libs: ${mlir_libs}") add_definitions(${LLVM_DEFINITIONS}) - # The minimum needed libraries for MLIR IR parse and transform. set(MLIR_IR_LIBS MLIRAnalysis MLIRPass MLIRParser MLIRDialect MLIRIR MLIROptLib) - # tb_base is the name of a xxx.td file (without the .td suffix) function(mlir_tablegen_on td_base) set(options) set(oneValueArgs DIALECT) - cmake_parse_arguments(mlir_tablegen_on "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cmake_parse_arguments(mlir_tablegen_on "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) set(LLVM_TARGET_DEFINITIONS ${td_base}.td) mlir_tablegen(${td_base}.hpp.inc -gen-op-decls) mlir_tablegen(${td_base}.cpp.inc -gen-op-defs) - if (mlir_tablegen_on_DIALECT) - mlir_tablegen(${td_base}_dialect.hpp.inc --gen-dialect-decls -dialect=${mlir_tablegen_on_DIALECT}) - mlir_tablegen(${td_base}_dialect.cpp.inc --gen-dialect-defs -dialect=${mlir_tablegen_on_DIALECT}) + if(mlir_tablegen_on_DIALECT) + mlir_tablegen(${td_base}_dialect.hpp.inc --gen-dialect-decls + -dialect=${mlir_tablegen_on_DIALECT}) + mlir_tablegen(${td_base}_dialect.cpp.inc --gen-dialect-defs + -dialect=${mlir_tablegen_on_DIALECT}) endif() add_public_tablegen_target(${td_base}_IncGen) add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen) @@ -99,7 +111,9 @@ endfunction() function(mlir_add_rewriter td_base) set(LLVM_TARGET_DEFINITIONS ${td_base}.td) - set(LLVM_TARGET_DEPENDS ${LLVM_TARGET_DEPENDS} ${CMAKE_SOURCE_DIR}/paddle/infrt/dialect/infrt/ir/infrt_base.td) + set(LLVM_TARGET_DEPENDS + ${LLVM_TARGET_DEPENDS} + ${CMAKE_SOURCE_DIR}/paddle/infrt/dialect/infrt/ir/infrt_base.td) mlir_tablegen(${td_base}.cpp.inc -gen-rewriters) add_public_tablegen_target(MLIR${td_base}IncGen) add_dependencies(mlir-headers MLIR${td_base}IncGen) @@ -108,7 +122,11 @@ endfunction() # Execute the mlir script with infrt-exec program. # @name: name of the test # @script: path to the mlir script file -function (infrt_exec_check name script) - add_test(NAME ${name} - COMMAND sh -c "${CMAKE_BINARY_DIR}/paddle/infrt/host_context/infrt-exec -i ${CMAKE_CURRENT_SOURCE_DIR}/${script}| ${LLVM_PATH}/bin/FileCheck ${CMAKE_CURRENT_SOURCE_DIR}/${script}") +function(infrt_exec_check name script) + add_test( + NAME ${name} + COMMAND + sh -c + "${CMAKE_BINARY_DIR}/paddle/infrt/host_context/infrt-exec -i ${CMAKE_CURRENT_SOURCE_DIR}/${script}| ${LLVM_PATH}/bin/FileCheck ${CMAKE_CURRENT_SOURCE_DIR}/${script}" + ) endfunction() diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 8f955008fa079..dfa20dd631fc6 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -12,108 +12,131 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) - -SET(MKLDNN_PROJECT "extern_mkldnn") -SET(MKLDNN_PREFIX_DIR ${THIRD_PARTY_PATH}/mkldnn) -SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) -SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) -SET(MKLDNN_REPOSITORY ${GIT_URL}/oneapi-src/oneDNN.git) -SET(MKLDNN_TAG 9b186765dded79066e0cd9c17eb70b680b76fb8e) +include(ExternalProject) +set(MKLDNN_PROJECT "extern_mkldnn") +set(MKLDNN_PREFIX_DIR ${THIRD_PARTY_PATH}/mkldnn) +set(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) +set(MKLDNN_INC_DIR + "${MKLDNN_INSTALL_DIR}/include" + CACHE PATH "mkldnn include directory." FORCE) +set(MKLDNN_REPOSITORY ${GIT_URL}/oneapi-src/oneDNN.git) +set(MKLDNN_TAG 9b186765dded79066e0cd9c17eb70b680b76fb8e) # Introduce variables: # * CMAKE_INSTALL_LIBDIR -INCLUDE(GNUInstallDirs) -SET(LIBDIR "lib") +include(GNUInstallDirs) +set(LIBDIR "lib") if(CMAKE_INSTALL_LIBDIR MATCHES ".*lib64$") - SET(LIBDIR "lib64") + set(LIBDIR "lib64") endif() -MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/${LIBDIR} to runtime path") -SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) -SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/${LIBDIR}") - -INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers. +message(STATUS "Set ${MKLDNN_INSTALL_DIR}/${LIBDIR} to runtime path") +set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) +set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" + "${MKLDNN_INSTALL_DIR}/${LIBDIR}") +include_directories(${MKLDNN_INC_DIR} +)# For MKLDNN code to include internal headers. -IF(NOT WIN32) - SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds") - SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value") - SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}") - SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}") - SET(MKLDNN_CXXFLAG_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}") - SET(MKLDNN_CFLAG_RELEASE "${CMAKE_C_FLAGS_RELEASE}") - SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/libdnnl.so" CACHE FILEPATH "mkldnn library." FORCE) -ELSE() - SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc") - SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS}") - string(REPLACE "/O2 " "" MKLDNN_CFLAG_RELEASE "${CMAKE_C_FLAGS_RELEASE}") - string(REPLACE "/O2 " "" MKLDNN_CXXFLAG_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}") - SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/bin/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE) -ENDIF(NOT WIN32) +if(NOT WIN32) + set(MKLDNN_FLAG + "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds" + ) + set(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value") + set(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}") + set(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}") + set(MKLDNN_CXXFLAG_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}") + set(MKLDNN_CFLAG_RELEASE "${CMAKE_C_FLAGS_RELEASE}") + set(MKLDNN_LIB + "${MKLDNN_INSTALL_DIR}/${LIBDIR}/libdnnl.so" + CACHE FILEPATH "mkldnn library." FORCE) +else() + set(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc") + set(MKLDNN_CFLAG "${CMAKE_C_FLAGS}") + string(REPLACE "/O2 " "" MKLDNN_CFLAG_RELEASE "${CMAKE_C_FLAGS_RELEASE}") + string(REPLACE "/O2 " "" MKLDNN_CXXFLAG_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}") + set(MKLDNN_LIB + "${MKLDNN_INSTALL_DIR}/bin/mkldnn.lib" + CACHE FILEPATH "mkldnn library." FORCE) +endif(NOT WIN32) ExternalProject_Add( - ${MKLDNN_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${MKLDNN_REPOSITORY} - GIT_TAG ${MKLDNN_TAG} - DEPENDS ${MKLDNN_DEPENDS} - PREFIX ${MKLDNN_PREFIX_DIR} - UPDATE_COMMAND "" - #BUILD_ALWAYS 1 - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG} - -DCMAKE_CXX_FLAGS_RELEASE=${MKLDNN_CXXFLAG_RELEASE} - -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} - -DCMAKE_C_FLAGS=${MKLDNN_CFLAG} - -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${MKLDNN_CFLAG_RELEASE} - -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DDNNL_BUILD_TESTS=OFF -DDNNL_BUILD_EXAMPLES=OFF - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR} -) + ${MKLDNN_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${MKLDNN_REPOSITORY} + GIT_TAG ${MKLDNN_TAG} + DEPENDS ${MKLDNN_DEPENDS} + PREFIX ${MKLDNN_PREFIX_DIR} + UPDATE_COMMAND "" + #BUILD_ALWAYS 1 + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG} + -DCMAKE_CXX_FLAGS_RELEASE=${MKLDNN_CXXFLAG_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_C_FLAGS=${MKLDNN_CFLAG} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${MKLDNN_CFLAG_RELEASE} + -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DDNNL_BUILD_TESTS=OFF + -DDNNL_BUILD_EXAMPLES=OFF + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}) -MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}") +message(STATUS "MKLDNN library: ${MKLDNN_LIB}") add_definitions(-DPADDLE_WITH_MKLDNN) # copy the real so.0 lib to install dir # it can be directly contained in wheel or capi if(WIN32) - SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll) + set(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll) - file(TO_NATIVE_PATH ${MKLDNN_INSTALL_DIR} NATIVE_MKLDNN_INSTALL_DIR) - file(TO_NATIVE_PATH ${MKLDNN_SHARED_LIB} NATIVE_MKLDNN_SHARED_LIB) + file(TO_NATIVE_PATH ${MKLDNN_INSTALL_DIR} NATIVE_MKLDNN_INSTALL_DIR) + file(TO_NATIVE_PATH ${MKLDNN_SHARED_LIB} NATIVE_MKLDNN_SHARED_LIB) - ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_LIB} - COMMAND (copy ${NATIVE_MKLDNN_INSTALL_DIR}\\bin\\dnnl.dll ${NATIVE_MKLDNN_SHARED_LIB} /Y) - COMMAND dumpbin /exports ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll > ${MKLDNN_INSTALL_DIR}/bin/exports.txt - COMMAND echo LIBRARY mkldnn > ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def - COMMAND echo EXPORTS >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def - COMMAND echo off && (for /f "skip=19 tokens=4" %A in (${MKLDNN_INSTALL_DIR}/bin/exports.txt) do echo %A >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def) && echo on - COMMAND lib /def:${MKLDNN_INSTALL_DIR}/bin/mkldnn.def /out:${MKLDNN_LIB} /machine:x64 - COMMENT "Generate mkldnn.lib manually--->" - DEPENDS ${MKLDNN_PROJECT} - VERBATIM) - ADD_CUSTOM_TARGET(mkldnn_cmd ALL DEPENDS ${MKLDNN_LIB}) + add_custom_command( + OUTPUT ${MKLDNN_LIB} + COMMAND (copy ${NATIVE_MKLDNN_INSTALL_DIR}\\bin\\dnnl.dll + ${NATIVE_MKLDNN_SHARED_LIB} /Y) + COMMAND dumpbin /exports ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll > + ${MKLDNN_INSTALL_DIR}/bin/exports.txt + COMMAND echo LIBRARY mkldnn > ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def + COMMAND echo EXPORTS >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def + COMMAND + echo off && (for + /f + "skip=19 tokens=4" + %A + in + (${MKLDNN_INSTALL_DIR}/bin/exports.txt) + do + echo + %A + >> + ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def) && echo on + COMMAND lib /def:${MKLDNN_INSTALL_DIR}/bin/mkldnn.def /out:${MKLDNN_LIB} + /machine:x64 + COMMENT "Generate mkldnn.lib manually--->" + DEPENDS ${MKLDNN_PROJECT} + VERBATIM) + add_custom_target(mkldnn_cmd ALL DEPENDS ${MKLDNN_LIB}) else(WIN32) - SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0) - SET(MKLDNN_SHARED_LIB_1 ${MKLDNN_INSTALL_DIR}/libdnnl.so.1) - SET(MKLDNN_SHARED_LIB_2 ${MKLDNN_INSTALL_DIR}/libdnnl.so.2) - ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB_2} - COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB} - COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB_1} - COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB_2} - DEPENDS ${MKLDNN_PROJECT}) - ADD_CUSTOM_TARGET(mkldnn_cmd ALL DEPENDS ${MKLDNN_SHARED_LIB_2}) + set(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0) + set(MKLDNN_SHARED_LIB_1 ${MKLDNN_INSTALL_DIR}/libdnnl.so.1) + set(MKLDNN_SHARED_LIB_2 ${MKLDNN_INSTALL_DIR}/libdnnl.so.2) + add_custom_command( + OUTPUT ${MKLDNN_SHARED_LIB_2} + COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB} + COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB_1} + COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB_2} + DEPENDS ${MKLDNN_PROJECT}) + add_custom_target(mkldnn_cmd ALL DEPENDS ${MKLDNN_SHARED_LIB_2}) endif(WIN32) # generate a static dummy target to track mkldnn dependencies # for cc_library(xxx SRCS xxx.c DEPS mkldnn) generate_dummy_static_lib(LIB_NAME "mkldnn" GENERATOR "mkldnn.cmake") -TARGET_LINK_LIBRARIES(mkldnn ${MKLDNN_LIB} ${MKLML_IOMP_LIB}) -ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT} mkldnn_cmd) +target_link_libraries(mkldnn ${MKLDNN_LIB} ${MKLML_IOMP_LIB}) +add_dependencies(mkldnn ${MKLDNN_PROJECT} mkldnn_cmd) diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index a2fd2fe03c162..90d61f47a52e8 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -12,59 +12,68 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) -SET(MKLML_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mklml) -SET(MKLML_INC_DIR ${MKLML_INSTALL_DIR}/include) -SET(MKLML_LIB_DIR ${MKLML_INSTALL_DIR}/lib) -SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_LIB_DIR}") +include(ExternalProject) +set(MKLML_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mklml) +set(MKLML_INC_DIR ${MKLML_INSTALL_DIR}/include) +set(MKLML_LIB_DIR ${MKLML_INSTALL_DIR}/lib) +set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_LIB_DIR}") -IF(WIN32) - SET(MKLML_VER "mklml_win_2019.0.5.20190502" CACHE STRING "" FORCE) - SET(MKLML_URL "https://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE) - SET(MKLML_URL_MD5 ff8c5237570f03eea37377ccfc95a08a) - SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib) - SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib) - SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll) - SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll) -ELSE() - #TODO(intel-huying): - # Now enable csrmm function in mklml library temporarily, it will be updated as offical version later. - SET(MKLML_VER "csrmm_mklml_lnx_2019.0.5" CACHE STRING "" FORCE) - SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) - SET(MKLML_URL_MD5 bc6a7faea6a2a9ad31752386f3ae87da) - SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) - SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) - SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) - SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) -ENDIF() +if(WIN32) + set(MKLML_VER + "mklml_win_2019.0.5.20190502" + CACHE STRING "" FORCE) + set(MKLML_URL + "https://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.zip" + CACHE STRING "" FORCE) + set(MKLML_URL_MD5 ff8c5237570f03eea37377ccfc95a08a) + set(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib) + set(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib) + set(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll) + set(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll) +else() + #TODO(intel-huying): + # Now enable csrmm function in mklml library temporarily, it will be updated as offical version later. + set(MKLML_VER + "csrmm_mklml_lnx_2019.0.5" + CACHE STRING "" FORCE) + set(MKLML_URL + "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" + CACHE STRING "" FORCE) + set(MKLML_URL_MD5 bc6a7faea6a2a9ad31752386f3ae87da) + set(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) + set(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) + set(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) + set(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) +endif() -SET(MKLML_PROJECT "extern_mklml") -MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}") -SET(MKLML_PREFIX_DIR ${THIRD_PARTY_PATH}/mklml) -SET(MKLML_SOURCE_DIR ${THIRD_PARTY_PATH}/mklml/src/extern_mklml) +set(MKLML_PROJECT "extern_mklml") +message(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}") +set(MKLML_PREFIX_DIR ${THIRD_PARTY_PATH}/mklml) +set(MKLML_SOURCE_DIR ${THIRD_PARTY_PATH}/mklml/src/extern_mklml) -# Ninja Generator can not establish the correct dependency relationship between the imported library with target, +# Ninja Generator can not establish the correct dependency relationship between the imported library with target, # the product file in the ExternalProject need to be specified manually, please refer to # https://stackoverflow.com/questions/54866067/cmake-and-ninja-missing-and-no-known-rule-to-make-it # It is the same to all other ExternalProject. ExternalProject_Add( - ${MKLML_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - URL ${MKLML_URL} - URL_MD5 ${MKLML_URL_MD5} - PREFIX ${MKLML_PREFIX_DIR} - DOWNLOAD_NO_PROGRESS 1 - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - UPDATE_COMMAND "" - INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory ${MKLML_SOURCE_DIR}/include ${MKLML_INC_DIR} && - ${CMAKE_COMMAND} -E copy_directory ${MKLML_SOURCE_DIR}/lib ${MKLML_LIB_DIR} - BUILD_BYPRODUCTS ${MKLML_LIB} - BUILD_BYPRODUCTS ${MKLML_IOMP_LIB} -) + ${MKLML_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + URL ${MKLML_URL} + URL_MD5 ${MKLML_URL_MD5} + PREFIX ${MKLML_PREFIX_DIR} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND + ${CMAKE_COMMAND} -E copy_directory ${MKLML_SOURCE_DIR}/include + ${MKLML_INC_DIR} && ${CMAKE_COMMAND} -E copy_directory + ${MKLML_SOURCE_DIR}/lib ${MKLML_LIB_DIR} + BUILD_BYPRODUCTS ${MKLML_LIB} + BUILD_BYPRODUCTS ${MKLML_IOMP_LIB}) -INCLUDE_DIRECTORIES(${MKLML_INC_DIR}) +include_directories(${MKLML_INC_DIR}) -ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB}) -ADD_DEPENDENCIES(mklml ${MKLML_PROJECT}) +add_library(mklml SHARED IMPORTED GLOBAL) +set_property(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB}) +add_dependencies(mklml ${MKLML_PROJECT}) diff --git a/cmake/external/onnxruntime.cmake b/cmake/external/onnxruntime.cmake index 2162f87812d13..9ace4caafd12a 100644 --- a/cmake/external/onnxruntime.cmake +++ b/cmake/external/onnxruntime.cmake @@ -12,83 +12,114 @@ # See the License for the specific language governing permissions and # limitations under the License. -if (NOT WITH_ONNXRUNTIME) +if(NOT WITH_ONNXRUNTIME) return() -endif () +endif() -if (WITH_ARM) +if(WITH_ARM) message(SEND_ERROR "The current onnxruntime backend doesn't support ARM cpu") return() -endif () +endif() -INCLUDE(ExternalProject) +include(ExternalProject) add_definitions(-DPADDLE_WITH_ONNXRUNTIME) -SET(ONNXRUNTIME_PROJECT "extern_onnxruntime") -SET(ONNXRUNTIME_PREFIX_DIR ${THIRD_PARTY_PATH}/onnxruntime) -SET(ONNXRUNTIME_SOURCE_DIR ${THIRD_PARTY_PATH}/onnxruntime/src/${ONNXRUNTIME_PROJECT}) -SET(ONNXRUNTIME_INSTALL_DIR ${THIRD_PARTY_PATH}/install/onnxruntime) -SET(ONNXRUNTIME_INC_DIR "${ONNXRUNTIME_INSTALL_DIR}/include" CACHE PATH "onnxruntime include directory." FORCE) -SET(ONNXRUNTIME_LIB_DIR "${ONNXRUNTIME_INSTALL_DIR}/lib" CACHE PATH "onnxruntime lib directory." FORCE) -SET(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${ONNXRUNTIME_LIB_DIR}") - +set(ONNXRUNTIME_PROJECT "extern_onnxruntime") +set(ONNXRUNTIME_PREFIX_DIR ${THIRD_PARTY_PATH}/onnxruntime) +set(ONNXRUNTIME_SOURCE_DIR + ${THIRD_PARTY_PATH}/onnxruntime/src/${ONNXRUNTIME_PROJECT}) +set(ONNXRUNTIME_INSTALL_DIR ${THIRD_PARTY_PATH}/install/onnxruntime) +set(ONNXRUNTIME_INC_DIR + "${ONNXRUNTIME_INSTALL_DIR}/include" + CACHE PATH "onnxruntime include directory." FORCE) +set(ONNXRUNTIME_LIB_DIR + "${ONNXRUNTIME_INSTALL_DIR}/lib" + CACHE PATH "onnxruntime lib directory." FORCE) +set(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${ONNXRUNTIME_LIB_DIR}") -if (WIN32) - SET(ONNXRUNTIME_URL "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-win-x64-1.10.0.zip") -elseif (APPLE) - SET(ONNXRUNTIME_URL "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-osx-x86_64-1.10.0.tgz") -else () - SET(ONNXRUNTIME_URL "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-linux-x64-1.10.0.tgz") +if(WIN32) + set(ONNXRUNTIME_URL + "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-win-x64-1.10.0.zip" + ) +elseif(APPLE) + set(ONNXRUNTIME_URL + "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-osx-x86_64-1.10.0.tgz" + ) +else() + set(ONNXRUNTIME_URL + "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-linux-x64-1.10.0.tgz" + ) endif() +include_directories(${ONNXRUNTIME_INC_DIR} +)# For ONNXRUNTIME code to include internal headers. +if(WIN32) + set(ONNXRUNTIME_SOURCE_LIB + "${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.dll" + CACHE FILEPATH "ONNXRUNTIME source library." FORCE) + set(ONNXRUNTIME_SHARED_LIB + "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.dll" + CACHE FILEPATH "ONNXRUNTIME shared library." FORCE) + set(ONNXRUNTIME_LIB + "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.lib" + CACHE FILEPATH "ONNXRUNTIME static library." FORCE) +elseif(APPLE) + set(ONNXRUNTIME_SOURCE_LIB + "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.1.10.0.dylib" + CACHE FILEPATH "ONNXRUNTIME source library." FORCE) + set(ONNXRUNTIME_LIB + "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.1.10.0.dylib" + CACHE FILEPATH "ONNXRUNTIME static library." FORCE) + set(ONNXRUNTIME_SHARED_LIB + ${ONNXRUNTIME_LIB} + CACHE FILEPATH "ONNXRUNTIME shared library." FORCE) +else() + set(ONNXRUNTIME_SOURCE_LIB + "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.so.1.10.0" + CACHE FILEPATH "ONNXRUNTIME source library." FORCE) + set(ONNXRUNTIME_LIB + "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.so.1.10.0" + CACHE FILEPATH "ONNXRUNTIME static library." FORCE) + set(ONNXRUNTIME_SHARED_LIB + ${ONNXRUNTIME_LIB} + CACHE FILEPATH "ONNXRUNTIME shared library." FORCE) +endif() -INCLUDE_DIRECTORIES(${ONNXRUNTIME_INC_DIR}) # For ONNXRUNTIME code to include internal headers. -if (WIN32) - SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.dll" CACHE FILEPATH "ONNXRUNTIME source library." FORCE) - SET(ONNXRUNTIME_SHARED_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.dll" CACHE FILEPATH "ONNXRUNTIME shared library." FORCE) - SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.lib" CACHE FILEPATH "ONNXRUNTIME static library." FORCE) -elseif (APPLE) - SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.1.10.0.dylib" CACHE FILEPATH "ONNXRUNTIME source library." FORCE) - SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.1.10.0.dylib" CACHE FILEPATH "ONNXRUNTIME static library." FORCE) - SET(ONNXRUNTIME_SHARED_LIB ${ONNXRUNTIME_LIB} CACHE FILEPATH "ONNXRUNTIME shared library." FORCE) -else () - SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.so.1.10.0" CACHE FILEPATH "ONNXRUNTIME source library." FORCE) - SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.so.1.10.0" CACHE FILEPATH "ONNXRUNTIME static library." FORCE) - SET(ONNXRUNTIME_SHARED_LIB ${ONNXRUNTIME_LIB} CACHE FILEPATH "ONNXRUNTIME shared library." FORCE) -endif () - -if (WIN32) +if(WIN32) ExternalProject_Add( - ${ONNXRUNTIME_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - URL ${ONNXRUNTIME_URL} - PREFIX ${ONNXRUNTIME_PREFIX_DIR} - DOWNLOAD_NO_PROGRESS 1 - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - UPDATE_COMMAND "" - INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_SHARED_LIB} && - ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.lib ${ONNXRUNTIME_LIB} && - ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include ${ONNXRUNTIME_INC_DIR} - BUILD_BYPRODUCTS ${ONNXRUNTIME_LIB} - ) -else () + ${ONNXRUNTIME_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + URL ${ONNXRUNTIME_URL} + PREFIX ${ONNXRUNTIME_PREFIX_DIR} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND + ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} + ${ONNXRUNTIME_SHARED_LIB} && ${CMAKE_COMMAND} -E copy + ${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.lib ${ONNXRUNTIME_LIB} && + ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include + ${ONNXRUNTIME_INC_DIR} + BUILD_BYPRODUCTS ${ONNXRUNTIME_LIB}) +else() ExternalProject_Add( ${ONNXRUNTIME_PROJECT} ${EXTERNAL_PROJECT_LOG_ARGS} - URL ${ONNXRUNTIME_URL} - PREFIX ${ONNXRUNTIME_PREFIX_DIR} - DOWNLOAD_NO_PROGRESS 1 - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - UPDATE_COMMAND "" - INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_LIB} && - ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include ${ONNXRUNTIME_INC_DIR} - BUILD_BYPRODUCTS ${ONNXRUNTIME_LIB} - ) + URL ${ONNXRUNTIME_URL} + PREFIX ${ONNXRUNTIME_PREFIX_DIR} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND + ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_LIB} && + ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include + ${ONNXRUNTIME_INC_DIR} + BUILD_BYPRODUCTS ${ONNXRUNTIME_LIB}) endif() -ADD_LIBRARY(onnxruntime STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET onnxruntime PROPERTY IMPORTED_LOCATION ${ONNXRUNTIME_LIB}) -ADD_DEPENDENCIES(onnxruntime ${ONNXRUNTIME_PROJECT}) +add_library(onnxruntime STATIC IMPORTED GLOBAL) +set_property(TARGET onnxruntime PROPERTY IMPORTED_LOCATION ${ONNXRUNTIME_LIB}) +add_dependencies(onnxruntime ${ONNXRUNTIME_PROJECT}) diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index b099831738599..1cccfb86f4208 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -12,80 +12,84 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) +include(ExternalProject) -SET(CBLAS_PREFIX_DIR ${THIRD_PARTY_PATH}/openblas) -SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas) -SET(CBLAS_REPOSITORY ${GIT_URL}/xianyi/OpenBLAS.git) -SET(CBLAS_TAG v0.3.7) +set(CBLAS_PREFIX_DIR ${THIRD_PARTY_PATH}/openblas) +set(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas) +set(CBLAS_REPOSITORY ${GIT_URL}/xianyi/OpenBLAS.git) +set(CBLAS_TAG v0.3.7) if(APPLE AND WITH_ARM) - SET(CBLAS_TAG v0.3.13) + set(CBLAS_TAG v0.3.13) endif() if(WITH_MIPS) - SET(CBLAS_TAG v0.3.13) + set(CBLAS_TAG v0.3.13) endif() -IF(NOT WIN32) - SET(CBLAS_LIBRARIES - "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" - CACHE FILEPATH "openblas library." FORCE) - SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE) - SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable") +if(NOT WIN32) + set(CBLAS_LIBRARIES + "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" + CACHE FILEPATH "openblas library." FORCE) + set(CBLAS_INC_DIR + "${CBLAS_INSTALL_DIR}/include" + CACHE PATH "openblas include directory." FORCE) + set(OPENBLAS_CC + "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable") - IF(APPLE) - SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}") - ENDIF() - SET(OPTIONAL_ARGS "") - IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$") - SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64) - ENDIF() + if(APPLE) + set(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}") + endif() + set(OPTIONAL_ARGS "") + if(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$") + set(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64) + endif() - SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs) - ExternalProject_Add( - extern_openblas - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${CBLAS_REPOSITORY} - GIT_TAG ${CBLAS_TAG} - PREFIX ${CBLAS_PREFIX_DIR} - INSTALL_DIR ${CBLAS_INSTALL_DIR} - BUILD_IN_SOURCE 1 - BUILD_COMMAND make -j$(nproc) ${COMMON_ARGS} ${OPTIONAL_ARGS} - INSTALL_COMMAND make install NO_SHARED=1 NO_LAPACK=1 PREFIX= - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_BYPRODUCTS ${CBLAS_LIBRARIES} - ) -ELSE(NOT WIN32) - SET(CBLAS_LIBRARIES - "${CBLAS_INSTALL_DIR}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" - CACHE FILEPATH "openblas library." FORCE) - SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include/openblas" CACHE PATH "openblas include directory." FORCE) - ExternalProject_Add( - extern_openblas - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY ${CBLAS_REPOSITORY} - GIT_TAG ${CBLAS_TAG} - PREFIX ${CBLAS_PREFIX_DIR} - INSTALL_DIR ${CBLAS_INSTALL_DIR} - BUILD_IN_SOURCE 0 - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - -DCMAKE_INSTALL_PREFIX=${CBLAS_INSTALL_DIR} - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - -DBUILD_SHARED_LIBS=ON - -DMSVC_STATIC_CRT=${MSVC_STATIC_CRT} - ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${CBLAS_INSTALL_DIR} - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - # ninja need to know where openblas.lib comes from - BUILD_BYPRODUCTS ${CBLAS_LIBRARIES} - ) - SET(OPENBLAS_SHARED_LIB ${CBLAS_INSTALL_DIR}/bin/openblas${CMAKE_SHARED_LIBRARY_SUFFIX}) -ENDIF(NOT WIN32) + set(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs) + ExternalProject_Add( + extern_openblas + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${CBLAS_REPOSITORY} + GIT_TAG ${CBLAS_TAG} + PREFIX ${CBLAS_PREFIX_DIR} + INSTALL_DIR ${CBLAS_INSTALL_DIR} + BUILD_IN_SOURCE 1 + BUILD_COMMAND make -j$(nproc) ${COMMON_ARGS} ${OPTIONAL_ARGS} + INSTALL_COMMAND make install NO_SHARED=1 NO_LAPACK=1 PREFIX= + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_BYPRODUCTS ${CBLAS_LIBRARIES}) +else(NOT WIN32) + set(CBLAS_LIBRARIES + "${CBLAS_INSTALL_DIR}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" + CACHE FILEPATH "openblas library." FORCE) + set(CBLAS_INC_DIR + "${CBLAS_INSTALL_DIR}/include/openblas" + CACHE PATH "openblas include directory." FORCE) + ExternalProject_Add( + extern_openblas + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY ${CBLAS_REPOSITORY} + GIT_TAG ${CBLAS_TAG} + PREFIX ${CBLAS_PREFIX_DIR} + INSTALL_DIR ${CBLAS_INSTALL_DIR} + BUILD_IN_SOURCE 0 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_INSTALL_PREFIX=${CBLAS_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DBUILD_SHARED_LIBS=ON + -DMSVC_STATIC_CRT=${MSVC_STATIC_CRT} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS + -DCMAKE_INSTALL_PREFIX:PATH=${CBLAS_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + # ninja need to know where openblas.lib comes from + BUILD_BYPRODUCTS ${CBLAS_LIBRARIES}) + set(OPENBLAS_SHARED_LIB + ${CBLAS_INSTALL_DIR}/bin/openblas${CMAKE_SHARED_LIBRARY_SUFFIX}) +endif(NOT WIN32) diff --git a/cmake/external/paddle2onnx.cmake b/cmake/external/paddle2onnx.cmake index 2fc22578cae9d..8252b2a73e943 100644 --- a/cmake/external/paddle2onnx.cmake +++ b/cmake/external/paddle2onnx.cmake @@ -16,84 +16,91 @@ if(NOT WITH_ONNXRUNTIME) return() endif() -if (WITH_ARM) +if(WITH_ARM) message(SEND_ERROR "The current onnxruntime backend doesn't support ARM cpu") return() -endif () +endif() -INCLUDE(ExternalProject) +include(ExternalProject) -SET(PADDLE2ONNX_PROJECT "extern_paddle2onnx") -SET(PADDLE2ONNX_PREFIX_DIR ${THIRD_PARTY_PATH}/paddle2onnx) -SET(PADDLE2ONNX_INSTALL_DIR ${THIRD_PARTY_PATH}/install/paddle2onnx) -SET(PADDLE2ONNX_INC_DIR "${PADDLE2ONNX_INSTALL_DIR}/include" CACHE PATH "paddle2onnx include directory." FORCE) -SET(PADDLE2ONNX_REPOSITORY ${GIT_URL}/PaddlePaddle/Paddle2ONNX.git) -SET(PADDLE2ONNX_TAG cpp) -SET(LIBDIR "lib") -SET(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}") +set(PADDLE2ONNX_PROJECT "extern_paddle2onnx") +set(PADDLE2ONNX_PREFIX_DIR ${THIRD_PARTY_PATH}/paddle2onnx) +set(PADDLE2ONNX_INSTALL_DIR ${THIRD_PARTY_PATH}/install/paddle2onnx) +set(PADDLE2ONNX_INC_DIR + "${PADDLE2ONNX_INSTALL_DIR}/include" + CACHE PATH "paddle2onnx include directory." FORCE) +set(PADDLE2ONNX_REPOSITORY ${GIT_URL}/PaddlePaddle/Paddle2ONNX.git) +set(PADDLE2ONNX_TAG cpp) +set(LIBDIR "lib") +set(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" + "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}") -INCLUDE_DIRECTORIES(${PADDLE2ONNX_INC_DIR}) # For PADDLE2ONNX code to include internal headers. +include_directories(${PADDLE2ONNX_INC_DIR} +)# For PADDLE2ONNX code to include internal headers. if(WIN32) - SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.lib" CACHE FILEPATH "paddle2onnx static library." FORCE) - SET(PADDLE2ONNX_SHARED_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.dll" CACHE FILEPATH "paddle2onnx shared library." FORCE) + set(PADDLE2ONNX_LIB + "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.lib" + CACHE FILEPATH "paddle2onnx static library." FORCE) + set(PADDLE2ONNX_SHARED_LIB + "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.dll" + CACHE FILEPATH "paddle2onnx shared library." FORCE) elseif(APPLE) - SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.dylib" CACHE FILEPATH "PADDLE2ONNX library." FORCE) + set(PADDLE2ONNX_LIB + "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.dylib" + CACHE FILEPATH "PADDLE2ONNX library." FORCE) else() - SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.so" CACHE FILEPATH "PADDLE2ONNX library." FORCE) + set(PADDLE2ONNX_LIB + "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.so" + CACHE FILEPATH "PADDLE2ONNX library." FORCE) endif(WIN32) - # The protoc path is required to compile onnx. string(REPLACE "/" ";" PROTOC_BIN_PATH ${PROTOBUF_PROTOC_EXECUTABLE}) list(POP_BACK PROTOC_BIN_PATH) list(JOIN PROTOC_BIN_PATH "/" PROTOC_BIN_PATH) - set(PADDLE2ONNX_OPTIONAL_ARGS - -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - -DCMAKE_CXX_STANDARD=14 - -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - -DONNX_CUSTOM_PROTOC_PATH=${PROTOC_BIN_PATH} - -DWITH_STATIC=OFF - -DMSVC_STATIC_CRT=${MSVC_STATIC_CRT} - -DCMAKE_INSTALL_PREFIX=${PADDLE2ONNX_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR=${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR} - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - ${EXTERNAL_OPTIONAL_ARGS} -) - -if (WITH_PYTHON) - set(PADDLE2ONNX_OPTIONAL_ARGS ${PADDLE2ONNX_OPTIONAL_ARGS} - -DPYTHON_EXECUTABLE:FILEPATH=${PYTHON_EXECUTABLE} - -DPYTHON_INCLUDE_DIR:PATH=${PYTHON_INCLUDE_DIR} - -DPYTHON_LIBRARY:FILEPATH=${PYTHON_LIBRARY} - ) -endif () + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_STANDARD=14 + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DONNX_CUSTOM_PROTOC_PATH=${PROTOC_BIN_PATH} + -DWITH_STATIC=OFF + -DMSVC_STATIC_CRT=${MSVC_STATIC_CRT} + -DCMAKE_INSTALL_PREFIX=${PADDLE2ONNX_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS}) +if(WITH_PYTHON) + set(PADDLE2ONNX_OPTIONAL_ARGS + ${PADDLE2ONNX_OPTIONAL_ARGS} + -DPYTHON_EXECUTABLE:FILEPATH=${PYTHON_EXECUTABLE} + -DPYTHON_INCLUDE_DIR:PATH=${PYTHON_INCLUDE_DIR} + -DPYTHON_LIBRARY:FILEPATH=${PYTHON_LIBRARY}) +endif() ExternalProject_Add( - ${PADDLE2ONNX_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${PADDLE2ONNX_REPOSITORY} - GIT_TAG ${PADDLE2ONNX_TAG} - DEPENDS protobuf - PREFIX ${PADDLE2ONNX_PREFIX_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS ${PADDLE2ONNX_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PADDLE2ONNX_INSTALL_DIR} - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - BUILD_BYPRODUCTS ${PADDLE2ONNX_LIB} -) + ${PADDLE2ONNX_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${PADDLE2ONNX_REPOSITORY} + GIT_TAG ${PADDLE2ONNX_TAG} + DEPENDS protobuf + PREFIX ${PADDLE2ONNX_PREFIX_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS ${PADDLE2ONNX_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS + -DCMAKE_INSTALL_PREFIX:PATH=${PADDLE2ONNX_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${PADDLE2ONNX_LIB}) -ADD_LIBRARY(paddle2onnx STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION ${PADDLE2ONNX_LIB}) -ADD_DEPENDENCIES(paddle2onnx ${PADDLE2ONNX_PROJECT}) +add_library(paddle2onnx STATIC IMPORTED GLOBAL) +set_property(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION ${PADDLE2ONNX_LIB}) +add_dependencies(paddle2onnx ${PADDLE2ONNX_PROJECT}) diff --git a/cmake/external/pocketfft.cmake b/cmake/external/pocketfft.cmake index 7323f67d115e1..2d809bbcf03ec 100644 --- a/cmake/external/pocketfft.cmake +++ b/cmake/external/pocketfft.cmake @@ -14,30 +14,29 @@ include(ExternalProject) +set(POCKETFFT_PATH + "${THIRD_PARTY_PATH}/pocketfft" + CACHE STRING "A path setting for external_pocketfft path.") +set(POCKETFFT_PREFIX_DIR ${POCKETFFT_PATH}) -set(POCKETFFT_PATH "${THIRD_PARTY_PATH}/pocketfft" CACHE STRING "A path setting for external_pocketfft path.") -set(POCKETFFT_PREFIX_DIR ${POCKETFFT_PATH}) +set(POCKETFFT_REPOSITORY https://gitlab.mpcdf.mpg.de/mtr/pocketfft.git) +set(POCKETFFT_TAG release_for_eigen) -set(POCKETFFT_REPOSITORY https://gitlab.mpcdf.mpg.de/mtr/pocketfft.git) -set(POCKETFFT_TAG release_for_eigen) - -SET(POCKETFFT_INCLUDE_DIR ${POCKETFFT_PREFIX_DIR}/src) +set(POCKETFFT_INCLUDE_DIR ${POCKETFFT_PREFIX_DIR}/src) message("POCKETFFT_INCLUDE_DIR is ${POCKETFFT_INCLUDE_DIR}") include_directories(${POCKETFFT_INCLUDE_DIR}) ExternalProject_Add( extern_pocketfft - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${POCKETFFT_REPOSITORY} - GIT_TAG ${POCKETFFT_TAG} - PREFIX ${POCKETFFT_PREFIX_DIR} - UPDATE_COMMAND "" + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${POCKETFFT_REPOSITORY} + GIT_TAG ${POCKETFFT_TAG} + PREFIX ${POCKETFFT_PREFIX_DIR} + UPDATE_COMMAND "" CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" -) + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "") add_library(pocketfft INTERFACE) diff --git a/cmake/external/poplar.cmake b/cmake/external/poplar.cmake index 8b2de14e96620..7589059e7b3e7 100644 --- a/cmake/external/poplar.cmake +++ b/cmake/external/poplar.cmake @@ -14,7 +14,12 @@ macro(find_popart_version popart_version_file) file(READ ${popart_version_file} popart_version_file_content) - string(REGEX MATCH "(POPART_VERSION_STRING)[ \t\r\n](\")([0-9]+\.[0-9]+\.[0-9]+)(\\+)([A-Za-z0-9_]*)(\")" POPART_VERSION ${popart_version_file_content}) + string( + REGEX + MATCH + "(POPART_VERSION_STRING)[ \t\r\n](\")([0-9]+\.[0-9]+\.[0-9]+)(\\+)([A-Za-z0-9_]*)(\")" + POPART_VERSION + ${popart_version_file_content}) string(REPLACE "POPART_VERSION_STRING" "" POPART_VERSION "${POPART_VERSION}") string(REPLACE "\"" "" POPART_VERSION "${POPART_VERSION}") string(REPLACE " " "" POPART_VERSION "${POPART_VERSION}") @@ -28,7 +33,11 @@ endmacro() if(WITH_IPU) set(POPLAR_DIR CACHE PATH "Path to a Poplar install") set(POPART_DIR CACHE PATH "Path to a Popart install") - set(POPLAR_SDK_DIR CACHE PATH "Path to an extracted SDK archive or to a Poplar & Popart install directory (Will populate POPLAR_DIR and POPART_DIR)") + set(POPLAR_SDK_DIR + CACHE + PATH + "Path to an extracted SDK archive or to a Poplar & Popart install directory (Will populate POPLAR_DIR and POPART_DIR)" + ) # support setting SDK both from environment variable or command line arguments @@ -36,10 +45,15 @@ if(WITH_IPU) set(POPLAR_SDK_DIR $ENV{POPLAR_SDK_DIR}) endif() if(EXISTS ${POPLAR_SDK_DIR}) - execute_process(COMMAND find ${POPLAR_SDK_DIR}/ -maxdepth 1 -type d -name "popart*" - OUTPUT_VARIABLE POPART_DIR OUTPUT_STRIP_TRAILING_WHITESPACE) - execute_process(COMMAND find ${POPLAR_SDK_DIR}/ -maxdepth 1 -type d -name "poplar-*" -o -name "poplar" - OUTPUT_VARIABLE POPLAR_DIR OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND find ${POPLAR_SDK_DIR}/ -maxdepth 1 -type d -name "popart*" + OUTPUT_VARIABLE POPART_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND find ${POPLAR_SDK_DIR}/ -maxdepth 1 -type d -name "poplar-*" -o + -name "poplar" + OUTPUT_VARIABLE POPLAR_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE) endif() if(DEFINED ENV{POPLAR_DIR}) set(POPLAR_DIR $ENV{POPLAR_DIR}) @@ -51,7 +65,10 @@ if(WITH_IPU) if(EXISTS ${POPLAR_DIR}) message("POPLAR_DIR is ${POPLAR_DIR}") if(NOT IS_DIRECTORY "${POPLAR_DIR}") - message(FATAL_ERROR "Couldn't find a \"poplar\" or \"poplar-*\" folder in '${POPLAR_SDK_DIR}'") + message( + FATAL_ERROR + "Couldn't find a \"poplar\" or \"poplar-*\" folder in '${POPLAR_SDK_DIR}'" + ) endif() list(APPEND CMAKE_PREFIX_PATH ${POPLAR_DIR}) set(ENABLE_POPLAR_CMD "source ${POPLAR_DIR}/enable.sh") @@ -60,12 +77,16 @@ if(WITH_IPU) link_directories("${POPLAR_DIR}/lib") endif() if(NOT poplar_FOUND) - message(FATAL_ERROR "You must provide a path to a Poplar install using -DPOPLAR_DIR=/path/to/popart/build/install") + message( + FATAL_ERROR + "You must provide a path to a Poplar install using -DPOPLAR_DIR=/path/to/popart/build/install" + ) endif() if(EXISTS ${POPART_DIR}) message("POPART_DIR is ${POPART_DIR}") if(NOT IS_DIRECTORY "${POPART_DIR}") - message(FATAL_ERROR "Couldn't find a \"popart*\" folder in '${POPLAR_SDK_DIR}'") + message( + FATAL_ERROR "Couldn't find a \"popart*\" folder in '${POPLAR_SDK_DIR}'") endif() list(APPEND CMAKE_PREFIX_PATH ${POPART_DIR}) set(ENABLE_POPART_CMD "source ${POPART_DIR}/enable.sh") @@ -74,7 +95,10 @@ if(WITH_IPU) link_directories("${POPART_DIR}/lib") endif() if(NOT popart_FOUND) - message(FATAL_ERROR "You must provide a path to a Popart build using -DPOPART_DIR=/path/to/popart/build") + message( + FATAL_ERROR + "You must provide a path to a Popart build using -DPOPART_DIR=/path/to/popart/build" + ) endif() find_popart_version("${POPART_DIR}/include/popart/version.hpp") diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 3a59ea6bc92a2..1368081b58fda 100755 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -12,304 +12,346 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) +include(ExternalProject) # Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp -IF(NOT WIN32) - FIND_PACKAGE(Protobuf QUIET) -ENDIF(NOT WIN32) +if(NOT WIN32) + find_package(Protobuf QUIET) +endif(NOT WIN32) -UNSET_VAR(PROTOBUF_INCLUDE_DIR) -UNSET_VAR(PROTOBUF_FOUND) -UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE) -UNSET_VAR(PROTOBUF_PROTOC_LIBRARY) -UNSET_VAR(PROTOBUF_LITE_LIBRARY) -UNSET_VAR(PROTOBUF_LIBRARY) -UNSET_VAR(PROTOBUF_INCLUDE_DIR) -UNSET_VAR(Protobuf_PROTOC_EXECUTABLE) +unset_var(PROTOBUF_INCLUDE_DIR) +unset_var(PROTOBUF_FOUND) +unset_var(PROTOBUF_PROTOC_EXECUTABLE) +unset_var(PROTOBUF_PROTOC_LIBRARY) +unset_var(PROTOBUF_LITE_LIBRARY) +unset_var(PROTOBUF_LIBRARY) +unset_var(PROTOBUF_INCLUDE_DIR) +unset_var(Protobuf_PROTOC_EXECUTABLE) function(protobuf_generate_python SRCS) - # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake - if(NOT ARGN) - message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files") - return() - endif() - - if(PROTOBUF_GENERATE_CPP_APPEND_PATH) - # Create an include path for each file specified - foreach(FIL ${ARGN}) - get_filename_component(ABS_FIL ${FIL} ABSOLUTE) - get_filename_component(ABS_PATH ${ABS_FIL} PATH) - list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) - if(${_contains_already} EQUAL -1) - list(APPEND _protobuf_include_path -I ${ABS_PATH}) - endif() - endforeach() - else() - set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR}) - endif() - if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) - set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") - endif() + # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake + if(NOT ARGN) + message( + SEND_ERROR + "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files") + return() + endif() - if(DEFINED Protobuf_IMPORT_DIRS) - foreach(DIR ${Protobuf_IMPORT_DIRS}) - get_filename_component(ABS_PATH ${DIR} ABSOLUTE) - list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) - if(${_contains_already} EQUAL -1) - list(APPEND _protobuf_include_path -I ${ABS_PATH}) - endif() - endforeach() - endif() - - set(${SRCS}) + if(PROTOBUF_GENERATE_CPP_APPEND_PATH) + # Create an include path for each file specified foreach(FIL ${ARGN}) - get_filename_component(ABS_FIL ${FIL} ABSOLUTE) - get_filename_component(FIL_WE ${FIL} NAME_WE) - if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH) - get_filename_component(FIL_DIR ${FIL} DIRECTORY) - if(FIL_DIR) - set(FIL_WE "${FIL_DIR}/${FIL_WE}") - endif() - endif() - list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py") - add_custom_command( - OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py" - COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL} - DEPENDS ${ABS_FIL} ${PROTOBUF_PROTOC_EXECUTABLE} - COMMENT "Running Python protocol buffer compiler on ${FIL}" - VERBATIM ) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(ABS_PATH ${ABS_FIL} PATH) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() + endforeach() + else() + set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR}) + endif() + if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) + set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") + endif() + + if(DEFINED Protobuf_IMPORT_DIRS) + foreach(DIR ${Protobuf_IMPORT_DIRS}) + get_filename_component(ABS_PATH ${DIR} ABSOLUTE) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() endforeach() + endif() - set(${SRCS} ${${SRCS}} PARENT_SCOPE) + set(${SRCS}) + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(FIL_WE ${FIL} NAME_WE) + if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH) + get_filename_component(FIL_DIR ${FIL} DIRECTORY) + if(FIL_DIR) + set(FIL_WE "${FIL_DIR}/${FIL_WE}") + endif() + endif() + list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py") + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --python_out + ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL} + DEPENDS ${ABS_FIL} ${PROTOBUF_PROTOC_EXECUTABLE} + COMMENT "Running Python protocol buffer compiler on ${FIL}" + VERBATIM) + endforeach() + + set(${SRCS} + ${${SRCS}} + PARENT_SCOPE) endfunction() # Print and set the protobuf library information, # finish this cmake process and exit from this file. macro(PROMPT_PROTOBUF_LIB) - SET(protobuf_DEPS ${ARGN}) + set(protobuf_DEPS ${ARGN}) - MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}") - MESSAGE(STATUS "Protobuf-lite library: ${PROTOBUF_LITE_LIBRARY}") - MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}") - MESSAGE(STATUS "Protoc library: ${PROTOBUF_PROTOC_LIBRARY}") - MESSAGE(STATUS "Protobuf version: ${PROTOBUF_VERSION}") - INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR}) + message(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}") + message(STATUS "Protobuf-lite library: ${PROTOBUF_LITE_LIBRARY}") + message(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}") + message(STATUS "Protoc library: ${PROTOBUF_PROTOC_LIBRARY}") + message(STATUS "Protobuf version: ${PROTOBUF_VERSION}") + include_directories(${PROTOBUF_INCLUDE_DIR}) - # Assuming that all the protobuf libraries are of the same type. - IF(${PROTOBUF_LIBRARY} MATCHES ${CMAKE_STATIC_LIBRARY_SUFFIX}) - SET(protobuf_LIBTYPE STATIC) - ELSEIF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}$") - SET(protobuf_LIBTYPE SHARED) - ELSE() - MESSAGE(FATAL_ERROR "Unknown library type: ${PROTOBUF_LIBRARY}") - ENDIF() + # Assuming that all the protobuf libraries are of the same type. + if(${PROTOBUF_LIBRARY} MATCHES ${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(protobuf_LIBTYPE STATIC) + elseif(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}$") + set(protobuf_LIBTYPE SHARED) + else() + message(FATAL_ERROR "Unknown library type: ${PROTOBUF_LIBRARY}") + endif() - ADD_LIBRARY(protobuf ${protobuf_LIBTYPE} IMPORTED GLOBAL) - SET_PROPERTY(TARGET protobuf PROPERTY IMPORTED_LOCATION ${PROTOBUF_LIBRARY}) + add_library(protobuf ${protobuf_LIBTYPE} IMPORTED GLOBAL) + set_property(TARGET protobuf PROPERTY IMPORTED_LOCATION ${PROTOBUF_LIBRARY}) - ADD_LIBRARY(protobuf_lite ${protobuf_LIBTYPE} IMPORTED GLOBAL) - SET_PROPERTY(TARGET protobuf_lite PROPERTY IMPORTED_LOCATION ${PROTOBUF_LITE_LIBRARY}) + add_library(protobuf_lite ${protobuf_LIBTYPE} IMPORTED GLOBAL) + set_property(TARGET protobuf_lite PROPERTY IMPORTED_LOCATION + ${PROTOBUF_LITE_LIBRARY}) - ADD_LIBRARY(libprotoc ${protobuf_LIBTYPE} IMPORTED GLOBAL) - SET_PROPERTY(TARGET libprotoc PROPERTY IMPORTED_LOCATION ${PROTOC_LIBRARY}) + add_library(libprotoc ${protobuf_LIBTYPE} IMPORTED GLOBAL) + set_property(TARGET libprotoc PROPERTY IMPORTED_LOCATION ${PROTOC_LIBRARY}) - ADD_EXECUTABLE(protoc IMPORTED GLOBAL) - SET_PROPERTY(TARGET protoc PROPERTY IMPORTED_LOCATION ${PROTOBUF_PROTOC_EXECUTABLE}) - # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`. - # make `protobuf_generate_cpp` happy. - SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE}) + add_executable(protoc IMPORTED GLOBAL) + set_property(TARGET protoc PROPERTY IMPORTED_LOCATION + ${PROTOBUF_PROTOC_EXECUTABLE}) + # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`. + # make `protobuf_generate_cpp` happy. + set(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE}) - FOREACH(dep ${protobuf_DEPS}) - ADD_DEPENDENCIES(protobuf ${dep}) - ADD_DEPENDENCIES(protobuf_lite ${dep}) - ADD_DEPENDENCIES(libprotoc ${dep}) - ADD_DEPENDENCIES(protoc ${dep}) - ENDFOREACH() + foreach(dep ${protobuf_DEPS}) + add_dependencies(protobuf ${dep}) + add_dependencies(protobuf_lite ${dep}) + add_dependencies(libprotoc ${dep}) + add_dependencies(protoc ${dep}) + endforeach() - RETURN() + return() endmacro() macro(SET_PROTOBUF_VERSION) - EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION) - STRING(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}") + exec_program( + ${PROTOBUF_PROTOC_EXECUTABLE} ARGS + --version + OUTPUT_VARIABLE PROTOBUF_VERSION) + string(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}") endmacro() -set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf") -IF (WIN32) - SET(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf) -ENDIF(WIN32) +set(PROTOBUF_ROOT + "" + CACHE PATH "Folder contains protobuf") +if(WIN32) + set(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf) +endif(WIN32) -if (NOT "${PROTOBUF_ROOT}" STREQUAL "") - find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH) - find_library(PROTOBUF_LIBRARY protobuf libprotobuf.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) - find_library(PROTOBUF_LITE_LIBRARY protobuf-lite libprotobuf-lite.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) - find_library(PROTOBUF_PROTOC_LIBRARY protoc libprotoc.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) - find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH) - if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE) - SET(PROTOBUF_FOUND true) - message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.") - SET_PROTOBUF_VERSION() - PROMPT_PROTOBUF_LIB() - endif() +if(NOT "${PROTOBUF_ROOT}" STREQUAL "") + find_path( + PROTOBUF_INCLUDE_DIR google/protobuf/message.h + PATHS ${PROTOBUF_ROOT}/include + NO_DEFAULT_PATH) + find_library( + PROTOBUF_LIBRARY protobuf libprotobuf.lib + PATHS ${PROTOBUF_ROOT}/lib + NO_DEFAULT_PATH) + find_library( + PROTOBUF_LITE_LIBRARY protobuf-lite libprotobuf-lite.lib + PATHS ${PROTOBUF_ROOT}/lib + NO_DEFAULT_PATH) + find_library( + PROTOBUF_PROTOC_LIBRARY protoc libprotoc.lib + PATHS ${PROTOBUF_ROOT}/lib + NO_DEFAULT_PATH) + find_program( + PROTOBUF_PROTOC_EXECUTABLE protoc + PATHS ${PROTOBUF_ROOT}/bin + NO_DEFAULT_PATH) + if(PROTOBUF_INCLUDE_DIR + AND PROTOBUF_LIBRARY + AND PROTOBUF_LITE_LIBRARY + AND PROTOBUF_PROTOC_LIBRARY + AND PROTOBUF_PROTOC_EXECUTABLE) + set(PROTOBUF_FOUND true) + message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.") + set_protobuf_version() + prompt_protobuf_lib() + endif() endif() -FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) - STRING(REPLACE "extern_" "" TARGET_DIR_NAME "${TARGET_NAME}") - SET(PROTOBUF_PREFIX_DIR ${THIRD_PARTY_PATH}/${TARGET_DIR_NAME}) - SET(PROTOBUF_SOURCE_DIR ${THIRD_PARTY_PATH}/${TARGET_DIR_NAME}/src/${TARGET_NAME}) - SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_DIR_NAME}) - - SET(${TARGET_NAME}_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE) - SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE) - SET(${TARGET_NAME}_LITE_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX}" - PARENT_SCOPE) - SET(${TARGET_NAME}_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX}" - PARENT_SCOPE) - SET(${TARGET_NAME}_PROTOC_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX}" - PARENT_SCOPE) - SET(${TARGET_NAME}_PROTOC_EXECUTABLE - "${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}" - PARENT_SCOPE) +function(build_protobuf TARGET_NAME BUILD_FOR_HOST) + string(REPLACE "extern_" "" TARGET_DIR_NAME "${TARGET_NAME}") + set(PROTOBUF_PREFIX_DIR ${THIRD_PARTY_PATH}/${TARGET_DIR_NAME}) + set(PROTOBUF_SOURCE_DIR + ${THIRD_PARTY_PATH}/${TARGET_DIR_NAME}/src/${TARGET_NAME}) + set(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_DIR_NAME}) - SET(OPTIONAL_CACHE_ARGS "") - SET(OPTIONAL_ARGS "") - IF(BUILD_FOR_HOST) - SET(OPTIONAL_ARGS "-Dprotobuf_WITH_ZLIB=OFF") - ELSE() - SET(OPTIONAL_ARGS - "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" - "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" - "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" - "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}" - "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}" - "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" - "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}" - "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}" - "-Dprotobuf_WITH_ZLIB=ON" - "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}" - ${EXTERNAL_OPTIONAL_ARGS}) - SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}") - ENDIF() - IF(WIN32) - SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} - "-DCMAKE_GENERATOR=${CMAKE_GENERATOR}" - "-DCMAKE_GENERATOR_PLATFORM=${CMAKE_GENERATOR_PLATFORM}" - "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}") - ENDIF() + set(${TARGET_NAME}_INCLUDE_DIR + "${PROTOBUF_INSTALL_DIR}/include" + PARENT_SCOPE) + set(PROTOBUF_INCLUDE_DIR + "${PROTOBUF_INSTALL_DIR}/include" + PARENT_SCOPE) + set(${TARGET_NAME}_LITE_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX}" + PARENT_SCOPE) + set(${TARGET_NAME}_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX}" + PARENT_SCOPE) + set(${TARGET_NAME}_PROTOC_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX}" + PARENT_SCOPE) + set(${TARGET_NAME}_PROTOC_EXECUTABLE + "${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}" + PARENT_SCOPE) + set(OPTIONAL_CACHE_ARGS "") + set(OPTIONAL_ARGS "") + if(BUILD_FOR_HOST) + set(OPTIONAL_ARGS "-Dprotobuf_WITH_ZLIB=OFF") + else() + set(OPTIONAL_ARGS + "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" + "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" + "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" + "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}" + "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}" + "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}" + "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}" + "-Dprotobuf_WITH_ZLIB=ON" + "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}" + ${EXTERNAL_OPTIONAL_ARGS}) + set(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}") + endif() + if(WIN32) + set(OPTIONAL_ARGS + ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR=${CMAKE_GENERATOR}" + "-DCMAKE_GENERATOR_PLATFORM=${CMAKE_GENERATOR_PLATFORM}" + "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}") + endif() - if(WITH_ONNXRUNTIME) - SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) - SET(PROTOBUF_TAG v3.18.0) - elseif(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) - SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) - SET(PROTOBUF_TAG v3.8.0) - elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11) - SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) - SET(PROTOBUF_TAG v3.8.0) - elseif(WITH_IPU) - SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) - SET(PROTOBUF_TAG d750fbf648256c7c631f51ffdbf67d7c18b0114e) - elseif(WIN32) - SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) - # Change the tag to support building with vs2019 - SET(PROTOBUF_TAG 01a05a53f40ca2ac5f0af10c6cc0810bee39b792) - else() - SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) - SET(PROTOBUF_TAG 9f75c5aa851cd877fb0d93ccc31b8567a6706546) - endif() - if(WITH_ARM_BRPC) - SET(ARM_PROTOBUF_URL "https://paddlerec.bj.bcebos.com/online_infer/arm_brpc_ubuntu18/arm_protobuf.tar.gz" CACHE STRING "" FORCE) - FILE(WRITE ${PROTOBUF_SOURCE_DIR}/CMakeLists.txt - "PROJECT(ARM_PROTOBUF)\n" - "cmake_minimum_required(VERSION 3.0)\n" - "install(DIRECTORY arm_protobuf/bin arm_protobuf/include arm_protobuf/lib \n" - " DESTINATION . USE_SOURCE_PERMISSIONS)\n") - ExternalProject_Add( - ${TARGET_NAME} - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - PREFIX ${PROTOBUF_PREFIX_DIR} - DOWNLOAD_DIR ${PROTOBUF_SOURCE_DIR} - DOWNLOAD_COMMAND rm -rf arm_protobuf.tar.gz - && wget --no-check-certificate ${ARM_PROTOBUF_URL} - && tar zxvf arm_protobuf.tar.gz - #DOWNLOAD_COMMAND cp /home/wangbin44/Paddle/build/arm_protobuf.tar.gz . - # && tar zxvf arm_protobuf.tar.gz - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - CMAKE_CACHE_ARGS - -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX} - BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX} - BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX} - BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX} - ) - else() - ExternalProject_Add( - ${TARGET_NAME} - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${PROTOBUF_REPOSITORY} - GIT_TAG ${PROTOBUF_TAG} - PREFIX ${PROTOBUF_PREFIX_DIR} - UPDATE_COMMAND "" - DEPENDS zlib - CONFIGURE_COMMAND - ${CMAKE_COMMAND} ${PROTOBUF_SOURCE_DIR}/cmake - ${OPTIONAL_ARGS} - -Dprotobuf_BUILD_TESTS=OFF - -DCMAKE_SKIP_RPATH=ON - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR=lib - -DBUILD_SHARED_LIBS=OFF - CMAKE_CACHE_ARGS - -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - ${OPTIONAL_CACHE_ARGS} - BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX} - BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX} - BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX} - BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX} - ) - endif() -ENDFUNCTION() + if(WITH_ONNXRUNTIME) + set(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) + set(PROTOBUF_TAG v3.18.0) + elseif(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) + set(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) + set(PROTOBUF_TAG v3.8.0) + elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11) + set(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) + set(PROTOBUF_TAG v3.8.0) + elseif(WITH_IPU) + set(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) + set(PROTOBUF_TAG d750fbf648256c7c631f51ffdbf67d7c18b0114e) + elseif(WIN32) + set(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) + # Change the tag to support building with vs2019 + set(PROTOBUF_TAG 01a05a53f40ca2ac5f0af10c6cc0810bee39b792) + else() + set(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) + set(PROTOBUF_TAG 9f75c5aa851cd877fb0d93ccc31b8567a6706546) + endif() + if(WITH_ARM_BRPC) + set(ARM_PROTOBUF_URL + "https://paddlerec.bj.bcebos.com/online_infer/arm_brpc_ubuntu18/arm_protobuf.tar.gz" + CACHE STRING "" FORCE) + file( + WRITE ${PROTOBUF_SOURCE_DIR}/CMakeLists.txt + "PROJECT(ARM_PROTOBUF)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY arm_protobuf/bin arm_protobuf/include arm_protobuf/lib \n" + " DESTINATION . USE_SOURCE_PERMISSIONS)\n") + ExternalProject_Add( + ${TARGET_NAME} + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + PREFIX ${PROTOBUF_PREFIX_DIR} + DOWNLOAD_DIR ${PROTOBUF_SOURCE_DIR} + DOWNLOAD_COMMAND rm -rf arm_protobuf.tar.gz && wget --no-check-certificate + ${ARM_PROTOBUF_URL} && tar zxvf arm_protobuf.tar.gz + #DOWNLOAD_COMMAND cp /home/wangbin44/Paddle/build/arm_protobuf.tar.gz . + # && tar zxvf arm_protobuf.tar.gz + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS + ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX} + BUILD_BYPRODUCTS + ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX} + BUILD_BYPRODUCTS + ${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX} + BUILD_BYPRODUCTS + ${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}) + else() + ExternalProject_Add( + ${TARGET_NAME} + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${PROTOBUF_REPOSITORY} + GIT_TAG ${PROTOBUF_TAG} + PREFIX ${PROTOBUF_PREFIX_DIR} + UPDATE_COMMAND "" + DEPENDS zlib + CONFIGURE_COMMAND + ${CMAKE_COMMAND} ${PROTOBUF_SOURCE_DIR}/cmake ${OPTIONAL_ARGS} + -Dprotobuf_BUILD_TESTS=OFF -DCMAKE_SKIP_RPATH=ON + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=lib -DBUILD_SHARED_LIBS=OFF + CMAKE_CACHE_ARGS + -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + ${OPTIONAL_CACHE_ARGS} + BUILD_BYPRODUCTS + ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX} + BUILD_BYPRODUCTS + ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX} + BUILD_BYPRODUCTS + ${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX} + BUILD_BYPRODUCTS + ${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}) + endif() +endfunction() if(WITH_ONNXRUNTIME) - SET(PROTOBUF_VERSION 3.18.0) + set(PROTOBUF_VERSION 3.18.0) elseif(WITH_ASCEND OR WITH_ASCEND_CL) - SET(PROTOBUF_VERSION 3.8.0) + set(PROTOBUF_VERSION 3.8.0) elseif(WITH_IPU) - SET(PROTOBUF_VERSION 3.6.1) + set(PROTOBUF_VERSION 3.6.1) elseif(WITH_ARM_BRPC) - SET(PROTOBUF_VERSION 3.7.1-baidu-ee-common) + set(PROTOBUF_VERSION 3.7.1-baidu-ee-common) else() - SET(PROTOBUF_VERSION 3.1.0) + set(PROTOBUF_VERSION 3.1.0) endif() -IF(NOT PROTOBUF_FOUND) - build_protobuf(extern_protobuf FALSE) +if(NOT PROTOBUF_FOUND) + build_protobuf(extern_protobuf FALSE) - SET(PROTOBUF_INCLUDE_DIR ${extern_protobuf_INCLUDE_DIR} - CACHE PATH "protobuf include directory." FORCE) - SET(PROTOBUF_LITE_LIBRARY ${extern_protobuf_LITE_LIBRARY} - CACHE FILEPATH "protobuf lite library." FORCE) - SET(PROTOBUF_LIBRARY ${extern_protobuf_LIBRARY} - CACHE FILEPATH "protobuf library." FORCE) - SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY} - CACHE FILEPATH "protoc library." FORCE) + set(PROTOBUF_INCLUDE_DIR + ${extern_protobuf_INCLUDE_DIR} + CACHE PATH "protobuf include directory." FORCE) + set(PROTOBUF_LITE_LIBRARY + ${extern_protobuf_LITE_LIBRARY} + CACHE FILEPATH "protobuf lite library." FORCE) + set(PROTOBUF_LIBRARY + ${extern_protobuf_LIBRARY} + CACHE FILEPATH "protobuf library." FORCE) + set(PROTOBUF_PROTOC_LIBRARY + ${extern_protobuf_PROTOC_LIBRARY} + CACHE FILEPATH "protoc library." FORCE) - SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE} - CACHE FILEPATH "protobuf executable." FORCE) - # `EXTERN_PROTOBUF_DEPEND` used in cmake function `proto_library` to ensure - # `protoc.exe` existed before calling it. - set(EXTERN_PROTOBUF_DEPEND extern_protobuf) - PROMPT_PROTOBUF_LIB(extern_protobuf) -ENDIF(NOT PROTOBUF_FOUND) + set(PROTOBUF_PROTOC_EXECUTABLE + ${extern_protobuf_PROTOC_EXECUTABLE} + CACHE FILEPATH "protobuf executable." FORCE) + # `EXTERN_PROTOBUF_DEPEND` used in cmake function `proto_library` to ensure + # `protoc.exe` existed before calling it. + set(EXTERN_PROTOBUF_DEPEND extern_protobuf) + prompt_protobuf_lib(extern_protobuf) +endif(NOT PROTOBUF_FOUND) diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake index 47a83d905e84f..1b1298d6c6c59 100644 --- a/cmake/external/pslib.cmake +++ b/cmake/external/pslib.cmake @@ -12,53 +12,58 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) +include(ExternalProject) -SET(PSLIB_PROJECT "extern_pslib") -IF((NOT DEFINED PSLIB_VER) OR (NOT DEFINED PSLIB_URL)) - MESSAGE(STATUS "use pre defined download url") - SET(PSLIB_VER "0.1.1" CACHE STRING "" FORCE) - SET(PSLIB_NAME "pslib" CACHE STRING "" FORCE) - SET(PSLIB_URL "https://pslib.bj.bcebos.com/pslib.tar.gz" CACHE STRING "" FORCE) -ENDIF() -MESSAGE(STATUS "PSLIB_NAME: ${PSLIB_NAME}, PSLIB_URL: ${PSLIB_URL}") -SET(PSLIB_PREFIX_DIR "${THIRD_PARTY_PATH}/pslib") -SET(PSLIB_DOWNLOAD_DIR "${PSLIB_PREFIX_DIR}/src/${PSLIB_PROJECT}") -SET(PSLIB_DST_DIR "pslib") -SET(PSLIB_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") -SET(PSLIB_INSTALL_DIR ${PSLIB_INSTALL_ROOT}/${PSLIB_DST_DIR}) -SET(PSLIB_ROOT ${PSLIB_INSTALL_DIR}) -SET(PSLIB_INC_DIR ${PSLIB_ROOT}/include) -SET(PSLIB_LIB_DIR ${PSLIB_ROOT}/lib) -SET(PSLIB_LIB ${PSLIB_LIB_DIR}/libps.so) -SET(PSLIB_VERSION_PY ${PSLIB_DOWNLOAD_DIR}/pslib/version.py) -SET(PSLIB_IOMP_LIB ${PSLIB_LIB_DIR}/libiomp5.so) #todo what is this -SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_ROOT}/lib") +set(PSLIB_PROJECT "extern_pslib") +if((NOT DEFINED PSLIB_VER) OR (NOT DEFINED PSLIB_URL)) + message(STATUS "use pre defined download url") + set(PSLIB_VER + "0.1.1" + CACHE STRING "" FORCE) + set(PSLIB_NAME + "pslib" + CACHE STRING "" FORCE) + set(PSLIB_URL + "https://pslib.bj.bcebos.com/pslib.tar.gz" + CACHE STRING "" FORCE) +endif() +message(STATUS "PSLIB_NAME: ${PSLIB_NAME}, PSLIB_URL: ${PSLIB_URL}") +set(PSLIB_PREFIX_DIR "${THIRD_PARTY_PATH}/pslib") +set(PSLIB_DOWNLOAD_DIR "${PSLIB_PREFIX_DIR}/src/${PSLIB_PROJECT}") +set(PSLIB_DST_DIR "pslib") +set(PSLIB_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +set(PSLIB_INSTALL_DIR ${PSLIB_INSTALL_ROOT}/${PSLIB_DST_DIR}) +set(PSLIB_ROOT ${PSLIB_INSTALL_DIR}) +set(PSLIB_INC_DIR ${PSLIB_ROOT}/include) +set(PSLIB_LIB_DIR ${PSLIB_ROOT}/lib) +set(PSLIB_LIB ${PSLIB_LIB_DIR}/libps.so) +set(PSLIB_VERSION_PY ${PSLIB_DOWNLOAD_DIR}/pslib/version.py) +set(PSLIB_IOMP_LIB ${PSLIB_LIB_DIR}/libiomp5.so) #todo what is this +set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_ROOT}/lib") -INCLUDE_DIRECTORIES(${PSLIB_INC_DIR}) +include_directories(${PSLIB_INC_DIR}) -FILE(WRITE ${PSLIB_DOWNLOAD_DIR}/CMakeLists.txt - "PROJECT(PSLIB)\n" - "cmake_minimum_required(VERSION 3.0)\n" +file( + WRITE ${PSLIB_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(PSLIB)\n" "cmake_minimum_required(VERSION 3.0)\n" "install(DIRECTORY ${PSLIB_NAME}/include ${PSLIB_NAME}/lib \n" " DESTINATION ${PSLIB_DST_DIR})\n") ExternalProject_Add( - ${PSLIB_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${PSLIB_PREFIX_DIR} - DOWNLOAD_DIR ${PSLIB_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_URL} -c -q -O ${PSLIB_NAME}.tar.gz - && tar zxvf ${PSLIB_NAME}.tar.gz - DOWNLOAD_NO_PROGRESS 1 - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_INSTALL_ROOT} - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_INSTALL_ROOT} - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - BUILD_BYPRODUCTS ${PSLIB_LIB} -) + ${PSLIB_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${PSLIB_PREFIX_DIR} + DOWNLOAD_DIR ${PSLIB_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_URL} -c -q -O + ${PSLIB_NAME}.tar.gz && tar zxvf ${PSLIB_NAME}.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${PSLIB_LIB}) -ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB}) -ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT}) +add_library(pslib SHARED IMPORTED GLOBAL) +set_property(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB}) +add_dependencies(pslib ${PSLIB_PROJECT}) diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake index 27e2788aa21fe..eef91052a400e 100644 --- a/cmake/external/pslib_brpc.cmake +++ b/cmake/external/pslib_brpc.cmake @@ -12,52 +12,61 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) +include(ExternalProject) -SET(PSLIB_BRPC_PROJECT "extern_pslib_brpc") -IF((NOT DEFINED PSLIB_BRPC_NAME) OR (NOT DEFINED PSLIB_BRPC_URL)) - MESSAGE(STATUS "use pre defined download url") - SET(PSLIB_BRPC_VER "0.1.0" CACHE STRING "" FORCE) - SET(PSLIB_BRPC_NAME "pslib_brpc" CACHE STRING "" FORCE) - SET(PSLIB_BRPC_URL "https://pslib.bj.bcebos.com/pslib_brpc.tar.gz" CACHE STRING "" FORCE) -ENDIF() -MESSAGE(STATUS "PSLIB_BRPC_NAME: ${PSLIB_BRPC_NAME}, PSLIB_BRPC_URL: ${PSLIB_BRPC_URL}") -SET(PSLIB_BRPC_PREFIX_DIR "${THIRD_PARTY_PATH}/pslib_brpc") -SET(PSLIB_BRPC_DOWNLOAD_DIR "${PSLIB_BRPC_PREFIX_DIR}/src/${PSLIB_BRPC_PROJECT}") -SET(PSLIB_BRPC_DST_DIR "pslib_brpc") -SET(PSLIB_BRPC_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") -SET(PSLIB_BRPC_INSTALL_DIR ${PSLIB_BRPC_INSTALL_ROOT}/${PSLIB_BRPC_DST_DIR}) -SET(PSLIB_BRPC_ROOT ${PSLIB_BRPC_INSTALL_DIR}) -SET(PSLIB_BRPC_INC_DIR ${PSLIB_BRPC_ROOT}/include) -SET(PSLIB_BRPC_LIB_DIR ${PSLIB_BRPC_ROOT}/lib) -SET(PSLIB_BRPC_LIB ${PSLIB_BRPC_LIB_DIR}/libbrpc.a) -SET(PSLIB_BRPC_IOMP_LIB ${PSLIB_BRPC_LIB_DIR}/libiomp5.so) #todo what is this -SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_BRPC_ROOT}/lib") +set(PSLIB_BRPC_PROJECT "extern_pslib_brpc") +if((NOT DEFINED PSLIB_BRPC_NAME) OR (NOT DEFINED PSLIB_BRPC_URL)) + message(STATUS "use pre defined download url") + set(PSLIB_BRPC_VER + "0.1.0" + CACHE STRING "" FORCE) + set(PSLIB_BRPC_NAME + "pslib_brpc" + CACHE STRING "" FORCE) + set(PSLIB_BRPC_URL + "https://pslib.bj.bcebos.com/pslib_brpc.tar.gz" + CACHE STRING "" FORCE) +endif() +message( + STATUS + "PSLIB_BRPC_NAME: ${PSLIB_BRPC_NAME}, PSLIB_BRPC_URL: ${PSLIB_BRPC_URL}") +set(PSLIB_BRPC_PREFIX_DIR "${THIRD_PARTY_PATH}/pslib_brpc") +set(PSLIB_BRPC_DOWNLOAD_DIR + "${PSLIB_BRPC_PREFIX_DIR}/src/${PSLIB_BRPC_PROJECT}") +set(PSLIB_BRPC_DST_DIR "pslib_brpc") +set(PSLIB_BRPC_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +set(PSLIB_BRPC_INSTALL_DIR ${PSLIB_BRPC_INSTALL_ROOT}/${PSLIB_BRPC_DST_DIR}) +set(PSLIB_BRPC_ROOT ${PSLIB_BRPC_INSTALL_DIR}) +set(PSLIB_BRPC_INC_DIR ${PSLIB_BRPC_ROOT}/include) +set(PSLIB_BRPC_LIB_DIR ${PSLIB_BRPC_ROOT}/lib) +set(PSLIB_BRPC_LIB ${PSLIB_BRPC_LIB_DIR}/libbrpc.a) +set(PSLIB_BRPC_IOMP_LIB ${PSLIB_BRPC_LIB_DIR}/libiomp5.so) #todo what is this +set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_BRPC_ROOT}/lib") -INCLUDE_DIRECTORIES(${PSLIB_BRPC_INC_DIR}) +include_directories(${PSLIB_BRPC_INC_DIR}) -FILE(WRITE ${PSLIB_BRPC_DOWNLOAD_DIR}/CMakeLists.txt - "PROJECT(PSLIB_BRPC)\n" - "cmake_minimum_required(VERSION 3.0)\n" +file( + WRITE ${PSLIB_BRPC_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(PSLIB_BRPC)\n" "cmake_minimum_required(VERSION 3.0)\n" "install(DIRECTORY ${PSLIB_BRPC_NAME}/include ${PSLIB_BRPC_NAME}/lib \n" " DESTINATION ${PSLIB_BRPC_DST_DIR})\n") ExternalProject_Add( - ${PSLIB_BRPC_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${PSLIB_BRPC_PREFIX_DIR} - DOWNLOAD_DIR ${PSLIB_BRPC_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_BRPC_URL} -c -q -O ${PSLIB_BRPC_NAME}.tar.gz - && tar zxvf ${PSLIB_BRPC_NAME}.tar.gz - DOWNLOAD_NO_PROGRESS 1 - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_BRPC_INSTALL_ROOT} - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_BRPC_INSTALL_ROOT} - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - BUILD_BYPRODUCTS ${PSLIB_BRPC_LIB} -) + ${PSLIB_BRPC_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${PSLIB_BRPC_PREFIX_DIR} + DOWNLOAD_DIR ${PSLIB_BRPC_DOWNLOAD_DIR} + DOWNLOAD_COMMAND + wget --no-check-certificate ${PSLIB_BRPC_URL} -c -q -O + ${PSLIB_BRPC_NAME}.tar.gz && tar zxvf ${PSLIB_BRPC_NAME}.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_BRPC_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_BRPC_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${PSLIB_BRPC_LIB}) -ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB}) -ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT}) +add_library(pslib_brpc SHARED IMPORTED GLOBAL) +set_property(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB}) +add_dependencies(pslib_brpc ${PSLIB_BRPC_PROJECT}) diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake index f87e73081ffb7..e236767cec156 100644 --- a/cmake/external/pybind11.cmake +++ b/cmake/external/pybind11.cmake @@ -14,31 +14,29 @@ include(ExternalProject) -set(PYBIND_PREFIX_DIR ${THIRD_PARTY_PATH}/pybind) -SET(PYBIND_REPOSITORY ${GIT_URL}/pybind/pybind11.git) -SET(PYBIND_TAG v2.4.3) +set(PYBIND_PREFIX_DIR ${THIRD_PARTY_PATH}/pybind) +set(PYBIND_REPOSITORY ${GIT_URL}/pybind/pybind11.git) +set(PYBIND_TAG v2.4.3) set(PYBIND_INCLUDE_DIR ${THIRD_PARTY_PATH}/pybind/src/extern_pybind/include) include_directories(${PYBIND_INCLUDE_DIR}) ExternalProject_Add( - extern_pybind - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${PYBIND_REPOSITORY} - GIT_TAG ${PYBIND_TAG} - PREFIX ${PYBIND_PREFIX_DIR} - # If we explicitly leave the `UPDATE_COMMAND` of the ExternalProject_Add - # function in CMakeLists blank, it will cause another parameter GIT_TAG - # to be modified without triggering incremental compilation, and the - # third-party library version changes cannot be incorporated. - # reference: https://cmake.org/cmake/help/latest/module/ExternalProject.html - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" -) + extern_pybind + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${PYBIND_REPOSITORY} + GIT_TAG ${PYBIND_TAG} + PREFIX ${PYBIND_PREFIX_DIR} + # If we explicitly leave the `UPDATE_COMMAND` of the ExternalProject_Add + # function in CMakeLists blank, it will cause another parameter GIT_TAG + # to be modified without triggering incremental compilation, and the + # third-party library version changes cannot be incorporated. + # reference: https://cmake.org/cmake/help/latest/module/ExternalProject.html + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "") add_library(pybind INTERFACE) diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index ab3776084136e..bc58c9d7b6c35 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -12,68 +12,72 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(python_module) +include(python_module) -FIND_PACKAGE(PythonInterp ${PY_VERSION} REQUIRED) -FIND_PACKAGE(PythonLibs ${PY_VERSION} REQUIRED) +find_package(PythonInterp ${PY_VERSION} REQUIRED) +find_package(PythonLibs ${PY_VERSION} REQUIRED) if(WIN32) - execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" -"from distutils import sysconfig as s;import sys;import struct; + execute_process( + COMMAND + "${PYTHON_EXECUTABLE}" "-c" + "from distutils import sysconfig as s;import sys;import struct; print(sys.prefix); print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION')); " - RESULT_VARIABLE _PYTHON_SUCCESS - OUTPUT_VARIABLE _PYTHON_VALUES - ERROR_VARIABLE _PYTHON_ERROR_VALUE) + RESULT_VARIABLE _PYTHON_SUCCESS + OUTPUT_VARIABLE _PYTHON_VALUES + ERROR_VARIABLE _PYTHON_ERROR_VALUE) - if(NOT _PYTHON_SUCCESS EQUAL 0) - set(PYTHONLIBS_FOUND FALSE) - return() - endif() + if(NOT _PYTHON_SUCCESS EQUAL 0) + set(PYTHONLIBS_FOUND FALSE) + return() + endif() - # Convert the process output into a list - string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES}) - string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES}) - list(GET _PYTHON_VALUES 0 PYTHON_PREFIX) - list(GET _PYTHON_VALUES 1 PYTHON_LIBRARY_SUFFIX) + # Convert the process output into a list + string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES}) + string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES}) + list(GET _PYTHON_VALUES 0 PYTHON_PREFIX) + list(GET _PYTHON_VALUES 1 PYTHON_LIBRARY_SUFFIX) - # Make sure all directory separators are '/' - string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX}) + # Make sure all directory separators are '/' + string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX}) - set(PYTHON_LIBRARY - "${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib") + set(PYTHON_LIBRARY "${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib") - # when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the - # original python installation. They may be found relative to PYTHON_INCLUDE_DIR. - if(NOT EXISTS "${PYTHON_LIBRARY}") - get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY) - set(PYTHON_LIBRARY - "${_PYTHON_ROOT}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib") - endif() + # when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the + # original python installation. They may be found relative to PYTHON_INCLUDE_DIR. + if(NOT EXISTS "${PYTHON_LIBRARY}") + get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY) + set(PYTHON_LIBRARY + "${_PYTHON_ROOT}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib") + endif() - # raise an error if the python libs are still not found. - if(NOT EXISTS "${PYTHON_LIBRARY}") - message(FATAL_ERROR "Python libraries not found") - endif() - SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}") + # raise an error if the python libs are still not found. + if(NOT EXISTS "${PYTHON_LIBRARY}") + message(FATAL_ERROR "Python libraries not found") + endif() + set(PYTHON_LIBRARIES "${PYTHON_LIBRARY}") endif(WIN32) # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE. -ADD_LIBRARY(python SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES}) +add_library(python SHARED IMPORTED GLOBAL) +set_property(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES}) -SET(py_env "") -IF(PYTHONINTERP_FOUND) - find_python_module(pip REQUIRED) - find_python_module(numpy REQUIRED) - find_python_module(wheel REQUIRED) - find_python_module(google.protobuf REQUIRED) - FIND_PACKAGE(NumPy REQUIRED) - IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0") - MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, " +set(py_env "") +if(PYTHONINTERP_FOUND) + find_python_module(pip REQUIRED) + find_python_module(numpy REQUIRED) + find_python_module(wheel REQUIRED) + find_python_module(google.protobuf REQUIRED) + find_package(NumPy REQUIRED) + if(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} + VERSION_LESS "3.0.0") + message( + FATAL_ERROR + "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, " "please use pip to upgrade protobuf. pip install -U protobuf") - ENDIF() -ENDIF(PYTHONINTERP_FOUND) -INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR}) -INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR}) + endif() +endif(PYTHONINTERP_FOUND) +include_directories(${PYTHON_INCLUDE_DIR}) +include_directories(${PYTHON_NUMPY_INCLUDE_DIR}) diff --git a/cmake/external/rocksdb.cmake b/cmake/external/rocksdb.cmake index befbc8138fc50..2e90f50e3cdf2 100644 --- a/cmake/external/rocksdb.cmake +++ b/cmake/external/rocksdb.cmake @@ -12,40 +12,44 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) +include(ExternalProject) -SET(ROCKSDB_PREFIX_DIR ${THIRD_PARTY_PATH}/rocksdb) -SET(ROCKSDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/rocksdb) -SET(ROCKSDB_INCLUDE_DIR "${ROCKSDB_INSTALL_DIR}/include" CACHE PATH "rocksdb include directory." FORCE) -SET(ROCKSDB_LIBRARIES "${ROCKSDB_INSTALL_DIR}/lib/librocksdb.a" CACHE FILEPATH "rocksdb library." FORCE) -SET(ROCKSDB_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") -INCLUDE_DIRECTORIES(${ROCKSDB_INCLUDE_DIR}) +set(ROCKSDB_PREFIX_DIR ${THIRD_PARTY_PATH}/rocksdb) +set(ROCKSDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/rocksdb) +set(ROCKSDB_INCLUDE_DIR + "${ROCKSDB_INSTALL_DIR}/include" + CACHE PATH "rocksdb include directory." FORCE) +set(ROCKSDB_LIBRARIES + "${ROCKSDB_INSTALL_DIR}/lib/librocksdb.a" + CACHE FILEPATH "rocksdb library." FORCE) +set(ROCKSDB_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") +include_directories(${ROCKSDB_INCLUDE_DIR}) ExternalProject_Add( - extern_rocksdb - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${ROCKSDB_PREFIX_DIR} - GIT_REPOSITORY "https://github.com/facebook/rocksdb" - GIT_TAG v6.10.1 - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DWITH_BZ2=OFF - -DWITH_GFLAGS=OFF - -DCMAKE_CXX_FLAGS=${ROCKSDB_CMAKE_CXX_FLAGS} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -# BUILD_BYPRODUCTS ${ROCKSDB_PREFIX_DIR}/src/extern_rocksdb/librocksdb.a - INSTALL_COMMAND mkdir -p ${ROCKSDB_INSTALL_DIR}/lib/ - && cp ${ROCKSDB_PREFIX_DIR}/src/extern_rocksdb/librocksdb.a ${ROCKSDB_LIBRARIES} - && cp -r ${ROCKSDB_PREFIX_DIR}/src/extern_rocksdb/include ${ROCKSDB_INSTALL_DIR}/ - BUILD_IN_SOURCE 1 -) + extern_rocksdb + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${ROCKSDB_PREFIX_DIR} + GIT_REPOSITORY "https://github.com/facebook/rocksdb" + GIT_TAG v6.10.1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DWITH_BZ2=OFF + -DWITH_GFLAGS=OFF + -DCMAKE_CXX_FLAGS=${ROCKSDB_CMAKE_CXX_FLAGS} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + # BUILD_BYPRODUCTS ${ROCKSDB_PREFIX_DIR}/src/extern_rocksdb/librocksdb.a + INSTALL_COMMAND + mkdir -p ${ROCKSDB_INSTALL_DIR}/lib/ && cp + ${ROCKSDB_PREFIX_DIR}/src/extern_rocksdb/librocksdb.a ${ROCKSDB_LIBRARIES} + && cp -r ${ROCKSDB_PREFIX_DIR}/src/extern_rocksdb/include + ${ROCKSDB_INSTALL_DIR}/ + BUILD_IN_SOURCE 1) -ADD_DEPENDENCIES(extern_rocksdb snappy) +add_dependencies(extern_rocksdb snappy) -ADD_LIBRARY(rocksdb STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET rocksdb PROPERTY IMPORTED_LOCATION ${ROCKSDB_LIBRARIES}) -ADD_DEPENDENCIES(rocksdb extern_rocksdb) - -LIST(APPEND external_project_dependencies rocksdb) +add_library(rocksdb STATIC IMPORTED GLOBAL) +set_property(TARGET rocksdb PROPERTY IMPORTED_LOCATION ${ROCKSDB_LIBRARIES}) +add_dependencies(rocksdb extern_rocksdb) +list(APPEND external_project_dependencies rocksdb) diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake index 42320df13972a..dfb7192a71e66 100644 --- a/cmake/external/snappy.cmake +++ b/cmake/external/snappy.cmake @@ -12,58 +12,61 @@ # See the License for the specific language governing permissions and # limitations under the License. -include (ExternalProject) +include(ExternalProject) # NOTE: snappy is needed when linking with recordio set(SNAPPY_PREFIX_DIR ${THIRD_PARTY_PATH}/snappy) set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy) -set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE) +set(SNAPPY_INCLUDE_DIR + "${SNAPPY_INSTALL_DIR}/include" + CACHE PATH "snappy include directory." FORCE) if(WIN32) - SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267") - IF(NOT EXISTS "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib") - add_custom_command(TARGET extern_snappy POST_BUILD - COMMAND cmake -E copy ${SNAPPY_INSTALL_DIR}/lib/snappy.lib ${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib - ) - ENDIF() - set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib") + set(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267") + if(NOT EXISTS "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib") + add_custom_command( + TARGET extern_snappy + POST_BUILD + COMMAND cmake -E copy ${SNAPPY_INSTALL_DIR}/lib/snappy.lib + ${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib) + endif() + set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib") else() - SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) - set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a") + set(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a") endif() ExternalProject_Add( - extern_snappy - GIT_REPOSITORY "https://github.com/google/snappy" - GIT_TAG "1.1.7" - PREFIX ${SNAPPY_PREFIX_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS} - -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} - -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DBUILD_TESTING=OFF - -DSNAPPY_BUILD_TESTS:BOOL=OFF - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPY_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - BUILD_BYPRODUCTS ${SNAPPY_LIBRARIES} -) + extern_snappy + GIT_REPOSITORY "https://github.com/google/snappy" + GIT_TAG "1.1.7" + PREFIX ${SNAPPY_PREFIX_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DBUILD_TESTING=OFF + -DSNAPPY_BUILD_TESTS:BOOL=OFF + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS + -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPY_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${SNAPPY_LIBRARIES}) add_library(snappy STATIC IMPORTED GLOBAL) set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES}) include_directories(${SNAPPY_INCLUDE_DIR}) add_dependencies(snappy extern_snappy) - diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake index c4d978115bfb2..1047465095f42 100644 --- a/cmake/external/threadpool.cmake +++ b/cmake/external/threadpool.cmake @@ -12,32 +12,30 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) +include(ExternalProject) -SET(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool) +set(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool) if(WITH_ASCEND OR WITH_ASCEND_CL) - SET(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git) + set(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git) else() - SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git) + set(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git) endif() -SET(THREADPOOL_TAG 9a42ec1329f259a5f4881a291db1dcb8f2ad9040) +set(THREADPOOL_TAG 9a42ec1329f259a5f4881a291db1dcb8f2ad9040) -SET(THREADPOOL_INCLUDE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool) -INCLUDE_DIRECTORIES(${THREADPOOL_INCLUDE_DIR}) +set(THREADPOOL_INCLUDE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool) +include_directories(${THREADPOOL_INCLUDE_DIR}) ExternalProject_Add( - extern_threadpool - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${THREADPOOL_REPOSITORY} - GIT_TAG ${THREADPOOL_TAG} - PREFIX ${THREADPOOL_PREFIX_DIR} - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" -) + extern_threadpool + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${THREADPOOL_REPOSITORY} + GIT_TAG ${THREADPOOL_TAG} + PREFIX ${THREADPOOL_PREFIX_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "") add_library(simple_threadpool INTERFACE) diff --git a/cmake/external/utf8proc.cmake b/cmake/external/utf8proc.cmake index a5de5c15c3b51..13107c03cf171 100644 --- a/cmake/external/utf8proc.cmake +++ b/cmake/external/utf8proc.cmake @@ -12,40 +12,38 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) +include(ExternalProject) -SET(UTF8PROC_PREFIX_DIR ${THIRD_PARTY_PATH}/utf8proc) -SET(UTF8PROC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/utf8proc) +set(UTF8PROC_PREFIX_DIR ${THIRD_PARTY_PATH}/utf8proc) +set(UTF8PROC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/utf8proc) # As we add extra features for utf8proc, we use the non-official repo -SET(UTF8PROC_REPOSITORY ${GIT_URL}/JuliaStrings/utf8proc.git) -SET(UTF8PROC_TAG v2.6.1) +set(UTF8PROC_REPOSITORY ${GIT_URL}/JuliaStrings/utf8proc.git) +set(UTF8PROC_TAG v2.6.1) -IF(WIN32) - SET(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/utf8proc_static.lib") +if(WIN32) + set(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/utf8proc_static.lib") add_definitions(-DUTF8PROC_STATIC) -ELSE(WIN32) - SET(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/libutf8proc.a") -ENDIF(WIN32) +else(WIN32) + set(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/libutf8proc.a") +endif(WIN32) -INCLUDE_DIRECTORIES(${UTF8PROC_INSTALL_DIR}/include) +include_directories(${UTF8PROC_INSTALL_DIR}/include) ExternalProject_Add( extern_utf8proc - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${UTF8PROC_REPOSITORY} - GIT_TAG ${UTF8PROC_TAG} - PREFIX ${UTF8PROC_PREFIX_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DBUILD_SHARED=ON - -DBUILD_STATIC=ON - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - -DCMAKE_INSTALL_PREFIX:PATH=${UTF8PROC_INSTALL_DIR} - -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} - BUILD_BYPRODUCTS ${UTF8PROC_LIBRARIES} -) + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${UTF8PROC_REPOSITORY} + GIT_TAG ${UTF8PROC_TAG} + PREFIX ${UTF8PROC_PREFIX_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DBUILD_SHARED=ON + -DBUILD_STATIC=ON + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_INSTALL_PREFIX:PATH=${UTF8PROC_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} + BUILD_BYPRODUCTS ${UTF8PROC_LIBRARIES}) -ADD_LIBRARY(utf8proc STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET utf8proc PROPERTY IMPORTED_LOCATION ${UTF8PROC_LIBRARIES}) -ADD_DEPENDENCIES(utf8proc extern_utf8proc) +add_library(utf8proc STATIC IMPORTED GLOBAL) +set_property(TARGET utf8proc PROPERTY IMPORTED_LOCATION ${UTF8PROC_LIBRARIES}) +add_dependencies(utf8proc extern_utf8proc) diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index f0d16fc7978e8..d38636c9c23a8 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -12,130 +12,139 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) +include(ExternalProject) -IF(WITH_ROCM) - add_definitions(-DWARPCTC_WITH_HIP) -ENDIF() +if(WITH_ROCM) + add_definitions(-DWARPCTC_WITH_HIP) +endif() -SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc) -SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) -# in case of low internet speed +set(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc) +set(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) +# in case of low internet speed #set(WARPCTC_REPOSITORY https://gitee.com/tianjianhe/warp-ctc.git) -set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git) -set(WARPCTC_TAG 37ece0e1bbe8a0019a63ac7e6462c36591c66a5b) +set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git) +set(WARPCTC_TAG 37ece0e1bbe8a0019a63ac7e6462c36591c66a5b) -SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" +set(WARPCTC_INCLUDE_DIR + "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE) # Used in unit test test_WarpCTCLayer -SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" +set(WARPCTC_LIB_DIR + "${WARPCTC_INSTALL_DIR}/lib" CACHE PATH "Warp-ctc Library Directory" FORCE) -IF(WIN32) - SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" - CACHE FILEPATH "Warp-ctc Library" FORCE) +if(WIN32) + set(WARPCTC_LIBRARIES + "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) else(WIN32) - SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" - CACHE FILEPATH "Warp-ctc Library" FORCE) -ENDIF(WIN32) + set(WARPCTC_LIBRARIES + "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) +endif(WIN32) -IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR WIN32) - SET(USE_OMP OFF) -ELSE() - SET(USE_OMP ON) -ENDIF() +if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" + OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" + OR WIN32) + set(USE_OMP OFF) +else() + set(USE_OMP ON) +endif() if(WITH_ASCEND OR WITH_ASCEND_CL) - ExternalProject_Add( - extern_warpctc - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${WARPCTC_REPOSITORY} - GIT_TAG ${WARPCTC_TAG} - PREFIX ${WARPCTC_PREFIX_DIR} - #UPDATE_COMMAND "" - PATCH_COMMAND "" - BUILD_ALWAYS 1 - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} - -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} - -DWITH_GPU=${WITH_GPU} - -DWITH_ROCM=${WITH_ROCM} - -DWITH_OMP=${USE_OMP} - -DWITH_TORCH=OFF - -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON - -DBUILD_SHARED=ON - -DBUILD_TESTS=OFF - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} - BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES} - ) + ExternalProject_Add( + extern_warpctc + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${WARPCTC_REPOSITORY} + GIT_TAG ${WARPCTC_TAG} + PREFIX ${WARPCTC_PREFIX_DIR} + #UPDATE_COMMAND "" + PATCH_COMMAND "" + BUILD_ALWAYS 1 + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} + -DWITH_GPU=${WITH_GPU} + -DWITH_ROCM=${WITH_ROCM} + -DWITH_OMP=${USE_OMP} + -DWITH_TORCH=OFF + -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON + -DBUILD_SHARED=ON + -DBUILD_TESTS=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} + BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES}) else() - if(WIN32) - set(WARPCTC_C_FLAGS $) - set(WARPCTC_C_FLAGS_DEBUG $) - set(WARPCTC_C_FLAGS_RELEASE $) - set(WARPCTC_CXX_FLAGS $) - set(WARPCTC_CXX_FLAGS_RELEASE $) - set(WARPCTC_CXX_FLAGS_DEBUG $) - else() - set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS}) - set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) - set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE}) - set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS}) - set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) - set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}) - endif() - ExternalProject_Add( - extern_warpctc - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${WARPCTC_REPOSITORY} - GIT_TAG ${WARPCTC_TAG} - PREFIX ${WARPCTC_PREFIX_DIR} - UPDATE_COMMAND "" - PATCH_COMMAND "" - #BUILD_ALWAYS 1 - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${WARPCTC_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${WARPCTC_C_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS=${WARPCTC_CXX_FLAGS} - -DCMAKE_CXX_FLAGS_RELEASE=${WARPCTC_CXX_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS_DEBUG=${WARPCTC_CXX_FLAGS_DEBUG} - -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} - -DWITH_GPU=${WITH_GPU} - -DWITH_ROCM=${WITH_ROCM} - -DWITH_OMP=${USE_OMP} - -DWITH_TORCH=OFF - -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON - -DBUILD_SHARED=ON - -DBUILD_TESTS=OFF - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} - BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES} - ) + if(WIN32) + set(WARPCTC_C_FLAGS $) + set(WARPCTC_C_FLAGS_DEBUG + $) + set(WARPCTC_C_FLAGS_RELEASE + $) + set(WARPCTC_CXX_FLAGS $) + set(WARPCTC_CXX_FLAGS_RELEASE + $) + set(WARPCTC_CXX_FLAGS_DEBUG + $) + else() + set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS}) + set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) + set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE}) + set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) + set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}) + endif() + ExternalProject_Add( + extern_warpctc + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${WARPCTC_REPOSITORY} + GIT_TAG ${WARPCTC_TAG} + PREFIX ${WARPCTC_PREFIX_DIR} + UPDATE_COMMAND "" + PATCH_COMMAND "" + #BUILD_ALWAYS 1 + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${WARPCTC_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${WARPCTC_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${WARPCTC_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${WARPCTC_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${WARPCTC_CXX_FLAGS_DEBUG} + -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} + -DWITH_GPU=${WITH_GPU} + -DWITH_ROCM=${WITH_ROCM} + -DWITH_OMP=${USE_OMP} + -DWITH_TORCH=OFF + -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON + -DBUILD_SHARED=ON + -DBUILD_TESTS=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} + BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES}) endif() -MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}") +message(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}") get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY) -INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers. +include_directories(${WARPCTC_INCLUDE_DIR} +)# For warpctc code to include its headers. -ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES}) -ADD_DEPENDENCIES(warpctc extern_warpctc) +add_library(warpctc SHARED IMPORTED GLOBAL) +set_property(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES}) +add_dependencies(warpctc extern_warpctc) diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake index 6ad15b3730d1d..589056458c1f0 100644 --- a/cmake/external/xbyak.cmake +++ b/cmake/external/xbyak.cmake @@ -14,12 +14,12 @@ include(ExternalProject) -set(XBYAK_PROJECT extern_xbyak) -set(XBYAK_PREFIX_DIR ${THIRD_PARTY_PATH}/xbyak) -set(XBYAK_INSTALL_ROOT ${THIRD_PARTY_PATH}/install/xbyak) -set(XBYAK_INC_DIR ${XBYAK_INSTALL_ROOT}/include) -set(XBYAK_REPOSITORY ${GIT_URL}/herumi/xbyak.git) -set(XBYAK_TAG v5.81) # Dec 19, 2019 +set(XBYAK_PROJECT extern_xbyak) +set(XBYAK_PREFIX_DIR ${THIRD_PARTY_PATH}/xbyak) +set(XBYAK_INSTALL_ROOT ${THIRD_PARTY_PATH}/install/xbyak) +set(XBYAK_INC_DIR ${XBYAK_INSTALL_ROOT}/include) +set(XBYAK_REPOSITORY ${GIT_URL}/herumi/xbyak.git) +set(XBYAK_TAG v5.81) # Dec 19, 2019 include_directories(${XBYAK_INC_DIR}) include_directories(${XBYAK_INC_DIR}/xbyak) @@ -31,19 +31,17 @@ add_definitions(-DXBYAK64) add_definitions(-DXBYAK_NO_OP_NAMES) ExternalProject_Add( - ${XBYAK_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${XBYAK_REPOSITORY} - GIT_TAG ${XBYAK_TAG} - DEPENDS "" - PREFIX ${XBYAK_PREFIX_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT} - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XBYAK_INSTALL_ROOT} - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} -) + ${XBYAK_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${XBYAK_REPOSITORY} + GIT_TAG ${XBYAK_TAG} + DEPENDS "" + PREFIX ${XBYAK_PREFIX_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XBYAK_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}) add_library(xbyak INTERFACE) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 43d5002fe3819..af27500398f57 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -1,127 +1,151 @@ -if (NOT WITH_XPU) - return() +if(NOT WITH_XPU) + return() endif() -INCLUDE(ExternalProject) -SET(XPU_PROJECT "extern_xpu") -SET(XPU_API_LIB_NAME "libxpuapi.so") -SET(XPU_RT_LIB_NAME "libxpurt.so") +include(ExternalProject) +set(XPU_PROJECT "extern_xpu") +set(XPU_API_LIB_NAME "libxpuapi.so") +set(XPU_RT_LIB_NAME "libxpurt.so") if(NOT DEFINED XPU_BASE_URL) - SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220520") + set(XPU_BASE_URL_WITHOUT_DATE + "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") + set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220601") else() - SET(XPU_BASE_URL "${XPU_BASE_URL}") + set(XPU_BASE_URL "${XPU_BASE_URL}") endif() # ubuntu and centos: use output by XDNN API team if(NOT DEFINED XPU_XDNN_BASE_URL) - SET(XPU_XDNN_BASE_URL_WITHOUT_DATE "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev") - SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220520") + set(XPU_XDNN_BASE_URL_WITHOUT_DATE + "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev") + set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220601") else() - SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}") + set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}") endif() -IF(WITH_AARCH64) - SET(XPU_XRE_DIR_NAME "xre-kylin_aarch64") - SET(XPU_XDNN_DIR_NAME "XDNN-kylin_aarch64") - SET(XPU_XCCL_DIR_NAME "xccl-kylin_aarch64") - SET(XPU_XDNN_URL "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) -ELSEIF(WITH_SUNWAY) - SET(XPU_XRE_DIR_NAME "xre-deepin_sw6_64") - SET(XPU_XDNN_DIR_NAME "xdnn-deepin_sw6_64") - SET(XPU_XCCL_DIR_NAME "xccl-deepin_sw6_64") - SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) -ELSEIF(WITH_BDCENTOS) - SET(XPU_XRE_DIR_NAME "xre-bdcentos_x86_64") - SET(XPU_XDNN_DIR_NAME "XDNN-bdcentos_x86_64") - SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") +if(WITH_AARCH64) + set(XPU_XRE_DIR_NAME "xre-kylin_aarch64") + set(XPU_XDNN_DIR_NAME "XDNN-kylin_aarch64") + set(XPU_XCCL_DIR_NAME "xccl-kylin_aarch64") + set(XPU_XDNN_URL + "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" + CACHE STRING "" FORCE) +elseif(WITH_SUNWAY) + set(XPU_XRE_DIR_NAME "xre-deepin_sw6_64") + set(XPU_XDNN_DIR_NAME "xdnn-deepin_sw6_64") + set(XPU_XCCL_DIR_NAME "xccl-deepin_sw6_64") + set(XPU_XDNN_URL + "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" + CACHE STRING "" FORCE) +elseif(WITH_BDCENTOS) + set(XPU_XRE_DIR_NAME "xre-bdcentos_x86_64") + set(XPU_XDNN_DIR_NAME "XDNN-bdcentos_x86_64") + set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") # ubuntu and centos: use output by XDNN API team - SET(XPU_XDNN_URL "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) -ELSEIF(WITH_UBUNTU) - SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64") - SET(XPU_XDNN_DIR_NAME "XDNN-ubuntu_x86_64") - SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") + set(XPU_XDNN_URL + "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" + CACHE STRING "" FORCE) +elseif(WITH_UBUNTU) + set(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64") + set(XPU_XDNN_DIR_NAME "XDNN-ubuntu_x86_64") + set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") # ubuntu and centos: use output by XDNN API team - SET(XPU_XDNN_URL "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) -ELSEIF(WITH_CENTOS) - SET(XPU_XRE_DIR_NAME "xre-centos7_x86_64") - SET(XPU_XDNN_DIR_NAME "XDNN-bdcentos_x86_64") - SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") + set(XPU_XDNN_URL + "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" + CACHE STRING "" FORCE) +elseif(WITH_CENTOS) + set(XPU_XRE_DIR_NAME "xre-centos7_x86_64") + set(XPU_XDNN_DIR_NAME "XDNN-bdcentos_x86_64") + set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") # ubuntu and centos: use output by XDNN API team - SET(XPU_XDNN_URL "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) -ELSE() - SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64") - SET(XPU_XDNN_DIR_NAME "XDNN-ubuntu_x86_64") - SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") + set(XPU_XDNN_URL + "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" + CACHE STRING "" FORCE) +else() + set(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64") + set(XPU_XDNN_DIR_NAME "XDNN-ubuntu_x86_64") + set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") # default: use output by XDNN API team - SET(XPU_XDNN_URL "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) -ENDIF() - -SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) -SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220411/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) -SET(XPU_PACK_DEPENCE_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh" CACHE STRING "" FORCE) - -SET(SNAPPY_PREFIX_DIR "${THIRD_PARTY_PATH}/xpu") -SET(XPU_DOWNLOAD_DIR "${SNAPPY_PREFIX_DIR}/src/${XPU_PROJECT}") -SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu") -SET(XPU_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include") -SET(XPU_LIB_DIR "${THIRD_PARTY_PATH}/install/xpu/lib") - -SET(XPU_API_LIB "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}") -SET(XPU_RT_LIB "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}") - -SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib") + set(XPU_XDNN_URL + "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" + CACHE STRING "" FORCE) +endif() -FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt - "PROJECT(XPU)\n" - "cmake_minimum_required(VERSION 3.0)\n" +set(XPU_XRE_URL + "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" + CACHE STRING "" FORCE) +set(XPU_XCCL_URL + "${XPU_BASE_URL_WITHOUT_DATE}/20220411/${XPU_XCCL_DIR_NAME}.tar.gz" + CACHE STRING "" FORCE) +set(XPU_PACK_DEPENCE_URL + "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh" + CACHE STRING "" FORCE) + +set(SNAPPY_PREFIX_DIR "${THIRD_PARTY_PATH}/xpu") +set(XPU_DOWNLOAD_DIR "${SNAPPY_PREFIX_DIR}/src/${XPU_PROJECT}") +set(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu") +set(XPU_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include") +set(XPU_LIB_DIR "${THIRD_PARTY_PATH}/install/xpu/lib") + +set(XPU_API_LIB "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}") +set(XPU_RT_LIB "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}") + +set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib") + +file( + WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(XPU)\n" "cmake_minimum_required(VERSION 3.0)\n" "install(DIRECTORY xpu/include xpu/lib \n" " DESTINATION ${XPU_INSTALL_DIR})\n") ExternalProject_Add( - ${XPU_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${SNAPPY_PREFIX_DIR} - DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget ${XPU_PACK_DEPENCE_URL} - && bash pack_paddle_depence.sh ${XPU_XRE_URL} ${XPU_XRE_DIR_NAME} ${XPU_XDNN_URL} ${XPU_XDNN_DIR_NAME} ${XPU_XCCL_URL} ${XPU_XCCL_DIR_NAME} - - DOWNLOAD_NO_PROGRESS 1 - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT} - BUILD_BYPRODUCTS ${XPU_API_LIB} - BUILD_BYPRODUCTS ${XPU_RT_LIB} -) - -INCLUDE_DIRECTORIES(${XPU_INC_DIR}) -ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL) + ${XPU_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${SNAPPY_PREFIX_DIR} + DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR} + DOWNLOAD_COMMAND + wget ${XPU_PACK_DEPENCE_URL} && bash pack_paddle_depence.sh ${XPU_XRE_URL} + ${XPU_XRE_DIR_NAME} ${XPU_XDNN_URL} ${XPU_XDNN_DIR_NAME} ${XPU_XCCL_URL} + ${XPU_XCCL_DIR_NAME} + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT} + BUILD_BYPRODUCTS ${XPU_API_LIB} + BUILD_BYPRODUCTS ${XPU_RT_LIB}) + +include_directories(${XPU_INC_DIR}) +add_library(shared_xpuapi SHARED IMPORTED GLOBAL) set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}") # generate a static dummy target to track xpulib dependencies # for cc_library(xxx SRCS xxx.c DEPS xpulib) generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake") -TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB}) +target_link_libraries(xpulib ${XPU_API_LIB} ${XPU_RT_LIB}) -IF(WITH_XPU_BKCL) - MESSAGE(STATUS "Compile with XPU BKCL!") - ADD_DEFINITIONS(-DPADDLE_WITH_XPU_BKCL) +if(WITH_XPU_BKCL) + message(STATUS "Compile with XPU BKCL!") + add_definitions(-DPADDLE_WITH_XPU_BKCL) - SET(XPU_BKCL_LIB_NAME "libbkcl.so") - SET(XPU_BKCL_LIB "${XPU_LIB_DIR}/${XPU_BKCL_LIB_NAME}") - SET(XPU_BKCL_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include") - INCLUDE_DIRECTORIES(${XPU_BKCL_INC_DIR}) - TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_BKCL_LIB}) -ELSE(WITH_XPU_BKCL) - TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB}) -ENDIF(WITH_XPU_BKCL) + set(XPU_BKCL_LIB_NAME "libbkcl.so") + set(XPU_BKCL_LIB "${XPU_LIB_DIR}/${XPU_BKCL_LIB_NAME}") + set(XPU_BKCL_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include") + include_directories(${XPU_BKCL_INC_DIR}) + target_link_libraries(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_BKCL_LIB}) +else(WITH_XPU_BKCL) + target_link_libraries(xpulib ${XPU_API_LIB} ${XPU_RT_LIB}) +endif(WITH_XPU_BKCL) -ADD_DEPENDENCIES(xpulib ${XPU_PROJECT}) +add_dependencies(xpulib ${XPU_PROJECT}) # Ensure that xpu/api.h can be included without dependency errors. -file(GENERATE OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc CONTENT "") -add_library(xpu_headers_dummy STATIC ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc) +file( + GENERATE + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc + CONTENT "") +add_library(xpu_headers_dummy STATIC + ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc) add_dependencies(xpu_headers_dummy extern_xpu) link_libraries(xpu_headers_dummy) diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake index fe17806e36274..6e685bbde402e 100644 --- a/cmake/external/xxhash.cmake +++ b/cmake/external/xxhash.cmake @@ -12,24 +12,39 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) +include(ExternalProject) set(XXHASH_PREFIX_DIR ${THIRD_PARTY_PATH}/xxhash) set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash/src/extern_xxhash) set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash) set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include") -set(XXHASH_REPOSITORY ${GIT_URL}/Cyan4973/xxHash.git) -set(XXHASH_TAG v0.6.5) +set(XXHASH_REPOSITORY ${GIT_URL}/Cyan4973/xxHash.git) +set(XXHASH_TAG v0.6.5) -INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR}) +include_directories(${XXHASH_INCLUDE_DIR}) -IF(APPLE) - SET(BUILD_CMD sed -i \"\" "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/Makefile && make lib) -ELSEIF(UNIX) - SET(BUILD_CMD sed -i "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/Makefile && make lib) -ENDIF() +if(APPLE) + set(BUILD_CMD + sed + -i + \"\" + "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" + ${XXHASH_SOURCE_DIR}/Makefile + && + make + lib) +elseif(UNIX) + set(BUILD_CMD + sed + -i + "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" + ${XXHASH_SOURCE_DIR}/Makefile + && + make + lib) +endif() -if (WIN32) +if(WIN32) set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/xxhash.lib") set(XXHASH_CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4710 /wd4711") set(XXHASH_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4710 /wd4711") @@ -37,53 +52,47 @@ else() set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a") set(XXHASH_CMAKE_C_FLAGS ${CMAKE_C_FLAGS}) set(XXHASH_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) -endif () +endif() if(WIN32) ExternalProject_Add( - extern_xxhash - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${XXHASH_REPOSITORY} - GIT_TAG ${XXHASH_TAG} - PREFIX ${XXHASH_PREFIX_DIR} - UPDATE_COMMAND "" - PATCH_COMMAND "" - CONFIGURE_COMMAND - ${CMAKE_COMMAND} ${XXHASH_SOURCE_DIR}/cmake_unofficial - -DCMAKE_INSTALL_PREFIX:PATH=${XXHASH_INSTALL_DIR} - -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} - -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DBUILD_XXHSUM=OFF - -DCMAKE_GENERATOR=${CMAKE_GENERATOR} - -DCMAKE_GENERATOR_PLATFORM=${CMAKE_GENERATOR_PLATFORM} - -DBUILD_SHARED_LIBS=OFF - -DCMAKE_CXX_FLAGS=${XXHASH_CMAKE_CXX_FLAGS} - -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} - -DCMAKE_C_FLAGS=${XXHASH_CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - ${OPTIONAL_CACHE_ARGS} - TEST_COMMAND "" - BUILD_BYPRODUCTS ${XXHASH_LIBRARIES} - ) + extern_xxhash + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${XXHASH_REPOSITORY} + GIT_TAG ${XXHASH_TAG} + PREFIX ${XXHASH_PREFIX_DIR} + UPDATE_COMMAND "" + PATCH_COMMAND "" + CONFIGURE_COMMAND + ${CMAKE_COMMAND} ${XXHASH_SOURCE_DIR}/cmake_unofficial + -DCMAKE_INSTALL_PREFIX:PATH=${XXHASH_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} + -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DBUILD_XXHSUM=OFF + -DCMAKE_GENERATOR=${CMAKE_GENERATOR} + -DCMAKE_GENERATOR_PLATFORM=${CMAKE_GENERATOR_PLATFORM} + -DBUILD_SHARED_LIBS=OFF -DCMAKE_CXX_FLAGS=${XXHASH_CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_C_FLAGS=${XXHASH_CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} ${OPTIONAL_CACHE_ARGS} + TEST_COMMAND "" + BUILD_BYPRODUCTS ${XXHASH_LIBRARIES}) else() ExternalProject_Add( - extern_xxhash - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY ${XXHASH_REPOSITORY} - GIT_TAG ${XXHASH_TAG} - PREFIX ${XXHASH_PREFIX_DIR} - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_IN_SOURCE 1 - BUILD_COMMAND ${BUILD_CMD} - INSTALL_COMMAND make PREFIX=${XXHASH_INSTALL_DIR} install - TEST_COMMAND "" - BUILD_BYPRODUCTS ${XXHASH_LIBRARIES} - ) + extern_xxhash + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY ${XXHASH_REPOSITORY} + GIT_TAG ${XXHASH_TAG} + PREFIX ${XXHASH_PREFIX_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_IN_SOURCE 1 + BUILD_COMMAND ${BUILD_CMD} + INSTALL_COMMAND make PREFIX=${XXHASH_INSTALL_DIR} install + TEST_COMMAND "" + BUILD_BYPRODUCTS ${XXHASH_LIBRARIES}) endif() add_library(xxhash STATIC IMPORTED GLOBAL) diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index 679e2064699e1..2cef053e32547 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -12,48 +12,57 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) +include(ExternalProject) -SET(ZLIB_PREFIX_DIR ${THIRD_PARTY_PATH}/zlib) -SET(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib) -SET(ZLIB_ROOT ${ZLIB_INSTALL_DIR} CACHE FILEPATH "zlib root directory." FORCE) -SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include directory." FORCE) +set(ZLIB_PREFIX_DIR ${THIRD_PARTY_PATH}/zlib) +set(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib) +set(ZLIB_ROOT + ${ZLIB_INSTALL_DIR} + CACHE FILEPATH "zlib root directory." FORCE) +set(ZLIB_INCLUDE_DIR + "${ZLIB_INSTALL_DIR}/include" + CACHE PATH "zlib include directory." FORCE) set(ZLIB_REPOSITORY ${GIT_URL}/madler/zlib.git) -set(ZLIB_TAG v1.2.8) +set(ZLIB_TAG v1.2.8) -INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers. -INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zlib.h. +include_directories(${ZLIB_INCLUDE_DIR} +)# For zlib code to include its own headers. +include_directories(${THIRD_PARTY_PATH}/install +)# For Paddle code to include zlib.h. -IF(WIN32) - SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE) -ELSE(WIN32) - SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE) -ENDIF(WIN32) +if(WIN32) + set(ZLIB_LIBRARIES + "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" + CACHE FILEPATH "zlib library." FORCE) +else(WIN32) + set(ZLIB_LIBRARIES + "${ZLIB_INSTALL_DIR}/lib/libz.a" + CACHE FILEPATH "zlib library." FORCE) +endif(WIN32) ExternalProject_Add( - extern_zlib - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${ZLIB_REPOSITORY} - GIT_TAG ${ZLIB_TAG} - PREFIX ${ZLIB_PREFIX_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR} - -DBUILD_SHARED_LIBS=OFF - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_MACOSX_RPATH=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR} - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - BUILD_BYPRODUCTS ${ZLIB_LIBRARIES} -) + extern_zlib + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${ZLIB_REPOSITORY} + GIT_TAG ${ZLIB_TAG} + PREFIX ${ZLIB_PREFIX_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR} + -DBUILD_SHARED_LIBS=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_MACOSX_RPATH=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS + -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${ZLIB_LIBRARIES}) -ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES}) -ADD_DEPENDENCIES(zlib extern_zlib) +add_library(zlib STATIC IMPORTED GLOBAL) +set_property(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES}) +add_dependencies(zlib extern_zlib) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 0dbd3bc328314..e3c5545df8b27 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -5,33 +5,39 @@ include(CheckCXXSymbolExists) include(CheckTypeSize) function(CheckCompilerCXX14Flag) - if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.4) - message(FATAL_ERROR "Unsupported GCC version. GCC >= 5.4 required.") - elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.2) - message(WARNING "Found GCC ${CMAKE_CXX_COMPILER_VERSION} which is too high, recommended to use GCC 8.2") - endif() - elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang" - # Apple Clang is a different compiler than upstream Clang which havs different version numbers. - # https://gist.github.com/yamaya/2924292 - if(APPLE) # cmake < 3.0 compiler id "Clang" on Mac OS X - if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.1) - message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.") - endif() - else() - if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.4) - message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.4 required.") - endif() - endif() + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.4) + message(FATAL_ERROR "Unsupported GCC version. GCC >= 5.4 required.") + elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.2) + message( + WARNING + "Found GCC ${CMAKE_CXX_COMPILER_VERSION} which is too high, recommended to use GCC 8.2" + ) endif() + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID + STREQUAL "Clang") + # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang" + # Apple Clang is a different compiler than upstream Clang which havs different version numbers. + # https://gist.github.com/yamaya/2924292 + if(APPLE) # cmake < 3.0 compiler id "Clang" on Mac OS X + if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.1) + message( + FATAL_ERROR + "Unsupported AppleClang version. AppleClang >= 5.1 required.") + endif() + else() + if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.4) + message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.4 required.") + endif() + endif() + endif() endfunction() -CheckCompilerCXX14Flag() +checkcompilercxx14flag() if(NOT WIN32) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") else() - set(CMAKE_CXX_STANDARD 14) + set(CMAKE_CXX_STANDARD 14) endif() # safe_set_flag @@ -42,56 +48,58 @@ endif() # flag_name: the flag name for compiler, such as '-Werror' '-Wall' etc # rest arguments: not used. function(safe_set_flag is_c src_list flag_name) - string(REPLACE "-" "_" safe_name ${flag_name}) - string(REPLACE "=" "_" safe_name ${safe_name}) + string(REPLACE "-" "_" safe_name ${flag_name}) + string(REPLACE "=" "_" safe_name ${safe_name}) - if(${flag_name} MATCHES "fsanitize") - set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS}) - set(CMAKE_REQUIRED_FLAGS ${flag_name}) - endif() + if(${flag_name} MATCHES "fsanitize") + set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS}) + set(CMAKE_REQUIRED_FLAGS ${flag_name}) + endif() - if(is_c) - CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name}) - set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name}) - else() - CHECK_CXX_COMPILER_FLAG(${flag_name} CXX_COMPILER_SUPPORT_FLAG_${safe_name}) - set(safe_name CXX_COMPILER_SUPPORT_FLAG_${safe_name}) - endif() - if(${safe_name}) - set(${src_list} "${${src_list}} ${flag_name}" PARENT_SCOPE) - endif() + if(is_c) + check_c_compiler_flag(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name}) + set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name}) + else() + check_cxx_compiler_flag(${flag_name} CXX_COMPILER_SUPPORT_FLAG_${safe_name}) + set(safe_name CXX_COMPILER_SUPPORT_FLAG_${safe_name}) + endif() + if(${safe_name}) + set(${src_list} + "${${src_list}} ${flag_name}" + PARENT_SCOPE) + endif() - if(${flag_name} MATCHES "fsanitize") - set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) - endif() + if(${flag_name} MATCHES "fsanitize") + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) + endif() endfunction() # helper macro to set cflag macro(safe_set_cflag src_list flag_name) - safe_set_flag(ON ${src_list} ${flag_name}) + safe_set_flag(ON ${src_list} ${flag_name}) endmacro() # helper macro to set cxxflag macro(safe_set_cxxflag src_list flag_name) - safe_set_flag(OFF ${src_list} ${flag_name}) + safe_set_flag(OFF ${src_list} ${flag_name}) endmacro() # helper macro to set nvcc flag macro(safe_set_nvflag flag_name) - string(REPLACE "-" "_" safe_name ${flag_name}) - string(REPLACE "=" "_" safe_name ${safe_name}) - CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name}) - set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name}) - if(${safe_name}) - set(SAFE_GPU_COMMON_FLAGS "${SAFE_GPU_COMMON_FLAGS} -Xcompiler=\"${flag_name}\"") - endif() + string(REPLACE "-" "_" safe_name ${flag_name}) + string(REPLACE "=" "_" safe_name ${safe_name}) + check_c_compiler_flag(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name}) + set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name}) + if(${safe_name}) + set(SAFE_GPU_COMMON_FLAGS + "${SAFE_GPU_COMMON_FLAGS} -Xcompiler=\"${flag_name}\"") + endif() endmacro() - -CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS) +check_cxx_symbol_exists(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS) if(NOT UINT64_MAX_EXISTS) set(CMAKE_REQUIRED_DEFINITIONS -D__STDC_LIMIT_MACROS) - CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS_HERE) + check_cxx_symbol_exists(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS_HERE) if(UINT64_MAX_EXISTS_HERE) set(CMAKE_REQUIRED_DEFINITIONS) add_definitions(-D__STDC_LIMIT_MACROS) @@ -100,152 +108,151 @@ if(NOT UINT64_MAX_EXISTS) endif() endif() -SET(CMAKE_EXTRA_INCLUDE_FILES "pthread.h") -CHECK_TYPE_SIZE(pthread_spinlock_t SPINLOCK_FOUND) -CHECK_TYPE_SIZE(pthread_barrier_t BARRIER_FOUND) +set(CMAKE_EXTRA_INCLUDE_FILES "pthread.h") +check_type_size(pthread_spinlock_t SPINLOCK_FOUND) +check_type_size(pthread_barrier_t BARRIER_FOUND) if(SPINLOCK_FOUND) add_definitions(-DPADDLE_USE_PTHREAD_SPINLOCK) endif(SPINLOCK_FOUND) if(BARRIER_FOUND) add_definitions(-DPADDLE_USE_PTHREAD_BARRIER) endif(BARRIER_FOUND) -SET(CMAKE_EXTRA_INCLUDE_FILES "") +set(CMAKE_EXTRA_INCLUDE_FILES "") # Only one sanitizer is allowed in compile time string(TOLOWER "${SANITIZER_TYPE}" sanitizer_type) if(sanitizer_type STREQUAL "address") - set(fsanitize "-fsanitize=address") + set(fsanitize "-fsanitize=address") elseif(sanitizer_type STREQUAL "leak") - set(fsanitize "-fsanitize=leak") + set(fsanitize "-fsanitize=leak") elseif(sanitizer_type STREQUAL "memory") - set(fsanitize "-fsanitize=memory") + set(fsanitize "-fsanitize=memory") elseif(sanitizer_type STREQUAL "thread") - set(fsanitize "-fsanitize=thread") + set(fsanitize "-fsanitize=thread") elseif(sanitizer_type STREQUAL "undefined") - set(fsanitize "-fsanitize=undefined") + set(fsanitize "-fsanitize=undefined") endif() # Common flags. the compiler flag used for C/C++ sources whenever release or debug # Do not care if this flag is support for gcc. # https://github.com/PaddlePaddle/Paddle/issues/12773 -if (NOT WIN32) -set(COMMON_FLAGS - -fPIC - -fno-omit-frame-pointer - -Werror - -Wall - -Wextra - -Wnon-virtual-dtor - -Wdelete-non-virtual-dtor - -Wno-unused-parameter - -Wno-unused-function - -Wno-error=literal-suffix - -Wno-error=unused-local-typedefs - -Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3 - -Wno-error=terminate # Warning in PADDLE_ENFORCE - -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2 - -Wimplicit-fallthrough=0 # Warning in tinyformat.h - ${fsanitize} -) - -if(WITH_IPU) - set(COMMON_FLAGS ${COMMON_FLAGS} - -Wno-sign-compare # Warnings in Popart - -Wno-non-virtual-dtor # Warnings in Popart +if(NOT WIN32) + set(COMMON_FLAGS + -fPIC + -fno-omit-frame-pointer + -Werror + -Wall + -Wextra + -Wnon-virtual-dtor + -Wdelete-non-virtual-dtor + -Wno-unused-parameter + -Wno-unused-function + -Wno-error=literal-suffix + -Wno-error=unused-local-typedefs + -Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3 + -Wno-error=terminate # Warning in PADDLE_ENFORCE + -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2 + -Wimplicit-fallthrough=0 # Warning in tinyformat.h + ${fsanitize}) + + if(WITH_IPU) + set(COMMON_FLAGS ${COMMON_FLAGS} -Wno-sign-compare # Warnings in Popart + -Wno-non-virtual-dtor # Warnings in Popart ) -endif() + endif() -if(WITH_ASCEND_CL AND WITH_ARM_BRPC) + if(WITH_ASCEND_CL AND WITH_ARM_BRPC) set(COMMON_FLAGS ${COMMON_FLAGS} -faligned-new) -endif() + endif() -if(NOT APPLE) + if(NOT APPLE) if((${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.0) OR (WITH_ROCM)) - set(COMMON_FLAGS - ${COMMON_FLAGS} - -Wno-format-truncation # Warning in boost gcc 8.2 - -Wno-error=parentheses # Warning in boost gcc 8.2 - -Wno-error=catch-value # Warning in boost gcc 8.2 - -Wno-error=nonnull-compare # Warning in boost gcc 8.2 - -Wno-error=address # Warning in boost gcc 8.2 - -Wno-ignored-qualifiers # Warning in boost gcc 8.2 - -Wno-ignored-attributes # Warning in Eigen gcc 8.3 - -Wno-parentheses # Warning in Eigen gcc 8.3 - ) + set(COMMON_FLAGS + ${COMMON_FLAGS} + -Wno-format-truncation # Warning in boost gcc 8.2 + -Wno-error=parentheses # Warning in boost gcc 8.2 + -Wno-error=catch-value # Warning in boost gcc 8.2 + -Wno-error=nonnull-compare # Warning in boost gcc 8.2 + -Wno-error=address # Warning in boost gcc 8.2 + -Wno-ignored-qualifiers # Warning in boost gcc 8.2 + -Wno-ignored-attributes # Warning in Eigen gcc 8.3 + -Wno-parentheses # Warning in Eigen gcc 8.3 + ) endif() -endif(NOT APPLE) - -set(GPU_COMMON_FLAGS - -fPIC - -fno-omit-frame-pointer - -Wnon-virtual-dtor - -Wdelete-non-virtual-dtor - -Wno-unused-parameter - -Wno-unused-function - -Wno-error=literal-suffix - -Wno-error=unused-local-typedefs - -Wno-error=unused-function # Warnings in Numpy Header. - -Wno-error=array-bounds # Warnings in Eigen::array -) -if (NOT WITH_NV_JETSON AND NOT WITH_ARM AND NOT WITH_SW AND NOT WITH_MIPS) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64") -endif() + endif(NOT APPLE) + + set(GPU_COMMON_FLAGS + -fPIC + -fno-omit-frame-pointer + -Wnon-virtual-dtor + -Wdelete-non-virtual-dtor + -Wno-unused-parameter + -Wno-unused-function + -Wno-error=literal-suffix + -Wno-error=unused-local-typedefs + -Wno-error=unused-function # Warnings in Numpy Header. + -Wno-error=array-bounds # Warnings in Eigen::array + ) + if(NOT WITH_NV_JETSON + AND NOT WITH_ARM + AND NOT WITH_SW + AND NOT WITH_MIPS) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64") + endif() endif(NOT WIN32) -if (APPLE) - if(WITH_ARM) - set (CMAKE_OSX_ARCHITECTURES "arm64" CACHE STRING "Build architectures for OSX" FORCE) - else(WITH_ARM) - set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE) - endif(WITH_ARM) - # On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0 - set (COMMON_FLAGS -Wno-deprecated-register) +if(APPLE) + if(WITH_ARM) + set(CMAKE_OSX_ARCHITECTURES + "arm64" + CACHE STRING "Build architectures for OSX" FORCE) + else(WITH_ARM) + set(CMAKE_OSX_ARCHITECTURES + "x86_64" + CACHE STRING "Build architectures for OSX" FORCE) + endif(WITH_ARM) + # On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0 + set(COMMON_FLAGS -Wno-deprecated-register) endif(APPLE) if(WITH_HETERPS AND WITH_PSLIB) - set(COMMON_FLAGS - -D_GLIBCXX_USE_CXX11_ABI=0 - ${COMMON_FLAGS}) + set(COMMON_FLAGS -D_GLIBCXX_USE_CXX11_ABI=0 ${COMMON_FLAGS}) - set(GPU_COMMON_FLAGS - -D_GLIBCXX_USE_CXX11_ABI=0 - ${GPU_COMMON_FLAGS}) + set(GPU_COMMON_FLAGS -D_GLIBCXX_USE_CXX11_ABI=0 ${GPU_COMMON_FLAGS}) endif() if(LINUX) - set(GPU_COMMON_FLAGS - -Wall - -Wextra - -Werror - ${GPU_COMMON_FLAGS}) + set(GPU_COMMON_FLAGS -Wall -Wextra -Werror ${GPU_COMMON_FLAGS}) endif(LINUX) foreach(flag ${COMMON_FLAGS}) - safe_set_cflag(CMAKE_C_FLAGS ${flag}) - safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag}) + safe_set_cflag(CMAKE_C_FLAGS ${flag}) + safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag}) endforeach() set(SAFE_GPU_COMMON_FLAGS "") foreach(flag ${GPU_COMMON_FLAGS}) - safe_set_nvflag(${flag}) + safe_set_nvflag(${flag}) endforeach() if(WITH_GPU) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}") endif() if(WITH_ROCM) - set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${SAFE_GPU_COMMON_FLAGS}") + set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${SAFE_GPU_COMMON_FLAGS}") endif() - # Disable -Werror, otherwise the compile will fail for rocblas_gemm_ex +# Disable -Werror, otherwise the compile will fail for rocblas_gemm_ex if(WITH_ROCM) - string (REPLACE "-Werror" "-Wno-error" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) - string (REPLACE "-Werror" "-Wno-error" CMAKE_C_FLAGS ${CMAKE_C_FLAGS}) + string(REPLACE "-Werror" "-Wno-error" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + string(REPLACE "-Werror" "-Wno-error" CMAKE_C_FLAGS ${CMAKE_C_FLAGS}) endif() if(WITH_PSCORE OR WITH_PSLIB) - string (REPLACE "-Wnon-virtual-dtor" "-Wno-non-virtual-dtor" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) - string (REPLACE "-Wnon-virtual-dtor" "-Wno-non-virtual-dtor" CMAKE_C_FLAGS ${CMAKE_C_FLAGS}) + string(REPLACE "-Wnon-virtual-dtor" "-Wno-non-virtual-dtor" CMAKE_CXX_FLAGS + ${CMAKE_CXX_FLAGS}) + string(REPLACE "-Wnon-virtual-dtor" "-Wno-non-virtual-dtor" CMAKE_C_FLAGS + ${CMAKE_C_FLAGS}) endif() diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 35170b5198dc3..a6a7ab983b9f6 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -13,7 +13,6 @@ # limitations under the License. # - # generic.cmake defines CMakes functions that look like Bazel's # building rules (https://bazel.build/). # @@ -96,9 +95,11 @@ if(NOT APPLE AND NOT WIN32) find_package(Threads REQUIRED) link_libraries(${CMAKE_THREAD_LIBS_INIT}) if(WITH_PSLIB OR WITH_DISTRIBUTE) - set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt -lz -lssl") + set(CMAKE_CXX_LINK_EXECUTABLE + "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt -lz -lssl") else() - set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt") + set(CMAKE_CXX_LINK_EXECUTABLE + "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt") endif() endif() @@ -107,7 +108,8 @@ set_property(GLOBAL PROPERTY FLUID_MODULES "") # for building inference libs function(find_fluid_modules TARGET_NAME) get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE) - string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path}) + string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path + ${__target_path}) string(FIND "${__target_path}" "fluid" pos) if(pos GREATER 1) get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) @@ -121,7 +123,8 @@ set_property(GLOBAL PROPERTY PHI_MODULES "") # for building inference libs function(find_phi_modules TARGET_NAME) get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE) - string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path}) + string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path + ${__target_path}) string(FIND "${__target_path}" "phi" pos) if(pos GREATER 1) get_property(phi_modules GLOBAL PROPERTY PHI_MODULES) @@ -131,7 +134,7 @@ function(find_phi_modules TARGET_NAME) endfunction(find_phi_modules) function(common_link TARGET_NAME) - if (WITH_PROFILER) + if(WITH_PROFILER) target_link_libraries(${TARGET_NAME} gperftools::profiler) endif() endfunction() @@ -141,7 +144,8 @@ endfunction() set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY) function(find_fluid_thirdparties TARGET_NAME) get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE) - string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path}) + string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path + ${__target_path}) string(FIND "${__target_path}" "third_party" pos) if(pos GREATER 1) get_property(fluid_ GLOBAL PROPERTY FLUID_THIRD_PARTY) @@ -162,13 +166,15 @@ function(create_static_lib TARGET_NAME) foreach(lib ${libs}) list(APPEND dummy_list ${lib}) list(LENGTH dummy_list listlen) - if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${libs_len})) + if((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL + ${libs_len})) merge_static_libs(${TARGET_NAME}_dummy_${dummy_index} ${dummy_list}) set(dummy_list) - list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_${dummy_index}) - MATH(EXPR dummy_index "${dummy_index}+1") + list(APPEND ${TARGET_NAME}_dummy_list + ${TARGET_NAME}_dummy_${dummy_index}) + math(EXPR dummy_index "${dummy_index}+1") endif() - MATH(EXPR dummy_offset "${dummy_offset}+1") + math(EXPR dummy_offset "${dummy_offset}+1") endforeach() merge_static_libs(${TARGET_NAME} ${${TARGET_NAME}_dummy_list}) else() @@ -180,7 +186,8 @@ function(create_dummy_static_lib TARGET_NAME) set(options "") set(oneValueArgs "") set(multiValueArgs LIBS DEPS LIMIT) - cmake_parse_arguments(merge "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cmake_parse_arguments(merge "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) list(REMOVE_DUPLICATES merge_LIBS) set(index 1) @@ -191,17 +198,18 @@ function(create_dummy_static_lib TARGET_NAME) foreach(lib ${merge_LIBS}) list(APPEND merge_list ${lib}) list(LENGTH merge_list listlen) - if ((${listlen} GREATER ${limit}) OR (${offset} EQUAL ${libs_len})) - message("Merge and generate static library: ${TARGET_NAME}_static_${index}") + if((${listlen} GREATER ${limit}) OR (${offset} EQUAL ${libs_len})) + message( + "Merge and generate static library: ${TARGET_NAME}_static_${index}") merge_static_libs(${TARGET_NAME}_static_${index} ${merge_list}) if(merge_DEPS) target_link_libraries(${TARGET_NAME}_static_${index} ${merge_DEPS}) endif() set(merge_list) list(APPEND ${TARGET_NAME}_list ${TARGET_NAME}_static_${index}) - MATH(EXPR index "${index}+1") + math(EXPR index "${index}+1") endif() - MATH(EXPR offset "${offset}+1") + math(EXPR offset "${offset}+1") endforeach() cc_library(${TARGET_NAME} DEPS ${${TARGET_NAME}_list}) endfunction() @@ -226,12 +234,14 @@ function(merge_static_libs TARGET_NAME) # Make the generated dummy source file depended on all static input # libs. If input lib changes,the source file is touched # which causes the desired effect (relink). - add_custom_command(OUTPUT ${target_SRCS} + add_custom_command( + OUTPUT ${target_SRCS} COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS} DEPENDS ${libs}) - - # Generate dummy staic lib - generate_dummy_static_lib(LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR "generic.cmake:merge_static_libs") + + # Generate dummy staic lib + generate_dummy_static_lib(LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} + GENERATOR "generic.cmake:merge_static_libs") target_link_libraries(${TARGET_NAME} ${libs_deps}) # OSX: use 'libtool' to merge archives @@ -240,29 +250,41 @@ function(merge_static_libs TARGET_NAME) # Get the file names of the libraries to be merged set(libfiles ${libfiles} $) endforeach() - add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + add_custom_command( + TARGET ${TARGET_NAME} + POST_BUILD COMMENT "Merge and generate static lib: lib${TARGET_NAME}.a" COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" - COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles} - ) + COMMAND /usr/bin/libtool -static -o + "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}) endif() # LINUX: use "ar" to extract objects and re-add to a common lib if(LINUX) - set(mri_file ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.mri CACHE INTERNAL "phi_static.mri file") - get_property(ABS_MERGE_LIB_PATH TARGET ${TARGET_NAME} PROPERTY LOCATION) + set(mri_file + ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.mri + CACHE INTERNAL "phi_static.mri file") + get_property( + ABS_MERGE_LIB_PATH + TARGET ${TARGET_NAME} + PROPERTY LOCATION) file(WRITE ${mri_file} "create ${ABS_MERGE_LIB_PATH}\n") foreach(lib ${libs}) - get_property(ABS_LIB_PATH TARGET ${lib} PROPERTY LOCATION) + get_property( + ABS_LIB_PATH + TARGET ${lib} + PROPERTY LOCATION) file(APPEND ${mri_file} "addlib ${ABS_LIB_PATH}\n") endforeach() file(APPEND ${mri_file} "save\nend\n") - add_custom_command(TARGET ${TARGET_NAME} POST_BUILD - COMMENT "Merge and generate static lib: lib${TARGET_NAME}.a" - COMMAND ${CMAKE_AR} -M < ${mri_file} - COMMAND ${CMAKE_RANLIB} "$") + add_custom_command( + TARGET ${TARGET_NAME} + POST_BUILD + COMMENT "Merge and generate static lib: lib${TARGET_NAME}.a" + COMMAND ${CMAKE_AR} -M < ${mri_file} + COMMAND ${CMAKE_RANLIB} "$") endif() # Windows do not support gcc/nvcc combined compiling. Use msvc 'lib.exe' to merge libs. @@ -271,60 +293,70 @@ function(merge_static_libs TARGET_NAME) set(libfiles ${libfiles} $) endforeach() # msvc compiler will put libarary in directory of "/Release/xxxlib" by default - add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + add_custom_command( + TARGET ${TARGET_NAME} + POST_BUILD COMMENT "Merge and generate static lib: lib${TARGET_NAME}.lib" COMMAND cmake -E make_directory $ - COMMAND lib /OUT:$ ${libfiles} - ) + COMMAND lib /OUT:$ ${libfiles}) endif() endfunction() function(check_coverage_opt TARGET_NAME SRCS) if(WITH_COVERAGE AND WITH_INCREMENTAL_COVERAGE) # if pybind.cc add '-g -O0 -fprofile-arcs -ftest-coverage' only, some testcase will fail. - if ("$ENV{PADDLE_GIT_DIFF_H_FILE}" STREQUAL "" AND (NOT ("$ENV{PADDLE_GIT_DIFF_CC_FILE}" MATCHES "pybind.cc"))) - if (NOT ("$ENV{PADDLE_GIT_DIFF_CC_FILE}" STREQUAL "")) + if("$ENV{PADDLE_GIT_DIFF_H_FILE}" STREQUAL "" + AND (NOT ("$ENV{PADDLE_GIT_DIFF_CC_FILE}" MATCHES "pybind.cc"))) + if(NOT ("$ENV{PADDLE_GIT_DIFF_CC_FILE}" STREQUAL "")) string(REPLACE "," ";" CC_FILE_LIST $ENV{PADDLE_GIT_DIFF_CC_FILE}) set(use_coverage_opt FALSE) - FOREACH(cc_file ${CC_FILE_LIST}) + foreach(cc_file ${CC_FILE_LIST}) if("${SRCS};" MATCHES "${cc_file}") set(use_coverage_opt TRUE) break() endif() - ENDFOREACH(cc_file) + endforeach(cc_file) - if (use_coverage_opt) + if(use_coverage_opt) message(STATUS "cc changed, add coverage opt for ${TARGET_NAME}") - target_compile_options(${TARGET_NAME} PRIVATE -g -O0 -fprofile-arcs -ftest-coverage) + target_compile_options(${TARGET_NAME} PRIVATE -g -O0 -fprofile-arcs + -ftest-coverage) target_link_libraries(${TARGET_NAME} -fprofile-arcs) - get_target_property(WH_TARGET_COMPILE_OPTIONS ${TARGET_NAME} COMPILE_OPTIONS) - message(STATUS "property for ${TARGET_NAME} is ${WH_TARGET_COMPILE_OPTIONS}") + get_target_property(WH_TARGET_COMPILE_OPTIONS ${TARGET_NAME} + COMPILE_OPTIONS) + message( + STATUS "property for ${TARGET_NAME} is ${WH_TARGET_COMPILE_OPTIONS}" + ) endif() endif() endif() endif() endfunction(check_coverage_opt) - function(cc_library TARGET_NAME) set(options STATIC static SHARED shared INTERFACE interface) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) if(WIN32) - # add libxxx.lib prefix in windows - set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}") + # add libxxx.lib prefix in windows + set(${TARGET_NAME}_LIB_NAME + "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" + CACHE STRING "output library name for target ${TARGET_NAME}") endif(WIN32) if(cc_library_SRCS) - if(cc_library_SHARED OR cc_library_shared) # build *.so - add_library(${TARGET_NAME} SHARED ${cc_library_SRCS}) - elseif(cc_library_INTERFACE OR cc_library_interface) - generate_dummy_static_lib(LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR "generic.cmake:cc_library") - else() - add_library(${TARGET_NAME} STATIC ${cc_library_SRCS}) - find_fluid_modules(${TARGET_NAME}) - find_phi_modules(${TARGET_NAME}) - endif() + if(cc_library_SHARED OR cc_library_shared) # build *.so + add_library(${TARGET_NAME} SHARED ${cc_library_SRCS}) + elseif(cc_library_INTERFACE OR cc_library_interface) + generate_dummy_static_lib( + LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR + "generic.cmake:cc_library") + else() + add_library(${TARGET_NAME} STATIC ${cc_library_SRCS}) + find_fluid_modules(${TARGET_NAME}) + find_phi_modules(${TARGET_NAME}) + endif() if(cc_library_DEPS) # Don't need link libwarpctc.so if("${cc_library_DEPS};" MATCHES "warpctc;") @@ -341,7 +373,8 @@ function(cc_library TARGET_NAME) if(WIN32) target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB}) else(WIN32) - target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed") + target_link_libraries(${TARGET_NAME} + "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed") endif(WIN32) endif() # remove link to python, see notes at: @@ -373,21 +406,26 @@ function(cc_library TARGET_NAME) if(cc_library_DEPS) list(REMOVE_DUPLICATES cc_library_DEPS) - generate_dummy_static_lib(LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR "generic.cmake:cc_library") + generate_dummy_static_lib( + LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR + "generic.cmake:cc_library") target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) else() - message(FATAL_ERROR "Please specify source files or libraries in cc_library(${TARGET_NAME} ...).") + message( + FATAL_ERROR + "Please specify source files or libraries in cc_library(${TARGET_NAME} ...)." + ) endif() endif(cc_library_SRCS) endfunction(cc_library) - function(cc_binary TARGET_NAME) set(options "") set(oneValueArgs "") set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(cc_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cmake_parse_arguments(cc_binary "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) add_executable(${TARGET_NAME} ${cc_binary_SRCS}) if(cc_binary_DEPS) target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS}) @@ -408,7 +446,8 @@ function(cc_test_build TARGET_NAME) if(WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") set(oneValueArgs "") set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) add_executable(${TARGET_NAME} ${cc_test_SRCS}) if(WIN32) if("${cc_test_DEPS};" MATCHES "python;") @@ -417,8 +456,25 @@ function(cc_test_build TARGET_NAME) endif() endif(WIN32) get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) - target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} ${os_dependency_modules} paddle_gtest_main lod_tensor memory gtest gflags glog) - add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) + target_link_libraries( + ${TARGET_NAME} + ${cc_test_DEPS} + ${os_dependency_modules} + paddle_gtest_main + lod_tensor + memory + gtest + gflags + glog) + add_dependencies( + ${TARGET_NAME} + ${cc_test_DEPS} + paddle_gtest_main + lod_tensor + memory + gtest + gflags + glog) common_link(${TARGET_NAME}) if(WITH_ROCM) target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB}) @@ -431,74 +487,80 @@ function(cc_test_run TARGET_NAME) if(WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") set(oneValueArgs "") set(multiValueArgs COMMAND ARGS) - cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - add_test(NAME ${TARGET_NAME} - COMMAND ${cc_test_COMMAND} ${cc_test_ARGS} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) - set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) - set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) - set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) + cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + add_test( + NAME ${TARGET_NAME} + COMMAND ${cc_test_COMMAND} ${cc_test_ARGS} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT + FLAGS_cpu_deterministic=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT + FLAGS_init_allocated_mem=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT + FLAGS_cudnn_deterministic=true) # No unit test should exceed 2 minutes. - if (WIN32) - set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150) + if(WIN32) + set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150) endif() - if (APPLE) - set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 20) + if(APPLE) + set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 20) endif() elseif(WITH_TESTING AND NOT TEST ${TARGET_NAME}) - add_test(NAME ${TARGET_NAME} COMMAND ${CMAKE_COMMAND} -E echo CI skip ${TARGET_NAME}.) + add_test(NAME ${TARGET_NAME} COMMAND ${CMAKE_COMMAND} -E echo CI skip + ${TARGET_NAME}.) endif() endfunction() function(cc_test TARGET_NAME) - # The environment variable `CI_SKIP_CPP_TEST` is used to skip the compilation - # and execution of test in CI. `CI_SKIP_CPP_TEST` is set to ON when no files + # The environment variable `CI_SKIP_CPP_TEST` is used to skip the compilation + # and execution of test in CI. `CI_SKIP_CPP_TEST` is set to ON when no files # other than *.py are modified. if(WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") set(oneValueArgs "") set(multiValueArgs SRCS DEPS ARGS) - cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - cc_test_build(${TARGET_NAME} - SRCS ${cc_test_SRCS} - DEPS ${cc_test_DEPS}) + cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + cc_test_build(${TARGET_NAME} SRCS ${cc_test_SRCS} DEPS ${cc_test_DEPS}) # we dont test hcom op, because it need complex configuration # with more than one machine - if(NOT ("${TARGET_NAME}" STREQUAL "c_broadcast_op_npu_test" OR - "${TARGET_NAME}" STREQUAL "c_allreduce_sum_op_npu_test" OR - "${TARGET_NAME}" STREQUAL "c_allreduce_max_op_npu_test" OR - "${TARGET_NAME}" STREQUAL "c_reducescatter_op_npu_test" OR - "${TARGET_NAME}" STREQUAL "c_allgather_op_npu_test" OR - "${TARGET_NAME}" STREQUAL "send_v2_op_npu_test" OR - "${TARGET_NAME}" STREQUAL "c_reduce_sum_op_npu_test" OR - "${TARGET_NAME}" STREQUAL "recv_v2_op_npu_test")) - cc_test_run(${TARGET_NAME} - COMMAND ${TARGET_NAME} - ARGS ${cc_test_ARGS}) + if(NOT + ("${TARGET_NAME}" STREQUAL "c_broadcast_op_npu_test" + OR "${TARGET_NAME}" STREQUAL "c_allreduce_sum_op_npu_test" + OR "${TARGET_NAME}" STREQUAL "c_allreduce_max_op_npu_test" + OR "${TARGET_NAME}" STREQUAL "c_reducescatter_op_npu_test" + OR "${TARGET_NAME}" STREQUAL "c_allgather_op_npu_test" + OR "${TARGET_NAME}" STREQUAL "send_v2_op_npu_test" + OR "${TARGET_NAME}" STREQUAL "c_reduce_sum_op_npu_test" + OR "${TARGET_NAME}" STREQUAL "recv_v2_op_npu_test")) + cc_test_run(${TARGET_NAME} COMMAND ${TARGET_NAME} ARGS ${cc_test_ARGS}) endif() elseif(WITH_TESTING AND NOT TEST ${TARGET_NAME}) - add_test(NAME ${TARGET_NAME} COMMAND ${CMAKE_COMMAND} -E echo CI skip ${TARGET_NAME}.) + add_test(NAME ${TARGET_NAME} COMMAND ${CMAKE_COMMAND} -E echo CI skip + ${TARGET_NAME}.) endif() endfunction(cc_test) function(nv_library TARGET_NAME) - if (WITH_GPU) + if(WITH_GPU) set(options STATIC static SHARED shared) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(nv_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cmake_parse_arguments(nv_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) if(nv_library_SRCS) # Attention: # 1. cuda_add_library is deprecated after cmake v3.10, use add_library for CUDA please. # 2. cuda_add_library does not support ccache. # Reference: https://cmake.org/cmake/help/v3.10/module/FindCUDA.html - if (nv_library_SHARED OR nv_library_shared) # build *.so + if(nv_library_SHARED OR nv_library_shared) # build *.so add_library(${TARGET_NAME} SHARED ${nv_library_SRCS}) else() add_library(${TARGET_NAME} STATIC ${nv_library_SRCS}) find_fluid_modules(${TARGET_NAME}) find_phi_modules(${TARGET_NAME}) endif() - if (nv_library_DEPS) + if(nv_library_DEPS) add_dependencies(${TARGET_NAME} ${nv_library_DEPS}) target_link_libraries(${TARGET_NAME} ${nv_library_DEPS}) endif() @@ -506,13 +568,16 @@ function(nv_library TARGET_NAME) foreach(source_file ${nv_library_SRCS}) string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) - list(APPEND nv_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND nv_library_HEADERS + ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) endif() endforeach() else(nv_library_SRCS) - if (nv_library_DEPS) + if(nv_library_DEPS) list(REMOVE_DUPLICATES nv_library_DEPS) - generate_dummy_static_lib(LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR "generic.cmake:nv_library") + generate_dummy_static_lib( + LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR + "generic.cmake:nv_library") target_link_libraries(${TARGET_NAME} ${nv_library_DEPS}) add_dependencies(${TARGET_NAME} ${nv_library_DEPS}) @@ -520,76 +585,112 @@ function(nv_library TARGET_NAME) message(FATAL "Please specify source file or library in nv_library.") endif() endif(nv_library_SRCS) - if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910)) - set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS}) + if((CUDA_VERSION GREATER 9.2) + AND (CUDA_VERSION LESS 11.0) + AND (MSVC_VERSION LESS 1910)) + set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS + ${WIN_PROPS}) endif() endif() endfunction(nv_library) function(nv_binary TARGET_NAME) - if (WITH_GPU) + if(WITH_GPU) set(options "") set(oneValueArgs "") set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) add_executable(${TARGET_NAME} ${nv_binary_SRCS}) if(nv_binary_DEPS) target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS}) add_dependencies(${TARGET_NAME} ${nv_binary_DEPS}) common_link(${TARGET_NAME}) endif() - if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910)) - set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS}) + if((CUDA_VERSION GREATER 9.2) + AND (CUDA_VERSION LESS 11.0) + AND (MSVC_VERSION LESS 1910)) + set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS + ${WIN_PROPS}) endif() endif() endfunction(nv_binary) function(nv_test TARGET_NAME) - # The environment variable `CI_SKIP_CPP_TEST` is used to skip the compilation - # and execution of test in CI. `CI_SKIP_CPP_TEST` is set to ON when no files + # The environment variable `CI_SKIP_CPP_TEST` is used to skip the compilation + # and execution of test in CI. `CI_SKIP_CPP_TEST` is set to ON when no files # other than *.py are modified. - if (WITH_GPU AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") + if(WITH_GPU + AND WITH_TESTING + AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") set(oneValueArgs "") set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) # Attention: # 1. cuda_add_executable is deprecated after cmake v3.10, use cuda_add_executable for CUDA please. # 2. cuda_add_executable does not support ccache. # Reference: https://cmake.org/cmake/help/v3.10/module/FindCUDA.html add_executable(${TARGET_NAME} ${nv_test_SRCS}) get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) - target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog ${os_dependency_modules}) - add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) + target_link_libraries( + ${TARGET_NAME} + ${nv_test_DEPS} + paddle_gtest_main + lod_tensor + memory + gtest + gflags + glog + ${os_dependency_modules}) + add_dependencies( + ${TARGET_NAME} + ${nv_test_DEPS} + paddle_gtest_main + lod_tensor + memory + gtest + gflags + glog) common_link(${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME}) - set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) - set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) - set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) - if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910)) - set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS}) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT + FLAGS_cpu_deterministic=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT + FLAGS_init_allocated_mem=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT + FLAGS_cudnn_deterministic=true) + if((CUDA_VERSION GREATER 9.2) + AND (CUDA_VERSION LESS 11.0) + AND (MSVC_VERSION LESS 1910)) + set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS + ${WIN_PROPS}) endif() endif() endfunction(nv_test) function(hip_library TARGET_NAME) - if (WITH_ROCM) + if(WITH_ROCM) set(options STATIC static SHARED shared) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) if(hip_library_SRCS) # FindHIP.cmake defined hip_add_library, HIP_SOURCE_PROPERTY_FORMAT is requried if no .cu files found - if(NOT (${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/operators" OR ${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/phi/kernels")) - set_source_files_properties(${hip_library_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + if(NOT (${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/operators" + OR ${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/phi/kernels")) + set_source_files_properties(${hip_library_SRCS} + PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) endif() - if (hip_library_SHARED OR hip_library_shared) # build *.so + if(hip_library_SHARED OR hip_library_shared) # build *.so hip_add_library(${TARGET_NAME} SHARED ${hip_library_SRCS}) else() hip_add_library(${TARGET_NAME} STATIC ${hip_library_SRCS}) find_fluid_modules(${TARGET_NAME}) find_phi_modules(${TARGET_NAME}) endif() - if (hip_library_DEPS) + if(hip_library_DEPS) add_dependencies(${TARGET_NAME} ${hip_library_DEPS}) target_link_libraries(${TARGET_NAME} ${hip_library_DEPS}) endif() @@ -597,13 +698,16 @@ function(hip_library TARGET_NAME) foreach(source_file ${hip_library_SRCS}) string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) - list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND hip_library_HEADERS + ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) endif() endforeach() else(hip_library_SRCS) - if (hip_library_DEPS) + if(hip_library_DEPS) list(REMOVE_DUPLICATES hip_library_DEPS) - generate_dummy_static_lib(LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR "generic.cmake:hip_library") + generate_dummy_static_lib( + LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR + "generic.cmake:hip_library") target_link_libraries(${TARGET_NAME} ${hip_library_DEPS}) add_dependencies(${TARGET_NAME} ${hip_library_DEPS}) @@ -615,11 +719,12 @@ function(hip_library TARGET_NAME) endfunction(hip_library) function(hip_binary TARGET_NAME) - if (WITH_ROCM) + if(WITH_ROCM) set(options "") set(oneValueArgs "") set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(hip_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cmake_parse_arguments(hip_binary "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) # FindHIP.cmake defined hip_add_executable, HIP_SOURCE_PROPERTY_FORMAT is requried for .cc files hip_add_executable(${TARGET_NAME} ${hip_binary_SRCS}) if(hip_binary_DEPS) @@ -634,42 +739,73 @@ function(hip_test TARGET_NAME) # The environment variable `CI_SKIP_CPP_TEST` is used to skip the compilation # and execution of test in CI. `CI_SKIP_CPP_TEST` is set to ON when no files # other than *.py are modified. - if (WITH_ROCM AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") + if(WITH_ROCM + AND WITH_TESTING + AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") set(oneValueArgs "") set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(hip_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cmake_parse_arguments(hip_test "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) # FindHIP.cmake defined hip_add_executable, HIP_SOURCE_PROPERTY_FORMAT is requried for .cc files hip_add_executable(${TARGET_NAME} ${hip_test_SRCS}) # "-pthread -ldl -lrt" is defined in CMAKE_CXX_LINK_EXECUTABLE target_link_options(${TARGET_NAME} PRIVATE -pthread -ldl -lrt) get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) - target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog ${os_dependency_modules}) - add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) + target_link_libraries( + ${TARGET_NAME} + ${hip_test_DEPS} + paddle_gtest_main + lod_tensor + memory + gtest + gflags + glog + ${os_dependency_modules}) + add_dependencies( + ${TARGET_NAME} + ${hip_test_DEPS} + paddle_gtest_main + lod_tensor + memory + gtest + gflags + glog) common_link(${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME}) - set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) - set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) - set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) - set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT "LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/python/paddle/libs:$LD_LIBRARY_PATH") + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT + FLAGS_cpu_deterministic=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT + FLAGS_init_allocated_mem=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT + FLAGS_cudnn_deterministic=true) + set_property( + TEST ${TARGET_NAME} + PROPERTY + ENVIRONMENT + "LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/python/paddle/libs:$LD_LIBRARY_PATH" + ) endif() endfunction(hip_test) function(xpu_library TARGET_NAME) - if (WITH_XPU_KP) + if(WITH_XPU_KP) set(options STATIC static SHARED shared) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(xpu_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cmake_parse_arguments(xpu_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) if(xpu_library_SRCS) - if (xpu_library_SHARED OR xpu_library_shared) # build *.so - message(FATAL_ERROR "XPU kernel currently does not support dynamic links") + if(xpu_library_SHARED OR xpu_library_shared) # build *.so + message( + FATAL_ERROR "XPU kernel currently does not support dynamic links") else() - xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS} DEPENDS ${xpu_library_DEPS}) + xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS} DEPENDS + ${xpu_library_DEPS}) find_fluid_modules(${TARGET_NAME}) find_phi_modules(${TARGET_NAME}) endif() - if (xpu_library_DEPS) + if(xpu_library_DEPS) add_dependencies(${TARGET_NAME} ${xpu_library_DEPS}) target_link_libraries(${TARGET_NAME} ${xpu_library_DEPS}) endif() @@ -677,13 +813,16 @@ function(xpu_library TARGET_NAME) foreach(source_file ${xpu_library_SRCS}) string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) - list(APPEND xpu_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND xpu_library_HEADERS + ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) endif() endforeach() else(xpu_library_SRCS) - if (xpu_library_DEPS) + if(xpu_library_DEPS) list(REMOVE_DUPLICATES xpu_library_DEPS) - generate_dummy_static_lib(LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR "generic.cmake:xpu_library") + generate_dummy_static_lib( + LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR + "generic.cmake:xpu_library") target_link_libraries(${TARGET_NAME} ${xpu_library_DEPS}) add_dependencies(${TARGET_NAME} ${xpu_library_DEPS}) else() @@ -694,11 +833,12 @@ function(xpu_library TARGET_NAME) endfunction(xpu_library) function(xpu_binary TARGET_NAME) - if (WITH_XPU_KP) + if(WITH_XPU_KP) set(options "") set(oneValueArgs "") set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(xpu_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cmake_parse_arguments(xpu_binary "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) add_executable(${TARGET_NAME} ${xpu_binary_SRCS}) if(xpu_binary_DEPS) target_link_libraries(${TARGET_NAME} ${xpu_binary_DEPS}) @@ -712,21 +852,44 @@ function(xpu_test TARGET_NAME) # The environment variable `CI_SKIP_CPP_TEST` is used to skip the compilation # and execution of test in CI. `CI_SKIP_CPP_TEST` is set to ON when no files # other than *.py are modified. - if (WITH_XPU_KP AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") + if(WITH_XPU_KP + AND WITH_TESTING + AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") set(oneValueArgs "") set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(xpu_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cmake_parse_arguments(xpu_test "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) add_executable(${TARGET_NAME} ${xpu_test_SRCS}) # "-pthread -ldl -lrt" is defined in CMAKE_CXX_LINK_EXECUTABLE target_link_options(${TARGET_NAME} PRIVATE -pthread -ldl -lrt) get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) - target_link_libraries(${TARGET_NAME} ${xpu_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog ${os_dependency_modules}) - add_dependencies(${TARGET_NAME} ${xpu_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) + target_link_libraries( + ${TARGET_NAME} + ${xpu_test_DEPS} + paddle_gtest_main + lod_tensor + memory + gtest + gflags + glog + ${os_dependency_modules}) + add_dependencies( + ${TARGET_NAME} + ${xpu_test_DEPS} + paddle_gtest_main + lod_tensor + memory + gtest + gflags + glog) common_link(${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME}) - set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) - set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) - set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT + FLAGS_cpu_deterministic=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT + FLAGS_init_allocated_mem=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT + FLAGS_cudnn_deterministic=true) endif() endfunction(xpu_test) @@ -734,34 +897,36 @@ function(go_library TARGET_NAME) set(options STATIC static SHARED shared) set(oneValueArgs "") set(multiValueArgs DEPS) - cmake_parse_arguments(go_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cmake_parse_arguments(go_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) - if (go_library_SHARED OR go_library_shared) + if(go_library_SHARED OR go_library_shared) set(BUILD_MODE "-buildmode=c-shared") - set(${TARGET_NAME}_LIB_NAME "${CMAKE_SHARED_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}") + set(${TARGET_NAME}_LIB_NAME + "${CMAKE_SHARED_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE STRING "output library name for target ${TARGET_NAME}") else() set(BUILD_MODE "-buildmode=c-archive") - set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}") + set(${TARGET_NAME}_LIB_NAME + "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" + CACHE STRING "output library name for target ${TARGET_NAME}") endif() set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c) # This custom command will always run since it depends on a not # existing file. - add_custom_command( - OUTPUT dummy_rebulid_${TARGET_NAME} - COMMAND cmake -E touch ${dummyfile} - ) + add_custom_command(OUTPUT dummy_rebulid_${TARGET_NAME} COMMAND cmake -E touch + ${dummyfile}) # Create a custom target that depends on the custom command output # file, so the custom command can be referenced as a dependency by # `add_dependencies`. - add_custom_target(rebuild_${TARGET_NAME} - DEPENDS dummy_rebulid_${TARGET_NAME} - ) + add_custom_target(rebuild_${TARGET_NAME} DEPENDS dummy_rebulid_${TARGET_NAME}) # Add dummy code to support `make target_name` under Terminal Command - file(WRITE ${dummyfile} "const char *dummy_${TARGET_NAME} = \"${dummyfile}\";") - if (go_library_SHARED OR go_library_shared) + file(WRITE ${dummyfile} + "const char *dummy_${TARGET_NAME} = \"${dummyfile}\";") + if(go_library_SHARED OR go_library_shared) add_library(${TARGET_NAME} SHARED ${dummyfile}) else() add_library(${TARGET_NAME} STATIC ${dummyfile}) @@ -777,17 +942,26 @@ function(go_library TARGET_NAME) # rebuild will always happen. add_dependencies(${TARGET_NAME} rebuild_${TARGET_NAME}) - set(${TARGET_NAME}_LIB_PATH "${CMAKE_CURRENT_BINARY_DIR}/${${TARGET_NAME}_LIB_NAME}" CACHE STRING "output library path for target ${TARGET_NAME}") + set(${TARGET_NAME}_LIB_PATH + "${CMAKE_CURRENT_BINARY_DIR}/${${TARGET_NAME}_LIB_NAME}" + CACHE STRING "output library path for target ${TARGET_NAME}") - file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go") - string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + file( + GLOB GO_SOURCE + RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + "*.go") + string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR + ${CMAKE_CURRENT_SOURCE_DIR}) - add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + add_custom_command( + TARGET ${TARGET_NAME} + POST_BUILD COMMAND rm "${${TARGET_NAME}_LIB_PATH}" # Golang build source code - COMMAND GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE} - -o "${${TARGET_NAME}_LIB_PATH}" - "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}" + COMMAND + GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE} -o + "${${TARGET_NAME}_LIB_PATH}" + "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}" # must run under GOPATH WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go") add_dependencies(${TARGET_NAME} go_vendor) @@ -797,15 +971,21 @@ function(go_binary TARGET_NAME) set(options OPTIONAL) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR + ${CMAKE_CURRENT_SOURCE_DIR}) - add_custom_command(OUTPUT ${TARGET_NAME}_timestamp - COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build - -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}" - "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}" + add_custom_command( + OUTPUT ${TARGET_NAME}_timestamp + COMMAND + env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build -o + "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}" + "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}" WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go") - add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${TARGET_NAME}_timestamp ${go_binary_DEPS}) + add_custom_target( + ${TARGET_NAME} ALL DEPENDS go_vendor ${TARGET_NAME}_timestamp + ${go_binary_DEPS}) install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin) check_coverage_opt(${TARGET_NAME} ${go_binary_SRCS}) @@ -816,15 +996,21 @@ function(go_test TARGET_NAME) set(options OPTIONAL) set(oneValueArgs "") set(multiValueArgs DEPS) - cmake_parse_arguments(go_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + cmake_parse_arguments(go_test "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR + ${CMAKE_CURRENT_SOURCE_DIR}) add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${go_test_DEPS}) - add_custom_command(TARGET ${TARGET_NAME} POST_BUILD - COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test -race - -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}" - ".${CMAKE_CURRENT_SOURCE_REL_DIR}" + add_custom_command( + TARGET ${TARGET_NAME} + POST_BUILD + COMMAND + env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test -race -c -o + "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}" + ".${CMAKE_CURRENT_SOURCE_REL_DIR}" WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go") - add_test(NAME ${TARGET_NAME} + add_test( + NAME ${TARGET_NAME} COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endfunction(go_test) @@ -835,7 +1021,9 @@ endfunction(go_test) function(paddle_protobuf_generate_cpp SRCS HDRS) if(NOT ARGN) - message(SEND_ERROR "Error: paddle_protobuf_generate_cpp() called without any proto files") + message( + SEND_ERROR + "Error: paddle_protobuf_generate_cpp() called without any proto files") return() endif() @@ -852,40 +1040,45 @@ function(paddle_protobuf_generate_cpp SRCS HDRS) list(APPEND ${HDRS} "${_protobuf_protoc_hdr}") add_custom_command( - OUTPUT "${_protobuf_protoc_src}" - "${_protobuf_protoc_hdr}" - + OUTPUT "${_protobuf_protoc_src}" "${_protobuf_protoc_hdr}" COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}" - COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} - -I${CMAKE_CURRENT_SOURCE_DIR} - --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL} + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} -I${CMAKE_CURRENT_SOURCE_DIR} + --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL} # Set `EXTERN_PROTOBUF_DEPEND` only if need to compile `protoc.exe`. DEPENDS ${ABS_FIL} ${EXTERN_PROTOBUF_DEPEND} COMMENT "Running C++ protocol buffer compiler on ${FIL}" - VERBATIM ) + VERBATIM) endforeach() set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE) - set(${SRCS} ${${SRCS}} PARENT_SCOPE) - set(${HDRS} ${${HDRS}} PARENT_SCOPE) + set(${SRCS} + ${${SRCS}} + PARENT_SCOPE) + set(${HDRS} + ${${HDRS}} + PARENT_SCOPE) endfunction() - function(proto_library TARGET_NAME) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) set(proto_srcs) set(proto_hdrs) paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS}) - cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf) + cc_library( + ${TARGET_NAME} + SRCS ${proto_srcs} + DEPS ${proto_library_DEPS} protobuf) add_dependencies(extern_xxhash ${TARGET_NAME}) endfunction() function(py_proto_compile TARGET_NAME) set(oneValueArgs "") set(multiValueArgs SRCS) - cmake_parse_arguments(py_proto_compile "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cmake_parse_arguments(py_proto_compile "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) set(py_srcs) protobuf_generate_python(py_srcs ${py_proto_compile_SRCS}) add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs} protobuf) @@ -896,29 +1089,37 @@ function(py_test TARGET_NAME) set(options "") set(oneValueArgs "") set(multiValueArgs SRCS DEPS ARGS ENVS) - cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - if(WITH_COVERAGE AND NOT (WITH_INCREMENTAL_COVERAGE AND "$ENV{PADDLE_GIT_DIFF_PY_FILE}" STREQUAL "")) - add_test(NAME ${TARGET_NAME} - COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true - FLAGS_cpu_deterministic=true - PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} - COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data - ${PYTHON_EXECUTABLE} -m coverage run --branch -p ${py_test_SRCS} ${py_test_ARGS} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + + if(WITH_COVERAGE AND NOT (WITH_INCREMENTAL_COVERAGE + AND "$ENV{PADDLE_GIT_DIFF_PY_FILE}" STREQUAL "")) + add_test( + NAME ${TARGET_NAME} + COMMAND + ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true + FLAGS_cudnn_deterministic=true FLAGS_cpu_deterministic=true + PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} + COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data + ${PYTHON_EXECUTABLE} -m coverage run --branch -p ${py_test_SRCS} + ${py_test_ARGS} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) else() - add_test(NAME ${TARGET_NAME} - COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true - FLAGS_cpu_deterministic=true ${py_test_ENVS} - ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + add_test( + NAME ${TARGET_NAME} + COMMAND + ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true + FLAGS_cudnn_deterministic=true FLAGS_cpu_deterministic=true + ${py_test_ENVS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} + ${py_test_ARGS} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endif() - if (WIN32) - set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150) + if(WIN32) + set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150) endif() - if (APPLE) - set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 20) + if(APPLE) + set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 20) endif() endif() @@ -936,7 +1137,8 @@ function(grpc_library TARGET_NAME) set(oneValueArgs PROTO) set(multiValueArgs SRCS DEPS) set(options "") - cmake_parse_arguments(grpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cmake_parse_arguments(grpc_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) message(STATUS "generating grpc ${grpc_library_PROTO}") @@ -953,36 +1155,43 @@ function(grpc_library TARGET_NAME) cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}") add_custom_command( - OUTPUT "${grpc_grpc_srcs}" "${grpc_grpc_hdrs}" - COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} - ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}" - --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}" - COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} - ARGS --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}" - "${ABS_PROTO}" - DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc) + OUTPUT "${grpc_grpc_srcs}" "${grpc_grpc_hdrs}" + COMMAND + ${PROTOBUF_PROTOC_EXECUTABLE} ARGS --grpc_out + "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}" + --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} ARGS --cpp_out + "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}" "${ABS_PROTO}" + DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc) # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it # as compiler warnings instead of error. Should try remove the warnings also. set_source_files_properties( ${grpc_grpc_srcs} PROPERTIES - COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + COMPILE_FLAGS + "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor" + ) cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}") set_source_files_properties( ${grpc_library_SRCS} PROPERTIES - COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}") + COMPILE_FLAGS + "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor" + ) + cc_library( + "${TARGET_NAME}" + SRCS "${grpc_library_SRCS}" + DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}") endfunction() - function(brpc_library TARGET_NAME) set(oneValueArgs PROTO) set(multiValueArgs SRCS DEPS) set(options "") - cmake_parse_arguments(brpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cmake_parse_arguments(brpc_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) message(STATUS "generating brpc ${brpc_library_PROTO}") @@ -992,7 +1201,10 @@ function(brpc_library TARGET_NAME) paddle_protobuf_generate_cpp(brpc_proto_srcs brpc_proto_hdrs "${ABS_PROTO}") cc_library("${TARGET_NAME}_proto" SRCS "${brpc_proto_srcs}") - cc_library("${TARGET_NAME}" SRCS "${brpc_library_SRCS}" DEPS "${TARGET_NAME}_proto" "${brpc_library_DEPS}") + cc_library( + "${TARGET_NAME}" + SRCS "${brpc_library_SRCS}" + DEPS "${TARGET_NAME}_proto" "${brpc_library_DEPS}") endfunction() # copy_if_different from src_file to dst_file At the beginning of the build. @@ -1000,11 +1212,11 @@ function(copy_if_different src_file dst_file) get_filename_component(FILE_NAME ${dst_file} NAME_WE) # this is a dummy target for custom command, should always be run firstly to update ${dst_file} - add_custom_target(copy_${FILE_NAME}_command ALL - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${src_file} ${dst_file} - COMMENT "copy_if_different ${dst_file}" - VERBATIM - ) + add_custom_target( + copy_${FILE_NAME}_command ALL + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${src_file} ${dst_file} + COMMENT "copy_if_different ${dst_file}" + VERBATIM) add_dependencies(extern_glog copy_${FILE_NAME}_command) endfunction() @@ -1019,7 +1231,8 @@ function(generate_dummy_static_lib) set(options "") set(oneValueArgs LIB_NAME FILE_PATH GENERATOR CONTENT) set(multiValueArgs "") - cmake_parse_arguments(dummy "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cmake_parse_arguments(dummy "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) if(NOT dummy_LIB_NAME) message(FATAL_ERROR "You must provide a static lib name.") endif() @@ -1033,45 +1246,55 @@ function(generate_dummy_static_lib) set(dummy_CONTENT "${dummy_LIB_NAME}_dummy.c for lib ${dummy_LIB_NAME}") endif() - configure_file(${PROJECT_SOURCE_DIR}/cmake/dummy.c.in ${dummy_FILE_PATH} @ONLY) + configure_file(${PROJECT_SOURCE_DIR}/cmake/dummy.c.in ${dummy_FILE_PATH} + @ONLY) add_library(${dummy_LIB_NAME} STATIC ${dummy_FILE_PATH}) endfunction() function(math_library TARGET) - # math_library is a function to create math library. - # The interface is the same as cc_library. - # But it handle split GPU/CPU code and link some common library. - set(cc_srcs) - set(cu_srcs) - set(hip_srcs) - set(math_common_deps device_context framework_proto enforce) - if (WITH_GPU) - if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) - list(APPEND math_common_deps cub) - else() - list(APPEND math_common_deps) - endif() + # math_library is a function to create math library. + # The interface is the same as cc_library. + # But it handle split GPU/CPU code and link some common library. + set(cc_srcs) + set(cu_srcs) + set(hip_srcs) + set(math_common_deps device_context framework_proto enforce) + if(WITH_GPU) + if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) + list(APPEND math_common_deps cub) + else() + list(APPEND math_common_deps) endif() - set(multiValueArgs DEPS) - cmake_parse_arguments(math_library "${options}" "${oneValueArgs}" - "${multiValueArgs}" ${ARGN}) + endif() + set(multiValueArgs DEPS) + cmake_parse_arguments(math_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc) - list(APPEND cc_srcs ${TARGET}.cc) - endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) - list(APPEND cu_srcs ${TARGET}.cu) - endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc) - list(APPEND cu_srcs ${TARGET}.cu.cc) - endif() + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc) + list(APPEND cc_srcs ${TARGET}.cc) + endif() + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) + list(APPEND cu_srcs ${TARGET}.cu) + endif() + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc) + list(APPEND cu_srcs ${TARGET}.cu.cc) + endif() - list(LENGTH cc_srcs cc_srcs_len) - if (WITH_GPU) - nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps}) - elseif (WITH_ROCM) - hip_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps}) - elseif(${cc_srcs_len} GREATER 0) - cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps}) - endif() + list(LENGTH cc_srcs cc_srcs_len) + if(WITH_GPU) + nv_library( + ${TARGET} + SRCS ${cc_srcs} ${cu_srcs} + DEPS ${math_library_DEPS} ${math_common_deps}) + elseif(WITH_ROCM) + hip_library( + ${TARGET} + SRCS ${cc_srcs} ${cu_srcs} + DEPS ${math_library_DEPS} ${math_common_deps}) + elseif(${cc_srcs_len} GREATER 0) + cc_library( + ${TARGET} + SRCS ${cc_srcs} + DEPS ${math_library_DEPS} ${math_common_deps}) + endif() endfunction() diff --git a/cmake/hip.cmake b/cmake/hip.cmake index 14cb9e6f6be5a..3514882c944de 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -1,15 +1,27 @@ if(NOT WITH_ROCM) - return() + return() endif() if(NOT DEFINED ENV{ROCM_PATH}) - set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed") - set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed") - set(HIP_CLANG_PATH ${ROCM_PATH}/llvm/bin CACHE PATH "Path to which clang has been installed") + set(ROCM_PATH + "/opt/rocm" + CACHE PATH "Path to which ROCm has been installed") + set(HIP_PATH + ${ROCM_PATH}/hip + CACHE PATH "Path to which HIP has been installed") + set(HIP_CLANG_PATH + ${ROCM_PATH}/llvm/bin + CACHE PATH "Path to which clang has been installed") else() - set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed") - set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed") - set(HIP_CLANG_PATH ${ROCM_PATH}/llvm/bin CACHE PATH "Path to which clang has been installed") + set(ROCM_PATH + $ENV{ROCM_PATH} + CACHE PATH "Path to which ROCm has been installed") + set(HIP_PATH + ${ROCM_PATH}/hip + CACHE PATH "Path to which HIP has been installed") + set(HIP_CLANG_PATH + ${ROCM_PATH}/llvm/bin + CACHE PATH "Path to which clang has been installed") endif() set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) @@ -18,30 +30,39 @@ include_directories(${ROCM_PATH}/include) message(STATUS "HIP version: ${HIP_VERSION}") message(STATUS "HIP_CLANG_PATH: ${HIP_CLANG_PATH}") -macro(find_hip_version hip_header_file) - file(READ ${hip_header_file} HIP_VERSION_FILE_CONTENTS) +macro(find_hip_version hip_header_file) + file(READ ${hip_header_file} HIP_VERSION_FILE_CONTENTS) - string(REGEX MATCH "define HIP_VERSION_MAJOR +([0-9]+)" HIP_MAJOR_VERSION - "${HIP_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define HIP_VERSION_MAJOR +([0-9]+)" "\\1" - HIP_MAJOR_VERSION "${HIP_MAJOR_VERSION}") - string(REGEX MATCH "define HIP_VERSION_MINOR +([0-9]+)" HIP_MINOR_VERSION - "${HIP_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define HIP_VERSION_MINOR +([0-9]+)" "\\1" - HIP_MINOR_VERSION "${HIP_MINOR_VERSION}") - string(REGEX MATCH "define HIP_VERSION_PATCH +([0-9]+)" HIP_PATCH_VERSION - "${HIP_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define HIP_VERSION_PATCH +([0-9]+)" "\\1" - HIP_PATCH_VERSION "${HIP_PATCH_VERSION}") + string(REGEX MATCH "define HIP_VERSION_MAJOR +([0-9]+)" HIP_MAJOR_VERSION + "${HIP_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define HIP_VERSION_MAJOR +([0-9]+)" "\\1" + HIP_MAJOR_VERSION "${HIP_MAJOR_VERSION}") + string(REGEX MATCH "define HIP_VERSION_MINOR +([0-9]+)" HIP_MINOR_VERSION + "${HIP_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define HIP_VERSION_MINOR +([0-9]+)" "\\1" + HIP_MINOR_VERSION "${HIP_MINOR_VERSION}") + string(REGEX MATCH "define HIP_VERSION_PATCH +([0-9]+)" HIP_PATCH_VERSION + "${HIP_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define HIP_VERSION_PATCH +([0-9]+)" "\\1" + HIP_PATCH_VERSION "${HIP_PATCH_VERSION}") - if(NOT HIP_MAJOR_VERSION) - set(HIP_VERSION "???") - message(WARNING "Cannot find HIP version in ${HIP_PATH}/include/hip/hip_version.h") - else() - math(EXPR HIP_VERSION "${HIP_MAJOR_VERSION} * 10000000 + ${HIP_MINOR_VERSION} * 100000 + ${HIP_PATCH_VERSION}") - message(STATUS "Current HIP header is ${HIP_PATH}/include/hip/hip_version.h " - "Current HIP version is v${HIP_MAJOR_VERSION}.${HIP_MINOR_VERSION}.${HIP_PATCH_VERSION}. ") - endif() + if(NOT HIP_MAJOR_VERSION) + set(HIP_VERSION "???") + message( + WARNING "Cannot find HIP version in ${HIP_PATH}/include/hip/hip_version.h" + ) + else() + math( + EXPR + HIP_VERSION + "${HIP_MAJOR_VERSION} * 10000000 + ${HIP_MINOR_VERSION} * 100000 + ${HIP_PATCH_VERSION}" + ) + message( + STATUS + "Current HIP header is ${HIP_PATH}/include/hip/hip_version.h " + "Current HIP version is v${HIP_MAJOR_VERSION}.${HIP_MINOR_VERSION}.${HIP_PATCH_VERSION}. " + ) + endif() endmacro() find_hip_version(${HIP_PATH}/include/hip/hip_version.h) @@ -66,7 +87,8 @@ find_package_and_include(rocfft) # set CXX flags for HIP set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP") +set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP") set(THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_HIP) # define HIP_CXX_FLAGS @@ -103,7 +125,6 @@ list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc) list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx906) list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx908) - if(HIP_COMPILER STREQUAL clang) set(hip_library_name amdhip64) else() diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index a52047e16167d..bf69ddc8fb49a 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -13,290 +13,366 @@ # limitations under the License. # make package for paddle fluid shared and static library -set(PADDLE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_install_dir" CACHE STRING - "A path setting paddle shared and static libraries") +set(PADDLE_INSTALL_DIR + "${CMAKE_BINARY_DIR}/paddle_install_dir" + CACHE STRING "A path setting paddle shared and static libraries") + +set(PADDLE_INFERENCE_INSTALL_DIR + "${CMAKE_BINARY_DIR}/paddle_inference_install_dir" + CACHE STRING "A path setting paddle inference shared and static libraries") -set(PADDLE_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_install_dir" CACHE STRING - "A path setting paddle inference shared and static libraries") - # At present, the size of static lib in Windows is very large, # so we need to crop the library size. if(WIN32) - #todo: remove the option - option(WITH_STATIC_LIB "Compile demo with static/shared library, default use dynamic." OFF) - if(NOT PYTHON_EXECUTABLE) - FIND_PACKAGE(PythonInterp REQUIRED) - endif() + #todo: remove the option + option(WITH_STATIC_LIB + "Compile demo with static/shared library, default use dynamic." OFF) + if(NOT PYTHON_EXECUTABLE) + find_package(PythonInterp REQUIRED) + endif() endif() set(COPY_SCRIPT_DIR ${PADDLE_SOURCE_DIR}/cmake) function(copy TARGET) - set(options "") - set(oneValueArgs "") - set(multiValueArgs SRCS DSTS) - cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - list(LENGTH copy_lib_SRCS copy_lib_SRCS_len) - list(LENGTH copy_lib_DSTS copy_lib_DSTS_len) - if (NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len}) - message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers") - endif () - math(EXPR len "${copy_lib_SRCS_len} - 1") - foreach (index RANGE ${len}) - list(GET copy_lib_SRCS ${index} src) - list(GET copy_lib_DSTS ${index} dst) - if (WIN32) #windows - file(TO_NATIVE_PATH ${src} native_src) - file(TO_NATIVE_PATH ${dst} native_dst) - add_custom_command(TARGET ${TARGET} POST_BUILD - COMMAND ${PYTHON_EXECUTABLE} ${COPY_SCRIPT_DIR}/copyfile.py ${native_src} ${native_dst}) - else (WIN32) #not windows - add_custom_command(TARGET ${TARGET} POST_BUILD - COMMAND mkdir -p "${dst}" - COMMAND cp -r "${src}" "${dst}" - COMMENT "copying ${src} -> ${dst}") - endif (WIN32) # not windows - endforeach () + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DSTS) + cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + + list(LENGTH copy_lib_SRCS copy_lib_SRCS_len) + list(LENGTH copy_lib_DSTS copy_lib_DSTS_len) + if(NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len}) + message( + FATAL_ERROR + "${TARGET} source numbers are not equal to destination numbers") + endif() + math(EXPR len "${copy_lib_SRCS_len} - 1") + foreach(index RANGE ${len}) + list(GET copy_lib_SRCS ${index} src) + list(GET copy_lib_DSTS ${index} dst) + if(WIN32) #windows + file(TO_NATIVE_PATH ${src} native_src) + file(TO_NATIVE_PATH ${dst} native_dst) + add_custom_command( + TARGET ${TARGET} + POST_BUILD + COMMAND ${PYTHON_EXECUTABLE} ${COPY_SCRIPT_DIR}/copyfile.py + ${native_src} ${native_dst}) + else(WIN32) #not windows + add_custom_command( + TARGET ${TARGET} + POST_BUILD + COMMAND mkdir -p "${dst}" + COMMAND cp -r "${src}" "${dst}" + COMMENT "copying ${src} -> ${dst}") + endif(WIN32) # not windows + endforeach() endfunction() -function(copy_part_of_thrid_party TARGET DST) - if(${CBLAS_PROVIDER} STREQUAL MKLML) - set(dst_dir "${DST}/third_party/install/mklml") - if(WIN32) - copy(${TARGET} - SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_SHARED_LIB} - ${MKLML_SHARED_IOMP_LIB} ${MKLML_INC_DIR} - DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}/lib - ${dst_dir}/lib ${dst_dir}) - else() - copy(${TARGET} - SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR} - DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}) - if(WITH_STRIP) - add_custom_command(TARGET ${TARGET} POST_BUILD - COMMAND strip -s ${dst_dir}/lib/libiomp5.so - COMMAND strip -s ${dst_dir}/lib/libmklml_intel.so - COMMENT "striping libiomp5.so\nstriping libmklml_intel.so") - endif() - endif() - elseif(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS) - set(dst_dir "${DST}/third_party/install/openblas") - if(WIN32) - copy(${TARGET} - SRCS ${CBLAS_INSTALL_DIR}/lib ${OPENBLAS_SHARED_LIB} ${CBLAS_INSTALL_DIR}/include - DSTS ${dst_dir} ${dst_dir}/lib ${dst_dir}) - else() - copy(${TARGET} - SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include - DSTS ${dst_dir} ${dst_dir}) - endif() - endif() - - if(WITH_MKLDNN) - set(dst_dir "${DST}/third_party/install/mkldnn") - if(WIN32) - copy(${TARGET} - SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} ${MKLDNN_LIB} - DSTS ${dst_dir} ${dst_dir}/lib ${dst_dir}/lib) - else() - copy(${TARGET} - SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} - DSTS ${dst_dir} ${dst_dir}/lib) - if(WITH_STRIP) - add_custom_command(TARGET ${TARGET} POST_BUILD - COMMAND strip -s ${dst_dir}/lib/libmkldnn.so.0 - COMMENT "striping libmkldnn.so.0") - endif() - add_custom_command(TARGET ${TARGET} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E create_symlink libmkldnn.so.0 ${dst_dir}/lib/libdnnl.so.1 - COMMAND ${CMAKE_COMMAND} -E create_symlink libmkldnn.so.0 ${dst_dir}/lib/libdnnl.so.2 - COMMENT "Make a symbol link of libmkldnn.so.0") - endif() +function(copy_part_of_thrid_party TARGET DST) + if(${CBLAS_PROVIDER} STREQUAL MKLML) + set(dst_dir "${DST}/third_party/install/mklml") + if(WIN32) + copy( + ${TARGET} + SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_SHARED_LIB} + ${MKLML_SHARED_IOMP_LIB} ${MKLML_INC_DIR} + DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}/lib + ${dst_dir}) + else() + copy( + ${TARGET} + SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR} + DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}) + if(WITH_STRIP) + add_custom_command( + TARGET ${TARGET} + POST_BUILD + COMMAND strip -s ${dst_dir}/lib/libiomp5.so + COMMAND strip -s ${dst_dir}/lib/libmklml_intel.so + COMMENT "striping libiomp5.so\nstriping libmklml_intel.so") + endif() endif() - - if (WITH_ONNXRUNTIME) - set(dst_dir "${DST}/third_party/install/onnxruntime") - copy(${TARGET} - SRCS ${ONNXRUNTIME_INC_DIR} ${ONNXRUNTIME_LIB_DIR} - DSTS ${dst_dir} ${dst_dir}) - - set(dst_dir "${DST}/third_party/install/paddle2onnx") - if(WIN32) - copy(${TARGET} - SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_SHARED_LIB} ${PADDLE2ONNX_LIB} - DSTS ${dst_dir}/include ${dst_dir}/lib ${dst_dir}/lib) - else() - copy(${TARGET} - SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_LIB} - DSTS ${dst_dir}/include ${dst_dir}/lib) - endif() + elseif(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS) + set(dst_dir "${DST}/third_party/install/openblas") + if(WIN32) + copy( + ${TARGET} + SRCS ${CBLAS_INSTALL_DIR}/lib ${OPENBLAS_SHARED_LIB} + ${CBLAS_INSTALL_DIR}/include + DSTS ${dst_dir} ${dst_dir}/lib ${dst_dir}) + else() + copy( + ${TARGET} + SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include + DSTS ${dst_dir} ${dst_dir}) endif() - - set(dst_dir "${DST}/third_party/install/gflags") - copy(${TARGET} - SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib) - - set(dst_dir "${DST}/third_party/install/glog") - copy(${TARGET} - SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib) - - set(dst_dir "${DST}/third_party/install/utf8proc") - copy(${TARGET} - SRCS ${UTF8PROC_INSTALL_DIR}/include ${UTF8PROC_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib) - - if (WITH_CRYPTO) - set(dst_dir "${DST}/third_party/install/cryptopp") - copy(${TARGET} - SRCS ${CRYPTOPP_INCLUDE_DIR} ${CRYPTOPP_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib) + endif() + + if(WITH_MKLDNN) + set(dst_dir "${DST}/third_party/install/mkldnn") + if(WIN32) + copy( + ${TARGET} + SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} ${MKLDNN_LIB} + DSTS ${dst_dir} ${dst_dir}/lib ${dst_dir}/lib) + else() + copy( + ${TARGET} + SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} + DSTS ${dst_dir} ${dst_dir}/lib) + if(WITH_STRIP) + add_custom_command( + TARGET ${TARGET} + POST_BUILD + COMMAND strip -s ${dst_dir}/lib/libmkldnn.so.0 + COMMENT "striping libmkldnn.so.0") + endif() + add_custom_command( + TARGET ${TARGET} + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E create_symlink libmkldnn.so.0 + ${dst_dir}/lib/libdnnl.so.1 + COMMAND ${CMAKE_COMMAND} -E create_symlink libmkldnn.so.0 + ${dst_dir}/lib/libdnnl.so.2 + COMMENT "Make a symbol link of libmkldnn.so.0") endif() - - set(dst_dir "${DST}/third_party/install/xxhash") - copy(${TARGET} - SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib) - - if (NOT PROTOBUF_FOUND OR WIN32) - set(dst_dir "${DST}/third_party/install/protobuf") - copy(${TARGET} - SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY} - DSTS ${dst_dir} ${dst_dir}/lib) - endif () - - if (LITE_BINARY_DIR) - set(dst_dir "${DST}/third_party/install/lite") - copy(${TARGET} - SRCS ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/* - DSTS ${dst_dir}) + endif() + + if(WITH_ONNXRUNTIME) + set(dst_dir "${DST}/third_party/install/onnxruntime") + copy( + ${TARGET} + SRCS ${ONNXRUNTIME_INC_DIR} ${ONNXRUNTIME_LIB_DIR} + DSTS ${dst_dir} ${dst_dir}) + + set(dst_dir "${DST}/third_party/install/paddle2onnx") + if(WIN32) + copy( + ${TARGET} + SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_SHARED_LIB} + ${PADDLE2ONNX_LIB} + DSTS ${dst_dir}/include ${dst_dir}/lib ${dst_dir}/lib) + else() + copy( + ${TARGET} + SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_LIB} + DSTS ${dst_dir}/include ${dst_dir}/lib) endif() + endif() + + set(dst_dir "${DST}/third_party/install/gflags") + copy( + ${TARGET} + SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib) + + set(dst_dir "${DST}/third_party/install/glog") + copy( + ${TARGET} + SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib) + + set(dst_dir "${DST}/third_party/install/utf8proc") + copy( + ${TARGET} + SRCS ${UTF8PROC_INSTALL_DIR}/include ${UTF8PROC_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib) + + if(WITH_CRYPTO) + set(dst_dir "${DST}/third_party/install/cryptopp") + copy( + ${TARGET} + SRCS ${CRYPTOPP_INCLUDE_DIR} ${CRYPTOPP_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib) + endif() + + set(dst_dir "${DST}/third_party/install/xxhash") + copy( + ${TARGET} + SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib) + + if(NOT PROTOBUF_FOUND OR WIN32) + set(dst_dir "${DST}/third_party/install/protobuf") + copy( + ${TARGET} + SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY} + DSTS ${dst_dir} ${dst_dir}/lib) + endif() + + if(LITE_BINARY_DIR) + set(dst_dir "${DST}/third_party/install/lite") + copy( + ${TARGET} + SRCS ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/* + DSTS ${dst_dir}) + endif() endfunction() # inference library for only inference -set(inference_lib_deps third_party paddle_inference paddle_inference_c paddle_inference_shared paddle_inference_c_shared) +set(inference_lib_deps third_party paddle_inference paddle_inference_c + paddle_inference_shared paddle_inference_c_shared) add_custom_target(inference_lib_dist DEPENDS ${inference_lib_deps}) - set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/threadpool") -copy(inference_lib_dist - SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h - DSTS ${dst_dir}) +copy( + inference_lib_dist + SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h + DSTS ${dst_dir}) # GPU must copy externalErrorMsg.pb -IF(WITH_GPU) - set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/externalError/data") - copy(inference_lib_dist - SRCS ${externalError_INCLUDE_DIR} - DSTS ${dst_dir}) -ENDIF() - -IF(WITH_XPU) - set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/install/xpu") - copy(inference_lib_dist - SRCS ${XPU_INC_DIR} ${XPU_LIB_DIR} - DSTS ${dst_dir} ${dst_dir}) -ENDIF() +if(WITH_GPU) + set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/externalError/data") + copy( + inference_lib_dist + SRCS ${externalError_INCLUDE_DIR} + DSTS ${dst_dir}) +endif() + +if(WITH_XPU) + set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/install/xpu") + copy( + inference_lib_dist + SRCS ${XPU_INC_DIR} ${XPU_LIB_DIR} + DSTS ${dst_dir} ${dst_dir}) +endif() # CMakeCache Info -copy(inference_lib_dist - SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt - DSTS ${PADDLE_INFERENCE_INSTALL_DIR}) +copy( + inference_lib_dist + SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}) copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR}) set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") if(WIN32) - if(WITH_STATIC_LIB) - set(paddle_inference_lib $/libpaddle_inference.lib - $/paddle_inference.*) - else() - set(paddle_inference_lib $/paddle_inference.dll - $/paddle_inference.lib) - endif() - copy(inference_lib_dist - SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_inference_lib} - DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib - ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib) + if(WITH_STATIC_LIB) + set(paddle_inference_lib + $/libpaddle_inference.lib + $/paddle_inference.*) + else() + set(paddle_inference_lib + $/paddle_inference.dll + $/paddle_inference.lib) + endif() + copy( + inference_lib_dist + SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_inference_lib} + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include + ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib + ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib) else(WIN32) - set(paddle_inference_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_inference.*) - copy(inference_lib_dist - SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_inference_lib} - DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib) + set(paddle_inference_lib + ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_inference.*) + copy( + inference_lib_dist + SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_inference_lib} + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include + ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib) endif(WIN32) -copy(inference_lib_dist - SRCS ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h - DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/internal) -copy(inference_lib_dist - SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/framework/io/crypto/cipher.h - DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/crypto/) +copy( + inference_lib_dist + SRCS ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/internal) +copy( + inference_lib_dist + SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/framework/io/crypto/cipher.h + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/crypto/) include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io) # copy api headers for phi & custom op -copy(inference_lib_dist - SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/api/ext/*.h - DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/ext/) -copy(inference_lib_dist - SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/api/include/*.h - DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/include/) -copy(inference_lib_dist - SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/api/all.h - DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/) -copy(inference_lib_dist - SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/common/*.h - DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/common/) -copy(inference_lib_dist - SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/core/macros.h - DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/core/) -copy(inference_lib_dist - SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/core/visit_type.h - DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/core/) -copy(inference_lib_dist - SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/any.h - DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/) -copy(inference_lib_dist - SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/optional.h - DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/) -copy(inference_lib_dist - SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/none.h - DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/) -copy(inference_lib_dist - SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/flat_hash_map.h - DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/) -copy(inference_lib_dist - SRCS ${PADDLE_SOURCE_DIR}/paddle/extension.h - DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/) +copy( + inference_lib_dist + SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/api/ext/*.h + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/ext/) +copy( + inference_lib_dist + SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/api/include/*.h + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/include/ +) +copy( + inference_lib_dist + SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/api/all.h + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/) +copy( + inference_lib_dist + SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/common/*.h + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/common/) +copy( + inference_lib_dist + SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/core/macros.h + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/core/) +copy( + inference_lib_dist + SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/core/visit_type.h + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/core/) +copy( + inference_lib_dist + SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/any.h + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/) +copy( + inference_lib_dist + SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/optional.h + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/) +copy( + inference_lib_dist + SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/none.h + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/) +copy( + inference_lib_dist + SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/flat_hash_map.h + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/) +copy( + inference_lib_dist + SRCS ${PADDLE_SOURCE_DIR}/paddle/extension.h + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/) # the header file of phi is copied to the experimental directory, # the include path of phi needs to be changed to adapt to inference api path -add_custom_command(TARGET inference_lib_dist POST_BUILD - COMMAND ${CMAKE_COMMAND} -P "${PADDLE_SOURCE_DIR}/cmake/phi_header.cmake" - COMMENT "Change phi header include path to adapt to inference api path") +add_custom_command( + TARGET inference_lib_dist + POST_BUILD + COMMAND ${CMAKE_COMMAND} -P "${PADDLE_SOURCE_DIR}/cmake/phi_header.cmake" + COMMENT "Change phi header include path to adapt to inference api path") # CAPI inference library for only inference -set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING -"A path setting CAPI paddle inference shared") +set(PADDLE_INFERENCE_C_INSTALL_DIR + "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" + CACHE STRING "A path setting CAPI paddle inference shared") copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_C_INSTALL_DIR}) set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") if(WIN32) - set(paddle_inference_c_lib $/paddle_inference_c.*) + set(paddle_inference_c_lib + $/paddle_inference_c.*) else(WIN32) - set(paddle_inference_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi_exp/libpaddle_inference_c.*) + set(paddle_inference_c_lib + ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi_exp/libpaddle_inference_c.* + ) endif(WIN32) -copy(inference_lib_dist - SRCS ${src_dir}/inference/capi_exp/pd_*.h ${paddle_inference_c_lib} - DSTS ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/lib) +copy( + inference_lib_dist + SRCS ${src_dir}/inference/capi_exp/pd_*.h ${paddle_inference_c_lib} + DSTS ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/include + ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/lib) if(WITH_STRIP AND NOT WIN32) - add_custom_command(TARGET inference_lib_dist POST_BUILD - COMMAND strip -s ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/lib/libpaddle_inference_c.so - COMMAND strip -s ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib/libpaddle_inference.so - COMMENT "striping libpaddle_inference_c.so\nstriping libpaddle_inference.so") + add_custom_command( + TARGET inference_lib_dist + POST_BUILD + COMMAND + strip -s + ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/lib/libpaddle_inference_c.so + COMMAND strip -s + ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib/libpaddle_inference.so + COMMENT "striping libpaddle_inference_c.so\nstriping libpaddle_inference.so" + ) endif() # fluid library for both train and inference @@ -306,36 +382,55 @@ add_custom_target(fluid_lib_dist ALL DEPENDS ${fluid_lib_deps}) set(dst_dir "${PADDLE_INSTALL_DIR}/paddle/fluid") set(module "inference") if(WIN32) - copy(fluid_lib_dist - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_inference_lib} - DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} - ) - else() - copy(fluid_lib_dist - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_inference_lib} - DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} - ) + copy( + fluid_lib_dist + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h + ${paddle_inference_lib} + DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} + ${dst_dir}/${module}) +else() + copy( + fluid_lib_dist + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h + ${paddle_inference_lib} + DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}) endif() set(module "framework") set(framework_lib_deps framework_proto data_feed_proto trainer_desc_proto) add_dependencies(fluid_lib_dist ${framework_lib_deps}) -copy(fluid_lib_dist - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/trainer_desc.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h - ${src_dir}/${module}/ir/*.h ${src_dir}/${module}/fleet/*.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}/ir/memory_optimize_pass ${dst_dir}/${module}/ir ${dst_dir}/${module}/fleet) +copy( + fluid_lib_dist + SRCS ${src_dir}/${module}/*.h + ${src_dir}/${module}/details/*.h + ${PADDLE_BINARY_DIR}/paddle/fluid/framework/trainer_desc.pb.h + ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h + ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h + ${src_dir}/${module}/ir/memory_optimize_pass/*.h + ${src_dir}/${module}/ir/*.h + ${src_dir}/${module}/fleet/*.h + DSTS ${dst_dir}/${module} + ${dst_dir}/${module}/details + ${dst_dir}/${module} + ${dst_dir}/${module} + ${dst_dir}/${module} + ${dst_dir}/${module}/ir/memory_optimize_pass + ${dst_dir}/${module}/ir + ${dst_dir}/${module}/fleet) set(module "operators") -copy(fluid_lib_dist - SRCS ${src_dir}/${module}/reader/blocking_queue.h - DSTS ${dst_dir}/${module}/reader/ - ) +copy( + fluid_lib_dist + SRCS ${src_dir}/${module}/reader/blocking_queue.h + DSTS ${dst_dir}/${module}/reader/) set(module "memory") -copy(fluid_lib_dist - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h ${src_dir}/${module}/allocation/*.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail ${dst_dir}/${module}/allocation - ) +copy( + fluid_lib_dist + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h + ${src_dir}/${module}/allocation/*.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail + ${dst_dir}/${module}/allocation) set(module "platform") set(platform_lib_deps profiler_proto errors) @@ -344,99 +439,113 @@ if(WITH_GPU) endif(WITH_GPU) add_dependencies(fluid_lib_dist ${platform_lib_deps}) -copy(fluid_lib_dist - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/*.pb.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module} - ) +copy( + fluid_lib_dist + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h + ${src_dir}/${module}/details/*.h + ${PADDLE_BINARY_DIR}/paddle/fluid/platform/*.pb.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload + ${dst_dir}/${module}/details ${dst_dir}/${module}) set(module "string") -copy(fluid_lib_dist - SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/${module}/*.h ${PADDLE_SOURCE_DIR}/paddle/utils/${module}/tinyformat/*.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat - ) +copy( + fluid_lib_dist + SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/${module}/*.h + ${PADDLE_SOURCE_DIR}/paddle/utils/${module}/tinyformat/*.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat) set(module "imperative") -copy(fluid_lib_dist - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/jit/*.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/jit - ) +copy( + fluid_lib_dist + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/jit/*.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/jit) set(module "pybind") -copy(fluid_lib_dist - SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h - DSTS ${dst_dir}/${module} - ) +copy( + fluid_lib_dist + SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h + DSTS ${dst_dir}/${module}) set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/eigen3") -copy(inference_lib_dist - SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen - DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported) +copy( + inference_lib_dist + SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src + ${EIGEN_INCLUDE_DIR}/unsupported/Eigen + DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported) set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/boost") -copy(inference_lib_dist - SRCS ${BOOST_INCLUDE_DIR}/boost - DSTS ${dst_dir}) +copy( + inference_lib_dist + SRCS ${BOOST_INCLUDE_DIR}/boost + DSTS ${dst_dir}) set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/dlpack") -copy(inference_lib_dist - SRCS ${DLPACK_INCLUDE_DIR}/dlpack - DSTS ${dst_dir}) +copy( + inference_lib_dist + SRCS ${DLPACK_INCLUDE_DIR}/dlpack + DSTS ${dst_dir}) set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/install/zlib") -copy(inference_lib_dist - SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib) - +copy( + inference_lib_dist + SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib) # CMakeCache Info -copy(fluid_lib_dist - SRCS ${PADDLE_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt - DSTS ${PADDLE_INSTALL_DIR} ${PADDLE_INSTALL_DIR} - ) +copy( + fluid_lib_dist + SRCS ${PADDLE_INFERENCE_INSTALL_DIR}/third_party + ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt + DSTS ${PADDLE_INSTALL_DIR} ${PADDLE_INSTALL_DIR}) # paddle fluid version function(version version_file) - execute_process( - COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1 - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} - OUTPUT_VARIABLE PADDLE_GIT_COMMIT) - file(WRITE ${version_file} - "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n" - "WITH_MKL: ${WITH_MKL}\n" - "WITH_MKLDNN: ${WITH_MKLDNN}\n" - "WITH_GPU: ${WITH_GPU}\n" - "WITH_ROCM: ${WITH_ROCM}\n" - "WITH_ASCEND_CL: ${WITH_ASCEND_CL}\n" - "WITH_ASCEND_CXX11: ${WITH_ASCEND_CXX11}\n" - "WITH_IPU: ${WITH_IPU}\n") - if(WITH_GPU) - file(APPEND ${version_file} - "CUDA version: ${CUDA_VERSION}\n" - "CUDNN version: v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}\n") - endif() - if(WITH_ROCM) - file(APPEND ${version_file} - "HIP version: v${HIP_MAJOR_VERSION}.${HIP_MINOR_VERSION}\n" - "MIOpen version: v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}\n") - endif() - if(WITH_ASCEND_CL) - file(APPEND ${version_file} - "Ascend Toolkit version: ${ASCEND_TOOLKIT_VERSION}\n" - "Ascend Driver version: ${ASCEND_DRIVER_VERSION}\n") - endif() - if(WITH_IPU) - file(APPEND ${version_file} - "PopART version: ${POPART_VERSION}\n") - endif() - file(APPEND ${version_file} "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n") - if(TENSORRT_FOUND) - file(APPEND ${version_file} - "WITH_TENSORRT: ${TENSORRT_FOUND}\n" "TensorRT version: v${TENSORRT_MAJOR_VERSION}.${TENSORRT_MINOR_VERSION}.${TENSORRT_PATCH_VERSION}.${TENSORRT_BUILD_VERSION}\n") - endif() - if(WITH_LITE) - file(APPEND ${version_file} "WITH_LITE: ${WITH_LITE}\n" "LITE_GIT_TAG: ${LITE_GIT_TAG}\n") - endif() - + execute_process( + COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1 + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_GIT_COMMIT) + file( + WRITE ${version_file} + "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n" + "WITH_MKL: ${WITH_MKL}\n" + "WITH_MKLDNN: ${WITH_MKLDNN}\n" + "WITH_GPU: ${WITH_GPU}\n" + "WITH_ROCM: ${WITH_ROCM}\n" + "WITH_ASCEND_CL: ${WITH_ASCEND_CL}\n" + "WITH_ASCEND_CXX11: ${WITH_ASCEND_CXX11}\n" + "WITH_IPU: ${WITH_IPU}\n") + if(WITH_GPU) + file(APPEND ${version_file} + "CUDA version: ${CUDA_VERSION}\n" + "CUDNN version: v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}\n") + endif() + if(WITH_ROCM) + file(APPEND ${version_file} + "HIP version: v${HIP_MAJOR_VERSION}.${HIP_MINOR_VERSION}\n" + "MIOpen version: v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}\n") + endif() + if(WITH_ASCEND_CL) + file(APPEND ${version_file} + "Ascend Toolkit version: ${ASCEND_TOOLKIT_VERSION}\n" + "Ascend Driver version: ${ASCEND_DRIVER_VERSION}\n") + endif() + if(WITH_IPU) + file(APPEND ${version_file} "PopART version: ${POPART_VERSION}\n") + endif() + file(APPEND ${version_file} + "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n") + if(TENSORRT_FOUND) + file( + APPEND ${version_file} + "WITH_TENSORRT: ${TENSORRT_FOUND}\n" + "TensorRT version: v${TENSORRT_MAJOR_VERSION}.${TENSORRT_MINOR_VERSION}.${TENSORRT_PATCH_VERSION}.${TENSORRT_BUILD_VERSION}\n" + ) + endif() + if(WITH_LITE) + file(APPEND ${version_file} "WITH_LITE: ${WITH_LITE}\n" + "LITE_GIT_TAG: ${LITE_GIT_TAG}\n") + endif() + endfunction() version(${PADDLE_INSTALL_DIR}/version.txt) version(${PADDLE_INFERENCE_INSTALL_DIR}/version.txt) diff --git a/cmake/infrt_lib.cmake b/cmake/infrt_lib.cmake index 5b27c9d8400cc..21dcd0ef36d16 100644 --- a/cmake/infrt_lib.cmake +++ b/cmake/infrt_lib.cmake @@ -12,65 +12,74 @@ # See the License for the specific language governing permissions and # limitations under the License. -set(INFRT_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_infrt_install_dir" CACHE STRING - "A path setting paddle infrt shared and static libraries") - +set(INFRT_INSTALL_DIR + "${CMAKE_BINARY_DIR}/paddle_infrt_install_dir" + CACHE STRING "A path setting paddle infrt shared and static libraries") + function(copy TARGET) - set(options "") - set(oneValueArgs "") - set(multiValueArgs SRCS DSTS) - cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DSTS) + cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) - list(LENGTH copy_lib_SRCS copy_lib_SRCS_len) - list(LENGTH copy_lib_DSTS copy_lib_DSTS_len) - if (NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len}) - message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers") - endif () - math(EXPR len "${copy_lib_SRCS_len} - 1") - foreach (index RANGE ${len}) - list(GET copy_lib_SRCS ${index} src) - list(GET copy_lib_DSTS ${index} dst) - add_custom_command(TARGET ${TARGET} POST_BUILD - COMMAND mkdir -p "${dst}" - COMMAND cp -r "${src}" "${dst}" - COMMENT "copying ${src} -> ${dst}") - endforeach () + list(LENGTH copy_lib_SRCS copy_lib_SRCS_len) + list(LENGTH copy_lib_DSTS copy_lib_DSTS_len) + if(NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len}) + message( + FATAL_ERROR + "${TARGET} source numbers are not equal to destination numbers") + endif() + math(EXPR len "${copy_lib_SRCS_len} - 1") + foreach(index RANGE ${len}) + list(GET copy_lib_SRCS ${index} src) + list(GET copy_lib_DSTS ${index} dst) + add_custom_command( + TARGET ${TARGET} + POST_BUILD + COMMAND mkdir -p "${dst}" + COMMAND cp -r "${src}" "${dst}" + COMMENT "copying ${src} -> ${dst}") + endforeach() endfunction() -function(copy_part_of_thrid_party TARGET DST) - set(dst_dir "${DST}/third_party/install/glog") - copy(${TARGET} - SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib) +function(copy_part_of_thrid_party TARGET DST) + set(dst_dir "${DST}/third_party/install/glog") + copy( + ${TARGET} + SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib) endfunction() # inference library for only inference set(infrt_lib_deps third_party infrt infrt_static) add_custom_target(infrt_lib_dist DEPENDS ${infrt_lib_deps}) - # CMakeCache Info -copy(infrt_lib_dist - SRCS ${CMAKE_BINARY_DIR}/CMakeCache.txt - DSTS ${INFRT_INSTALL_DIR}) +copy( + infrt_lib_dist + SRCS ${CMAKE_BINARY_DIR}/CMakeCache.txt + DSTS ${INFRT_INSTALL_DIR}) set(infrt_lib ${INFRT_BINARY_DIR}/libinfrt.*) -copy(infrt_lib_dist - SRCS ${INFRT_SOURCE_DIR}/api/infrt_api.h ${infrt_lib} - DSTS ${INFRT_INSTALL_DIR}/infrt/include ${INFRT_INSTALL_DIR}/infrt/lib) - +copy( + infrt_lib_dist + SRCS ${INFRT_SOURCE_DIR}/api/infrt_api.h ${infrt_lib} + DSTS ${INFRT_INSTALL_DIR}/infrt/include ${INFRT_INSTALL_DIR}/infrt/lib) -copy(infrt_lib_dist - SRCS ${INFRT_BINARY_DIR}/paddle/framework.pb.h - DSTS ${INFRT_INSTALL_DIR}/infrt/include/internal) +copy( + infrt_lib_dist + SRCS ${INFRT_BINARY_DIR}/paddle/framework.pb.h + DSTS ${INFRT_INSTALL_DIR}/infrt/include/internal) # paddle fluid version function(version version_file) - execute_process( - COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1 - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} - OUTPUT_VARIABLE PADDLE_GIT_COMMIT) - file(WRITE ${version_file} "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n") - file(APPEND ${version_file} "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n") + execute_process( + COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1 + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_GIT_COMMIT) + file(WRITE ${version_file} "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n") + file(APPEND ${version_file} + "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n") endfunction() version(${INFRT_INSTALL_DIR}/version.txt) diff --git a/cmake/init.cmake b/cmake/init.cmake index 0ebcdc8ceeebc..86c43cb233bfc 100644 --- a/cmake/init.cmake +++ b/cmake/init.cmake @@ -8,43 +8,44 @@ # MINSIZEREL: default: "-O2 -g -DNDEBUG" if(NOT WIN32) - set(CMAKE_C_FLAGS_DEBUG "-g") - set(CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG") - set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG") - set(CMAKE_C_FLAGS_MINSIZEREL "-Os -DNDEBUG") + set(CMAKE_C_FLAGS_DEBUG "-g") + set(CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG") + set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG") + set(CMAKE_C_FLAGS_MINSIZEREL "-Os -DNDEBUG") - set(CMAKE_CXX_FLAGS_DEBUG "-g") - set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG") - set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG") - set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG") - - if(WITH_GPU) - set(CMAKE_CUDA_FLAGS_DEBUG "-g") - set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG") - set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG") - set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG") - endif() + set(CMAKE_CXX_FLAGS_DEBUG "-g") + set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG") + set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG") + + if(WITH_GPU) + set(CMAKE_CUDA_FLAGS_DEBUG "-g") + set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG") + set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG") + set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG") + endif() else() - set(CMAKE_C_FLAGS_DEBUG "/MDd /Zi /Ob0 /Od /RTC1") - set(CMAKE_C_FLAGS_RELEASE "/MD /O2 /Ob2 /DNDEBUG") - set(CMAKE_C_FLAGS_RELWITHDEBINFO "/MD /Zi /O2 /Ob1 /DNDEBUG") - set(CMAKE_C_FLAGS_MINSIZEREL "/MD /O1 /Ob1 /DNDEBUG") + set(CMAKE_C_FLAGS_DEBUG "/MDd /Zi /Ob0 /Od /RTC1") + set(CMAKE_C_FLAGS_RELEASE "/MD /O2 /Ob2 /DNDEBUG") + set(CMAKE_C_FLAGS_RELWITHDEBINFO "/MD /Zi /O2 /Ob1 /DNDEBUG") + set(CMAKE_C_FLAGS_MINSIZEREL "/MD /O1 /Ob1 /DNDEBUG") - set(CMAKE_CXX_FLAGS_DEBUG "/MDd /Zi /Ob0 /Od /RTC1") - set(CMAKE_CXX_FLAGS_RELEASE "/MD /O2 /Ob2 /DNDEBUG") - set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/MD /Zi /O2 /Ob1 /DNDEBUG") - set(CMAKE_CXX_FLAGS_MINSIZEREL "/MD /O1 /Ob1 /DNDEBUG") + set(CMAKE_CXX_FLAGS_DEBUG "/MDd /Zi /Ob0 /Od /RTC1") + set(CMAKE_CXX_FLAGS_RELEASE "/MD /O2 /Ob2 /DNDEBUG") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/MD /Zi /O2 /Ob1 /DNDEBUG") + set(CMAKE_CXX_FLAGS_MINSIZEREL "/MD /O1 /Ob1 /DNDEBUG") - if(WITH_GPU) - set(CMAKE_CUDA_FLAGS_DEBUG "-Xcompiler=\"-MDd -Zi -Ob0 -Od /RTC1\"") - set(CMAKE_CUDA_FLAGS_RELEASE "-Xcompiler=\"-MD -O2 -Ob2\" -DNDEBUG") - set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-Xcompiler=\"-MD -Zi -O2 -Ob1\" -DNDEBUG") - set(CMAKE_CUDA_FLAGS_MINSIZEREL "-Xcompiler=\"-MD -O1 -Ob1\" -DNDEBUG") - endif() + if(WITH_GPU) + set(CMAKE_CUDA_FLAGS_DEBUG "-Xcompiler=\"-MDd -Zi -Ob0 -Od /RTC1\"") + set(CMAKE_CUDA_FLAGS_RELEASE "-Xcompiler=\"-MD -O2 -Ob2\" -DNDEBUG") + set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO + "-Xcompiler=\"-MD -Zi -O2 -Ob1\" -DNDEBUG") + set(CMAKE_CUDA_FLAGS_MINSIZEREL "-Xcompiler=\"-MD -O1 -Ob1\" -DNDEBUG") + endif() - # It can specify CUDA compile flag manualy, - # its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous - # because CUDA will update by nvidia, then error will occur. - # Now, it's only used in VS2015 + CUDA:[10.0, 10.2] - set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props) + # It can specify CUDA compile flag manualy, + # its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous + # because CUDA will update by nvidia, then error will occur. + # Now, it's only used in VS2015 + CUDA:[10.0, 10.2] + set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props) endif() diff --git a/cmake/miopen.cmake b/cmake/miopen.cmake index 493c37955f725..392ff0401eaef 100644 --- a/cmake/miopen.cmake +++ b/cmake/miopen.cmake @@ -1,65 +1,77 @@ if(NOT WITH_ROCM) - return() + return() endif() # Now we don't support ROCm on windows if(WIN32) - return() + return() endif() -set(MIOPEN_ROOT ${ROCM_PATH}/miopen CACHE PATH "MIOPEN ROOT") +set(MIOPEN_ROOT + ${ROCM_PATH}/miopen + CACHE PATH "MIOPEN ROOT") -find_path(MIOPEN_INCLUDE_DIR "miopen/miopen.h" - PATHS ${MIOPEN_ROOT} ${MIOPEN_ROOT}/include ${MIOPEN_ROOT}/local/include - $ENV{MIOPEN_ROOT} $ENV{MIOPEN_ROOT}/include $ENV{MIOPEN_ROOT}/local/include - NO_DEFAULT_PATH -) +find_path( + MIOPEN_INCLUDE_DIR "miopen/miopen.h" + PATHS ${MIOPEN_ROOT} ${MIOPEN_ROOT}/include ${MIOPEN_ROOT}/local/include + $ENV{MIOPEN_ROOT} $ENV{MIOPEN_ROOT}/include + $ENV{MIOPEN_ROOT}/local/include + NO_DEFAULT_PATH) -find_library(MIOPEN_LIBRARY NAMES "libMIOpen.so" - PATHS ${MIOPEN_ROOT} ${MIOPEN_ROOT}/lib ${MIOPEN_ROOT}/lib64 ${__libpath_hist} - $ENV{MIOPEN_ROOT} $ENV{MIOPEN_ROOT}/lib $ENV{MIOPEN_ROOT}/lib64 - NO_DEFAULT_PATH - DOC "Path to MIOpen library.") +find_library( + MIOPEN_LIBRARY + NAMES "libMIOpen.so" + PATHS ${MIOPEN_ROOT} + ${MIOPEN_ROOT}/lib + ${MIOPEN_ROOT}/lib64 + ${__libpath_hist} + $ENV{MIOPEN_ROOT} + $ENV{MIOPEN_ROOT}/lib + $ENV{MIOPEN_ROOT}/lib64 + NO_DEFAULT_PATH + DOC "Path to MIOpen library.") if(MIOPEN_INCLUDE_DIR AND MIOPEN_LIBRARY) - set(MIOPEN_FOUND ON) + set(MIOPEN_FOUND ON) else() - set(MIOPEN_FOUND OFF) + set(MIOPEN_FOUND OFF) endif() -macro(find_miopen_version miopen_header_file) - file(READ ${miopen_header_file} MIOPEN_VERSION_FILE_CONTENTS) - get_filename_component(MIOPEN_LIB_PATH ${MIOPEN_LIBRARY} DIRECTORY) +macro(find_miopen_version miopen_header_file) + file(READ ${miopen_header_file} MIOPEN_VERSION_FILE_CONTENTS) + get_filename_component(MIOPEN_LIB_PATH ${MIOPEN_LIBRARY} DIRECTORY) - string(REGEX MATCH "define MIOPEN_VERSION_MAJOR +([0-9]+)" MIOPEN_MAJOR_VERSION - "${MIOPEN_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define MIOPEN_VERSION_MAJOR +([0-9]+)" "\\1" - MIOPEN_MAJOR_VERSION "${MIOPEN_MAJOR_VERSION}") - string(REGEX MATCH "define MIOPEN_VERSION_MINOR +([0-9]+)" MIOPEN_MINOR_VERSION - "${MIOPEN_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define MIOPEN_VERSION_MINOR +([0-9]+)" "\\1" - MIOPEN_MINOR_VERSION "${MIOPEN_MINOR_VERSION}") - string(REGEX MATCH "define MIOPEN_VERSION_PATCH +([0-9]+)" MIOPEN_PATCH_VERSION - "${MIOPEN_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define MIOPEN_VERSION_PATCH +([0-9]+)" "\\1" - MIOPEN_PATCH_VERSION "${MIOPEN_PATCH_VERSION}") - string(REGEX MATCH "define MIOPEN_VERSION_TWEAK +([0-9]+)" MIOPEN_TWEAK_VERSION - "${MIOPEN_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define MIOPEN_VERSION_TWEAK +([0-9]+)" "\\1" - MIOPEN_TWEAK_VERSION "${MIOPEN_TWEAK_VERSION}") + string(REGEX MATCH "define MIOPEN_VERSION_MAJOR +([0-9]+)" + MIOPEN_MAJOR_VERSION "${MIOPEN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define MIOPEN_VERSION_MAJOR +([0-9]+)" "\\1" + MIOPEN_MAJOR_VERSION "${MIOPEN_MAJOR_VERSION}") + string(REGEX MATCH "define MIOPEN_VERSION_MINOR +([0-9]+)" + MIOPEN_MINOR_VERSION "${MIOPEN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define MIOPEN_VERSION_MINOR +([0-9]+)" "\\1" + MIOPEN_MINOR_VERSION "${MIOPEN_MINOR_VERSION}") + string(REGEX MATCH "define MIOPEN_VERSION_PATCH +([0-9]+)" + MIOPEN_PATCH_VERSION "${MIOPEN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define MIOPEN_VERSION_PATCH +([0-9]+)" "\\1" + MIOPEN_PATCH_VERSION "${MIOPEN_PATCH_VERSION}") + string(REGEX MATCH "define MIOPEN_VERSION_TWEAK +([0-9]+)" + MIOPEN_TWEAK_VERSION "${MIOPEN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define MIOPEN_VERSION_TWEAK +([0-9]+)" "\\1" + MIOPEN_TWEAK_VERSION "${MIOPEN_TWEAK_VERSION}") - if(NOT MIOPEN_MAJOR_VERSION) - set(MIOPEN_VERSION "???") - else() - add_definitions("-DMIOPEN_MAJOR_VERSION=\"${MIOPEN_MAJOR_VERSION}\"") - math(EXPR MIOPEN_VERSION - "${MIOPEN_MAJOR_VERSION} * 1000 + + if(NOT MIOPEN_MAJOR_VERSION) + set(MIOPEN_VERSION "???") + else() + add_definitions("-DMIOPEN_MAJOR_VERSION=\"${MIOPEN_MAJOR_VERSION}\"") + math(EXPR MIOPEN_VERSION "${MIOPEN_MAJOR_VERSION} * 1000 + ${MIOPEN_MINOR_VERSION} * 10 + ${MIOPEN_PATCH_VERSION}") - message(STATUS "Current MIOpen header is ${MIOPEN_INCLUDE_DIR}/miopen/miopen.h " - "Current MIOpen version is v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}.${MIOPEN_PATCH_VERSION}. ") - endif() + message( + STATUS + "Current MIOpen header is ${MIOPEN_INCLUDE_DIR}/miopen/miopen.h " + "Current MIOpen version is v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}.${MIOPEN_PATCH_VERSION}. " + ) + endif() endmacro() if(MIOPEN_FOUND) - find_miopen_version(${MIOPEN_INCLUDE_DIR}/miopen/version.h) + find_miopen_version(${MIOPEN_INCLUDE_DIR}/miopen/version.h) endif() diff --git a/cmake/nccl.cmake b/cmake/nccl.cmake index 9124fec0b856a..8ce3cd91ac82a 100644 --- a/cmake/nccl.cmake +++ b/cmake/nccl.cmake @@ -1,55 +1,59 @@ if(NOT WITH_GPU) - return() + return() endif() # Now we don't support NCCL on windows if(WIN32) - return() + return() endif() if(WITH_NCCL) - set(NCCL_ROOT "/usr" CACHE PATH "NCCL ROOT") - find_path(NCCL_INCLUDE_DIR nccl.h - PATHS ${NCCL_ROOT} ${NCCL_ROOT}/include ${NCCL_ROOT}/local/include - $ENV{NCCL_ROOT} $ENV{NCCL_ROOT}/include $ENV{NCCL_ROOT}/local/include - NO_DEFAULT_PATH - ) + set(NCCL_ROOT + "/usr" + CACHE PATH "NCCL ROOT") + find_path( + NCCL_INCLUDE_DIR nccl.h + PATHS ${NCCL_ROOT} ${NCCL_ROOT}/include ${NCCL_ROOT}/local/include + $ENV{NCCL_ROOT} $ENV{NCCL_ROOT}/include $ENV{NCCL_ROOT}/local/include + NO_DEFAULT_PATH) - file(READ ${NCCL_INCLUDE_DIR}/nccl.h NCCL_VERSION_FILE_CONTENTS) + file(READ ${NCCL_INCLUDE_DIR}/nccl.h NCCL_VERSION_FILE_CONTENTS) - string(REGEX MATCH "define NCCL_VERSION_CODE +([0-9]+)" - NCCL_VERSION "${NCCL_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define NCCL_VERSION_CODE +([0-9]+)" "\\1" - NCCL_VERSION "${NCCL_VERSION}") + string(REGEX MATCH "define NCCL_VERSION_CODE +([0-9]+)" NCCL_VERSION + "${NCCL_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define NCCL_VERSION_CODE +([0-9]+)" "\\1" NCCL_VERSION + "${NCCL_VERSION}") - if("${NCCL_VERSION}" GREATER "2000") - message(STATUS "Current NCCL header is ${NCCL_INCLUDE_DIR}/nccl.h. " - "Current NCCL version is v${NCCL_VERSION}. ") - else() - # in old version nccl, it may not define NCCL_VERSION_CODE - string(REGEX MATCH "define NCCL_MAJOR +([0-9]+)" NCCL_MAJOR_VERSION - "${NCCL_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define NCCL_MAJOR +([0-9]+)" "\\1" - NCCL_MAJOR_VERSION "${NCCL_MAJOR_VERSION}") - string(REGEX MATCH "define NCCL_MINOR +([0-9]+)" NCCL_MINOR_VERSION - "${NCCL_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define NCCL_MINOR +([0-9]+)" "\\1" - NCCL_MINOR_VERSION "${NCCL_MINOR_VERSION}") - string(REGEX MATCH "define NCCL_PATCH +([0-9]+)" - NCCL_PATCH_VERSION "${NCCL_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define NCCL_PATCH +([0-9]+)" "\\1" - NCCL_PATCH_VERSION "${NCCL_PATCH_VERSION}") + if("${NCCL_VERSION}" GREATER "2000") + message(STATUS "Current NCCL header is ${NCCL_INCLUDE_DIR}/nccl.h. " + "Current NCCL version is v${NCCL_VERSION}. ") + else() + # in old version nccl, it may not define NCCL_VERSION_CODE + string(REGEX MATCH "define NCCL_MAJOR +([0-9]+)" NCCL_MAJOR_VERSION + "${NCCL_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define NCCL_MAJOR +([0-9]+)" "\\1" NCCL_MAJOR_VERSION + "${NCCL_MAJOR_VERSION}") + string(REGEX MATCH "define NCCL_MINOR +([0-9]+)" NCCL_MINOR_VERSION + "${NCCL_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define NCCL_MINOR +([0-9]+)" "\\1" NCCL_MINOR_VERSION + "${NCCL_MINOR_VERSION}") + string(REGEX MATCH "define NCCL_PATCH +([0-9]+)" NCCL_PATCH_VERSION + "${NCCL_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define NCCL_PATCH +([0-9]+)" "\\1" NCCL_PATCH_VERSION + "${NCCL_PATCH_VERSION}") - if(NOT NCCL_MAJOR_VERSION) - set(NCCL_VERSION "0") - else() - math(EXPR NCCL_VERSION - "${NCCL_MAJOR_VERSION} * 1000 + + if(NOT NCCL_MAJOR_VERSION) + set(NCCL_VERSION "0") + else() + math(EXPR NCCL_VERSION "${NCCL_MAJOR_VERSION} * 1000 + ${NCCL_MINOR_VERSION} * 100 + ${NCCL_PATCH_VERSION}") - endif() - add_definitions("-DNCCL_VERSION_CODE=$NCCL_VERSION") - - message(STATUS "Current NCCL header is ${NCCL_INCLUDE_DIR}/nccl.h. " - "Current NCCL version is v${NCCL_MAJOR_VERSION}.${NCCL_MINOR_VERSION}.${NCCL_PATCH_VERSION} ") endif() + add_definitions("-DNCCL_VERSION_CODE=$NCCL_VERSION") + + message( + STATUS + "Current NCCL header is ${NCCL_INCLUDE_DIR}/nccl.h. " + "Current NCCL version is v${NCCL_MAJOR_VERSION}.${NCCL_MINOR_VERSION}.${NCCL_PATCH_VERSION} " + ) + endif() endif() diff --git a/cmake/neuware.cmake b/cmake/neuware.cmake index a371a0032d991..16dbf16899b5d 100644 --- a/cmake/neuware.cmake +++ b/cmake/neuware.cmake @@ -1,18 +1,18 @@ if(NOT WITH_MLU) - return() + return() endif() if(NOT ENV{NEUWARE_HOME}) - set(NEUWARE_HOME "/usr/local/neuware") + set(NEUWARE_HOME "/usr/local/neuware") else() - set(NEUWARE_HOME $ENV{NEUWARE_HOME}) + set(NEUWARE_HOME $ENV{NEUWARE_HOME}) endif() message(STATUS "NEUWARE_HOME: " ${NEUWARE_HOME}) set(NEUWARE_INCLUDE_DIR ${NEUWARE_HOME}/include) set(NEUWARE_LIB_DIR ${NEUWARE_HOME}/lib64) -INCLUDE_DIRECTORIES(${NEUWARE_INCLUDE_DIR}) +include_directories(${NEUWARE_INCLUDE_DIR}) set(CNNL_LIB ${NEUWARE_LIB_DIR}/libcnnl.so) set(CNRT_LIB ${NEUWARE_LIB_DIR}/libcnrt.so) @@ -23,10 +23,10 @@ generate_dummy_static_lib(LIB_NAME "neuware_lib" GENERATOR "neuware.cmake") set(NEUWARE_LIB_DEPS ${CNNL_LIB} ${CNRT_LIB} ${CNDRV_LIB} ${CNPAPI_LIB}) if(WITH_CNCL) - MESSAGE(STATUS "Compile with CNCL!") - ADD_DEFINITIONS(-DPADDLE_WITH_CNCL) - set(CNCL_LIB ${NEUWARE_LIB_DIR}/libcncl.so) - list(APPEND NEUWARE_LIB_DEPS ${CNCL_LIB}) + message(STATUS "Compile with CNCL!") + add_definitions(-DPADDLE_WITH_CNCL) + set(CNCL_LIB ${NEUWARE_LIB_DIR}/libcncl.so) + list(APPEND NEUWARE_LIB_DEPS ${CNCL_LIB}) endif() -TARGET_LINK_LIBRARIES(neuware_lib ${NEUWARE_LIB_DEPS}) +target_link_libraries(neuware_lib ${NEUWARE_LIB_DEPS}) diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 51e4bd3ac41c9..4e0cc1027eff0 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -3,538 +3,611 @@ include(unity_build) set(PART_CUDA_KERNEL_FILES) function(find_register FILENAME PATTERN OUTPUT) -# find the op_name of REGISTER_OPERATOR(op_name, ...), REGISTER_OP_CPU_KERNEL(op_name, ...) , etc. -# set op_name to OUTPUT - set(options "") - set(oneValueArgs "") - set(multiValueArgs "") - file(READ ${FILENAME} CONTENT) - # message ("number of arguments sent to function: ${ARGC}") - # message ("all function arguments: ${ARGV}") - # message("PATTERN ${PATTERN}") - string(REGEX MATCH "${PATTERN}\\([ \t\r\n]*[a-z0-9_]*," register "${CONTENT}") - if (NOT register STREQUAL "") - string(REPLACE "${PATTERN}(" "" register "${register}") - string(REPLACE "," "" register "${register}") - # [ \t\r\n]+ is used for blank characters. - # Here we use '+' instead of '*' since it is a REPLACE operation. - string(REGEX REPLACE "[ \t\r\n]+" "" register "${register}") - endif() - - set(${OUTPUT} ${register} PARENT_SCOPE) + # find the op_name of REGISTER_OPERATOR(op_name, ...), REGISTER_OP_CPU_KERNEL(op_name, ...) , etc. + # set op_name to OUTPUT + set(options "") + set(oneValueArgs "") + set(multiValueArgs "") + file(READ ${FILENAME} CONTENT) + # message ("number of arguments sent to function: ${ARGC}") + # message ("all function arguments: ${ARGV}") + # message("PATTERN ${PATTERN}") + string(REGEX MATCH "${PATTERN}\\([ \t\r\n]*[a-z0-9_]*," register "${CONTENT}") + if(NOT register STREQUAL "") + string(REPLACE "${PATTERN}(" "" register "${register}") + string(REPLACE "," "" register "${register}") + # [ \t\r\n]+ is used for blank characters. + # Here we use '+' instead of '*' since it is a REPLACE operation. + string(REGEX REPLACE "[ \t\r\n]+" "" register "${register}") + endif() + + set(${OUTPUT} + ${register} + PARENT_SCOPE) endfunction() function(op_library TARGET) - # op_library is a function to create op library. The interface is same as - # cc_library. But it handle split GPU/CPU code and link some common library - # for ops. - set(cc_srcs) - set(cu_srcs) - set(hip_srcs) - set(cu_cc_srcs) - set(hip_cc_srcs) - set(xpu_cc_srcs) - set(xpu_kp_cc_srcs) - set(npu_cc_srcs) - set(mlu_cc_srcs) - set(cudnn_cu_cc_srcs) - set(miopen_cu_cc_srcs) - set(cudnn_cu_srcs) - set(miopen_cu_srcs) - set(CUDNN_FILE) - set(MIOPEN_FILE) - set(mkldnn_cc_srcs) - set(MKLDNN_FILE) - set(op_common_deps operator op_registry math_function layer common_infer_shape_functions) - if (WITH_ASCEND_CL) - set(op_common_deps ${op_common_deps} npu_op_runner) - endif() - if (WITH_MLU) - set(op_common_deps ${op_common_deps} mlu_baseop) - endif() + # op_library is a function to create op library. The interface is same as + # cc_library. But it handle split GPU/CPU code and link some common library + # for ops. + set(cc_srcs) + set(cu_srcs) + set(hip_srcs) + set(cu_cc_srcs) + set(hip_cc_srcs) + set(xpu_cc_srcs) + set(xpu_kp_cc_srcs) + set(npu_cc_srcs) + set(mlu_cc_srcs) + set(cudnn_cu_cc_srcs) + set(miopen_cu_cc_srcs) + set(cudnn_cu_srcs) + set(miopen_cu_srcs) + set(CUDNN_FILE) + set(MIOPEN_FILE) + set(mkldnn_cc_srcs) + set(MKLDNN_FILE) + set(op_common_deps operator op_registry math_function layer + common_infer_shape_functions) + if(WITH_ASCEND_CL) + set(op_common_deps ${op_common_deps} npu_op_runner) + endif() + if(WITH_MLU) + set(op_common_deps ${op_common_deps} mlu_baseop) + endif() - # Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build. - set(options UNITY) - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) - set(pybind_flag 0) - cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" - "${multiValueArgs}" ${ARGN}) + # Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build. + set(options UNITY) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + set(pybind_flag 0) + cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) - list(LENGTH op_library_SRCS op_library_SRCS_len) - if (${op_library_SRCS_len} EQUAL 0) - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc) - list(APPEND cc_srcs ${TARGET}.cc) - endif() - if(WITH_GPU) - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc) - list(APPEND cu_cc_srcs ${TARGET}.cu.cc) - endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) - list(APPEND cu_srcs ${TARGET}.cu) - endif() - # rename in KP: .kps -> .cu - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps) - file(COPY ${TARGET}.kps DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) - file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.kps ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu) - list(APPEND cu_srcs ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu) - endif() - if (WITH_NV_JETSON) - list(REMOVE_ITEM cu_srcs "decode_jpeg_op.cu") - endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) - set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu - ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE) - list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) - endif() - string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}") - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc) - list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc) - endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu) - list(APPEND cudnn_cu_srcs ${CUDNN_FILE}.cu) - endif() - endif() - if(WITH_ROCM) - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc) - list(APPEND hip_cc_srcs ${TARGET}.cu.cc) - endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) - list(APPEND hip_srcs ${TARGET}.cu) - endif() - # rename in KP: .kps -> .cu - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps) - file(COPY ${TARGET}.kps DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) - file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.kps ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu) - list(APPEND hip_srcs ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu) - endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) - set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu - ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE) - list(APPEND hip_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) - endif() - string(REPLACE "_op" "_cudnn_op" MIOPEN_FILE "${TARGET}") - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.cu.cc) - list(APPEND miopen_cu_cc_srcs ${MIOPEN_FILE}.cu.cc) - endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.cu) - list(APPEND miopen_cu_srcs ${MIOPEN_FILE}.cu) - endif() - endif() - if(WITH_MKLDNN) - string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}") - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn/${MKLDNN_FILE}.cc) - list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc) - endif() - endif() - if(WITH_XPU) - string(REPLACE "_op" "_op_xpu" XPU_FILE "${TARGET}") - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${XPU_FILE}.cc) - list(APPEND xpu_cc_srcs ${XPU_FILE}.cc) - endif() - endif() - if(WITH_XPU_KP) - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.xpu) - list(APPEND xpu_kp_cc_srcs ${TARGET}.xpu) - endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps) - list(APPEND xpu_kp_cc_srcs ${TARGET}.kps) - endif() - endif() - if(WITH_ASCEND_CL) - string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}") - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc) - list(APPEND npu_cc_srcs ${NPU_FILE}.cc) - endif() - endif() - if(WITH_MLU) - string(REPLACE "_op" "_op_mlu" MLU_FILE "${TARGET}") - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MLU_FILE}.cc) - list(APPEND mlu_cc_srcs ${MLU_FILE}.cc) - endif() - endif() - else() - foreach(src ${op_library_SRCS}) - if(WITH_ROCM AND ${src} MATCHES ".*_cudnn_op.cu$") - list(APPEND miopen_cu_srcs ${src}) - elseif(WITH_ROCM AND ${src} MATCHES ".*\\.cu$") - list(APPEND hip_srcs ${src}) - elseif(WITH_ROCM AND ${src} MATCHES ".*_cudnn_op.cu.cc$") - list(APPEND miopen_cu_cc_srcs ${src}) - elseif(WITH_ROCM AND ${src} MATCHES ".*\\.cu.cc$") - list(APPEND hip_cc_srcs ${src}) - elseif(WITH_GPU AND ${src} MATCHES ".*_cudnn_op.cu$") - list(APPEND cudnn_cu_srcs ${src}) - elseif (WITH_GPU AND ${src} MATCHES ".*\\.cu$") - list(APPEND cu_srcs ${src}) - elseif(WITH_GPU AND ${src} MATCHES ".*_cudnn_op.cu.cc$") - list(APPEND cudnn_cu_cc_srcs ${src}) - elseif(WITH_GPU AND ${src} MATCHES ".*\\.cu.cc$") - list(APPEND cu_cc_srcs ${src}) - elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$") - list(APPEND mkldnn_cc_srcs ${src}) - elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$") - list(APPEND xpu_cc_srcs ${src}) - elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.xpu$") - list(APPEND xpu_kp_cc_srcs ${src}) - elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.kps$") - list(APPEND xpu_kp_cc_srcs ${src}) - elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$") - list(APPEND npu_cc_srcs ${src}) - elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$") - list(APPEND mlu_cc_srcs ${src}) - elseif(${src} MATCHES ".*\\.cc$") - list(APPEND cc_srcs ${src}) - else() - message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu or .xpu") - endif() - endforeach() + list(LENGTH op_library_SRCS op_library_SRCS_len) + if(${op_library_SRCS_len} EQUAL 0) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc) + list(APPEND cc_srcs ${TARGET}.cc) + endif() + if(WITH_GPU) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc) + list(APPEND cu_cc_srcs ${TARGET}.cu.cc) + endif() + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) + list(APPEND cu_srcs ${TARGET}.cu) + endif() + # rename in KP: .kps -> .cu + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps) + file(COPY ${TARGET}.kps DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) + file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.kps + ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu) + list(APPEND cu_srcs ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu) + endif() + if(WITH_NV_JETSON) + list(REMOVE_ITEM cu_srcs "decode_jpeg_op.cu") + endif() + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) + set(PART_CUDA_KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu + ${PART_CUDA_KERNEL_FILES} + PARENT_SCOPE) + list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) + endif() + string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}") + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc) + list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc) + endif() + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu) + list(APPEND cudnn_cu_srcs ${CUDNN_FILE}.cu) + endif() + endif() + if(WITH_ROCM) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc) + list(APPEND hip_cc_srcs ${TARGET}.cu.cc) + endif() + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) + list(APPEND hip_srcs ${TARGET}.cu) + endif() + # rename in KP: .kps -> .cu + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps) + file(COPY ${TARGET}.kps DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) + file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.kps + ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu) + list(APPEND hip_srcs ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu) + endif() + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) + set(PART_CUDA_KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu + ${PART_CUDA_KERNEL_FILES} + PARENT_SCOPE) + list(APPEND hip_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) + endif() + string(REPLACE "_op" "_cudnn_op" MIOPEN_FILE "${TARGET}") + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.cu.cc) + list(APPEND miopen_cu_cc_srcs ${MIOPEN_FILE}.cu.cc) + endif() + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.cu) + list(APPEND miopen_cu_srcs ${MIOPEN_FILE}.cu) + endif() + endif() + if(WITH_MKLDNN) + string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}") + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn/${MKLDNN_FILE}.cc) + list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc) + endif() + endif() + if(WITH_XPU) + string(REPLACE "_op" "_op_xpu" XPU_FILE "${TARGET}") + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${XPU_FILE}.cc) + list(APPEND xpu_cc_srcs ${XPU_FILE}.cc) + endif() + endif() + if(WITH_XPU_KP) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.xpu) + list(APPEND xpu_kp_cc_srcs ${TARGET}.xpu) + endif() + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps) + list(APPEND xpu_kp_cc_srcs ${TARGET}.kps) + endif() + endif() + if(WITH_ASCEND_CL) + string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}") + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc) + list(APPEND npu_cc_srcs ${NPU_FILE}.cc) + endif() endif() - - list(LENGTH xpu_cc_srcs xpu_cc_srcs_len) - list(LENGTH xpu_kp_cc_srcs xpu_kp_cc_srcs_len) - list(LENGTH cc_srcs cc_srcs_len) - if (${cc_srcs_len} EQUAL 0) - message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file") + if(WITH_MLU) + string(REPLACE "_op" "_op_mlu" MLU_FILE "${TARGET}") + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MLU_FILE}.cc) + list(APPEND mlu_cc_srcs ${MLU_FILE}.cc) + endif() endif() - if (WIN32) + else() + foreach(src ${op_library_SRCS}) + if(WITH_ROCM AND ${src} MATCHES ".*_cudnn_op.cu$") + list(APPEND miopen_cu_srcs ${src}) + elseif(WITH_ROCM AND ${src} MATCHES ".*\\.cu$") + list(APPEND hip_srcs ${src}) + elseif(WITH_ROCM AND ${src} MATCHES ".*_cudnn_op.cu.cc$") + list(APPEND miopen_cu_cc_srcs ${src}) + elseif(WITH_ROCM AND ${src} MATCHES ".*\\.cu.cc$") + list(APPEND hip_cc_srcs ${src}) + elseif(WITH_GPU AND ${src} MATCHES ".*_cudnn_op.cu$") + list(APPEND cudnn_cu_srcs ${src}) + elseif(WITH_GPU AND ${src} MATCHES ".*\\.cu$") + list(APPEND cu_srcs ${src}) + elseif(WITH_GPU AND ${src} MATCHES ".*_cudnn_op.cu.cc$") + list(APPEND cudnn_cu_cc_srcs ${src}) + elseif(WITH_GPU AND ${src} MATCHES ".*\\.cu.cc$") + list(APPEND cu_cc_srcs ${src}) + elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$") + list(APPEND mkldnn_cc_srcs ${src}) + elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$") + list(APPEND xpu_cc_srcs ${src}) + elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.xpu$") + list(APPEND xpu_kp_cc_srcs ${src}) + elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.kps$") + list(APPEND xpu_kp_cc_srcs ${src}) + elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$") + list(APPEND npu_cc_srcs ${src}) + elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$") + list(APPEND mlu_cc_srcs ${src}) + elseif(${src} MATCHES ".*\\.cc$") + list(APPEND cc_srcs ${src}) + else() + message( + FATAL_ERROR + "${TARGET} Source file ${src} should only be .cc or .cu or .xpu") + endif() + endforeach() + endif() + + list(LENGTH xpu_cc_srcs xpu_cc_srcs_len) + list(LENGTH xpu_kp_cc_srcs xpu_kp_cc_srcs_len) + list(LENGTH cc_srcs cc_srcs_len) + if(${cc_srcs_len} EQUAL 0) + message( + FATAL_ERROR + "The op library ${TARGET} should contains at least one .cc file") + endif() + if(WIN32) # remove windows unsupported op, because windows has no nccl, no warpctc such ops. foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op") - if ("${TARGET}" STREQUAL "${windows_unsupport_op}") - return() - endif() + if("${TARGET}" STREQUAL "${windows_unsupport_op}") + return() + endif() endforeach() - endif(WIN32) + endif(WIN32) + + # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`. + if(WITH_UNITY_BUILD AND op_library_UNITY) + # Generate the unity target name by the directory where source files located. + string(REPLACE "${PADDLE_SOURCE_DIR}/paddle/fluid/" "" UNITY_TARGET + ${CMAKE_CURRENT_SOURCE_DIR}) + string(REPLACE "/" "_" UNITY_TARGET ${UNITY_TARGET}) + set(UNITY_TARGET "paddle_${UNITY_TARGET}_unity") + if(NOT ${UNITY_TARGET} IN_LIST OP_LIBRARY) + set(OP_LIBRARY + ${UNITY_TARGET} ${OP_LIBRARY} + CACHE INTERNAL "op libs") + endif() + else() + set(OP_LIBRARY + ${TARGET} ${OP_LIBRARY} + CACHE INTERNAL "op libs") + endif() + list(LENGTH op_library_DEPS op_library_DEPS_len) + if(${op_library_DEPS_len} GREATER 0) + set(DEPS_OPS + ${TARGET} ${DEPS_OPS} + PARENT_SCOPE) + endif() + if(WITH_GPU) # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`. if(WITH_UNITY_BUILD AND op_library_UNITY) - # Generate the unity target name by the directory where source files located. - string(REPLACE "${PADDLE_SOURCE_DIR}/paddle/fluid/" "" UNITY_TARGET ${CMAKE_CURRENT_SOURCE_DIR}) - string(REPLACE "/" "_" UNITY_TARGET ${UNITY_TARGET}) - set(UNITY_TARGET "paddle_${UNITY_TARGET}_unity") - if(NOT ${UNITY_TARGET} IN_LIST OP_LIBRARY) - set(OP_LIBRARY ${UNITY_TARGET} ${OP_LIBRARY} CACHE INTERNAL "op libs") - endif() + # Combine the cc and cu source files. + compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${cu_cc_srcs} + ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs}) + compose_unity_target_sources(${UNITY_TARGET} cu ${cudnn_cu_srcs} + ${cu_srcs}) + if(TARGET ${UNITY_TARGET}) + # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`. + target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources} + ${unity_target_cu_sources}) + else() + # If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files. + nv_library( + ${UNITY_TARGET} + SRCS ${unity_target_cc_sources} ${unity_target_cu_sources} + DEPS ${op_library_DEPS} ${op_common_deps}) + endif() + # Add alias library to handle dependencies. + add_library(${TARGET} ALIAS ${UNITY_TARGET}) else() - set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} CACHE INTERNAL "op libs") + nv_library( + ${TARGET} + SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cudnn_cu_srcs} + ${mkldnn_cc_srcs} ${cu_srcs} + DEPS ${op_library_DEPS} ${op_common_deps}) endif() - - list(LENGTH op_library_DEPS op_library_DEPS_len) - if (${op_library_DEPS_len} GREATER 0) - set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE) + elseif(WITH_ROCM) + list(REMOVE_ITEM miopen_cu_cc_srcs "affine_grid_cudnn_op.cu.cc") + list(REMOVE_ITEM miopen_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc") + list(REMOVE_ITEM hip_srcs "cholesky_op.cu") + list(REMOVE_ITEM hip_srcs "cholesky_solve_op.cu") + list(REMOVE_ITEM hip_srcs "lu_op.cu") + list(REMOVE_ITEM hip_srcs "matrix_rank_op.cu") + list(REMOVE_ITEM hip_srcs "svd_op.cu") + list(REMOVE_ITEM hip_srcs "eigvalsh_op.cu") + list(REMOVE_ITEM hip_srcs "qr_op.cu") + list(REMOVE_ITEM hip_srcs "eigh_op.cu") + list(REMOVE_ITEM hip_srcs "lstsq_op.cu") + list(REMOVE_ITEM hip_srcs "multinomial_op.cu") + list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu") + hip_library( + ${TARGET} + SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} + ${mkldnn_cc_srcs} ${hip_srcs} + DEPS ${op_library_DEPS} ${op_common_deps}) + elseif(WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0) + xpu_library( + ${TARGET} + SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${xpu_kp_cc_srcs} + DEPS ${op_library_DEPS} ${op_common_deps}) + else() + # deal with CANN version control while registering NPU operators before build + if(WITH_ASCEND_CL) + if(CANN_VERSION LESS 504000) + list(REMOVE_ITEM npu_cc_srcs "multinomial_op_npu.cc") + list(REMOVE_ITEM npu_cc_srcs "take_along_axis_op_npu.cc") + endif() endif() - if (WITH_GPU) - # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`. - if(WITH_UNITY_BUILD AND op_library_UNITY) - # Combine the cc and cu source files. - compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs}) - compose_unity_target_sources(${UNITY_TARGET} cu ${cudnn_cu_srcs} ${cu_srcs}) - if(TARGET ${UNITY_TARGET}) - # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`. - target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources} ${unity_target_cu_sources}) - else() - # If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files. - nv_library(${UNITY_TARGET} SRCS ${unity_target_cc_sources} ${unity_target_cu_sources} DEPS ${op_library_DEPS} ${op_common_deps}) - endif() - # Add alias library to handle dependencies. - add_library(${TARGET} ALIAS ${UNITY_TARGET}) - else() - nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cudnn_cu_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} - ${op_common_deps}) - endif() - elseif (WITH_ROCM) - list(REMOVE_ITEM miopen_cu_cc_srcs "affine_grid_cudnn_op.cu.cc") - list(REMOVE_ITEM miopen_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc") - list(REMOVE_ITEM hip_srcs "cholesky_op.cu") - list(REMOVE_ITEM hip_srcs "cholesky_solve_op.cu") - list(REMOVE_ITEM hip_srcs "lu_op.cu") - list(REMOVE_ITEM hip_srcs "matrix_rank_op.cu") - list(REMOVE_ITEM hip_srcs "svd_op.cu") - list(REMOVE_ITEM hip_srcs "eigvalsh_op.cu") - list(REMOVE_ITEM hip_srcs "qr_op.cu") - list(REMOVE_ITEM hip_srcs "eigh_op.cu") - list(REMOVE_ITEM hip_srcs "lstsq_op.cu") - list(REMOVE_ITEM hip_srcs "multinomial_op.cu") - list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu") - hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS} - ${op_common_deps}) - elseif (WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0) - xpu_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${xpu_kp_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) + # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`. + if(WITH_UNITY_BUILD AND op_library_UNITY) + # Combine the cc source files. + compose_unity_target_sources( + ${UNITY_TARGET} + cc + ${cc_srcs} + ${mkldnn_cc_srcs} + ${xpu_cc_srcs} + ${npu_cc_srcs} + ${mlu_cc_srcs}) + if(TARGET ${UNITY_TARGET}) + # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`. + target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources}) + else() + # If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files. + cc_library( + ${UNITY_TARGET} + SRCS ${unity_target_cc_sources} + DEPS ${op_library_DEPS} ${op_common_deps}) + endif() + # Add alias library to handle dependencies. + add_library(${TARGET} ALIAS ${UNITY_TARGET}) else() - # deal with CANN version control while registering NPU operators before build - if (WITH_ASCEND_CL) - if (CANN_VERSION LESS 504000) - list(REMOVE_ITEM npu_cc_srcs "multinomial_op_npu.cc") - list(REMOVE_ITEM npu_cc_srcs "take_along_axis_op_npu.cc") - endif() - endif() - # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`. - if(WITH_UNITY_BUILD AND op_library_UNITY) - # Combine the cc source files. - compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs} ${mlu_cc_srcs}) - if(TARGET ${UNITY_TARGET}) - # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`. - target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources}) - else() - # If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files. - cc_library(${UNITY_TARGET} SRCS ${unity_target_cc_sources} DEPS ${op_library_DEPS} ${op_common_deps}) - endif() - # Add alias library to handle dependencies. - add_library(${TARGET} ALIAS ${UNITY_TARGET}) - else() - cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs} ${mlu_cc_srcs} DEPS ${op_library_DEPS} - ${op_common_deps}) - endif() + cc_library( + ${TARGET} + SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs} + ${mlu_cc_srcs} + DEPS ${op_library_DEPS} ${op_common_deps}) endif() + endif() - list(LENGTH cu_srcs cu_srcs_len) - list(LENGTH hip_srcs hip_srcs_len) - list(LENGTH cu_cc_srcs cu_cc_srcs_len) - list(LENGTH hip_cc_srcs hip_cc_srcs_len) - list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len) - list(LENGTH xpu_cc_srcs xpu_cc_srcs_len) - list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len) - list(LENGTH npu_cc_srcs npu_cc_srcs_len) - list(LENGTH mlu_cc_srcs mlu_cc_srcs_len) + list(LENGTH cu_srcs cu_srcs_len) + list(LENGTH hip_srcs hip_srcs_len) + list(LENGTH cu_cc_srcs cu_cc_srcs_len) + list(LENGTH hip_cc_srcs hip_cc_srcs_len) + list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len) + list(LENGTH xpu_cc_srcs xpu_cc_srcs_len) + list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len) + list(LENGTH npu_cc_srcs npu_cc_srcs_len) + list(LENGTH mlu_cc_srcs mlu_cc_srcs_len) - # Define operators that don't need pybind here. - foreach(manual_pybind_op "compare_all_op" "compare_op" "logical_op" "bitwise_op" "nccl_op" - "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op") + # Define operators that don't need pybind here. + foreach( + manual_pybind_op + "compare_all_op" + "compare_op" + "logical_op" + "bitwise_op" + "nccl_op" + "tensor_array_read_write_op" + "tensorrt_engine_op" + "conv_fusion_op") - if ("${TARGET}" STREQUAL "${manual_pybind_op}") - set(pybind_flag 1) - endif() - endforeach() + if("${TARGET}" STREQUAL "${manual_pybind_op}") + set(pybind_flag 1) + endif() + endforeach() - # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h. - # Note that it's enough to just adding one operator to pybind in a *_op.cc file. - # And for detail pybind information, please see generated paddle/pybind/pybind.h. - set(ORIGINAL_TARGET ${TARGET}) - string(REGEX REPLACE "_op" "" TARGET "${TARGET}") + # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h. + # Note that it's enough to just adding one operator to pybind in a *_op.cc file. + # And for detail pybind information, please see generated paddle/pybind/pybind.h. + set(ORIGINAL_TARGET ${TARGET}) + string(REGEX REPLACE "_op" "" TARGET "${TARGET}") - foreach(cc_src ${cc_srcs}) - # pybind USE_OP_ITSELF - set(op_name "") - find_register(${cc_src} "REGISTER_OPERATOR" op_name) - if(NOT ${op_name} EQUAL "") - file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n") - # hack: for example, the target in conv_transpose_op.cc is conv2d_transpose, used in mkldnn - set(TARGET ${op_name}) - set(pybind_flag 1) - endif() - - set(op_name "") - find_register(${cc_src} "REGISTER_OP_WITHOUT_GRADIENT" op_name) - if(NOT ${op_name} EQUAL "") - file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n") - # hack: for example, the target in conv_transpose_op.cc is conv2d_transpose, used in mkldnn - set(TARGET ${op_name}) - set(pybind_flag 1) - endif() + foreach(cc_src ${cc_srcs}) + # pybind USE_OP_ITSELF + set(op_name "") + find_register(${cc_src} "REGISTER_OPERATOR" op_name) + if(NOT ${op_name} EQUAL "") + file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n") + # hack: for example, the target in conv_transpose_op.cc is conv2d_transpose, used in mkldnn + set(TARGET ${op_name}) + set(pybind_flag 1) + endif() - # pybind USE_OP_DEVICE_KERNEL for CPU - set(op_name "") - find_register(${cc_src} "REGISTER_OP_CPU_KERNEL" op_name) - if(NOT ${op_name} EQUAL "") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CPU);\n") - # why change TARGET here? - # when building padle with on_infer, the REGISTER_OPERATOR(*_grad) will be removed before compiling (see details in remove_grad_op_and_kernel.py) - # in elementwise_op.cc, it will find REGISTER_OPERATOR(grad_add) and set TARGET to grad_add - # and, in the following "mkldnn" part, it will add USE_OP_DEVICE_KERNEL(grad_add, MKLDNN) to pybind.h - # however, grad_add has no mkldnn kernel. - set(TARGET ${op_name}) - set(pybind_flag 1) - endif() - endforeach() + set(op_name "") + find_register(${cc_src} "REGISTER_OP_WITHOUT_GRADIENT" op_name) + if(NOT ${op_name} EQUAL "") + file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n") + # hack: for example, the target in conv_transpose_op.cc is conv2d_transpose, used in mkldnn + set(TARGET ${op_name}) + set(pybind_flag 1) + endif() - # pybind USE_OP_DEVICE_KERNEL for CUDA - list (APPEND cu_srcs ${cu_cc_srcs}) - # message("cu_srcs ${cu_srcs}") - foreach(cu_src ${cu_srcs}) - set(op_name "") - find_register(${cu_src} "REGISTER_OP_CUDA_KERNEL" op_name) - if(NOT ${op_name} EQUAL "") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n") - set(pybind_flag 1) - endif() - endforeach() + # pybind USE_OP_DEVICE_KERNEL for CPU + set(op_name "") + find_register(${cc_src} "REGISTER_OP_CPU_KERNEL" op_name) + if(NOT ${op_name} EQUAL "") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CPU);\n") + # why change TARGET here? + # when building padle with on_infer, the REGISTER_OPERATOR(*_grad) will be removed before compiling (see details in remove_grad_op_and_kernel.py) + # in elementwise_op.cc, it will find REGISTER_OPERATOR(grad_add) and set TARGET to grad_add + # and, in the following "mkldnn" part, it will add USE_OP_DEVICE_KERNEL(grad_add, MKLDNN) to pybind.h + # however, grad_add has no mkldnn kernel. + set(TARGET ${op_name}) + set(pybind_flag 1) + endif() + endforeach() - # pybind USE_OP_DEVICE_KERNEL for ROCm - list (APPEND hip_srcs ${hip_cc_srcs}) - # message("hip_srcs ${hip_srcs}") - foreach(hip_src ${hip_srcs}) - set(op_name "") - find_register(${hip_src} "REGISTER_OP_CUDA_KERNEL" op_name) - if(NOT ${op_name} EQUAL "") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n") - set(pybind_flag 1) - endif() - endforeach() + # pybind USE_OP_DEVICE_KERNEL for CUDA + list(APPEND cu_srcs ${cu_cc_srcs}) + # message("cu_srcs ${cu_srcs}") + foreach(cu_src ${cu_srcs}) + set(op_name "") + find_register(${cu_src} "REGISTER_OP_CUDA_KERNEL" op_name) + if(NOT ${op_name} EQUAL "") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n") + set(pybind_flag 1) + endif() + endforeach() - # pybind USE_OP_DEVICE_KERNEL for CUDNN/MIOPEN - list(APPEND cudnn_cu_srcs ${cudnn_cu_cc_srcs}) - list(APPEND cudnn_cu_srcs ${miopen_cu_cc_srcs}) - list(APPEND cudnn_cu_srcs ${miopen_cu_srcs}) - list(LENGTH cudnn_cu_srcs cudnn_cu_srcs_len) - #message("cudnn_cu_srcs ${cudnn_cu_srcs}") - if(${cudnn_cu_srcs_len} GREATER 0 AND ${ORIGINAL_TARGET} STREQUAL "activation_op") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n") - else() - foreach(cudnn_src ${cudnn_cu_srcs}) - set(op_name "") - find_register(${cudnn_src} "REGISTER_OP_KERNEL" op_name) - if(NOT ${op_name} EQUAL "") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDNN);\n") - set(pybind_flag 1) - endif() - endforeach() + # pybind USE_OP_DEVICE_KERNEL for ROCm + list(APPEND hip_srcs ${hip_cc_srcs}) + # message("hip_srcs ${hip_srcs}") + foreach(hip_src ${hip_srcs}) + set(op_name "") + find_register(${hip_src} "REGISTER_OP_CUDA_KERNEL" op_name) + if(NOT ${op_name} EQUAL "") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n") + set(pybind_flag 1) endif() + endforeach() + # pybind USE_OP_DEVICE_KERNEL for CUDNN/MIOPEN + list(APPEND cudnn_cu_srcs ${cudnn_cu_cc_srcs}) + list(APPEND cudnn_cu_srcs ${miopen_cu_cc_srcs}) + list(APPEND cudnn_cu_srcs ${miopen_cu_srcs}) + list(LENGTH cudnn_cu_srcs cudnn_cu_srcs_len) + #message("cudnn_cu_srcs ${cudnn_cu_srcs}") + if(${cudnn_cu_srcs_len} GREATER 0 AND ${ORIGINAL_TARGET} STREQUAL + "activation_op") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n") + else() + foreach(cudnn_src ${cudnn_cu_srcs}) + set(op_name "") + find_register(${cudnn_src} "REGISTER_OP_KERNEL" op_name) + if(NOT ${op_name} EQUAL "") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDNN);\n") + set(pybind_flag 1) + endif() + endforeach() + endif() - if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0) + if(WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0) if(${ORIGINAL_TARGET} STREQUAL "activation_op") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, XPU);\n") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, XPU);\n") else() - foreach(xpu_src ${xpu_cc_srcs}) + foreach(xpu_src ${xpu_cc_srcs}) set(op_name "") find_register(${xpu_src} "REGISTER_OP_XPU_KERNEL" op_name) if(NOT ${op_name} EQUAL "") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, XPU);\n") - set(pybind_flag 1) + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, XPU);\n") + set(pybind_flag 1) else() - find_register(${xpu_src} "REGISTER_OP_XPU_KERNEL_FUNCTOR" op_name) - if(NOT ${op_name} EQUAL "") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, XPU);\n") - set(pybind_flag 1) - endif() - endif() - endforeach() - endif() - endif() - - # pybind USE_OP_DEVICE_KERNEL for XPU KP - if (WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0) - foreach(xpu_kp_src ${xpu_kp_cc_srcs}) - set(op_name "") - find_register(${xpu_kp_src} "REGISTER_OP_KERNEL" op_name) - if(NOT ${op_name} EQUAL "") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, KP);\n") - message(STATUS "Building KP Target: ${op_name}") + find_register(${xpu_src} "REGISTER_OP_XPU_KERNEL_FUNCTOR" op_name) + if(NOT ${op_name} EQUAL "") + file(APPEND ${pybind_file} + "USE_OP_DEVICE_KERNEL(${op_name}, XPU);\n") set(pybind_flag 1) + endif() endif() - endforeach() + endforeach() endif() + endif() - # pybind USE_OP_DEVICE_KERNEL for NPU - if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0) - foreach(npu_src ${npu_cc_srcs}) - set(op_name "") - find_register(${npu_src} "REGISTER_OP_NPU_KERNEL" op_name) - if(NOT ${op_name} EQUAL "") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, NPU);\n") - set(pybind_flag 1) - endif() - endforeach() - endif() + # pybind USE_OP_DEVICE_KERNEL for XPU KP + if(WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0) + foreach(xpu_kp_src ${xpu_kp_cc_srcs}) + set(op_name "") + find_register(${xpu_kp_src} "REGISTER_OP_KERNEL" op_name) + if(NOT ${op_name} EQUAL "") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, KP);\n") + message(STATUS "Building KP Target: ${op_name}") + set(pybind_flag 1) + endif() + endforeach() + endif() - # pybind USE_OP_DEVICE_KERNEL for MLU - if (WITH_MLU AND ${mlu_cc_srcs_len} GREATER 0) - foreach(mlu_src ${mlu_cc_srcs}) - set(op_name "") - find_register(${mlu_src} "REGISTER_OP_MLU_KERNEL" op_name) - if(NOT ${op_name} EQUAL "") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, MLU);\n") - set(pybind_flag 1) - endif() - endforeach() - endif() + # pybind USE_OP_DEVICE_KERNEL for NPU + if(WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0) + foreach(npu_src ${npu_cc_srcs}) + set(op_name "") + find_register(${npu_src} "REGISTER_OP_NPU_KERNEL" op_name) + if(NOT ${op_name} EQUAL "") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, NPU);\n") + set(pybind_flag 1) + endif() + endforeach() + endif() - # pybind USE_OP_DEVICE_KERNEL for MKLDNN - if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0) - # Append first implemented MKLDNN activation operator - if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n") - elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, S8);\n") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, U8);\n") - elseif(${MKLDNN_FILE} STREQUAL "transpose_mkldnn_op") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, FP32);\n") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, S8);\n") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, U8);\n") - elseif(${MKLDNN_FILE} STREQUAL "fc_mkldnn_op") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, FP32);\n") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, S8);\n") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, U8);\n") - else() - foreach(mkldnn_src ${mkldnn_cc_srcs}) + # pybind USE_OP_DEVICE_KERNEL for MLU + if(WITH_MLU AND ${mlu_cc_srcs_len} GREATER 0) + foreach(mlu_src ${mlu_cc_srcs}) + set(op_name "") + find_register(${mlu_src} "REGISTER_OP_MLU_KERNEL" op_name) + if(NOT ${op_name} EQUAL "") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, MLU);\n") + set(pybind_flag 1) + endif() + endforeach() + endif() + + # pybind USE_OP_DEVICE_KERNEL for MKLDNN + if(WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0) + # Append first implemented MKLDNN activation operator + if(${MKLDNN_FILE} STREQUAL "activation_mkldnn_op") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n") + elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op") + file(APPEND ${pybind_file} + "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n") + file(APPEND ${pybind_file} + "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, S8);\n") + file(APPEND ${pybind_file} + "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, U8);\n") + elseif(${MKLDNN_FILE} STREQUAL "transpose_mkldnn_op") + file(APPEND ${pybind_file} + "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, FP32);\n") + file(APPEND ${pybind_file} + "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, S8);\n") + file(APPEND ${pybind_file} + "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, U8);\n") + elseif(${MKLDNN_FILE} STREQUAL "fc_mkldnn_op") + file(APPEND ${pybind_file} + "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, FP32);\n") + file(APPEND ${pybind_file} + "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, S8);\n") + file(APPEND ${pybind_file} + "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, U8);\n") + else() + foreach(mkldnn_src ${mkldnn_cc_srcs}) set(op_name "") find_register(${mkldnn_src} "REGISTER_OP_KERNEL" op_name) if(NOT ${op_name} EQUAL "") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, MKLDNN);\n") - set(pybind_flag 1) + file(APPEND ${pybind_file} + "USE_OP_DEVICE_KERNEL(${op_name}, MKLDNN);\n") + set(pybind_flag 1) endif() - endforeach() - endif() + endforeach() endif() + endif() - # pybind USE_NO_KERNEL_OP - # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel - string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}") - string(REPLACE "_op" "" TARGET "${TARGET}") - if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "") - file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n") - set(pybind_flag 1) - endif() + # pybind USE_NO_KERNEL_OP + # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel + string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}") + string(REPLACE "_op" "" TARGET "${TARGET}") + if(${pybind_flag} EQUAL 0 AND regex_result STREQUAL "") + file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n") + set(pybind_flag 1) + endif() - # pybind USE_OP - if (${pybind_flag} EQUAL 0) - # NOTE(*): activation use macro to regist the kernels, set use_op manually. - if(${TARGET} STREQUAL "activation") - file(APPEND ${pybind_file} "USE_OP_ITSELF(relu);\n") - elseif(${TARGET} STREQUAL "fake_dequantize") - file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n") - elseif(${TARGET} STREQUAL "fake_quantize") - file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n") - elseif(${TARGET} STREQUAL "tensorrt_engine_op") - message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference") - else() - file(APPEND ${pybind_file} "USE_OP(${TARGET});\n") - endif() + # pybind USE_OP + if(${pybind_flag} EQUAL 0) + # NOTE(*): activation use macro to regist the kernels, set use_op manually. + if(${TARGET} STREQUAL "activation") + file(APPEND ${pybind_file} "USE_OP_ITSELF(relu);\n") + elseif(${TARGET} STREQUAL "fake_dequantize") + file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n") + elseif(${TARGET} STREQUAL "fake_quantize") + file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n") + elseif(${TARGET} STREQUAL "tensorrt_engine_op") + message( + STATUS + "Pybind skips [tensorrt_engine_op], for this OP is only used in inference" + ) + else() + file(APPEND ${pybind_file} "USE_OP(${TARGET});\n") endif() + endif() endfunction() function(register_operators) - set(options "") - set(oneValueArgs "") - set(multiValueArgs EXCLUDES DEPS) - cmake_parse_arguments(register_operators "${options}" "${oneValueArgs}" - "${multiValueArgs}" ${ARGN}) - file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") - string(REPLACE "_mkldnn" "" OPS "${OPS}") - string(REPLACE "_xpu" "" OPS "${OPS}") - string(REPLACE "_npu" "" OPS "${OPS}") - string(REPLACE "_mlu" "" OPS "${OPS}") - string(REPLACE ".cc" "" OPS "${OPS}") - list(REMOVE_DUPLICATES OPS) - list(LENGTH register_operators_DEPS register_operators_DEPS_len) + set(options "") + set(oneValueArgs "") + set(multiValueArgs EXCLUDES DEPS) + cmake_parse_arguments(register_operators "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + file( + GLOB OPS + RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + "*_op.cc") + string(REPLACE "_mkldnn" "" OPS "${OPS}") + string(REPLACE "_xpu" "" OPS "${OPS}") + string(REPLACE "_npu" "" OPS "${OPS}") + string(REPLACE "_mlu" "" OPS "${OPS}") + string(REPLACE ".cc" "" OPS "${OPS}") + list(REMOVE_DUPLICATES OPS) + list(LENGTH register_operators_DEPS register_operators_DEPS_len) - foreach(src ${OPS}) - list(FIND register_operators_EXCLUDES ${src} _index) - if (${_index} EQUAL -1) - if (${register_operators_DEPS_len} GREATER 0) - op_library(${src} UNITY DEPS ${register_operators_DEPS}) - else() - op_library(${src} UNITY) - endif() - endif() - endforeach() + foreach(src ${OPS}) + list(FIND register_operators_EXCLUDES ${src} _index) + if(${_index} EQUAL -1) + if(${register_operators_DEPS_len} GREATER 0) + op_library(${src} UNITY DEPS ${register_operators_DEPS}) + else() + op_library(${src} UNITY) + endif() + endif() + endforeach() - # Complete the processing of `UNITY_TARGET`. - if(WITH_UNITY_BUILD) - finish_unity_target(cc) - if(WITH_GPU) - finish_unity_target(cu) - endif() + # Complete the processing of `UNITY_TARGET`. + if(WITH_UNITY_BUILD) + finish_unity_target(cc) + if(WITH_GPU) + finish_unity_target(cu) endif() + endif() endfunction() diff --git a/cmake/phi.cmake b/cmake/phi.cmake index f147ef3a586ed..4555d892f11ce 100644 --- a/cmake/phi.cmake +++ b/cmake/phi.cmake @@ -13,366 +13,485 @@ # limitations under the License. function(generate_unify_header DIR_NAME) - set(options "") - set(oneValueArgs HEADER_NAME SKIP_SUFFIX) - set(multiValueArgs "") - cmake_parse_arguments(generate_unify_header "${options}" "${oneValueArgs}" - "${multiValueArgs}" ${ARGN}) + set(options "") + set(oneValueArgs HEADER_NAME SKIP_SUFFIX) + set(multiValueArgs "") + cmake_parse_arguments(generate_unify_header "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) - # get header name and suffix - set(header_name "${DIR_NAME}") - list(LENGTH generate_unify_header_HEADER_NAME generate_unify_header_HEADER_NAME_len) - if(${generate_unify_header_HEADER_NAME_len} GREATER 0) - set(header_name "${generate_unify_header_HEADER_NAME}") - endif() - set(skip_suffix "") - list(LENGTH generate_unify_header_SKIP_SUFFIX generate_unify_header_SKIP_SUFFIX_len) - if(${generate_unify_header_SKIP_SUFFIX_len} GREATER 0) - set(skip_suffix "${generate_unify_header_SKIP_SUFFIX}") - endif() + # get header name and suffix + set(header_name "${DIR_NAME}") + list(LENGTH generate_unify_header_HEADER_NAME + generate_unify_header_HEADER_NAME_len) + if(${generate_unify_header_HEADER_NAME_len} GREATER 0) + set(header_name "${generate_unify_header_HEADER_NAME}") + endif() + set(skip_suffix "") + list(LENGTH generate_unify_header_SKIP_SUFFIX + generate_unify_header_SKIP_SUFFIX_len) + if(${generate_unify_header_SKIP_SUFFIX_len} GREATER 0) + set(skip_suffix "${generate_unify_header_SKIP_SUFFIX}") + endif() - # generate target header file - set(header_file ${CMAKE_CURRENT_SOURCE_DIR}/include/${header_name}.h) - file(WRITE ${header_file} "// Header file generated by paddle/phi/CMakeLists.txt for external users,\n// DO NOT edit or include it within paddle.\n\n#pragma once\n\n") + # generate target header file + set(header_file ${CMAKE_CURRENT_SOURCE_DIR}/include/${header_name}.h) + file( + WRITE ${header_file} + "// Header file generated by paddle/phi/CMakeLists.txt for external users,\n// DO NOT edit or include it within paddle.\n\n#pragma once\n\n" + ) - # get all top-level headers and write into header file - file(GLOB HEADERS "${CMAKE_CURRENT_SOURCE_DIR}\/${DIR_NAME}\/*.h") - foreach(header ${HEADERS}) - if("${skip_suffix}" STREQUAL "") - string(REPLACE "${PADDLE_SOURCE_DIR}\/" "" header "${header}") - file(APPEND ${header_file} "#include \"${header}\"\n") - else() - string(FIND "${header}" "${skip_suffix}.h" skip_suffix_found) - if(${skip_suffix_found} EQUAL -1) - string(REPLACE "${PADDLE_SOURCE_DIR}\/" "" header "${header}") - file(APPEND ${header_file} "#include \"${header}\"\n") - endif() - endif() - endforeach() - # append header into extension.h - string(REPLACE "${PADDLE_SOURCE_DIR}\/" "" header_file "${header_file}") - file(APPEND ${phi_extension_header_file} "#include \"${header_file}\"\n") + # get all top-level headers and write into header file + file(GLOB HEADERS "${CMAKE_CURRENT_SOURCE_DIR}\/${DIR_NAME}\/*.h") + foreach(header ${HEADERS}) + if("${skip_suffix}" STREQUAL "") + string(REPLACE "${PADDLE_SOURCE_DIR}\/" "" header "${header}") + file(APPEND ${header_file} "#include \"${header}\"\n") + else() + string(FIND "${header}" "${skip_suffix}.h" skip_suffix_found) + if(${skip_suffix_found} EQUAL -1) + string(REPLACE "${PADDLE_SOURCE_DIR}\/" "" header "${header}") + file(APPEND ${header_file} "#include \"${header}\"\n") + endif() + endif() + endforeach() + # append header into extension.h + string(REPLACE "${PADDLE_SOURCE_DIR}\/" "" header_file "${header_file}") + file(APPEND ${phi_extension_header_file} "#include \"${header_file}\"\n") endfunction() # call kernel_declare need to make sure whether the target of input exists function(kernel_declare TARGET_LIST) - foreach(kernel_path ${TARGET_LIST}) - file(READ ${kernel_path} kernel_impl) - string(REGEX MATCH "(PD_REGISTER_KERNEL|PD_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*,[ \t\r\n\/]*[a-z0-9_]*" first_registry "${kernel_impl}") - if (NOT first_registry STREQUAL "") - # some gpu kernel only can run on cuda, not support rocm, so we add this branch - if (WITH_ROCM) - string(FIND "${first_registry}" "cuda_only" pos) - if(pos GREATER 1) - continue() - endif() - endif() - # parse the first kernel name - string(REPLACE "PD_REGISTER_KERNEL(" "" kernel_name "${first_registry}") - string(REPLACE "PD_REGISTER_GENERAL_KERNEL(" "" kernel_name "${kernel_name}") - string(REPLACE "," "" kernel_name "${kernel_name}") - string(REGEX REPLACE "[ \t\r\n]+" "" kernel_name "${kernel_name}") - string(REGEX REPLACE "//cuda_only" "" kernel_name "${kernel_name}") - # append kernel declare into declarations.h - # TODO(chenweihang): default declare ALL_LAYOUT for each kernel - if (${kernel_path} MATCHES "./cpu\/") - file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n") - elseif (${kernel_path} MATCHES "./gpu\/") - file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n") - elseif (${kernel_path} MATCHES "./xpu\/") - file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n") - elseif (${kernel_path} MATCHES "./gpudnn\/") - file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPUDNN, ALL_LAYOUT);\n") - elseif (${kernel_path} MATCHES "./kps\/") - file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, KPS, ALL_LAYOUT);\n") - else () - # deal with device independent kernel, now we use CPU temporaary - file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n") - endif() + foreach(kernel_path ${TARGET_LIST}) + file(READ ${kernel_path} kernel_impl) + string( + REGEX + MATCH + "(PD_REGISTER_KERNEL|PD_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*,[ \t\r\n\/]*[a-z0-9_]*" + first_registry + "${kernel_impl}") + if(NOT first_registry STREQUAL "") + # some gpu kernel only can run on cuda, not support rocm, so we add this branch + if(WITH_ROCM) + string(FIND "${first_registry}" "cuda_only" pos) + if(pos GREATER 1) + continue() endif() - endforeach() + endif() + # parse the first kernel name + string(REPLACE "PD_REGISTER_KERNEL(" "" kernel_name "${first_registry}") + string(REPLACE "PD_REGISTER_GENERAL_KERNEL(" "" kernel_name + "${kernel_name}") + string(REPLACE "," "" kernel_name "${kernel_name}") + string(REGEX REPLACE "[ \t\r\n]+" "" kernel_name "${kernel_name}") + string(REGEX REPLACE "//cuda_only" "" kernel_name "${kernel_name}") + # append kernel declare into declarations.h + # TODO(chenweihang): default declare ALL_LAYOUT for each kernel + if(${kernel_path} MATCHES "./cpu\/") + file(APPEND ${kernel_declare_file} + "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n") + elseif(${kernel_path} MATCHES "./gpu\/") + file(APPEND ${kernel_declare_file} + "PD_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n") + elseif(${kernel_path} MATCHES "./xpu\/") + file(APPEND ${kernel_declare_file} + "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n") + elseif(${kernel_path} MATCHES "./gpudnn\/") + file(APPEND ${kernel_declare_file} + "PD_DECLARE_KERNEL(${kernel_name}, GPUDNN, ALL_LAYOUT);\n") + elseif(${kernel_path} MATCHES "./kps\/") + file(APPEND ${kernel_declare_file} + "PD_DECLARE_KERNEL(${kernel_name}, KPS, ALL_LAYOUT);\n") + else() + # deal with device independent kernel, now we use CPU temporaary + file(APPEND ${kernel_declare_file} + "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n") + endif() + endif() + endforeach() endfunction() function(kernel_library TARGET) - set(common_srcs) - set(cpu_srcs) - set(gpu_srcs) - set(xpu_srcs) - set(gpudnn_srcs) - set(kps_srcs) - # parse and save the deps kerenl targets - set(all_srcs) - set(kernel_deps) + set(common_srcs) + set(cpu_srcs) + set(gpu_srcs) + set(xpu_srcs) + set(gpudnn_srcs) + set(kps_srcs) + # parse and save the deps kerenl targets + set(all_srcs) + set(kernel_deps) - set(oneValueArgs SUB_DIR) - set(multiValueArgs SRCS DEPS) - set(target_build_flag 1) + set(oneValueArgs SUB_DIR) + set(multiValueArgs SRCS DEPS) + set(target_build_flag 1) - cmake_parse_arguments(kernel_library "${options}" "${oneValueArgs}" - "${multiValueArgs}" ${ARGN}) + cmake_parse_arguments(kernel_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) - # used for cc_library selected_rows dir target - set(target_suffix "") - if ("${kernel_library_SUB_DIR}" STREQUAL "selected_rows") - set(target_suffix "_sr") + # used for cc_library selected_rows dir target + set(target_suffix "") + if("${kernel_library_SUB_DIR}" STREQUAL "selected_rows") + set(target_suffix "_sr") + endif() + if("${kernel_library_SUB_DIR}" STREQUAL "sparse") + set(target_suffix "_sp") + endif() + + list(LENGTH kernel_library_SRCS kernel_library_SRCS_len) + # one kernel only match one impl file in each backend + if(${kernel_library_SRCS_len} EQUAL 0) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc) + list(APPEND common_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc) endif() - if ("${kernel_library_SUB_DIR}" STREQUAL "sparse") - set(target_suffix "_sp") + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc) + list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc) endif() - - list(LENGTH kernel_library_SRCS kernel_library_SRCS_len) - # one kernel only match one impl file in each backend - if (${kernel_library_SRCS_len} EQUAL 0) - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc) - list(APPEND common_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc) - endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc) - list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc) - endif() - if (WITH_GPU OR WITH_ROCM) - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu) - list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu) - endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc) - list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc) - endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) - list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) - endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu) - list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu) - endif() - endif() - if (WITH_XPU) - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc) - list(APPEND xpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc) - endif() - endif() - if (WITH_XPU_KP) - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) - # Change XPU2 file suffix - # NOTE(chenweihang): If we can be sure that the *.kps suffix is no longer used, it can be copied directly to *.xpu - file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps) - file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.cu ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps) - list(APPEND kps_srcs ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps) - endif() - endif() - else() - # TODO(chenweihang): impl compile by source later + if(WITH_GPU OR WITH_ROCM) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu) + list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu) + endif() + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc) + list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc) + endif() + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) + list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) + endif() + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu) + list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu) + endif() endif() - - list(APPEND all_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.h) - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/impl/${TARGET}_impl.h) - list(APPEND all_srcs ${CMAKE_CURRENT_SOURCE_DIR}/impl/${TARGET}_impl.h) + if(WITH_XPU) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc) + list(APPEND xpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc) + endif() endif() - list(APPEND all_srcs ${common_srcs}) - list(APPEND all_srcs ${cpu_srcs}) - list(APPEND all_srcs ${gpu_srcs}) - list(APPEND all_srcs ${xpu_srcs}) - list(APPEND all_srcs ${gpudnn_srcs}) - list(APPEND all_srcs ${kps_srcs}) + if(WITH_XPU_KP) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) + # Change XPU2 file suffix + # NOTE(chenweihang): If we can be sure that the *.kps suffix is no longer used, it can be copied directly to *.xpu + file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps) + file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.cu + ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps) + list(APPEND kps_srcs ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps) + endif() + endif() + else() + # TODO(chenweihang): impl compile by source later + endif() - set(all_include_kernels) - set(all_kernel_name) + list(APPEND all_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.h) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/impl/${TARGET}_impl.h) + list(APPEND all_srcs ${CMAKE_CURRENT_SOURCE_DIR}/impl/${TARGET}_impl.h) + endif() + list(APPEND all_srcs ${common_srcs}) + list(APPEND all_srcs ${cpu_srcs}) + list(APPEND all_srcs ${gpu_srcs}) + list(APPEND all_srcs ${xpu_srcs}) + list(APPEND all_srcs ${gpudnn_srcs}) + list(APPEND all_srcs ${kps_srcs}) - foreach(src ${all_srcs}) - file(READ ${src} target_content) - # "kernels/xxx"(DenseTensor Kernel) can only include each other, but can't include "SUB_DIR/xxx" (such as selected_rows Kernel) - string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content}) - list(APPEND all_include_kernels ${include_kernels}) + set(all_include_kernels) + set(all_kernel_name) - # "SUB_DIR/xxx" can include "kernels/xx" and "SUB_DIR/xxx" - if (NOT "${kernel_library_SUB_DIR}" STREQUAL "") - string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content}) - list(APPEND all_include_kernels ${include_kernels}) - endif() + foreach(src ${all_srcs}) + file(READ ${src} target_content) + # "kernels/xxx"(DenseTensor Kernel) can only include each other, but can't include "SUB_DIR/xxx" (such as selected_rows Kernel) + string(REGEX MATCHALL + "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" + include_kernels ${target_content}) + list(APPEND all_include_kernels ${include_kernels}) - foreach(include_kernel ${all_include_kernels}) - if ("${kernel_library_SUB_DIR}" STREQUAL "") - string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel}) - string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name}) - list(APPEND all_kernel_name ${kernel_name}) - else() - # NOTE(dev): we should firstly match kernel_library_SUB_DIR. - if (${include_kernel} MATCHES "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/") - string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel}) - # for selected_rows directory, add ${target_suffix}. - string(REGEX REPLACE ".h\"" "${target_suffix}" kernel_name ${kernel_name}) - list(APPEND all_kernel_name ${kernel_name}) - else() - string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel}) - string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name}) - list(APPEND all_kernel_name ${kernel_name}) - endif() - endif() - list(APPEND kernel_deps ${all_kernel_name}) - endforeach() + # "SUB_DIR/xxx" can include "kernels/xx" and "SUB_DIR/xxx" + if(NOT "${kernel_library_SUB_DIR}" STREQUAL "") + string( + REGEX + MATCHALL + "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/[a-z0-9_]+_kernel.h\"" + include_kernels + ${target_content}) + list(APPEND all_include_kernels ${include_kernels}) + endif() + + foreach(include_kernel ${all_include_kernels}) + if("${kernel_library_SUB_DIR}" STREQUAL "") + string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name + ${include_kernel}) + string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name}) + list(APPEND all_kernel_name ${kernel_name}) + else() + # NOTE(dev): we should firstly match kernel_library_SUB_DIR. + if(${include_kernel} MATCHES + "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/") + string( + REGEX + REPLACE + "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" + kernel_name ${include_kernel}) + # for selected_rows directory, add ${target_suffix}. + string(REGEX REPLACE ".h\"" "${target_suffix}" kernel_name + ${kernel_name}) + list(APPEND all_kernel_name ${kernel_name}) + else() + string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" + kernel_name ${include_kernel}) + string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name}) + list(APPEND all_kernel_name ${kernel_name}) + endif() + endif() + list(APPEND kernel_deps ${all_kernel_name}) endforeach() - list(REMOVE_DUPLICATES kernel_deps) - list(REMOVE_ITEM kernel_deps ${TARGET}${target_suffix}) + endforeach() + list(REMOVE_DUPLICATES kernel_deps) + list(REMOVE_ITEM kernel_deps ${TARGET}${target_suffix}) - list(LENGTH common_srcs common_srcs_len) - list(LENGTH cpu_srcs cpu_srcs_len) - list(LENGTH gpu_srcs gpu_srcs_len) - list(LENGTH xpu_srcs xpu_srcs_len) - list(LENGTH gpudnn_srcs gpudnn_srcs_len) - list(LENGTH kps_srcs kps_srcs_len) + list(LENGTH common_srcs common_srcs_len) + list(LENGTH cpu_srcs cpu_srcs_len) + list(LENGTH gpu_srcs gpu_srcs_len) + list(LENGTH xpu_srcs xpu_srcs_len) + list(LENGTH gpudnn_srcs gpudnn_srcs_len) + list(LENGTH kps_srcs kps_srcs_len) - # kernel source file level - # level 1: base device kernel (if any device or dnn kernel exists, the cpu_kernel must be exists!!!) - # - cpu_srcs / gpu_srcs / xpu_srcs / kps_srcs - # = dnn srcs: gpudnn_srcs - # level 2: device-independent kernel - # - common_srcs + # kernel source file level + # level 1: base device kernel (if any device or dnn kernel exists, the cpu_kernel must be exists!!!) + # - cpu_srcs / gpu_srcs / xpu_srcs / kps_srcs + # = dnn srcs: gpudnn_srcs + # level 2: device-independent kernel + # - common_srcs - set(partial_build_flag 0) - set(base_build_flag 0) - if (${common_srcs_len} GREATER 0) - set(partial_build_flag 1) - endif() - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) - set(base_build_flag 1) - endif() + set(partial_build_flag 0) + set(base_build_flag 0) + if(${common_srcs_len} GREATER 0) + set(partial_build_flag 1) + endif() + if(${cpu_srcs_len} GREATER 0 + OR ${gpu_srcs_len} GREATER 0 + OR ${xpu_srcs_len} GREATER 0 + OR ${kps_srcs_len} GREATER 0) + set(base_build_flag 1) + endif() - # gpudnn or mkldnn needs to be compiled separately - set(dnn_kernels) - if (${gpudnn_srcs_len} GREATER 0) - if (WITH_GPU) - nv_library(${TARGET}_gpudnn${target_suffix} SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - elseif (WITH_ROCM) - hip_library(${TARGET}_gpudnn${target_suffix} SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() - list(APPEND dnn_kernels ${TARGET}_gpudnn${target_suffix}) + # gpudnn or mkldnn needs to be compiled separately + set(dnn_kernels) + if(${gpudnn_srcs_len} GREATER 0) + if(WITH_GPU) + nv_library( + ${TARGET}_gpudnn${target_suffix} + SRCS ${gpudnn_srcs} + DEPS ${kernel_library_DEPS} ${kernel_deps}) + elseif(WITH_ROCM) + hip_library( + ${TARGET}_gpudnn${target_suffix} + SRCS ${gpudnn_srcs} + DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() - list(LENGTH dnn_kernels dnn_kernels_len) + list(APPEND dnn_kernels ${TARGET}_gpudnn${target_suffix}) + endif() + list(LENGTH dnn_kernels dnn_kernels_len) - if (${partial_build_flag} EQUAL 0 AND ${base_build_flag} EQUAL 1) - if (WITH_GPU) - if (${dnn_kernels_len} GREATER 0) - nv_library(${TARGET}_base${target_suffix} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - nv_library(${TARGET}${target_suffix} DEPS ${TARGET}_base${target_suffix} ${dnn_kernels}) - else() - nv_library(${TARGET}${target_suffix} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() - elseif (WITH_ROCM) - if (${dnn_kernels_len} GREATER 0) - hip_library(${TARGET}_base${target_suffix} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - hip_library(${TARGET}${target_suffix} DEPS ${TARGET}_base${target_suffix} ${dnn_kernels}) - else() - hip_library(${TARGET}${target_suffix} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() - elseif (WITH_XPU_KP) - xpu_library(${TARGET}${target_suffix} SRCS ${cpu_srcs} ${kps_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - else() - cc_library(${TARGET}${target_suffix} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() - elseif (${partial_build_flag} EQUAL 1 AND ${base_build_flag} EQUAL 1) - if (WITH_GPU) - nv_library(${TARGET}_base${target_suffix} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - nv_library(${TARGET}${target_suffix} SRCS ${common_srcs} DEPS ${TARGET}_base${target_suffix} ${dnn_kernels}) - elseif (WITH_ROCM) - hip_library(${TARGET}_base${target_suffix} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - hip_library(${TARGET}${target_suffix} SRCS ${common_srcs} DEPS ${TARGET}_base${target_suffix} ${dnn_kernels}) - elseif (WITH_XPU_KP) - xpu_library(${TARGET}_base${target_suffix} SRCS ${cpu_srcs} ${kps_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - xpu_library(${TARGET}${target_suffix} SRCS ${common_srcs} DEPS ${TARGET}_base${target_suffix}) - else() - cc_library(${TARGET}_base${target_suffix} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - cc_library(${TARGET}${target_suffix} SRCS ${common_srcs} DEPS ${TARGET}_base${target_suffix}) - endif() - elseif (${partial_build_flag} EQUAL 1 AND ${base_build_flag} EQUAL 0) - if (WITH_GPU) - nv_library(${TARGET}${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - elseif (WITH_ROCM) - hip_library(${TARGET}${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - elseif (WITH_XPU_KP) - xpu_library(${TARGET}${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - else() - cc_library(${TARGET}${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() + if(${partial_build_flag} EQUAL 0 AND ${base_build_flag} EQUAL 1) + if(WITH_GPU) + if(${dnn_kernels_len} GREATER 0) + nv_library( + ${TARGET}_base${target_suffix} + SRCS ${cpu_srcs} ${gpu_srcs} + DEPS ${kernel_library_DEPS} ${kernel_deps}) + nv_library(${TARGET}${target_suffix} DEPS ${TARGET}_base${target_suffix} + ${dnn_kernels}) + else() + nv_library( + ${TARGET}${target_suffix} + SRCS ${cpu_srcs} ${gpu_srcs} + DEPS ${kernel_library_DEPS} ${kernel_deps}) + endif() + elseif(WITH_ROCM) + if(${dnn_kernels_len} GREATER 0) + hip_library( + ${TARGET}_base${target_suffix} + SRCS ${cpu_srcs} ${gpu_srcs} + DEPS ${kernel_library_DEPS} ${kernel_deps}) + hip_library(${TARGET}${target_suffix} + DEPS ${TARGET}_base${target_suffix} ${dnn_kernels}) + else() + hip_library( + ${TARGET}${target_suffix} + SRCS ${cpu_srcs} ${gpu_srcs} + DEPS ${kernel_library_DEPS} ${kernel_deps}) + endif() + elseif(WITH_XPU_KP) + xpu_library( + ${TARGET}${target_suffix} + SRCS ${cpu_srcs} ${kps_srcs} ${xpu_srcs} + DEPS ${kernel_library_DEPS} ${kernel_deps}) + else() + cc_library( + ${TARGET}${target_suffix} + SRCS ${cpu_srcs} ${xpu_srcs} + DEPS ${kernel_library_DEPS} ${kernel_deps}) + endif() + elseif(${partial_build_flag} EQUAL 1 AND ${base_build_flag} EQUAL 1) + if(WITH_GPU) + nv_library( + ${TARGET}_base${target_suffix} + SRCS ${cpu_srcs} ${gpu_srcs} + DEPS ${kernel_library_DEPS} ${kernel_deps}) + nv_library( + ${TARGET}${target_suffix} + SRCS ${common_srcs} + DEPS ${TARGET}_base${target_suffix} ${dnn_kernels}) + elseif(WITH_ROCM) + hip_library( + ${TARGET}_base${target_suffix} + SRCS ${cpu_srcs} ${gpu_srcs} + DEPS ${kernel_library_DEPS} ${kernel_deps}) + hip_library( + ${TARGET}${target_suffix} + SRCS ${common_srcs} + DEPS ${TARGET}_base${target_suffix} ${dnn_kernels}) + elseif(WITH_XPU_KP) + xpu_library( + ${TARGET}_base${target_suffix} + SRCS ${cpu_srcs} ${kps_srcs} ${xpu_srcs} + DEPS ${kernel_library_DEPS} ${kernel_deps}) + xpu_library( + ${TARGET}${target_suffix} + SRCS ${common_srcs} + DEPS ${TARGET}_base${target_suffix}) + else() + cc_library( + ${TARGET}_base${target_suffix} + SRCS ${cpu_srcs} ${xpu_srcs} + DEPS ${kernel_library_DEPS} ${kernel_deps}) + cc_library( + ${TARGET}${target_suffix} + SRCS ${common_srcs} + DEPS ${TARGET}_base${target_suffix}) + endif() + elseif(${partial_build_flag} EQUAL 1 AND ${base_build_flag} EQUAL 0) + if(WITH_GPU) + nv_library( + ${TARGET}${target_suffix} + SRCS ${common_srcs} + DEPS ${kernel_library_DEPS} ${kernel_deps}) + elseif(WITH_ROCM) + hip_library( + ${TARGET}${target_suffix} + SRCS ${common_srcs} + DEPS ${kernel_library_DEPS} ${kernel_deps}) + elseif(WITH_XPU_KP) + xpu_library( + ${TARGET}${target_suffix} + SRCS ${common_srcs} + DEPS ${kernel_library_DEPS} ${kernel_deps}) else() - set(target_build_flag 0) + cc_library( + ${TARGET}${target_suffix} + SRCS ${common_srcs} + DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() + else() + set(target_build_flag 0) + endif() - if (${target_build_flag} EQUAL 1) - if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR - ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0 OR - ${gpudnn_srcs_len} GREATER 0) - # append target into PHI_KERNELS property - get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) - set(phi_kernels ${phi_kernels} ${TARGET}${target_suffix}) - set_property(GLOBAL PROPERTY PHI_KERNELS ${phi_kernels}) - endif() + if(${target_build_flag} EQUAL 1) + if(${common_srcs_len} GREATER 0 + OR ${cpu_srcs_len} GREATER 0 + OR ${gpu_srcs_len} GREATER 0 + OR ${xpu_srcs_len} GREATER 0 + OR ${kps_srcs_len} GREATER 0 + OR ${gpudnn_srcs_len} GREATER 0) + # append target into PHI_KERNELS property + get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) + set(phi_kernels ${phi_kernels} ${TARGET}${target_suffix}) + set_property(GLOBAL PROPERTY PHI_KERNELS ${phi_kernels}) + endif() - # parse kernel name and auto generate kernel declaration - # here, we don't need to check WITH_XXX, because if not WITH_XXX, the - # xxx_srcs_len will be equal to 0 - if (${common_srcs_len} GREATER 0) - kernel_declare(${common_srcs}) - endif() - if (${cpu_srcs_len} GREATER 0) - kernel_declare(${cpu_srcs}) - endif() - if (${gpu_srcs_len} GREATER 0) - kernel_declare(${gpu_srcs}) - endif() - if (${xpu_srcs_len} GREATER 0) - kernel_declare(${xpu_srcs}) - endif() - if (${gpudnn_srcs_len} GREATER 0) - kernel_declare(${gpudnn_srcs}) - endif() - if (${kps_srcs_len} GREATER 0) - kernel_declare(${kps_srcs}) - endif() + # parse kernel name and auto generate kernel declaration + # here, we don't need to check WITH_XXX, because if not WITH_XXX, the + # xxx_srcs_len will be equal to 0 + if(${common_srcs_len} GREATER 0) + kernel_declare(${common_srcs}) + endif() + if(${cpu_srcs_len} GREATER 0) + kernel_declare(${cpu_srcs}) + endif() + if(${gpu_srcs_len} GREATER 0) + kernel_declare(${gpu_srcs}) + endif() + if(${xpu_srcs_len} GREATER 0) + kernel_declare(${xpu_srcs}) endif() + if(${gpudnn_srcs_len} GREATER 0) + kernel_declare(${gpudnn_srcs}) + endif() + if(${kps_srcs_len} GREATER 0) + kernel_declare(${kps_srcs}) + endif() + endif() endfunction() function(register_kernels) - set(options "") - set(oneValueArgs SUB_DIR) - set(multiValueArgs EXCLUDES DEPS) - cmake_parse_arguments(register_kernels "${options}" "${oneValueArgs}" - "${multiValueArgs}" ${ARGN}) + set(options "") + set(oneValueArgs SUB_DIR) + set(multiValueArgs EXCLUDES DEPS) + cmake_parse_arguments(register_kernels "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) - file(GLOB KERNELS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_kernel.h") - string(REPLACE ".h" "" KERNELS "${KERNELS}") - list(LENGTH register_kernels_DEPS register_kernels_DEPS_len) + file( + GLOB KERNELS + RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + "*_kernel.h") + string(REPLACE ".h" "" KERNELS "${KERNELS}") + list(LENGTH register_kernels_DEPS register_kernels_DEPS_len) - foreach(target ${KERNELS}) - list(FIND register_kernels_EXCLUDES ${target} _index) - if (${_index} EQUAL -1) - if (${register_kernels_DEPS_len} GREATER 0) - kernel_library(${target} DEPS ${register_kernels_DEPS} SUB_DIR ${register_kernels_SUB_DIR}) - else() - kernel_library(${target} SUB_DIR ${register_kernels_SUB_DIR}) - endif() - endif() - endforeach() + foreach(target ${KERNELS}) + list(FIND register_kernels_EXCLUDES ${target} _index) + if(${_index} EQUAL -1) + if(${register_kernels_DEPS_len} GREATER 0) + kernel_library(${target} DEPS ${register_kernels_DEPS} SUB_DIR + ${register_kernels_SUB_DIR}) + else() + kernel_library(${target} SUB_DIR ${register_kernels_SUB_DIR}) + endif() + endif() + endforeach() endfunction() function(append_op_util_declare TARGET) - file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET} target_content) - string(REGEX MATCH "(PD_REGISTER_BASE_KERNEL_NAME|PD_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*" util_registrar "${target_content}") - string(REPLACE "PD_REGISTER_ARG_MAPPING_FN" "PD_DECLARE_ARG_MAPPING_FN" util_declare "${util_registrar}") - string(REPLACE "PD_REGISTER_BASE_KERNEL_NAME" "PD_DECLARE_BASE_KERNEL_NAME" util_declare "${util_declare}") - string(APPEND util_declare ");\n") - file(APPEND ${op_utils_header} "${util_declare}") + file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET} target_content) + string( + REGEX + MATCH + "(PD_REGISTER_BASE_KERNEL_NAME|PD_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*" + util_registrar + "${target_content}") + string(REPLACE "PD_REGISTER_ARG_MAPPING_FN" "PD_DECLARE_ARG_MAPPING_FN" + util_declare "${util_registrar}") + string(REPLACE "PD_REGISTER_BASE_KERNEL_NAME" "PD_DECLARE_BASE_KERNEL_NAME" + util_declare "${util_declare}") + string(APPEND util_declare ");\n") + file(APPEND ${op_utils_header} "${util_declare}") endfunction() function(register_op_utils TARGET_NAME) - set(utils_srcs) - set(options "") - set(oneValueArgs "") - set(multiValueArgs EXCLUDES DEPS) - cmake_parse_arguments(register_op_utils "${options}" "${oneValueArgs}" - "${multiValueArgs}" ${ARGN}) + set(utils_srcs) + set(options "") + set(oneValueArgs "") + set(multiValueArgs EXCLUDES DEPS) + cmake_parse_arguments(register_op_utils "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) - file(GLOB SIGNATURES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_sig.cc") - foreach(target ${SIGNATURES}) - append_op_util_declare(${target}) - list(APPEND utils_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${target}) - endforeach() + file( + GLOB SIGNATURES + RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + "*_sig.cc") + foreach(target ${SIGNATURES}) + append_op_util_declare(${target}) + list(APPEND utils_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${target}) + endforeach() - cc_library(${TARGET_NAME} SRCS ${utils_srcs} DEPS ${register_op_utils_DEPS}) + cc_library( + ${TARGET_NAME} + SRCS ${utils_srcs} + DEPS ${register_op_utils_DEPS}) endfunction() diff --git a/cmake/phi_header.cmake b/cmake/phi_header.cmake index b23b4086b18f2..fa5b6724ce89a 100644 --- a/cmake/phi_header.cmake +++ b/cmake/phi_header.cmake @@ -12,32 +12,42 @@ # See the License for the specific language governing permissions and # limitations under the License. -set(PADDLE_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_install_dir") +set(PADDLE_INFERENCE_INSTALL_DIR + "${CMAKE_BINARY_DIR}/paddle_inference_install_dir") function(phi_header_path_compat TARGET_PATH) -message(STATUS "phi header path compat processing: ${TARGET_PATH}") -string(FIND ${TARGET_PATH} "experimental" pos) -if (pos GREATER 1) + message(STATUS "phi header path compat processing: ${TARGET_PATH}") + string(FIND ${TARGET_PATH} "experimental" pos) + if(pos GREATER 1) file(GLOB HEADERS "${TARGET_PATH}/*" "*.h") foreach(header ${HEADERS}) - if (${header} MATCHES ".*.h$") - file(READ ${header} HEADER_CONTENT) - string(REPLACE "paddle/phi/" "paddle/include/experimental/phi/" HEADER_CONTENT "${HEADER_CONTENT}") - string(REPLACE "paddle/utils/" "paddle/include/experimental/utils/" HEADER_CONTENT "${HEADER_CONTENT}") - file(WRITE ${header} "${HEADER_CONTENT}") - message(STATUS "phi header path compat processing complete: ${header}") - endif() + if(${header} MATCHES ".*.h$") + file(READ ${header} HEADER_CONTENT) + string(REPLACE "paddle/phi/" "paddle/include/experimental/phi/" + HEADER_CONTENT "${HEADER_CONTENT}") + string(REPLACE "paddle/utils/" "paddle/include/experimental/utils/" + HEADER_CONTENT "${HEADER_CONTENT}") + file(WRITE ${header} "${HEADER_CONTENT}") + message(STATUS "phi header path compat processing complete: ${header}") + endif() endforeach() -endif() + endif() endfunction() -phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental) -phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api) -phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/ext) -phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/include) -phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/common) -phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/core) +phi_header_path_compat( + ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental) +phi_header_path_compat( + ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api) +phi_header_path_compat( + ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/ext) +phi_header_path_compat( + ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/include) +phi_header_path_compat( + ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/common) +phi_header_path_compat( + ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/core) # In order to be compatible with the original behavior, the header file name needs to be changed -file(RENAME ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/extension.h - ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/ext_all.h) +file(RENAME + ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/extension.h + ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/ext_all.h) diff --git a/cmake/python_module.cmake b/cmake/python_module.cmake index 1412b7f7f2060..9367435b61b55 100644 --- a/cmake/python_module.cmake +++ b/cmake/python_module.cmake @@ -2,42 +2,49 @@ # Found at http://www.cmake.org/pipermail/cmake/2011-January/041666.html # To use do: find_python_module(PyQt4 REQUIRED) function(find_python_module module) - string(TOUPPER ${module} module_upper) - if(NOT PY_${module_upper}) - if(ARGC GREATER 1 AND ARGV1 STREQUAL "REQUIRED") - set(${module}_FIND_REQUIRED TRUE) - else() - set(${module}_FIND_REQUIRED FALSE) - endif() - # A module's location is usually a directory, but for binary modules - # it's a .so file. - execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" - "import re, ${module}; print(re.compile('/__init__.py.*').sub('',${module}.__file__))" - RESULT_VARIABLE _${module}_status - OUTPUT_VARIABLE _${module}_location - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) - if(NOT _${module}_status) - set(PY_${module_upper} ${_${module}_location} CACHE STRING - "Location of Python module ${module}") - endif(NOT _${module}_status) - endif(NOT PY_${module_upper}) - find_package_handle_standard_args(PY_${module} DEFAULT_MSG PY_${module_upper}) - if(NOT PY_${module_upper}_FOUND AND ${module}_FIND_REQUIRED) - message(FATAL_ERROR "python module ${module} is not found") + string(TOUPPER ${module} module_upper) + if(NOT PY_${module_upper}) + if(ARGC GREATER 1 AND ARGV1 STREQUAL "REQUIRED") + set(${module}_FIND_REQUIRED TRUE) + else() + set(${module}_FIND_REQUIRED FALSE) endif() - - execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" - "import sys, ${module}; sys.stdout.write(${module}.__version__)" - OUTPUT_VARIABLE _${module}_version - RESULT_VARIABLE _${module}_status - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) + # A module's location is usually a directory, but for binary modules + # it's a .so file. + execute_process( + COMMAND + "${PYTHON_EXECUTABLE}" "-c" + "import re, ${module}; print(re.compile('/__init__.py.*').sub('',${module}.__file__))" + RESULT_VARIABLE _${module}_status + OUTPUT_VARIABLE _${module}_location + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) if(NOT _${module}_status) - set(PY_${module_upper}_VERSION ${_${module}_version} CACHE STRING - "Version of Python module ${module}") + set(PY_${module_upper} + ${_${module}_location} + CACHE STRING "Location of Python module ${module}") endif(NOT _${module}_status) + endif(NOT PY_${module_upper}) + find_package_handle_standard_args(PY_${module} DEFAULT_MSG PY_${module_upper}) + if(NOT PY_${module_upper}_FOUND AND ${module}_FIND_REQUIRED) + message(FATAL_ERROR "python module ${module} is not found") + endif() + + execute_process( + COMMAND "${PYTHON_EXECUTABLE}" "-c" + "import sys, ${module}; sys.stdout.write(${module}.__version__)" + OUTPUT_VARIABLE _${module}_version + RESULT_VARIABLE _${module}_status + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + if(NOT _${module}_status) + set(PY_${module_upper}_VERSION + ${_${module}_version} + CACHE STRING "Version of Python module ${module}") + endif(NOT _${module}_status) - set(PY_${module_upper}_FOUND ${PY_${module_upper}_FOUND} PARENT_SCOPE) - set(PY_${module_upper}_VERSION ${PY_${module_upper}_VERSION} PARENT_SCOPE) + set(PY_${module_upper}_FOUND + ${PY_${module_upper}_FOUND} + PARENT_SCOPE) + set(PY_${module_upper}_VERSION + ${PY_${module_upper}_VERSION} + PARENT_SCOPE) endfunction(find_python_module) diff --git a/cmake/rccl.cmake b/cmake/rccl.cmake index f3a472ac930de..1f78c74f40e64 100644 --- a/cmake/rccl.cmake +++ b/cmake/rccl.cmake @@ -1,28 +1,30 @@ if(NOT WITH_ROCM) - return() + return() endif() # Now we don't support RCCL on windows if(WIN32) - return() + return() endif() if(WITH_RCCL) - set(RCCL_ROOT ${ROCM_PATH}/rccl CACHE PATH "RCCL ROOT") - find_path(RCCL_INCLUDE_DIR rccl.h - PATHS ${RCCL_ROOT} ${RCCL_ROOT}/include ${RCCL_ROOT}/local/include - $ENV{RCCL_ROOT} $ENV{RCCL_ROOT}/include $ENV{RCCL_ROOT}/local/include - NO_DEFAULT_PATH - ) + set(RCCL_ROOT + ${ROCM_PATH}/rccl + CACHE PATH "RCCL ROOT") + find_path( + RCCL_INCLUDE_DIR rccl.h + PATHS ${RCCL_ROOT} ${RCCL_ROOT}/include ${RCCL_ROOT}/local/include + $ENV{RCCL_ROOT} $ENV{RCCL_ROOT}/include $ENV{RCCL_ROOT}/local/include + NO_DEFAULT_PATH) - file(READ ${RCCL_INCLUDE_DIR}/rccl.h RCCL_VERSION_FILE_CONTENTS) + file(READ ${RCCL_INCLUDE_DIR}/rccl.h RCCL_VERSION_FILE_CONTENTS) - string(REGEX MATCH "define NCCL_VERSION_CODE +([0-9]+)" - RCCL_VERSION "${RCCL_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define NCCL_VERSION_CODE +([0-9]+)" "\\1" - RCCL_VERSION "${RCCL_VERSION}") + string(REGEX MATCH "define NCCL_VERSION_CODE +([0-9]+)" RCCL_VERSION + "${RCCL_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define NCCL_VERSION_CODE +([0-9]+)" "\\1" RCCL_VERSION + "${RCCL_VERSION}") - # 2604 for ROCM3.5 and 2708 for ROCM 3.9 - message(STATUS "Current RCCL header is ${RCCL_INCLUDE_DIR}/rccl.h. " - "Current RCCL version is v${RCCL_VERSION}. ") + # 2604 for ROCM3.5 and 2708 for ROCM 3.9 + message(STATUS "Current RCCL header is ${RCCL_INCLUDE_DIR}/rccl.h. " + "Current RCCL version is v${RCCL_VERSION}. ") endif() diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 566dc75fda019..ff8b9d6f9a9b4 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -4,49 +4,62 @@ include(CheckCXXSourceRuns) include(CheckCXXSourceCompiles) -if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") - set(MMX_FLAG "-mmmx") - set(SSE2_FLAG "-msse2") - set(SSE3_FLAG "-msse3") - set(AVX_FLAG "-mavx") - set(AVX2_FLAG "-mavx2") - set(AVX512F_FLAG "-mavx512f") +if(CMAKE_COMPILER_IS_GNUCC + OR CMAKE_COMPILER_IS_GNUCXX + OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + set(MMX_FLAG "-mmmx") + set(SSE2_FLAG "-msse2") + set(SSE3_FLAG "-msse3") + set(AVX_FLAG "-mavx") + set(AVX2_FLAG "-mavx2") + set(AVX512F_FLAG "-mavx512f") elseif(MSVC) - set(MMX_FLAG "/arch:MMX") - set(SSE2_FLAG "/arch:SSE2") - set(SSE3_FLAG "/arch:SSE3") - SET(AVX_FLAG "/arch:AVX") - SET(AVX2_FLAG "/arch:AVX2") + set(MMX_FLAG "/arch:MMX") + set(SSE2_FLAG "/arch:SSE2") + set(SSE3_FLAG "/arch:SSE3") + set(AVX_FLAG "/arch:AVX") + set(AVX2_FLAG "/arch:AVX2") endif() set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS}) # Check MMX set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG}) -set(MMX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" +set(MMX_FOUND_EXITCODE + 1 + CACHE STRING "Result from TRY_RUN" FORCE) +check_cxx_source_runs( + " #include int main() { _mm_setzero_si64(); return 0; -}" MMX_FOUND) +}" + MMX_FOUND) # Check SSE2 set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG}) -set(SSE2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" +set(SSE2_FOUND_EXITCODE + 1 + CACHE STRING "Result from TRY_RUN" FORCE) +check_cxx_source_runs( + " #include int main() { _mm_setzero_si128(); return 0; -}" SSE2_FOUND) +}" + SSE2_FOUND) # Check SSE3 set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG}) -set(SSE3_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" +set(SSE3_FOUND_EXITCODE + 1 + CACHE STRING "Result from TRY_RUN" FORCE) +check_cxx_source_runs( + " #include int main() { @@ -55,12 +68,16 @@ int main() __m128d result = _mm_addsub_pd(a, b); result = _mm_movedup_pd(result); return 0; -}" SSE3_FOUND) +}" + SSE3_FOUND) # Check AVX set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) -set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" +set(AVX_FOUND_EXITCODE + 1 + CACHE STRING "Result from TRY_RUN" FORCE) +check_cxx_source_runs( + " #include int main() { @@ -68,24 +85,32 @@ int main() __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); __m256 result = _mm256_add_ps (a, b); return 0; -}" AVX_FOUND) +}" + AVX_FOUND) # Check AVX 2 set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) -set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" +set(AVX2_FOUND_EXITCODE + 1 + CACHE STRING "Result from TRY_RUN" FORCE) +check_cxx_source_runs( + " #include int main() { __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); __m256i result = _mm256_abs_epi32 (a); return 0; -}" AVX2_FOUND) +}" + AVX2_FOUND) # Check AVX512F set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) -set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" +set(AVX512F_FOUND_EXITCODE + 1 + CACHE STRING "Result from TRY_RUN" FORCE) +check_cxx_source_runs( + " #include int main() { @@ -93,7 +118,9 @@ int main() 13, -5, 6, -7, 9, 2, -6, 3); __m512i result = _mm512_abs_epi32 (a); return 0; -}" AVX512F_FOUND) +}" + AVX512F_FOUND) set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) -mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND) +mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND + AVX512F_FOUND) diff --git a/cmake/system.cmake b/cmake/system.cmake index c740136b93d52..0562077eae187 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -1,11 +1,11 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -25,66 +25,82 @@ if(UNIX AND NOT APPLE) set(LINUX TRUE) endif(UNIX AND NOT APPLE) -IF(WIN32) - SET(HOST_SYSTEM "win32") -ELSE(WIN32) - IF(APPLE) - SET(HOST_SYSTEM "macosx") - EXEC_PROGRAM(sw_vers ARGS -productVersion OUTPUT_VARIABLE HOST_SYSTEM_VERSION) - STRING(REGEX MATCH "[0-9]+.[0-9]+" MACOS_VERSION "${HOST_SYSTEM_VERSION}") - IF(NOT DEFINED $ENV{MACOSX_DEPLOYMENT_TARGET}) - # Set cache variable - end user may change this during ccmake or cmake-gui configure. - SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING - "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.") - ENDIF() - set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security") - ELSE(APPLE) +if(WIN32) + set(HOST_SYSTEM "win32") +else(WIN32) + if(APPLE) + set(HOST_SYSTEM "macosx") + exec_program( + sw_vers ARGS + -productVersion + OUTPUT_VARIABLE HOST_SYSTEM_VERSION) + string(REGEX MATCH "[0-9]+.[0-9]+" MACOS_VERSION "${HOST_SYSTEM_VERSION}") + if(NOT DEFINED $ENV{MACOSX_DEPLOYMENT_TARGET}) + # Set cache variable - end user may change this during ccmake or cmake-gui configure. + set(CMAKE_OSX_DEPLOYMENT_TARGET + ${MACOS_VERSION} + CACHE + STRING + "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value." + ) + endif() + set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security") + else(APPLE) - IF(EXISTS "/etc/issue") - FILE(READ "/etc/issue" LINUX_ISSUE) - IF(LINUX_ISSUE MATCHES "CentOS") - SET(HOST_SYSTEM "centos") - ELSEIF(LINUX_ISSUE MATCHES "Debian") - SET(HOST_SYSTEM "debian") - ELSEIF(LINUX_ISSUE MATCHES "Ubuntu") - SET(HOST_SYSTEM "ubuntu") - ELSEIF(LINUX_ISSUE MATCHES "Red Hat") - SET(HOST_SYSTEM "redhat") - ELSEIF(LINUX_ISSUE MATCHES "Fedora") - SET(HOST_SYSTEM "fedora") - ENDIF() + if(EXISTS "/etc/issue") + file(READ "/etc/issue" LINUX_ISSUE) + if(LINUX_ISSUE MATCHES "CentOS") + set(HOST_SYSTEM "centos") + elseif(LINUX_ISSUE MATCHES "Debian") + set(HOST_SYSTEM "debian") + elseif(LINUX_ISSUE MATCHES "Ubuntu") + set(HOST_SYSTEM "ubuntu") + elseif(LINUX_ISSUE MATCHES "Red Hat") + set(HOST_SYSTEM "redhat") + elseif(LINUX_ISSUE MATCHES "Fedora") + set(HOST_SYSTEM "fedora") + endif() - STRING(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" HOST_SYSTEM_VERSION "${LINUX_ISSUE}") - ENDIF(EXISTS "/etc/issue") + string(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" HOST_SYSTEM_VERSION + "${LINUX_ISSUE}") + endif(EXISTS "/etc/issue") - IF(EXISTS "/etc/redhat-release") - FILE(READ "/etc/redhat-release" LINUX_ISSUE) - IF(LINUX_ISSUE MATCHES "CentOS") - SET(HOST_SYSTEM "centos") - ENDIF() - ENDIF(EXISTS "/etc/redhat-release") + if(EXISTS "/etc/redhat-release") + file(READ "/etc/redhat-release" LINUX_ISSUE) + if(LINUX_ISSUE MATCHES "CentOS") + set(HOST_SYSTEM "centos") + endif() + endif(EXISTS "/etc/redhat-release") - IF(NOT HOST_SYSTEM) - SET(HOST_SYSTEM ${CMAKE_SYSTEM_NAME}) - ENDIF() + if(NOT HOST_SYSTEM) + set(HOST_SYSTEM ${CMAKE_SYSTEM_NAME}) + endif() - ENDIF(APPLE) -ENDIF(WIN32) + endif(APPLE) +endif(WIN32) # query number of logical cores -CMAKE_HOST_SYSTEM_INFORMATION(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES) +cmake_host_system_information(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES) -MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES) +mark_as_advanced(HOST_SYSTEM CPU_CORES) -MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}") -MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores") +message( + STATUS + "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}") +message(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores") # external dependencies log output -SET(EXTERNAL_PROJECT_LOG_ARGS - LOG_DOWNLOAD 0 # Wrap download in script to log output - LOG_UPDATE 1 # Wrap update in script to log output - LOG_CONFIGURE 1 # Wrap configure in script to log output - LOG_BUILD 0 # Wrap build in script to log output - LOG_TEST 1 # Wrap test in script to log output - LOG_INSTALL 0 # Wrap install in script to log output +set(EXTERNAL_PROJECT_LOG_ARGS + LOG_DOWNLOAD + 0 # Wrap download in script to log output + LOG_UPDATE + 1 # Wrap update in script to log output + LOG_CONFIGURE + 1 # Wrap configure in script to log output + LOG_BUILD + 0 # Wrap build in script to log output + LOG_TEST + 1 # Wrap test in script to log output + LOG_INSTALL + 0 # Wrap install in script to log output ) diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake index e4b22befff850..5651ceb76e538 100644 --- a/cmake/tensorrt.cmake +++ b/cmake/tensorrt.cmake @@ -1,87 +1,103 @@ if(NOT WITH_GPU OR NOT WITH_TENSORRT) - return() + return() endif() if(WIN32) - string(REPLACE "\\" "/" TENSORRT_ROOT "${TENSORRT_ROOT}") - set(TR_INFER_LIB nvinfer.lib) - set(TR_INFER_RT nvinfer.dll) - set(TR_INFER_PLUGIN_RT nvinfer_plugin.dll) + string(REPLACE "\\" "/" TENSORRT_ROOT "${TENSORRT_ROOT}") + set(TR_INFER_LIB nvinfer.lib) + set(TR_INFER_RT nvinfer.dll) + set(TR_INFER_PLUGIN_RT nvinfer_plugin.dll) else() - set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT") - set(TR_INFER_LIB libnvinfer.a) - set(TR_INFER_RT libnvinfer.so) - set(TR_INFER_PLUGIN_RT libnvinfer_plugin.so) + set(TENSORRT_ROOT + "/usr" + CACHE PATH "TENSORRT ROOT") + set(TR_INFER_LIB libnvinfer.a) + set(TR_INFER_RT libnvinfer.so) + set(TR_INFER_PLUGIN_RT libnvinfer_plugin.so) endif() -find_path(TENSORRT_INCLUDE_DIR NvInfer.h - PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/include - ${TENSORRT_ROOT}/include/${CMAKE_LIBRARY_ARCHITECTURE} - $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/include - $ENV{TENSORRT_ROOT}/include/${CMAKE_LIBRARY_ARCHITECTURE} - NO_DEFAULT_PATH -) +find_path( + TENSORRT_INCLUDE_DIR NvInfer.h + PATHS ${TENSORRT_ROOT} + ${TENSORRT_ROOT}/include + ${TENSORRT_ROOT}/include/${CMAKE_LIBRARY_ARCHITECTURE} + $ENV{TENSORRT_ROOT} + $ENV{TENSORRT_ROOT}/include + $ENV{TENSORRT_ROOT}/include/${CMAKE_LIBRARY_ARCHITECTURE} + NO_DEFAULT_PATH) -find_path(TENSORRT_LIBRARY_DIR NAMES ${TR_INFER_LIB} ${TR_INFER_RT} - PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/lib - ${TENSORRT_ROOT}/lib/${CMAKE_LIBRARY_ARCHITECTURE} - $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/lib - $ENV{TENSORRT_ROOT}/lib/${CMAKE_LIBRARY_ARCHITECTURE} - NO_DEFAULT_PATH - DOC "Path to TensorRT library." -) +find_path( + TENSORRT_LIBRARY_DIR + NAMES ${TR_INFER_LIB} ${TR_INFER_RT} + PATHS ${TENSORRT_ROOT} + ${TENSORRT_ROOT}/lib + ${TENSORRT_ROOT}/lib/${CMAKE_LIBRARY_ARCHITECTURE} + $ENV{TENSORRT_ROOT} + $ENV{TENSORRT_ROOT}/lib + $ENV{TENSORRT_ROOT}/lib/${CMAKE_LIBRARY_ARCHITECTURE} + NO_DEFAULT_PATH + DOC "Path to TensorRT library.") -find_library(TENSORRT_LIBRARY NAMES ${TR_INFER_LIB} ${TR_INFER_RT} - PATHS ${TENSORRT_LIBRARY_DIR} - NO_DEFAULT_PATH - DOC "Path to TensorRT library.") +find_library( + TENSORRT_LIBRARY + NAMES ${TR_INFER_LIB} ${TR_INFER_RT} + PATHS ${TENSORRT_LIBRARY_DIR} + NO_DEFAULT_PATH + DOC "Path to TensorRT library.") if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY) - set(TENSORRT_FOUND ON) + set(TENSORRT_FOUND ON) else() - set(TENSORRT_FOUND OFF) - message(WARNING "TensorRT is disabled. You are compiling PaddlePaddle with option -DWITH_TENSORRT=ON, but TensorRT is not found, please configure path to TensorRT with option -DTENSORRT_ROOT or install it.") + set(TENSORRT_FOUND OFF) + message( + WARNING + "TensorRT is disabled. You are compiling PaddlePaddle with option -DWITH_TENSORRT=ON, but TensorRT is not found, please configure path to TensorRT with option -DTENSORRT_ROOT or install it." + ) endif() if(TENSORRT_FOUND) - file(READ ${TENSORRT_INCLUDE_DIR}/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS) - string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION - "${TENSORRT_VERSION_FILE_CONTENTS}") - string(REGEX MATCH "define NV_TENSORRT_MINOR +([0-9]+)" TENSORRT_MINOR_VERSION - "${TENSORRT_VERSION_FILE_CONTENTS}") - string(REGEX MATCH "define NV_TENSORRT_PATCH +([0-9]+)" TENSORRT_PATCH_VERSION - "${TENSORRT_VERSION_FILE_CONTENTS}") - string(REGEX MATCH "define NV_TENSORRT_BUILD +([0-9]+)" TENSORRT_BUILD_VERSION - "${TENSORRT_VERSION_FILE_CONTENTS}") + file(READ ${TENSORRT_INCLUDE_DIR}/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS) + string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" + TENSORRT_MAJOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}") + string(REGEX MATCH "define NV_TENSORRT_MINOR +([0-9]+)" + TENSORRT_MINOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}") + string(REGEX MATCH "define NV_TENSORRT_PATCH +([0-9]+)" + TENSORRT_PATCH_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}") + string(REGEX MATCH "define NV_TENSORRT_BUILD +([0-9]+)" + TENSORRT_BUILD_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}") - if("${TENSORRT_MAJOR_VERSION}" STREQUAL "") - file(READ ${TENSORRT_INCLUDE_DIR}/NvInferVersion.h TENSORRT_VERSION_FILE_CONTENTS) - string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION - "${TENSORRT_VERSION_FILE_CONTENTS}") - string(REGEX MATCH "define NV_TENSORRT_MINOR +([0-9]+)" TENSORRT_MINOR_VERSION - "${TENSORRT_VERSION_FILE_CONTENTS}") - string(REGEX MATCH "define NV_TENSORRT_PATCH +([0-9]+)" TENSORRT_PATCH_VERSION - "${TENSORRT_VERSION_FILE_CONTENTS}") - string(REGEX MATCH "define NV_TENSORRT_BUILD +([0-9]+)" TENSORRT_BUILD_VERSION - "${TENSORRT_VERSION_FILE_CONTENTS}") - endif() + if("${TENSORRT_MAJOR_VERSION}" STREQUAL "") + file(READ ${TENSORRT_INCLUDE_DIR}/NvInferVersion.h + TENSORRT_VERSION_FILE_CONTENTS) + string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" + TENSORRT_MAJOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}") + string(REGEX MATCH "define NV_TENSORRT_MINOR +([0-9]+)" + TENSORRT_MINOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}") + string(REGEX MATCH "define NV_TENSORRT_PATCH +([0-9]+)" + TENSORRT_PATCH_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}") + string(REGEX MATCH "define NV_TENSORRT_BUILD +([0-9]+)" + TENSORRT_BUILD_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}") + endif() - if("${TENSORRT_MAJOR_VERSION}" STREQUAL "") - message(SEND_ERROR "Failed to detect TensorRT version.") - endif() + if("${TENSORRT_MAJOR_VERSION}" STREQUAL "") + message(SEND_ERROR "Failed to detect TensorRT version.") + endif() - string(REGEX REPLACE "define NV_TENSORRT_MAJOR +([0-9]+)" "\\1" - TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}") - string(REGEX REPLACE "define NV_TENSORRT_MINOR +([0-9]+)" "\\1" - TENSORRT_MINOR_VERSION "${TENSORRT_MINOR_VERSION}") - string(REGEX REPLACE "define NV_TENSORRT_PATCH +([0-9]+)" "\\1" - TENSORRT_PATCH_VERSION "${TENSORRT_PATCH_VERSION}") - string(REGEX REPLACE "define NV_TENSORRT_BUILD +([0-9]+)" "\\1" - TENSORRT_BUILD_VERSION "${TENSORRT_BUILD_VERSION}") + string(REGEX REPLACE "define NV_TENSORRT_MAJOR +([0-9]+)" "\\1" + TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}") + string(REGEX REPLACE "define NV_TENSORRT_MINOR +([0-9]+)" "\\1" + TENSORRT_MINOR_VERSION "${TENSORRT_MINOR_VERSION}") + string(REGEX REPLACE "define NV_TENSORRT_PATCH +([0-9]+)" "\\1" + TENSORRT_PATCH_VERSION "${TENSORRT_PATCH_VERSION}") + string(REGEX REPLACE "define NV_TENSORRT_BUILD +([0-9]+)" "\\1" + TENSORRT_BUILD_VERSION "${TENSORRT_BUILD_VERSION}") - message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. " - "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}.${TENSORRT_MINOR_VERSION}.${TENSORRT_PATCH_VERSION}.${TENSORRT_BUILD_VERSION} ") - include_directories(${TENSORRT_INCLUDE_DIR}) - link_directories(${TENSORRT_LIBRARY}) - add_definitions(-DPADDLE_WITH_TENSORRT) + message( + STATUS + "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. " + "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}.${TENSORRT_MINOR_VERSION}.${TENSORRT_PATCH_VERSION}.${TENSORRT_BUILD_VERSION} " + ) + include_directories(${TENSORRT_INCLUDE_DIR}) + link_directories(${TENSORRT_LIBRARY}) + add_definitions(-DPADDLE_WITH_TENSORRT) endif() diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index eb6fa4ee13c81..2004241ab1a76 100755 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -15,10 +15,14 @@ include(ExternalProject) # Creat a target named "third_party", which can compile external dependencies on all platform(windows/linux/mac) -set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING - "A path setting third party libraries download & build directories.") -set(THIRD_PARTY_CACHE_PATH "${CMAKE_SOURCE_DIR}" CACHE STRING - "A path cache third party source code to avoid repeated download.") +set(THIRD_PARTY_PATH + "${CMAKE_BINARY_DIR}/third_party" + CACHE STRING + "A path setting third party libraries download & build directories.") +set(THIRD_PARTY_CACHE_PATH + "${CMAKE_SOURCE_DIR}" + CACHE STRING + "A path cache third party source code to avoid repeated download.") set(THIRD_PARTY_BUILD_TYPE Release) set(third_party_deps) @@ -39,389 +43,457 @@ set(third_party_deps) # TAG ${TARGET_TAG} # DIR ${TARGET_SOURCE_DIR}) -FUNCTION(cache_third_party TARGET) - SET(options "") - SET(oneValueArgs URL REPOSITORY TAG DIR) - SET(multiValueArgs "") - cmake_parse_arguments(cache_third_party "${optionps}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - STRING(REPLACE "extern_" "" TARGET_NAME ${TARGET}) - STRING(REGEX REPLACE "[0-9]+" "" TARGET_NAME ${TARGET_NAME}) - STRING(TOUPPER ${TARGET_NAME} TARGET_NAME) - IF(cache_third_party_REPOSITORY) - SET(${TARGET_NAME}_DOWNLOAD_CMD - GIT_REPOSITORY ${cache_third_party_REPOSITORY}) - IF(cache_third_party_TAG) - LIST(APPEND ${TARGET_NAME}_DOWNLOAD_CMD - GIT_TAG ${cache_third_party_TAG}) - ENDIF() - ELSEIF(cache_third_party_URL) - SET(${TARGET_NAME}_DOWNLOAD_CMD - URL ${cache_third_party_URL}) - ELSE() - MESSAGE(FATAL_ERROR "Download link (Git repo or URL) must be specified for cache!") - ENDIF() - IF(WITH_TP_CACHE) - IF(NOT cache_third_party_DIR) - MESSAGE(FATAL_ERROR "Please input the ${TARGET_NAME}_SOURCE_DIR for overwriting when -DWITH_TP_CACHE=ON") - ENDIF() - # Generate and verify cache dir for third_party source code - SET(cache_third_party_REPOSITORY ${cache_third_party_REPOSITORY} ${cache_third_party_URL}) - IF(cache_third_party_REPOSITORY AND cache_third_party_TAG) - STRING(MD5 HASH_REPO ${cache_third_party_REPOSITORY}) - STRING(MD5 HASH_GIT ${cache_third_party_TAG}) - STRING(SUBSTRING ${HASH_REPO} 0 8 HASH_REPO) - STRING(SUBSTRING ${HASH_GIT} 0 8 HASH_GIT) - STRING(CONCAT HASH ${HASH_REPO} ${HASH_GIT}) - # overwrite the original SOURCE_DIR when cache directory - SET(${cache_third_party_DIR} ${THIRD_PARTY_CACHE_PATH}/third_party/${TARGET}_${HASH}) - ELSEIF(cache_third_party_REPOSITORY) - STRING(MD5 HASH_REPO ${cache_third_party_REPOSITORY}) - STRING(SUBSTRING ${HASH_REPO} 0 16 HASH) - # overwrite the original SOURCE_DIR when cache directory - SET(${cache_third_party_DIR} ${THIRD_PARTY_CACHE_PATH}/third_party/${TARGET}_${HASH}) - ENDIF() - - IF(EXISTS ${${cache_third_party_DIR}}) - # judge whether the cache dir is empty - FILE(GLOB files ${${cache_third_party_DIR}}/*) - LIST(LENGTH files files_len) - IF(files_len GREATER 0) - list(APPEND ${TARGET_NAME}_DOWNLOAD_CMD DOWNLOAD_COMMAND "") - ENDIF() - ENDIF() - SET(${cache_third_party_DIR} ${${cache_third_party_DIR}} PARENT_SCOPE) - ENDIF() - - # Pass ${TARGET_NAME}_DOWNLOAD_CMD to parent scope, the double quotation marks can't be removed - SET(${TARGET_NAME}_DOWNLOAD_CMD "${${TARGET_NAME}_DOWNLOAD_CMD}" PARENT_SCOPE) -ENDFUNCTION() - -MACRO(UNSET_VAR VAR_NAME) - UNSET(${VAR_NAME} CACHE) - UNSET(${VAR_NAME}) -ENDMACRO() +function(cache_third_party TARGET) + set(options "") + set(oneValueArgs URL REPOSITORY TAG DIR) + set(multiValueArgs "") + cmake_parse_arguments(cache_third_party "${optionps}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + + string(REPLACE "extern_" "" TARGET_NAME ${TARGET}) + string(REGEX REPLACE "[0-9]+" "" TARGET_NAME ${TARGET_NAME}) + string(TOUPPER ${TARGET_NAME} TARGET_NAME) + if(cache_third_party_REPOSITORY) + set(${TARGET_NAME}_DOWNLOAD_CMD GIT_REPOSITORY + ${cache_third_party_REPOSITORY}) + if(cache_third_party_TAG) + list(APPEND ${TARGET_NAME}_DOWNLOAD_CMD GIT_TAG ${cache_third_party_TAG}) + endif() + elseif(cache_third_party_URL) + set(${TARGET_NAME}_DOWNLOAD_CMD URL ${cache_third_party_URL}) + else() + message( + FATAL_ERROR "Download link (Git repo or URL) must be specified for cache!" + ) + endif() + if(WITH_TP_CACHE) + if(NOT cache_third_party_DIR) + message( + FATAL_ERROR + "Please input the ${TARGET_NAME}_SOURCE_DIR for overwriting when -DWITH_TP_CACHE=ON" + ) + endif() + # Generate and verify cache dir for third_party source code + set(cache_third_party_REPOSITORY ${cache_third_party_REPOSITORY} + ${cache_third_party_URL}) + if(cache_third_party_REPOSITORY AND cache_third_party_TAG) + string(MD5 HASH_REPO ${cache_third_party_REPOSITORY}) + string(MD5 HASH_GIT ${cache_third_party_TAG}) + string(SUBSTRING ${HASH_REPO} 0 8 HASH_REPO) + string(SUBSTRING ${HASH_GIT} 0 8 HASH_GIT) + string(CONCAT HASH ${HASH_REPO} ${HASH_GIT}) + # overwrite the original SOURCE_DIR when cache directory + set(${cache_third_party_DIR} + ${THIRD_PARTY_CACHE_PATH}/third_party/${TARGET}_${HASH}) + elseif(cache_third_party_REPOSITORY) + string(MD5 HASH_REPO ${cache_third_party_REPOSITORY}) + string(SUBSTRING ${HASH_REPO} 0 16 HASH) + # overwrite the original SOURCE_DIR when cache directory + set(${cache_third_party_DIR} + ${THIRD_PARTY_CACHE_PATH}/third_party/${TARGET}_${HASH}) + endif() + + if(EXISTS ${${cache_third_party_DIR}}) + # judge whether the cache dir is empty + file(GLOB files ${${cache_third_party_DIR}}/*) + list(LENGTH files files_len) + if(files_len GREATER 0) + list(APPEND ${TARGET_NAME}_DOWNLOAD_CMD DOWNLOAD_COMMAND "") + endif() + endif() + set(${cache_third_party_DIR} + ${${cache_third_party_DIR}} + PARENT_SCOPE) + endif() + + # Pass ${TARGET_NAME}_DOWNLOAD_CMD to parent scope, the double quotation marks can't be removed + set(${TARGET_NAME}_DOWNLOAD_CMD + "${${TARGET_NAME}_DOWNLOAD_CMD}" + PARENT_SCOPE) +endfunction() + +macro(UNSET_VAR VAR_NAME) + unset(${VAR_NAME} CACHE) + unset(${VAR_NAME}) +endmacro() # Funciton to Download the dependencies during compilation # This function has 2 parameters, URL / DIRNAME: # 1. URL: The download url of 3rd dependencies # 2. NAME: The name of file, that determin the dirname # -FUNCTION(file_download_and_uncompress URL NAME) +function(file_download_and_uncompress URL NAME) set(options "") set(oneValueArgs MD5) set(multiValueArgs "") - cmake_parse_arguments(URL "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}, MD5: ${URL_MD5}") - SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data PARENT_SCOPE) + cmake_parse_arguments(URL "${options}" "${oneValueArgs}" "${multiValueArgs}" + ${ARGN}) + message(STATUS "Download dependence[${NAME}] from ${URL}, MD5: ${URL_MD5}") + set(${NAME}_INCLUDE_DIR + ${THIRD_PARTY_PATH}/${NAME}/data + PARENT_SCOPE) ExternalProject_Add( - download_${NAME} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${THIRD_PARTY_PATH}/${NAME} - URL ${URL} - URL_MD5 ${URL_MD5} - TIMEOUT 120 - DOWNLOAD_DIR ${THIRD_PARTY_PATH}/${NAME}/data/ - SOURCE_DIR ${THIRD_PARTY_PATH}/${NAME}/data/ - DOWNLOAD_NO_PROGRESS 1 - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - UPDATE_COMMAND "" - INSTALL_COMMAND "" - ) - set(third_party_deps ${third_party_deps} download_${NAME} PARENT_SCOPE) -ENDFUNCTION() - + download_${NAME} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${THIRD_PARTY_PATH}/${NAME} + URL ${URL} + URL_MD5 ${URL_MD5} + TIMEOUT 120 + DOWNLOAD_DIR ${THIRD_PARTY_PATH}/${NAME}/data/ + SOURCE_DIR ${THIRD_PARTY_PATH}/${NAME}/data/ + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND "") + set(third_party_deps + ${third_party_deps} download_${NAME} + PARENT_SCOPE) +endfunction() # Correction of flags on different Platform(WIN/MAC) and Print Warning Message -if (APPLE) - if(WITH_MKL) - MESSAGE(WARNING - "Mac is not supported with MKL in Paddle yet. Force WITH_MKL=OFF.") - set(WITH_MKL OFF CACHE STRING "Disable MKL for building on mac" FORCE) - endif() +if(APPLE) + if(WITH_MKL) + message( + WARNING "Mac is not supported with MKL in Paddle yet. Force WITH_MKL=OFF." + ) + set(WITH_MKL + OFF + CACHE STRING "Disable MKL for building on mac" FORCE) + endif() endif() if(WIN32 OR APPLE) - MESSAGE(STATUS "Disable XBYAK in Windows and MacOS") - SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE) - - if(WITH_LIBXSMM) - MESSAGE(WARNING - "Windows, Mac are not supported with libxsmm in Paddle yet." - "Force WITH_LIBXSMM=OFF") - SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM in Windows and MacOS" FORCE) - endif() - - if(WITH_BOX_PS) - MESSAGE(WARNING - "Windows or Mac is not supported with BOX_PS in Paddle yet." - "Force WITH_BOX_PS=OFF") - SET(WITH_BOX_PS OFF CACHE STRING "Disable BOX_PS package in Windows and MacOS" FORCE) - endif() - - if(WITH_PSLIB) - MESSAGE(WARNING - "Windows or Mac is not supported with PSLIB in Paddle yet." - "Force WITH_PSLIB=OFF") - SET(WITH_PSLIB OFF CACHE STRING "Disable PSLIB package in Windows and MacOS" FORCE) - endif() - - if(WITH_ARM_BRPC) - MESSAGE(WARNING - "Windows or Mac is not supported with ARM_BRPC in Paddle yet." - "Force WITH_ARM_BRPC=OFF") - SET(WITH_ARM_BRPC OFF CACHE STRING "Disable ARM_BRPC package in Windows and MacOS" FORCE) - endif() - - if(WITH_LIBMCT) - MESSAGE(WARNING - "Windows or Mac is not supported with LIBMCT in Paddle yet." - "Force WITH_LIBMCT=OFF") - SET(WITH_LIBMCT OFF CACHE STRING "Disable LIBMCT package in Windows and MacOS" FORCE) - endif() - - if(WITH_PSLIB_BRPC) - MESSAGE(WARNING - "Windows or Mac is not supported with PSLIB_BRPC in Paddle yet." - "Force WITH_PSLIB_BRPC=OFF") - SET(WITH_PSLIB_BRPC OFF CACHE STRING "Disable PSLIB_BRPC package in Windows and MacOS" FORCE) - endif() + message(STATUS "Disable XBYAK in Windows and MacOS") + set(WITH_XBYAK + OFF + CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE) + + if(WITH_LIBXSMM) + message(WARNING "Windows, Mac are not supported with libxsmm in Paddle yet." + "Force WITH_LIBXSMM=OFF") + set(WITH_LIBXSMM + OFF + CACHE STRING "Disable LIBXSMM in Windows and MacOS" FORCE) + endif() + + if(WITH_BOX_PS) + message(WARNING "Windows or Mac is not supported with BOX_PS in Paddle yet." + "Force WITH_BOX_PS=OFF") + set(WITH_BOX_PS + OFF + CACHE STRING "Disable BOX_PS package in Windows and MacOS" FORCE) + endif() + + if(WITH_PSLIB) + message(WARNING "Windows or Mac is not supported with PSLIB in Paddle yet." + "Force WITH_PSLIB=OFF") + set(WITH_PSLIB + OFF + CACHE STRING "Disable PSLIB package in Windows and MacOS" FORCE) + endif() + + if(WITH_ARM_BRPC) + message( + WARNING "Windows or Mac is not supported with ARM_BRPC in Paddle yet." + "Force WITH_ARM_BRPC=OFF") + set(WITH_ARM_BRPC + OFF + CACHE STRING "Disable ARM_BRPC package in Windows and MacOS" FORCE) + endif() + + if(WITH_LIBMCT) + message(WARNING "Windows or Mac is not supported with LIBMCT in Paddle yet." + "Force WITH_LIBMCT=OFF") + set(WITH_LIBMCT + OFF + CACHE STRING "Disable LIBMCT package in Windows and MacOS" FORCE) + endif() + + if(WITH_PSLIB_BRPC) + message( + WARNING "Windows or Mac is not supported with PSLIB_BRPC in Paddle yet." + "Force WITH_PSLIB_BRPC=OFF") + set(WITH_PSLIB_BRPC + OFF + CACHE STRING "Disable PSLIB_BRPC package in Windows and MacOS" FORCE) + endif() endif() set(WITH_MKLML ${WITH_MKL}) if(NOT DEFINED WITH_MKLDNN) - if(WITH_MKL AND AVX2_FOUND) - set(WITH_MKLDNN ON) - else() - message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN") - set(WITH_MKLDNN OFF) - endif() + if(WITH_MKL AND AVX2_FOUND) + set(WITH_MKLDNN ON) + else() + message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN") + set(WITH_MKLDNN OFF) + endif() endif() -if(WIN32 OR APPLE OR NOT WITH_GPU OR ON_INFER) - set(WITH_DGC OFF) +if(WIN32 + OR APPLE + OR NOT WITH_GPU + OR ON_INFER) + set(WITH_DGC OFF) endif() if(${CMAKE_VERSION} VERSION_GREATER "3.5.2") - set(SHALLOW_CLONE "GIT_SHALLOW TRUE") # adds --depth=1 arg to git clone of External_Projects + set(SHALLOW_CLONE "GIT_SHALLOW TRUE" + )# adds --depth=1 arg to git clone of External_Projects endif() ########################### include third_party according to flags ############################### -include(external/zlib) # download, build, install zlib -include(external/gflags) # download, build, install gflags -include(external/glog) # download, build, install glog -include(external/boost) # download boost -include(external/eigen) # download eigen3 -include(external/threadpool)# download threadpool -include(external/dlpack) # download dlpack -include(external/xxhash) # download, build, install xxhash -include(external/warpctc) # download, build, install warpctc -include(external/utf8proc) # download, build, install utf8proc - -list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash) -list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool extern_utf8proc) -include(external/lapack) # download, build, install lapack - -list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash) -list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool extern_lapack) - -include(cblas) # find first, then download, build, install openblas +include(external/zlib) # download, build, install zlib +include(external/gflags) # download, build, install gflags +include(external/glog) # download, build, install glog +include(external/boost) # download boost +include(external/eigen) # download eigen3 +include(external/threadpool) # download threadpool +include(external/dlpack) # download dlpack +include(external/xxhash) # download, build, install xxhash +include(external/warpctc) # download, build, install warpctc +include(external/utf8proc) # download, build, install utf8proc + +list( + APPEND + third_party_deps + extern_eigen3 + extern_gflags + extern_glog + extern_boost + extern_xxhash) +list( + APPEND + third_party_deps + extern_zlib + extern_dlpack + extern_warpctc + extern_threadpool + extern_utf8proc) +include(external/lapack) # download, build, install lapack + +list( + APPEND + third_party_deps + extern_eigen3 + extern_gflags + extern_glog + extern_boost + extern_xxhash) +list( + APPEND + third_party_deps + extern_zlib + extern_dlpack + extern_warpctc + extern_threadpool + extern_lapack) + +include(cblas) # find first, then download, build, install openblas message(STATUS "CBLAS_PROVIDER: ${CBLAS_PROVIDER}") if(${CBLAS_PROVIDER} STREQUAL MKLML) - list(APPEND third_party_deps extern_mklml) + list(APPEND third_party_deps extern_mklml) elseif(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS) - list(APPEND third_party_deps extern_openblas) + list(APPEND third_party_deps extern_openblas) endif() - if(WITH_MKLDNN) - include(external/mkldnn) # download, build, install mkldnn - list(APPEND third_party_deps extern_mkldnn) + include(external/mkldnn) # download, build, install mkldnn + list(APPEND third_party_deps extern_mkldnn) endif() -include(external/protobuf) # find first, then download, build, install protobuf +include(external/protobuf) # find first, then download, build, install protobuf if(TARGET extern_protobuf) - list(APPEND third_party_deps extern_protobuf) + list(APPEND third_party_deps extern_protobuf) endif() if(WITH_PYTHON) - include(external/python) # find python and python_module - include(external/pybind11) # download pybind11 - list(APPEND third_party_deps extern_pybind) + include(external/python) # find python and python_module + include(external/pybind11) # download pybind11 + list(APPEND third_party_deps extern_pybind) endif() -IF(WITH_TESTING OR WITH_DISTRIBUTE) - include(external/gtest) # download, build, install gtest - list(APPEND third_party_deps extern_gtest) -ENDIF() +if(WITH_TESTING OR WITH_DISTRIBUTE) + include(external/gtest) # download, build, install gtest + list(APPEND third_party_deps extern_gtest) +endif() if(WITH_ONNXRUNTIME) - include(external/onnxruntime) # download, build, install onnxruntime、paddle2onnx - include(external/paddle2onnx) - list(APPEND third_party_deps extern_onnxruntime extern_paddle2onnx) + include(external/onnxruntime + )# download, build, install onnxruntime、paddle2onnx + include(external/paddle2onnx) + list(APPEND third_party_deps extern_onnxruntime extern_paddle2onnx) endif() if(WITH_GPU) - if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) - include(external/cub) # download cub - list(APPEND third_party_deps extern_cub) - endif() - set(URL "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg_20210928.tar.gz" CACHE STRING "" FORCE) - file_download_and_uncompress(${URL} "externalError" MD5 a712a49384e77ca216ad866712f7cafa) # download file externalErrorMsg.tar.gz - if(WITH_TESTING) - # copy externalErrorMsg.pb, just for unittest can get error message correctly. - set(SRC_DIR ${THIRD_PARTY_PATH}/externalError/data) - if(WIN32 AND (NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja")) - set(DST_DIR1 ${CMAKE_BINARY_DIR}/paddle/fluid/third_party/externalError/data) - else() - set(DST_DIR1 ${CMAKE_BINARY_DIR}/paddle/third_party/externalError/data) - endif() - set(DST_DIR2 ${CMAKE_BINARY_DIR}/python/paddle/include/third_party/externalError/data) - add_custom_command(TARGET download_externalError POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_directory ${SRC_DIR} ${DST_DIR1} - COMMAND ${CMAKE_COMMAND} -E copy_directory ${SRC_DIR} ${DST_DIR2} - COMMENT "copy_directory from ${SRC_DIR} to ${DST_DIR}") + if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) + include(external/cub) # download cub + list(APPEND third_party_deps extern_cub) + endif() + set(URL + "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg_20210928.tar.gz" + CACHE STRING "" FORCE) + file_download_and_uncompress( + ${URL} "externalError" MD5 a712a49384e77ca216ad866712f7cafa + )# download file externalErrorMsg.tar.gz + if(WITH_TESTING) + # copy externalErrorMsg.pb, just for unittest can get error message correctly. + set(SRC_DIR ${THIRD_PARTY_PATH}/externalError/data) + if(WIN32 AND (NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja")) + set(DST_DIR1 + ${CMAKE_BINARY_DIR}/paddle/fluid/third_party/externalError/data) + else() + set(DST_DIR1 ${CMAKE_BINARY_DIR}/paddle/third_party/externalError/data) endif() + set(DST_DIR2 + ${CMAKE_BINARY_DIR}/python/paddle/include/third_party/externalError/data + ) + add_custom_command( + TARGET download_externalError + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_directory ${SRC_DIR} ${DST_DIR1} + COMMAND ${CMAKE_COMMAND} -E copy_directory ${SRC_DIR} ${DST_DIR2} + COMMENT "copy_directory from ${SRC_DIR} to ${DST_DIR}") + endif() endif(WITH_GPU) if(WITH_XPU) - include(external/xpu) # download, build, install xpu - list(APPEND third_party_deps extern_xpu) + include(external/xpu) # download, build, install xpu + list(APPEND third_party_deps extern_xpu) endif(WITH_XPU) if(WITH_MLU) - include(external/concurrentqueue) # download, build, install concurrentqueue - list(APPEND third_party_deps extern_concurrentqueue) + include(external/concurrentqueue) # download, build, install concurrentqueue + list(APPEND third_party_deps extern_concurrentqueue) endif(WITH_MLU) if(WITH_PSLIB) - include(external/pslib) # download, build, install pslib - list(APPEND third_party_deps extern_pslib) - if(WITH_LIBMCT) - include(external/libmct) # download, build, install libmct - list(APPEND third_party_deps extern_libxsmm) - endif() - if(WITH_PSLIB_BRPC) - include(external/pslib_brpc) # download, build, install pslib_brpc - list(APPEND third_party_deps extern_pslib_brpc) - else() - include(external/snappy) - list(APPEND third_party_deps extern_snappy) - - include(external/leveldb) - list(APPEND third_party_deps extern_leveldb) - if(NOT WITH_HETERPS) - include(external/brpc) - list(APPEND third_party_deps extern_brpc) - endif() + include(external/pslib) # download, build, install pslib + list(APPEND third_party_deps extern_pslib) + if(WITH_LIBMCT) + include(external/libmct) # download, build, install libmct + list(APPEND third_party_deps extern_libxsmm) + endif() + if(WITH_PSLIB_BRPC) + include(external/pslib_brpc) # download, build, install pslib_brpc + list(APPEND third_party_deps extern_pslib_brpc) + else() + include(external/snappy) + list(APPEND third_party_deps extern_snappy) + + include(external/leveldb) + list(APPEND third_party_deps extern_leveldb) + if(NOT WITH_HETERPS) + include(external/brpc) + list(APPEND third_party_deps extern_brpc) endif() + endif() endif(WITH_PSLIB) if(NOT WIN32 AND NOT APPLE) - include(external/gloo) - list(APPEND third_party_deps extern_gloo) + include(external/gloo) + list(APPEND third_party_deps extern_gloo) endif() if(WITH_BOX_PS) - include(external/box_ps) - list(APPEND third_party_deps extern_box_ps) + include(external/box_ps) + list(APPEND third_party_deps extern_box_ps) endif(WITH_BOX_PS) if(WITH_ASCEND OR WITH_ASCEND_CL) - include(external/ascend) - if(WITH_ASCEND OR WITH_ASCEND_CL) - list(APPEND third_party_deps extern_ascend) - endif() - if(WITH_ASCEND_CL) - list(APPEND third_party_deps extern_ascend_cl) - endif() -endif () + include(external/ascend) + if(WITH_ASCEND OR WITH_ASCEND_CL) + list(APPEND third_party_deps extern_ascend) + endif() + if(WITH_ASCEND_CL) + list(APPEND third_party_deps extern_ascend_cl) + endif() +endif() -if (WITH_PSCORE) - include(external/snappy) - list(APPEND third_party_deps extern_snappy) +if(WITH_PSCORE) + include(external/snappy) + list(APPEND third_party_deps extern_snappy) - include(external/leveldb) - list(APPEND third_party_deps extern_leveldb) - - if (WITH_ARM_BRPC) - include(external/arm_brpc) - list(APPEND third_party_deps extern_arm_brpc) - else() - include(external/brpc) - list(APPEND third_party_deps extern_brpc) - endif() + include(external/leveldb) + list(APPEND third_party_deps extern_leveldb) + + if(WITH_ARM_BRPC) + include(external/arm_brpc) + list(APPEND third_party_deps extern_arm_brpc) + else() + include(external/brpc) + list(APPEND third_party_deps extern_brpc) + endif() - include(external/libmct) # download, build, install libmct - list(APPEND third_party_deps extern_libmct) + include(external/libmct) # download, build, install libmct + list(APPEND third_party_deps extern_libmct) - include(external/rocksdb) # download, build, install rocksdb - list(APPEND third_party_deps extern_rocksdb) + include(external/rocksdb) # download, build, install rocksdb + list(APPEND third_party_deps extern_rocksdb) endif() if(WITH_XBYAK) - include(external/xbyak) # download, build, install xbyak - list(APPEND third_party_deps extern_xbyak) + include(external/xbyak) # download, build, install xbyak + list(APPEND third_party_deps extern_xbyak) endif() if(WITH_LIBXSMM) - include(external/libxsmm) # download, build, install libxsmm - list(APPEND third_party_deps extern_libxsmm) + include(external/libxsmm) # download, build, install libxsmm + list(APPEND third_party_deps extern_libxsmm) endif() if(WITH_DGC) - message(STATUS "add dgc lib.") - include(external/dgc) # download, build, install dgc - add_definitions(-DPADDLE_WITH_DGC) - list(APPEND third_party_deps extern_dgc) + message(STATUS "add dgc lib.") + include(external/dgc) # download, build, install dgc + add_definitions(-DPADDLE_WITH_DGC) + list(APPEND third_party_deps extern_dgc) endif() -if (WITH_LITE) - message(STATUS "Compile Paddle with Lite Engine.") - include(external/lite) -endif (WITH_LITE) - -if (WITH_CINN) - message(STATUS "Compile Paddle with CINN.") - include(external/cinn) - add_definitions(-DPADDLE_WITH_CINN) - if (WITH_GPU) - add_definitions(-DCINN_WITH_CUDA) - add_definitions(-DCINN_WITH_CUDNN) - endif (WITH_GPU) - if (WITH_MKL) - add_definitions(-DCINN_WITH_MKL_CBLAS) - add_definitions(-DCINN_WITH_MKLDNN) - endif (WITH_MKL) -endif (WITH_CINN) - -if (WITH_CRYPTO) - include(external/cryptopp) # download, build, install cryptopp - list(APPEND third_party_deps extern_cryptopp) - add_definitions(-DPADDLE_WITH_CRYPTO) -endif (WITH_CRYPTO) - -if (WITH_POCKETFFT) - include(external/pocketfft) - list(APPEND third_party_deps extern_pocketfft) - add_definitions(-DPADDLE_WITH_POCKETFFT) -endif (WITH_POCKETFFT) - -if (WIN32) - include(external/dirent) - list(APPEND third_party_deps extern_dirent) -endif (WIN32) - -if (WITH_INFRT) - include(external/llvm) - list(APPEND third_party_deps ${llvm_libs}) +if(WITH_LITE) + message(STATUS "Compile Paddle with Lite Engine.") + include(external/lite) +endif(WITH_LITE) + +if(WITH_CINN) + message(STATUS "Compile Paddle with CINN.") + include(external/cinn) + add_definitions(-DPADDLE_WITH_CINN) + if(WITH_GPU) + add_definitions(-DCINN_WITH_CUDA) + add_definitions(-DCINN_WITH_CUDNN) + endif(WITH_GPU) + if(WITH_MKL) + add_definitions(-DCINN_WITH_MKL_CBLAS) + add_definitions(-DCINN_WITH_MKLDNN) + endif(WITH_MKL) +endif(WITH_CINN) + +if(WITH_CRYPTO) + include(external/cryptopp) # download, build, install cryptopp + list(APPEND third_party_deps extern_cryptopp) + add_definitions(-DPADDLE_WITH_CRYPTO) +endif(WITH_CRYPTO) + +if(WITH_POCKETFFT) + include(external/pocketfft) + list(APPEND third_party_deps extern_pocketfft) + add_definitions(-DPADDLE_WITH_POCKETFFT) +endif(WITH_POCKETFFT) + +if(WIN32) + include(external/dirent) + list(APPEND third_party_deps extern_dirent) +endif(WIN32) + +if(WITH_INFRT) + include(external/llvm) + list(APPEND third_party_deps ${llvm_libs}) endif() -if (WITH_IPU) - include(external/poplar) - list(APPEND third_party_deps extern_poplar) +if(WITH_IPU) + include(external/poplar) + list(APPEND third_party_deps extern_poplar) endif() add_custom_target(third_party ALL DEPENDS ${third_party_deps}) diff --git a/cmake/thrust.cmake b/cmake/thrust.cmake index ff415b1e3c4bf..73c2c29847a34 100644 --- a/cmake/thrust.cmake +++ b/cmake/thrust.cmake @@ -1,6 +1,8 @@ function(add_thrust_patches_if_necessary) set(thrust_detect_file ${PROJECT_BINARY_DIR}/detect_thrust.cu) - file(WRITE ${thrust_detect_file} "" + file( + WRITE ${thrust_detect_file} + "" "#include \"thrust/version.h\"\n" "#include \"thrust/shuffle.h\"\n" "#include \"stdio.h\"\n" @@ -10,10 +12,11 @@ function(add_thrust_patches_if_necessary) " return 0;\n" "}\n") - execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" - "--run" "${thrust_detect_file}" - WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" - RESULT_VARIABLE nvcc_res ERROR_QUIET) + execute_process( + COMMAND "${CUDA_NVCC_EXECUTABLE}" "--run" "${thrust_detect_file}" + WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" + RESULT_VARIABLE nvcc_res + ERROR_QUIET) if(NOT nvcc_res EQUAL 0) set(thrust_patches "${PADDLE_SOURCE_DIR}/patches/thrust") message(STATUS "Add thrust patches: ${thrust_patches}") diff --git a/cmake/unity_build.cmake b/cmake/unity_build.cmake index b7e5564b3a618..e18b2ef1ee686 100644 --- a/cmake/unity_build.cmake +++ b/cmake/unity_build.cmake @@ -1,12 +1,14 @@ # Add the following code before all include to avoid compilation failure. -set(UNITY_CC_BEFORE_CODE [[ +set(UNITY_CC_BEFORE_CODE + [[ #ifndef NOMINMAX #define NOMINMAX #endif #ifndef _USE_MATH_DEFINES #define _USE_MATH_DEFINES #endif]]) -set(UNITY_CU_BEFORE_CODE [[ +set(UNITY_CU_BEFORE_CODE + [[ #ifndef __CUDACC_VER_MAJOR__ #define __CUDACC_VER_MAJOR__ CUDA_COMPILER_MAJOR_VERSION #endif @@ -14,15 +16,13 @@ set(UNITY_CU_BEFORE_CODE [[ #define __CUDACC_VER_MINOR__ CUDA_COMPILER_MINOR_VERSION #endif]]) if(WITH_GPU) - string(REPLACE "." ";" CUDA_COMPILER_VERSION ${CMAKE_CUDA_COMPILER_VERSION}) - list(GET CUDA_COMPILER_VERSION 0 CUDA_COMPILER_MAJOR_VERSION) - list(GET CUDA_COMPILER_VERSION 1 CUDA_COMPILER_MINOR_VERSION) - string(REPLACE - "CUDA_COMPILER_MAJOR_VERSION" ${CUDA_COMPILER_MAJOR_VERSION} - UNITY_CU_BEFORE_CODE ${UNITY_CU_BEFORE_CODE}) - string(REPLACE - "CUDA_COMPILER_MINOR_VERSION" ${CUDA_COMPILER_MINOR_VERSION} - UNITY_CU_BEFORE_CODE ${UNITY_CU_BEFORE_CODE}) + string(REPLACE "." ";" CUDA_COMPILER_VERSION ${CMAKE_CUDA_COMPILER_VERSION}) + list(GET CUDA_COMPILER_VERSION 0 CUDA_COMPILER_MAJOR_VERSION) + list(GET CUDA_COMPILER_VERSION 1 CUDA_COMPILER_MINOR_VERSION) + string(REPLACE "CUDA_COMPILER_MAJOR_VERSION" ${CUDA_COMPILER_MAJOR_VERSION} + UNITY_CU_BEFORE_CODE ${UNITY_CU_BEFORE_CODE}) + string(REPLACE "CUDA_COMPILER_MINOR_VERSION" ${CUDA_COMPILER_MINOR_VERSION} + UNITY_CU_BEFORE_CODE ${UNITY_CU_BEFORE_CODE}) endif() # Group a list of source files that can be included together. @@ -30,37 +30,43 @@ endif() # do not have to exist. # Here you need to specify the source type which belongs to cc or cu. function(register_unity_group TYPE) - # Get UNITY_TARGET from CMAKE_CURRENT_SOURCE_DIR. - string(REPLACE "${PADDLE_SOURCE_DIR}/paddle/fluid/" "" UNITY_TARGET ${CMAKE_CURRENT_SOURCE_DIR}) - string(REPLACE "/" "_" UNITY_TARGET ${UNITY_TARGET}) - set(UNITY_TARGET "paddle_${UNITY_TARGET}_unity") + # Get UNITY_TARGET from CMAKE_CURRENT_SOURCE_DIR. + string(REPLACE "${PADDLE_SOURCE_DIR}/paddle/fluid/" "" UNITY_TARGET + ${CMAKE_CURRENT_SOURCE_DIR}) + string(REPLACE "/" "_" UNITY_TARGET ${UNITY_TARGET}) + set(UNITY_TARGET "paddle_${UNITY_TARGET}_unity") - # Variable unity_group_index is used to record the number of UNITY_TARGET groups. - get_property(unity_group_index GLOBAL PROPERTY ${UNITY_TARGET}_${TYPE}_group_index) - if("${unity_group_index}" STREQUAL "") - set(unity_group_index 0) - endif() + # Variable unity_group_index is used to record the number of UNITY_TARGET groups. + get_property(unity_group_index GLOBAL + PROPERTY ${UNITY_TARGET}_${TYPE}_group_index) + if("${unity_group_index}" STREQUAL "") + set(unity_group_index 0) + endif() - # Variable unity_group_sources is used to record the sources of one group. - set(unity_group_sources ${UNITY_TARGET}_${TYPE}_group_${unity_group_index}_sources) - set_property(GLOBAL PROPERTY ${unity_group_sources} "") - foreach(src ${ARGN}) - # UB use absolute path of source. - if(NOT IS_ABSOLUTE ${src}) - set(src ${CMAKE_CURRENT_SOURCE_DIR}/${src}) - endif() - set_property(GLOBAL APPEND PROPERTY ${unity_group_sources} ${src}) - endforeach() - - # If unity_file does not exists, nv_library or cc_library will use - # dummy_file. Touch unity_file to avoid to use dummy file. - set(unity_file ${CMAKE_CURRENT_BINARY_DIR}/${UNITY_TARGET}_${unity_group_index}_${TYPE}.${TYPE}) - if(NOT EXISTS ${unity_file}) - file(TOUCH ${unity_file}) + # Variable unity_group_sources is used to record the sources of one group. + set(unity_group_sources + ${UNITY_TARGET}_${TYPE}_group_${unity_group_index}_sources) + set_property(GLOBAL PROPERTY ${unity_group_sources} "") + foreach(src ${ARGN}) + # UB use absolute path of source. + if(NOT IS_ABSOLUTE ${src}) + set(src ${CMAKE_CURRENT_SOURCE_DIR}/${src}) endif() + set_property(GLOBAL APPEND PROPERTY ${unity_group_sources} ${src}) + endforeach() + + # If unity_file does not exists, nv_library or cc_library will use + # dummy_file. Touch unity_file to avoid to use dummy file. + set(unity_file + ${CMAKE_CURRENT_BINARY_DIR}/${UNITY_TARGET}_${unity_group_index}_${TYPE}.${TYPE} + ) + if(NOT EXISTS ${unity_file}) + file(TOUCH ${unity_file}) + endif() - math(EXPR unity_group_index "${unity_group_index} + 1") - set_property(GLOBAL PROPERTY ${UNITY_TARGET}_${TYPE}_group_index ${unity_group_index}) + math(EXPR unity_group_index "${unity_group_index} + 1") + set_property(GLOBAL PROPERTY ${UNITY_TARGET}_${TYPE}_group_index + ${unity_group_index}) endfunction(register_unity_group) # Combine the original source files used by `TARGET`, then use @@ -72,81 +78,105 @@ endfunction(register_unity_group) # directory on Windows. # Here you need to specify the source type which belongs to cc or cu. function(compose_unity_target_sources TARGET TYPE) - # Variable unity_target_sources represents the source file used in TARGET - set(unity_target_sources "") - get_property(unity_group_index_max GLOBAL PROPERTY ${TARGET}_${TYPE}_group_index) - foreach(src ${ARGN}) - set(unity_file "") - # Note(zhouwei25): UB use the path releative to CMAKE_SOURCE_DIR. - # If use absolute path, sccache/ccache hit rate will be reduced. - if(IS_ABSOLUTE ${src}) - set(src_absolute_path ${src}) - file(RELATIVE_PATH src_relative_path ${CMAKE_SOURCE_DIR} ${src}) - else() - set(src_absolute_path ${CMAKE_CURRENT_SOURCE_DIR}/${src}) - file(RELATIVE_PATH src_relative_path ${CMAKE_SOURCE_DIR} ${src_absolute_path}) - endif() - # If `unity_group_index_max` is empty, there is no combination - # relationship. - # TODO(Avin0323): Whether use target property `UNITY_BUILD` of CMAKE to - # combine source files. - if(NOT "${unity_group_index_max}" STREQUAL "") - # Search in each registed group. - foreach(unity_group_index RANGE ${unity_group_index_max}) - if(${unity_group_index} GREATER_EQUAL ${unity_group_index_max}) - break() - endif() - get_property(unity_group_sources GLOBAL PROPERTY ${TARGET}_${TYPE}_group_${unity_group_index}_sources) - if(${src_absolute_path} IN_LIST unity_group_sources) - set(unity_file ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}_${unity_group_index}_${TYPE}.${TYPE}) - set(unity_file_sources ${TARGET}_${TYPE}_file_${unity_group_index}_sources) - get_property(set_unity_file_sources GLOBAL PROPERTY ${unity_file_sources} SET) - if(NOT ${set_unity_file_sources}) - # Add macro before include source files. - set_property(GLOBAL PROPERTY ${unity_file_sources} "// Generate by Unity Build") - set_property(GLOBAL APPEND PROPERTY ${unity_file_sources} ${UNITY_CC_BEFORE_CODE}) - if(WITH_GPU AND "${TYPE}" STREQUAL "cu") - set_property(GLOBAL APPEND PROPERTY ${unity_file_sources} ${UNITY_CU_BEFORE_CODE}) - endif() - endif() - set_property(GLOBAL APPEND PROPERTY ${unity_file_sources} "#include \"${src_relative_path}\"") - set(unity_target_sources ${unity_target_sources} ${unity_file}) - break() - endif() - endforeach() + # Variable unity_target_sources represents the source file used in TARGET + set(unity_target_sources "") + get_property(unity_group_index_max GLOBAL + PROPERTY ${TARGET}_${TYPE}_group_index) + foreach(src ${ARGN}) + set(unity_file "") + # Note(zhouwei25): UB use the path releative to CMAKE_SOURCE_DIR. + # If use absolute path, sccache/ccache hit rate will be reduced. + if(IS_ABSOLUTE ${src}) + set(src_absolute_path ${src}) + file(RELATIVE_PATH src_relative_path ${CMAKE_SOURCE_DIR} ${src}) + else() + set(src_absolute_path ${CMAKE_CURRENT_SOURCE_DIR}/${src}) + file(RELATIVE_PATH src_relative_path ${CMAKE_SOURCE_DIR} + ${src_absolute_path}) + endif() + # If `unity_group_index_max` is empty, there is no combination + # relationship. + # TODO(Avin0323): Whether use target property `UNITY_BUILD` of CMAKE to + # combine source files. + if(NOT "${unity_group_index_max}" STREQUAL "") + # Search in each registed group. + foreach(unity_group_index RANGE ${unity_group_index_max}) + if(${unity_group_index} GREATER_EQUAL ${unity_group_index_max}) + break() endif() - # Use original source file. - if("${unity_file}" STREQUAL "") - set(unity_target_sources ${unity_target_sources} ${src}) + get_property( + unity_group_sources GLOBAL + PROPERTY ${TARGET}_${TYPE}_group_${unity_group_index}_sources) + if(${src_absolute_path} IN_LIST unity_group_sources) + set(unity_file + ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}_${unity_group_index}_${TYPE}.${TYPE} + ) + set(unity_file_sources + ${TARGET}_${TYPE}_file_${unity_group_index}_sources) + get_property( + set_unity_file_sources GLOBAL + PROPERTY ${unity_file_sources} + SET) + if(NOT ${set_unity_file_sources}) + # Add macro before include source files. + set_property(GLOBAL PROPERTY ${unity_file_sources} + "// Generate by Unity Build") + set_property(GLOBAL APPEND PROPERTY ${unity_file_sources} + ${UNITY_CC_BEFORE_CODE}) + if(WITH_GPU AND "${TYPE}" STREQUAL "cu") + set_property(GLOBAL APPEND PROPERTY ${unity_file_sources} + ${UNITY_CU_BEFORE_CODE}) + endif() + endif() + set_property( + GLOBAL APPEND PROPERTY ${unity_file_sources} + "#include \"${src_relative_path}\"") + set(unity_target_sources ${unity_target_sources} ${unity_file}) + break() endif() - endforeach() + endforeach() + endif() + # Use original source file. + if("${unity_file}" STREQUAL "") + set(unity_target_sources ${unity_target_sources} ${src}) + endif() + endforeach() - set(unity_target_${TYPE}_sources ${unity_target_sources} PARENT_SCOPE) + set(unity_target_${TYPE}_sources + ${unity_target_sources} + PARENT_SCOPE) endfunction(compose_unity_target_sources) # Write the unity files used by `UNITY_TARGET`. # Write dependent on whether the contents of the unity file have changed, which # protects incremental compilation speed. function(finish_unity_target TYPE) - # Get UNITY_TARGET from CMAKE_CURRENT_SOURCE_DIR. - string(REPLACE "${PADDLE_SOURCE_DIR}/paddle/fluid/" "" UNITY_TARGET ${CMAKE_CURRENT_SOURCE_DIR}) - string(REPLACE "/" "_" UNITY_TARGET ${UNITY_TARGET}) - set(UNITY_TARGET "paddle_${UNITY_TARGET}_unity") + # Get UNITY_TARGET from CMAKE_CURRENT_SOURCE_DIR. + string(REPLACE "${PADDLE_SOURCE_DIR}/paddle/fluid/" "" UNITY_TARGET + ${CMAKE_CURRENT_SOURCE_DIR}) + string(REPLACE "/" "_" UNITY_TARGET ${UNITY_TARGET}) + set(UNITY_TARGET "paddle_${UNITY_TARGET}_unity") - get_property(unity_group_index_max GLOBAL PROPERTY ${UNITY_TARGET}_${TYPE}_group_index) - if(NOT "${unity_group_index_max}" STREQUAL "") - foreach(unity_group_index RANGE ${unity_group_index_max}) - if(${unity_group_index} GREATER_EQUAL ${unity_group_index_max}) - break() - endif() - get_property(unity_file_sources GLOBAL PROPERTY ${UNITY_TARGET}_${TYPE}_file_${unity_group_index}_sources) - set(unity_file_read_content "") - string(JOIN "\n" unity_file_write_content ${unity_file_sources}) - set(unity_file ${CMAKE_CURRENT_BINARY_DIR}/${UNITY_TARGET}_${unity_group_index}_${TYPE}.${TYPE}) - file(READ ${unity_file} unity_file_read_content) - if(NOT "${unity_file_read_content}" STREQUAL "${unity_file_write_content}") - file(WRITE ${unity_file} ${unity_file_write_content}) - endif() - endforeach() - endif() + get_property(unity_group_index_max GLOBAL + PROPERTY ${UNITY_TARGET}_${TYPE}_group_index) + if(NOT "${unity_group_index_max}" STREQUAL "") + foreach(unity_group_index RANGE ${unity_group_index_max}) + if(${unity_group_index} GREATER_EQUAL ${unity_group_index_max}) + break() + endif() + get_property( + unity_file_sources GLOBAL + PROPERTY ${UNITY_TARGET}_${TYPE}_file_${unity_group_index}_sources) + set(unity_file_read_content "") + string(JOIN "\n" unity_file_write_content ${unity_file_sources}) + set(unity_file + ${CMAKE_CURRENT_BINARY_DIR}/${UNITY_TARGET}_${unity_group_index}_${TYPE}.${TYPE} + ) + file(READ ${unity_file} unity_file_read_content) + if(NOT "${unity_file_read_content}" STREQUAL + "${unity_file_write_content}") + file(WRITE ${unity_file} ${unity_file_write_content}) + endif() + endforeach() + endif() endfunction(finish_unity_target) diff --git a/cmake/util.cmake b/cmake/util.cmake index 02667dbce69ed..8e52831ebe972 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -6,50 +6,47 @@ # First Argument: target name want to be linked with libraries # Rest Arguments: libraries which link together. function(target_circle_link_libraries TARGET_NAME) - if(APPLE) - set(LIBS) - set(inArchive OFF) - set(libsInArgn) + if(APPLE) + set(LIBS) + set(inArchive OFF) + set(libsInArgn) - foreach(arg ${ARGN}) - if(${arg} STREQUAL "ARCHIVE_START") - set(inArchive ON) - elseif(${arg} STREQUAL "ARCHIVE_END") - set(inArchive OFF) - else() - if(inArchive) - list(APPEND LIBS "-Wl,-force_load") - endif() - list(APPEND LIBS ${arg}) - list(APPEND libsInArgn ${arg}) - endif() - endforeach() - if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang") - if(NOT IOS_ENABLE_BITCODE) - list(APPEND LIBS "-undefined dynamic_lookup") - endif() + foreach(arg ${ARGN}) + if(${arg} STREQUAL "ARCHIVE_START") + set(inArchive ON) + elseif(${arg} STREQUAL "ARCHIVE_END") + set(inArchive OFF) + else() + if(inArchive) + list(APPEND LIBS "-Wl,-force_load") endif() - list(REVERSE libsInArgn) - target_link_libraries(${TARGET_NAME} - ${LIBS} - ${libsInArgn}) + list(APPEND LIBS ${arg}) + list(APPEND libsInArgn ${arg}) + endif() + endforeach() + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" + STREQUAL "AppleClang") + if(NOT IOS_ENABLE_BITCODE) + list(APPEND LIBS "-undefined dynamic_lookup") + endif() + endif() + list(REVERSE libsInArgn) + target_link_libraries(${TARGET_NAME} ${LIBS} ${libsInArgn}) - else() # LINUX - set(LIBS) + else() # LINUX + set(LIBS) - foreach(arg ${ARGN}) - if(${arg} STREQUAL "ARCHIVE_START") - list(APPEND LIBS "-Wl,--whole-archive") - elseif(${arg} STREQUAL "ARCHIVE_END") - list(APPEND LIBS "-Wl,--no-whole-archive") - else() - list(APPEND LIBS ${arg}) - endif() - endforeach() + foreach(arg ${ARGN}) + if(${arg} STREQUAL "ARCHIVE_START") + list(APPEND LIBS "-Wl,--whole-archive") + elseif(${arg} STREQUAL "ARCHIVE_END") + list(APPEND LIBS "-Wl,--no-whole-archive") + else() + list(APPEND LIBS ${arg}) + endif() + endforeach() - target_link_libraries(${TARGET_NAME} - "-Wl,--start-group" - ${LIBS} - "-Wl,--end-group") - endif() + target_link_libraries(${TARGET_NAME} "-Wl,--start-group" ${LIBS} + "-Wl,--end-group") + endif() endfunction() diff --git a/cmake/version.cmake b/cmake/version.cmake index 57ca750df6cb9..83bd3f1b1bc4a 100644 --- a/cmake/version.cmake +++ b/cmake/version.cmake @@ -3,7 +3,7 @@ set(PADDLE_VERSION $ENV{PADDLE_VERSION}) set(tmp_version "HEAD") set(TAG_VERSION_REGEX "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?") set(COMMIT_VERSION_REGEX "[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+") -while ("${PADDLE_VERSION}" STREQUAL "") +while("${PADDLE_VERSION}" STREQUAL "") # Check current branch name execute_process( COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref ${tmp_version} @@ -11,23 +11,24 @@ while ("${PADDLE_VERSION}" STREQUAL "") OUTPUT_VARIABLE GIT_BRANCH_NAME RESULT_VARIABLE GIT_BRANCH_RESULT ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) - if (NOT ${GIT_BRANCH_RESULT}) + if(NOT ${GIT_BRANCH_RESULT}) execute_process( - COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always ${tmp_version} + COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always + ${tmp_version} WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} OUTPUT_VARIABLE GIT_TAG_NAME RESULT_VARIABLE GIT_RESULT ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) - if (NOT ${GIT_RESULT}) + if(NOT ${GIT_RESULT}) # Check if current branch is release branch - if (${GIT_BRANCH_NAME} MATCHES "release/${TAG_VERSION_REGEX}") + if(${GIT_BRANCH_NAME} MATCHES "release/${TAG_VERSION_REGEX}") # Check the tag is a correct version - if (${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}") + if(${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}") # if no tag was found, set PADDLE_VERSION to 0.0.0 to represent latest set(PADDLE_VERSION "0.0.0") - elseif (${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}") + elseif(${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}") string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME}) - else() # otherwise, get the previous git tag name. + else() # otherwise, get the previous git tag name. set(tmp_version "${GIT_TAG_NAME}~1") endif() else() @@ -37,9 +38,9 @@ while ("${PADDLE_VERSION}" STREQUAL "") OUTPUT_VARIABLE GIT_EXACT_TAG_NAME RESULT_VARIABLE GIT_EXACT_TAG_RESULT ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) - if (NOT ${GIT_EXACT_TAG_NAME}) + if(NOT ${GIT_EXACT_TAG_NAME}) # Check if current branch is tag branch - if (${GIT_EXACT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}") + if(${GIT_EXACT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}") string(REPLACE "v" "" PADDLE_VERSION ${GIT_EXACT_TAG_NAME}) else() set(PADDLE_VERSION "0.0.0") diff --git a/cmake/xpu_kp.cmake b/cmake/xpu_kp.cmake index adf3d74c26220..6692f24dd6ae9 100644 --- a/cmake/xpu_kp.cmake +++ b/cmake/xpu_kp.cmake @@ -13,11 +13,11 @@ # limitations under the License. if(NOT WITH_XPU_KP) - return() + return() endif() -set(LINK_FLAGS "-Wl,--allow-multiple-definition") -set(CMAKE_EXE_LINKER_FLAGS "${LINK_FLAGS}") +set(LINK_FLAGS "-Wl,--allow-multiple-definition") +set(CMAKE_EXE_LINKER_FLAGS "${LINK_FLAGS}") set(CMAKE_SHARED_LINKER_FLAGS "${LINK_FLAGS}") if(NOT XPU_TOOLCHAIN) @@ -31,7 +31,7 @@ message(STATUS "Build with XPU_TOOLCHAIN=" ${XPU_TOOLCHAIN}) set(XPU_CLANG ${XPU_TOOLCHAIN}/bin/clang++) message(STATUS "Build with XPU_CLANG=" ${XPU_CLANG}) -# The host sysroot of XPU compiler is gcc-8.2 +# The host sysroot of XPU compiler is gcc-8.2 if(NOT HOST_SYSROOT) set(HOST_SYSROOT /opt/compiler/gcc-8.2) endif() @@ -45,19 +45,19 @@ if(NOT API_ARCH) endif() if(API_ARCH MATCHES "x86_64") -if(EXISTS ${HOST_SYSROOT}/bin/g++) - set(HOST_CXX ${HOST_SYSROOT}/bin/g++) - set(HOST_AR ${HOST_SYSROOT}/bin/ar) -else() - set(HOST_CXX /usr/bin/g++) - set(HOST_AR /usr/bin/ar) -endif() + if(EXISTS ${HOST_SYSROOT}/bin/g++) + set(HOST_CXX ${HOST_SYSROOT}/bin/g++) + set(HOST_AR ${HOST_SYSROOT}/bin/ar) + else() + set(HOST_CXX /usr/bin/g++) + set(HOST_AR /usr/bin/ar) + endif() else() set(HOST_CXX ${CMAKE_CXX_COMPILER}) set(HOST_AR ${CMAKE_AR}) endif() -set(TOOLCHAIN_ARGS ) +set(TOOLCHAIN_ARGS) if(OPT_LEVEL) set(OPT_LEVEL ${OPT_LEVEL}) @@ -74,8 +74,16 @@ message(STATUS "Build with HOST_AR=" ${HOST_AR}) macro(compile_kernel COMPILE_ARGS) set(options "") set(oneValueArgs "") - set(multiValueArgs KERNEL DIRPATH XNAME DEVICE HOST XPU DEPENDS) - cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(multiValueArgs + KERNEL + DIRPATH + XNAME + DEVICE + HOST + XPU + DEPENDS) + cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) set(kernel_path ${xpu_add_library_DIRPATH}) set(kernel_name ${xpu_add_library_XNAME}) set(device_o_extra_flags ${xpu_add_library_DEVICE}) @@ -84,16 +92,12 @@ macro(compile_kernel COMPILE_ARGS) set(cc_depends ${xpu_add_library_DEPENDS}) set(kernel_target ${kernel_name}_kernel) - add_custom_target(${kernel_target} - WORKING_DIRECTORY - ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS - kernel_build/${kernel_name}.host.o - kernel_build/${kernel_name}.bin.o - COMMENT - ${kernel_target} - VERBATIM - ) + add_custom_target( + ${kernel_target} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS kernel_build/${kernel_name}.host.o kernel_build/${kernel_name}.bin.o + COMMENT ${kernel_target} + VERBATIM) if(cc_depends) add_dependencies(${kernel_target} ${xpu_add_library_DEPENDS}) @@ -106,24 +110,56 @@ macro(compile_kernel COMPILE_ARGS) set(XTDK_DIR ${XPU_TOOLCHAIN}) set(CXX_DIR ${HOST_SYSROOT}) - set(XPU_CXX_FLAGS -fforce-enable-int128 -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -DNDEBUG ) + set(XPU_CXX_FLAGS + -fforce-enable-int128 + -Wno-error=pessimizing-move + -Wno-error=constant-conversion + -Wno-error=c++11-narrowing + -Wno-error=shift-count-overflow + -Wno-error=unused-local-typedef + -Wno-error=deprecated-declarations + -Wno-deprecated-declarations + -std=c++14 + -m64 + -fPIC + -fno-omit-frame-pointer + -Wall + -Wno-inconsistent-missing-override + -Wextra + -Wnon-virtual-dtor + -Wdelete-non-virtual-dtor + -Wno-unused-parameter + -Wno-unused-function + -Wno-error=unused-local-typedefs + -Wno-error=ignored-attributes + -Wno-error=int-in-bool-context + -Wno-error=parentheses + -Wno-error=address + -Wno-ignored-qualifiers + -Wno-ignored-attributes + -Wno-parentheses + -DNDEBUG) #include path - get_property(dirs DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES) + get_property( + dirs + DIRECTORY ${CMAKE_SOURCE_DIR} + PROPERTY INCLUDE_DIRECTORIES) set(XPU_CXX_INCLUDES "") foreach(dir IN LISTS dirs) list(APPEND XPU_CXX_INCLUDES "-I${dir}") endforeach() - string(REPLACE ";" " " XPU_CXX_INCLUDES "${XPU_CXX_INCLUDES}" ) + string(REPLACE ";" " " XPU_CXX_INCLUDES "${XPU_CXX_INCLUDES}") separate_arguments(XPU_CXX_INCLUDES UNIX_COMMAND "${XPU_CXX_INCLUDES}") #related flags - get_directory_property( DirDefs DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS ) + get_directory_property(DirDefs DIRECTORY ${CMAKE_SOURCE_DIR} + COMPILE_DEFINITIONS) set(XPU_CXX_DEFINES "") foreach(def IN LISTS DirDefs) list(APPEND XPU_CXX_DEFINES "-D${def}") endforeach() - string(REPLACE ";" " " XPU_CXX_DEFINES "${XPU_CXX_DEFINES}" ) + string(REPLACE ";" " " XPU_CXX_DEFINES "${XPU_CXX_DEFINES}") separate_arguments(XPU_CXX_DEFINES UNIX_COMMAND "${XPU_CXX_DEFINES}") set(ABI_VERSION "") @@ -133,121 +169,119 @@ macro(compile_kernel COMPILE_ARGS) set(ABI_VERSION "-D_GLIBCXX_USE_CXX11_ABI=1") endif() add_custom_command( - OUTPUT - kernel_build/${kernel_name}.bin.o - COMMAND - ${CMAKE_COMMAND} -E make_directory kernel_build - COMMAND - ${CMAKE_COMMAND} -E copy ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu + OUTPUT kernel_build/${kernel_name}.bin.o + COMMAND ${CMAKE_COMMAND} -E make_directory kernel_build + COMMAND ${CMAKE_COMMAND} -E copy ${kernel_path}/${kernel_name}.kps + kernel_build/${kernel_name}.xpu COMMAND - ${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 ${ABI_VERSION} ${OPT_LEVEL} -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} - -I. -o kernel_build/${kernel_name}.bin.o.sec kernel_build/${kernel_name}.xpu - --xpu-device-only -c -v - COMMAND - ${XTDK_DIR}/bin/xpu2-elfconv kernel_build/${kernel_name}.bin.o.sec kernel_build/${kernel_name}.bin.o ${XPU_CLANG} --sysroot=${CXX_DIR} - WORKING_DIRECTORY - ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS - ${xpu_add_library_DEPENDS} - COMMENT - kernel_build/${kernel_name}.bin.o - VERBATIM - ) - list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.bin.o) + ${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 ${ABI_VERSION} ${OPT_LEVEL} + -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} + ${XPU_CXX_INCLUDES} -I. -o kernel_build/${kernel_name}.bin.o.sec + kernel_build/${kernel_name}.xpu --xpu-device-only -c -v + COMMAND ${XTDK_DIR}/bin/xpu2-elfconv kernel_build/${kernel_name}.bin.o.sec + kernel_build/${kernel_name}.bin.o ${XPU_CLANG} --sysroot=${CXX_DIR} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${xpu_add_library_DEPENDS} + COMMENT kernel_build/${kernel_name}.bin.o + VERBATIM) + list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.bin.o) add_custom_command( - OUTPUT - kernel_build/${kernel_name}.host.o - COMMAND - ${CMAKE_COMMAND} -E make_directory kernel_build - COMMAND - ${CMAKE_COMMAND} -E copy ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu + OUTPUT kernel_build/${kernel_name}.host.o + COMMAND ${CMAKE_COMMAND} -E make_directory kernel_build + COMMAND ${CMAKE_COMMAND} -E copy ${kernel_path}/${kernel_name}.kps + kernel_build/${kernel_name}.xpu COMMAND - ${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 ${ABI_VERSION} ${OPT_LEVEL} -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} - -I. -o kernel_build/${kernel_name}.host.o kernel_build/${kernel_name}.xpu - --xpu-host-only -c -v - WORKING_DIRECTORY - ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS - ${xpu_add_library_DEPENDS} - COMMENT - kernel_build/${kernel_name}.host.o - VERBATIM - ) - list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.host.o) + ${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 ${ABI_VERSION} ${OPT_LEVEL} + -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} + ${XPU_CXX_INCLUDES} -I. -o kernel_build/${kernel_name}.host.o + kernel_build/${kernel_name}.xpu --xpu-host-only -c -v + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${xpu_add_library_DEPENDS} + COMMENT kernel_build/${kernel_name}.host.o + VERBATIM) + list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.host.o) endmacro() ############################################################################### # XPU_ADD_LIBRARY ############################################################################### macro(xpu_add_library TARGET_NAME) - # Separate the sources from the options - set(options "") - set(oneValueArgs "") - set(multiValueArgs STATIC DEPENDS) - cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - set(xpu_srcs ${xpu_add_library_STATIC}) - set(xpu_target ${TARGET_NAME}) - set(cc_srcs_depends ${xpu_add_library_DEPENDS}) - - file(GLOB_RECURSE xpu_srcs_lists ${xpu_srcs}) - list(LENGTH xpu_srcs_lists xpu_srcs_lists_num) - - set(XPU1_DEVICE_O_EXTRA_FLAGS " ") - set(XPU1_HOST_O_EXTRA_FLAGS " ") - - # Distinguish .xpu file from other files - foreach(cur_xpu_src IN LISTS xpu_srcs_lists) - get_filename_component(language_type_name ${cur_xpu_src} EXT) - if(${language_type_name} STREQUAL ".kps") - list(APPEND xpu_kernel_lists ${cur_xpu_src}) - else() - list(APPEND cc_kernel_lists ${cur_xpu_src}) - endif() - endforeach() + # Separate the sources from the options + set(options "") + set(oneValueArgs "") + set(multiValueArgs STATIC DEPENDS) + cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + set(xpu_srcs ${xpu_add_library_STATIC}) + set(xpu_target ${TARGET_NAME}) + set(cc_srcs_depends ${xpu_add_library_DEPENDS}) - # Ensure that there is only one xpu kernel - list(LENGTH xpu_kernel_lists xpu_kernel_lists_num) - list(LENGTH cc_srcs_depends cc_srcs_depends_num) - - if(${xpu_kernel_lists_num}) - foreach(xpu_kernel IN LISTS xpu_kernel_lists) - get_filename_component(kernel_name ${xpu_kernel} NAME_WE) - get_filename_component(kernel_dir ${xpu_kernel} DIRECTORY) - set(kernel_rules ${kernel_dir}/${kernel_name}.rules) - set(kernel_name ${kernel_name}) - compile_kernel( KERNEL ${xpu_kernel} DIRPATH ${kernel_dir} XNAME ${kernel_name} DEVICE ${XPU1_DEVICE_O_EXTRA_FLAGS} HOST ${XPU1_HOST_O_EXTRA_FLAGS} XPU "xpu2" DEPENDS ${cc_srcs_depends}) - endforeach() - - add_custom_target(${xpu_target}_src ALL - WORKING_DIRECTORY - ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS - ${xpu_kernel_depends} - ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a - COMMENT - ${xpu_target}_src - VERBATIM - ) - - add_custom_command( - OUTPUT - ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a - COMMAND - ${HOST_AR} rcs ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a ${xpu_kernel_depends} - WORKING_DIRECTORY - ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS - ${xpu_kernel_depends} - COMMENT - ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a - VERBATIM - ) - - add_library(${xpu_target} STATIC ${cc_kernel_lists}) - add_dependencies(${xpu_target} ${xpu_target}_src) - target_link_libraries(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a) + file(GLOB_RECURSE xpu_srcs_lists ${xpu_srcs}) + list(LENGTH xpu_srcs_lists xpu_srcs_lists_num) + + set(XPU1_DEVICE_O_EXTRA_FLAGS " ") + set(XPU1_HOST_O_EXTRA_FLAGS " ") + + # Distinguish .xpu file from other files + foreach(cur_xpu_src IN LISTS xpu_srcs_lists) + get_filename_component(language_type_name ${cur_xpu_src} EXT) + if(${language_type_name} STREQUAL ".kps") + list(APPEND xpu_kernel_lists ${cur_xpu_src}) else() - add_library(${xpu_target} STATIC ${cc_kernel_lists}) + list(APPEND cc_kernel_lists ${cur_xpu_src}) endif() + endforeach() + + # Ensure that there is only one xpu kernel + list(LENGTH xpu_kernel_lists xpu_kernel_lists_num) + list(LENGTH cc_srcs_depends cc_srcs_depends_num) + + if(${xpu_kernel_lists_num}) + foreach(xpu_kernel IN LISTS xpu_kernel_lists) + get_filename_component(kernel_name ${xpu_kernel} NAME_WE) + get_filename_component(kernel_dir ${xpu_kernel} DIRECTORY) + set(kernel_rules ${kernel_dir}/${kernel_name}.rules) + set(kernel_name ${kernel_name}) + compile_kernel( + KERNEL + ${xpu_kernel} + DIRPATH + ${kernel_dir} + XNAME + ${kernel_name} + DEVICE + ${XPU1_DEVICE_O_EXTRA_FLAGS} + HOST + ${XPU1_HOST_O_EXTRA_FLAGS} + XPU + "xpu2" + DEPENDS + ${cc_srcs_depends}) + endforeach() + + add_custom_target( + ${xpu_target}_src ALL + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${xpu_kernel_depends} + ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a + COMMENT ${xpu_target}_src + VERBATIM) + + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a + COMMAND ${HOST_AR} rcs ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a + ${xpu_kernel_depends} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${xpu_kernel_depends} + COMMENT ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a + VERBATIM) + + add_library(${xpu_target} STATIC ${cc_kernel_lists}) + add_dependencies(${xpu_target} ${xpu_target}_src) + target_link_libraries(${TARGET_NAME} + ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a) + else() + add_library(${xpu_target} STATIC ${cc_kernel_lists}) + endif() endmacro() diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index 9d801c9e224a9..07041455df4fd 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -1,7 +1,9 @@ add_subdirectory(utils) add_subdirectory(scripts) add_subdirectory(testing) -set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory") +set(PYTHON_TESTS_DIR + ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests + CACHE INTERNAL "python tests directory") add_subdirectory(phi) add_subdirectory(infrt) add_subdirectory(fluid) diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt index a92932b4d3247..304a764f5b87c 100755 --- a/paddle/fluid/distributed/CMakeLists.txt +++ b/paddle/fluid/distributed/CMakeLists.txt @@ -2,35 +2,49 @@ add_subdirectory(collective) add_subdirectory(store) if(WITH_PYTHON) py_proto_compile(ps_py_proto SRCS the_one_ps.proto) - add_custom_target(ps_py_proto_init ALL - COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto) + add_custom_target( + ps_py_proto_init ALL + COMMAND ${CMAKE_COMMAND} -E make_directory + ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto) add_dependencies(ps_py_proto ps_py_proto_init) - if (NOT WIN32) - add_custom_command(TARGET ps_py_proto POST_BUILD - COMMAND mv the_one_ps_pb2.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/) + if(NOT WIN32) + add_custom_command( + TARGET ps_py_proto + POST_BUILD + COMMAND mv the_one_ps_pb2.py + ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/) else(NOT WIN32) - string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/") - add_custom_command(TARGET ps_py_proto POST_BUILD + string( + REPLACE "/" "\\" fleet_proto_dstpath + "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/") + add_custom_command( + TARGET ps_py_proto + POST_BUILD COMMAND copy /Y the_one_ps_pb2.py ${fleet_proto_dstpath} - COMMENT "Copy generated python the_one_ps_pb2 into directory ${fleet_proto_dstpath}.") + COMMENT + "Copy generated python the_one_ps_pb2 into directory ${fleet_proto_dstpath}." + ) endif(NOT WIN32) endif() if(NOT WITH_PSCORE) - add_subdirectory(fleet_executor) - return() + add_subdirectory(fleet_executor) + return() endif() proto_library(ps_framework_proto SRCS the_one_ps.proto) -add_custom_command(TARGET ps_framework_proto POST_BUILD - COMMAND mv the_one_ps.pb.h ps.pb.h - COMMAND mv the_one_ps.pb.cc ps.pb.cc) +add_custom_command( + TARGET ps_framework_proto + POST_BUILD + COMMAND mv the_one_ps.pb.h ps.pb.h + COMMAND mv the_one_ps.pb.cc ps.pb.cc) -set(DISTRIBUTE_COMPILE_FLAGS "-Wno-error=unused-value -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result") +set(DISTRIBUTE_COMPILE_FLAGS + "-Wno-error=unused-value -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result" +) -if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) - set(DISTRIBUTE_COMPILE_FLAGS - "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") +if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) + set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") endif() add_subdirectory(common) diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index f6b1bd47c1e46..0cfc82709637f 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -1,20 +1,65 @@ -cc_library(processgroup SRCS ProcessGroup.cc DEPS phi_api eager_api) -cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup phi_api string_helper) +cc_library( + processgroup + SRCS ProcessGroup.cc + DEPS phi_api eager_api) +cc_library( + eager_reducer + SRCS reducer.cc + DEPS eager_api processgroup phi_api string_helper) -if (WITH_DISTRIBUTE) - cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi_api eager_api gloo_wrapper) +if(WITH_DISTRIBUTE) + cc_library( + processgroup_gloo + SRCS ProcessGroupGloo.cc + DEPS phi_api eager_api gloo_wrapper) endif() if(WITH_NCCL) - cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc DEPS place cuda_stream enforce collective_helper device_context phi_api eager_api) - if (WITH_DISTRIBUTE AND WITH_PSCORE) - cc_library(processgroup_heter SRCS ProcessGroupHeter.cc NCCLTools.cc Common.cc DEPS place cuda_stream enforce collective_helper device_context phi_api eager_api) + cc_library( + processgroup_nccl + SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc + DEPS place + cuda_stream + enforce + collective_helper + device_context + phi_api + eager_api) + if(WITH_DISTRIBUTE AND WITH_PSCORE) + cc_library( + processgroup_heter + SRCS ProcessGroupHeter.cc NCCLTools.cc Common.cc + DEPS place + cuda_stream + enforce + collective_helper + device_context + phi_api + eager_api) endif() endif() if(WITH_ASCEND_CL) - cc_library(processgroup_hccl SRCS ProcessGroupHCCL.cc HCCLTools.cc Common.cc DEPS place npu_stream enforce collective_helper device_context phi_api eager_api) - if (WITH_DISTRIBUTE AND WITH_PSCORE) - cc_library(processgroup_heter SRCS ProcessGroupHeter.cc HCCLTools.cc Common.cc DEPS place npu_stream enforce collective_helper device_context phi_api eager_api) + cc_library( + processgroup_hccl + SRCS ProcessGroupHCCL.cc HCCLTools.cc Common.cc + DEPS place + npu_stream + enforce + collective_helper + device_context + phi_api + eager_api) + if(WITH_DISTRIBUTE AND WITH_PSCORE) + cc_library( + processgroup_heter + SRCS ProcessGroupHeter.cc HCCLTools.cc Common.cc + DEPS place + npu_stream + enforce + collective_helper + device_context + phi_api + eager_api) endif() endif() diff --git a/paddle/fluid/distributed/collective/HCCLTools.cc b/paddle/fluid/distributed/collective/HCCLTools.cc index 526a683e057c0..676a71cb30d95 100644 --- a/paddle/fluid/distributed/collective/HCCLTools.cc +++ b/paddle/fluid/distributed/collective/HCCLTools.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/distributed/collective/HCCLTools.h" + #include "paddle/fluid/distributed/collective/Types.h" namespace paddle { diff --git a/paddle/fluid/distributed/collective/HCCLTools.h b/paddle/fluid/distributed/collective/HCCLTools.h index a1dcf7cd9b626..4955e24eadbfb 100644 --- a/paddle/fluid/distributed/collective/HCCLTools.h +++ b/paddle/fluid/distributed/collective/HCCLTools.h @@ -15,6 +15,7 @@ #pragma once #include + #include #include "boost/variant.hpp" diff --git a/paddle/fluid/distributed/collective/NCCLTools.cc b/paddle/fluid/distributed/collective/NCCLTools.cc index 7e842ebf92166..2cecaf0734df6 100644 --- a/paddle/fluid/distributed/collective/NCCLTools.cc +++ b/paddle/fluid/distributed/collective/NCCLTools.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/distributed/collective/NCCLTools.h" + #include "paddle/fluid/distributed/collective/Types.h" namespace paddle { diff --git a/paddle/fluid/distributed/collective/NCCLTools.h b/paddle/fluid/distributed/collective/NCCLTools.h index 0454518b1836c..f38ce8faa7ffb 100644 --- a/paddle/fluid/distributed/collective/NCCLTools.h +++ b/paddle/fluid/distributed/collective/NCCLTools.h @@ -16,9 +16,11 @@ #include #include + #include #include "boost/variant.hpp" +#include "paddle/fluid/distributed/collective/Types.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/cuda_device_guard.h" @@ -26,8 +28,6 @@ #include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/distributed/collective/Types.h" - namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index 52e09792d5d80..7ed6b188fd217 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -21,7 +21,6 @@ #include "paddle/fluid/distributed/collective/Types.h" #include "paddle/fluid/eager/api/utils/tensor_utils.h" - #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc index 824341c3cd97d..1a390e38755fd 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc @@ -27,6 +27,7 @@ #include #include #include + #include "paddle/fluid/distributed/collective/Common.h" #include "paddle/fluid/distributed/collective/ProcessGroupGloo.h" #include "paddle/fluid/framework/fleet/gloo_wrapper.h" @@ -485,8 +486,9 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::createDefaultDevice() { std::array hostname{}; auto ret = ::gethostname(hostname.data(), HOST_NAME_MAX); - PADDLE_ENFORCE_EQ(ret, 0, platform::errors::Fatal( - "Get hostname error for createDefaultDevice.")); + PADDLE_ENFORCE_EQ( + ret, 0, + platform::errors::Fatal("Get hostname error for createDefaultDevice.")); ::addrinfo* result; result = tcputils::get_addr_info(hostname.data(), "", 0, AF_UNSPEC); ::addrinfo* cur; diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc index 9ed6c2198df4c..50249b03967a9 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h" + #include "paddle/fluid/distributed/collective/Common.h" #include "paddle/fluid/distributed/collective/HCCLTools.h" #include "paddle/fluid/memory/malloc.h" @@ -216,15 +217,16 @@ std::shared_ptr ProcessGroupHCCL::AllReduce( std::vector& in_tensors, // NOLINT std::vector& out_tensors, // NOLINT const AllreduceOptions& opts) { - return Collective(in_tensors, out_tensors, - [&](phi::DenseTensor& input, phi::DenseTensor& output, - HcclComm comm, const aclrtStream& stream) { - return platform::dynload::HcclAllReduce( - input.data(), output.data(), input.numel(), - platform::ToHCCLDataType(input.dtype()), - ToHCCLRedType(opts.reduce_op), comm, stream); - }, - CommType::ALLREDUCE); + return Collective( + in_tensors, out_tensors, + [&](phi::DenseTensor& input, phi::DenseTensor& output, HcclComm comm, + const aclrtStream& stream) { + return platform::dynload::HcclAllReduce( + input.data(), output.data(), input.numel(), + platform::ToHCCLDataType(input.dtype()), + ToHCCLRedType(opts.reduce_op), comm, stream); + }, + CommType::ALLREDUCE); } std::shared_ptr ProcessGroupHCCL::Broadcast( diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h index 2f0ff6b9565ea..a32984798febd 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h @@ -21,12 +21,11 @@ #include #include +#include "paddle/fluid/distributed/collective/HCCLTools.h" #include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/distributed/store/store.h" #include "paddle/fluid/platform/device/npu/npu_stream.h" #include "paddle/fluid/platform/device_context.h" - -#include "paddle/fluid/distributed/collective/HCCLTools.h" -#include "paddle/fluid/distributed/store/store.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/gen_comm_id_helper.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc index 0911a4a3e3e18..0b388a6a848a9 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/distributed/collective/ProcessGroupHeter.h" + #include + #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/api/include/api.h" @@ -129,8 +131,9 @@ std::shared_ptr ProcessGroupHeter::AllReduce( gid_, {dense_cpu_tensor.name()}, send_size, dense_cpu_tensor.data(), dense_cpu_tensor.numel() * framework::DataTypeSize(dense_cpu_tensor.dtype())); - PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( - "Send to the switch module error.")); + PADDLE_ENFORCE_EQ(ret, 0, + platform::errors::PreconditionNotMet( + "Send to the switch module error.")); phi::DenseTensor cpu_tensor2; cpu_tensor2.AllocateFrom( std::make_unique( @@ -140,8 +143,9 @@ std::shared_ptr ProcessGroupHeter::AllReduce( ret = client_->Recv( gid_, {dense_cpu_tensor.name()}, cpu_tensor2.data(), cpu_tensor2.numel() * framework::DataTypeSize(cpu_tensor2.dtype())); - PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( - "Recv from the switch module error.")); + PADDLE_ENFORCE_EQ(ret, 0, + platform::errors::PreconditionNotMet( + "Recv from the switch module error.")); switch (dense_cpu_tensor.dtype()) { case DataType::FLOAT32: @@ -226,8 +230,9 @@ std::shared_ptr ProcessGroupHeter::Broadcast( dense_cpu_tensor.data(), dense_cpu_tensor.numel() * framework::DataTypeSize(dense_cpu_tensor.dtype())); - PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( - "Send to the switch module error.")); + PADDLE_ENFORCE_EQ(ret, 0, + platform::errors::PreconditionNotMet( + "Send to the switch module error.")); } else { int ret = client_->Recv( gid_, {dense_cpu_tensor.name()}, dense_cpu_tensor.data(), @@ -286,8 +291,9 @@ std::shared_ptr ProcessGroupHeter::Send( VLOG(2) << "tensor_name:" << tensor_name; int ret = client_->Send(gid_, {tensor_name}, send_size, cpu_tensor.data(), tensor_size); - PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( - "Send to the switch module error.")); + PADDLE_ENFORCE_EQ( + ret, 0, + platform::errors::PreconditionNotMet("Send to the switch module error.")); return CreateTask(rank_, CommType::SEND, in_tensors); } @@ -319,8 +325,9 @@ std::shared_ptr ProcessGroupHeter::Recv( int ret = client_->Recv( gid_, {tensor_name}, cpu_tensor.data(), cpu_tensor.numel() * framework::DataTypeSize(cpu_tensor.dtype())); - PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( - "receive to the switch module error.")); + PADDLE_ENFORCE_EQ(ret, 0, + platform::errors::PreconditionNotMet( + "receive to the switch module error.")); auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration diff = end - start; double goodput = cpu_tensor.numel() * diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index f1b66864b2930..dc67205c78f56 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" + #include "paddle/fluid/distributed/collective/Common.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -320,15 +321,16 @@ std::shared_ptr ProcessGroupNCCL::AllReduce( PADDLE_ENFORCE_EQ( CheckTensorsInCudaPlace(in_tensors), true, platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); - return Collective(in_tensors, out_tensors, - [&](const phi::DenseTensor& input, phi::DenseTensor& output, - ncclComm_t comm, const gpuStream_t& stream) { - return platform::dynload::ncclAllReduce( - input.data(), output.data(), input.numel(), - platform::ToNCCLDataType(input.type()), - ToNCCLRedType(opts.reduce_op), comm, stream); - }, - CommType::ALLREDUCE); + return Collective( + in_tensors, out_tensors, + [&](const phi::DenseTensor& input, phi::DenseTensor& output, + ncclComm_t comm, const gpuStream_t& stream) { + return platform::dynload::ncclAllReduce( + input.data(), output.data(), input.numel(), + platform::ToNCCLDataType(input.type()), + ToNCCLRedType(opts.reduce_op), comm, stream); + }, + CommType::ALLREDUCE); } std::shared_ptr ProcessGroupNCCL::Broadcast( @@ -338,17 +340,17 @@ std::shared_ptr ProcessGroupNCCL::Broadcast( CheckTensorsInCudaPlace(in_tensors), true, platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); - return Collective(in_tensors, out_tensors, - [&](phi::DenseTensor& input, phi::DenseTensor& output, - ncclComm_t comm, const gpuStream_t& stream) { - const auto root = opts.source_rank * in_tensors.size() + - opts.source_root; - return platform::dynload::ncclBroadcast( - input.data(), output.data(), input.numel(), - platform::ToNCCLDataType(input.type()), root, comm, - stream); - }, - CommType::BROADCAST); + return Collective( + in_tensors, out_tensors, + [&](phi::DenseTensor& input, phi::DenseTensor& output, ncclComm_t comm, + const gpuStream_t& stream) { + const auto root = + opts.source_rank * in_tensors.size() + opts.source_root; + return platform::dynload::ncclBroadcast( + input.data(), output.data(), input.numel(), + platform::ToNCCLDataType(input.type()), root, comm, stream); + }, + CommType::BROADCAST); } std::shared_ptr ProcessGroupNCCL::Barrier( @@ -400,15 +402,15 @@ std::shared_ptr ProcessGroupNCCL::Send( std::vector& tensors, int dst_rank) { CheckTensorsInDifferentDevices(tensors, static_cast(GetSize())); - auto task = PointToPoint(tensors, - [&](phi::DenseTensor& input, ncclComm_t comm, - const gpuStream_t& stream, int dst_rank) { - return platform::dynload::ncclSend( - input.data(), input.numel(), - platform::ToNCCLDataType(input.dtype()), - dst_rank, comm, stream); - }, - dst_rank, CommType::SEND); + auto task = PointToPoint( + tensors, + [&](phi::DenseTensor& input, ncclComm_t comm, const gpuStream_t& stream, + int dst_rank) { + return platform::dynload::ncclSend( + input.data(), input.numel(), + platform::ToNCCLDataType(input.dtype()), dst_rank, comm, stream); + }, + dst_rank, CommType::SEND); return task; } @@ -416,15 +418,15 @@ std::shared_ptr ProcessGroupNCCL::Recv( std::vector& tensors, int src_rank) { CheckTensorsInDifferentDevices(tensors, static_cast(GetSize())); - auto task = PointToPoint(tensors, - [&](phi::DenseTensor& output, ncclComm_t comm, - const gpuStream_t& stream, int src_rank) { - return platform::dynload::ncclRecv( - output.data(), output.numel(), - platform::ToNCCLDataType(output.dtype()), - src_rank, comm, stream); - }, - src_rank, CommType::RECV); + auto task = PointToPoint( + tensors, + [&](phi::DenseTensor& output, ncclComm_t comm, const gpuStream_t& stream, + int src_rank) { + return platform::dynload::ncclRecv( + output.data(), output.numel(), + platform::ToNCCLDataType(output.dtype()), src_rank, comm, stream); + }, + src_rank, CommType::RECV); return task; } @@ -440,15 +442,15 @@ std::shared_ptr ProcessGroupNCCL::Send_Partial( std::vector shared_tensors; shared_tensors.push_back(shared_input); - auto task = PointToPoint(shared_tensors, - [&](phi::DenseTensor& input, ncclComm_t comm, - const gpuStream_t& stream, int dst_rank) { - return platform::dynload::ncclSend( - input.data(), input.numel(), - platform::ToNCCLDataType(input.dtype()), - dst_rank, comm, stream); - }, - dst_rank, CommType::SEND); + auto task = PointToPoint( + shared_tensors, + [&](phi::DenseTensor& input, ncclComm_t comm, const gpuStream_t& stream, + int dst_rank) { + return platform::dynload::ncclSend( + input.data(), input.numel(), + platform::ToNCCLDataType(input.dtype()), dst_rank, comm, stream); + }, + dst_rank, CommType::SEND); return task; } @@ -463,15 +465,15 @@ std::shared_ptr ProcessGroupNCCL::Recv_Partial( std::vector shared_tensors; shared_tensors.push_back(shared_input); - auto task = PointToPoint(shared_tensors, - [&](phi::DenseTensor& output, ncclComm_t comm, - const gpuStream_t& stream, int src_rank) { - return platform::dynload::ncclRecv( - output.data(), output.numel(), - platform::ToNCCLDataType(output.dtype()), - src_rank, comm, stream); - }, - src_rank, CommType::RECV); + auto task = PointToPoint( + shared_tensors, + [&](phi::DenseTensor& output, ncclComm_t comm, const gpuStream_t& stream, + int src_rank) { + return platform::dynload::ncclRecv( + output.data(), output.numel(), + platform::ToNCCLDataType(output.dtype()), src_rank, comm, stream); + }, + src_rank, CommType::RECV); return task; } @@ -484,15 +486,15 @@ std::shared_ptr ProcessGroupNCCL::AllGather( PADDLE_ENFORCE_EQ( CheckTensorsInCudaPlace(out_tensors), true, platform::errors::InvalidArgument("All outputs should be in CudaPlace.")); - return Collective(in_tensors, out_tensors, - [&](const phi::DenseTensor& input, phi::DenseTensor& output, - ncclComm_t comm, const gpuStream_t& stream) { - return platform::dynload::ncclAllGather( - input.data(), output.data(), input.numel(), - platform::ToNCCLDataType(input.dtype()), comm, - stream); - }, - CommType::ALLGATHER); + return Collective( + in_tensors, out_tensors, + [&](const phi::DenseTensor& input, phi::DenseTensor& output, + ncclComm_t comm, const gpuStream_t& stream) { + return platform::dynload::ncclAllGather( + input.data(), output.data(), input.numel(), + platform::ToNCCLDataType(input.dtype()), comm, stream); + }, + CommType::ALLGATHER); } void* GetPointerByOffset(void* raw_pointer, size_t offset, diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index 82ced6e135ac9..2325e645b4c46 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -22,10 +22,9 @@ #include #include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/distributed/store/store.h" #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/device_context.h" - -#include "paddle/fluid/distributed/store/store.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/gen_comm_id_helper.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index 96009ce722905..9c04b95a732e8 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -403,8 +403,9 @@ void EagerReducer::InitializeDenseGroups( "Tensor %s is not initialized.", tensor_name)); const auto size = tensor.numel(); PADDLE_ENFORCE_GT( - size, 0, platform::errors::PreconditionNotMet( - "The number of tensor %s's elements is 0.", tensor_name)); + size, 0, + platform::errors::PreconditionNotMet( + "The number of tensor %s's elements is 0.", tensor_name)); all_length += size; p_group->length_.push_back(size); diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h index 424bae0e5acd1..0527ceb9b5121 100644 --- a/paddle/fluid/distributed/collective/reducer.h +++ b/paddle/fluid/distributed/collective/reducer.h @@ -16,6 +16,7 @@ #include #include + #include "paddle/fluid/distributed/collective/ProcessGroup.h" #include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/api/utils/hook_utils.h" diff --git a/paddle/fluid/distributed/common/CMakeLists.txt b/paddle/fluid/distributed/common/CMakeLists.txt index eab6165ca689e..05f6a1d1ccec4 100644 --- a/paddle/fluid/distributed/common/CMakeLists.txt +++ b/paddle/fluid/distributed/common/CMakeLists.txt @@ -1,4 +1,6 @@ - -cc_library(afs_wrapper SRCS afs_warpper.cc DEPS fs ps_framework_proto) +cc_library( + afs_wrapper + SRCS afs_warpper.cc + DEPS fs ps_framework_proto) #set_property(GLOBAL PROPERTY COMMON_DEPS afs_warpper) diff --git a/paddle/fluid/distributed/common/afs_warpper.cc b/paddle/fluid/distributed/common/afs_warpper.cc index d539ec6080469..3a37c6be7c2af 100644 --- a/paddle/fluid/distributed/common/afs_warpper.cc +++ b/paddle/fluid/distributed/common/afs_warpper.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/distributed/common/afs_warpper.h" + #include "paddle/fluid/framework/io/fs.h" namespace paddle { @@ -27,9 +28,10 @@ int AfsClient::initialize(const FsClientParameter& fs_client_param) { int AfsClient::initialize(const std::string& hadoop_bin, const std::string& uri, const std::string& user, const std::string& passwd, int buffer_size_param) { - return initialize(hadoop_bin, uri, paddle::string::format_string( - "%s,%s", user.c_str(), passwd.c_str()), - buffer_size_param); + return initialize( + hadoop_bin, uri, + paddle::string::format_string("%s,%s", user.c_str(), passwd.c_str()), + buffer_size_param); } int AfsClient::initialize(const std::string& hadoop_bin, const std::string& uri, const std::string& ugi, int buffer_size_param) { diff --git a/paddle/fluid/distributed/common/afs_warpper.h b/paddle/fluid/distributed/common/afs_warpper.h index d10668046c0a7..cef3e5ae35c28 100644 --- a/paddle/fluid/distributed/common/afs_warpper.h +++ b/paddle/fluid/distributed/common/afs_warpper.h @@ -19,6 +19,7 @@ #include #include #include + #include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/string/string_helper.h" diff --git a/paddle/fluid/distributed/common/cost_timer.h b/paddle/fluid/distributed/common/cost_timer.h index 5073dc9cf5084..1651121ee0cd9 100644 --- a/paddle/fluid/distributed/common/cost_timer.h +++ b/paddle/fluid/distributed/common/cost_timer.h @@ -15,6 +15,7 @@ #pragma once #include #include + #include "butil/time.h" #include "bvar/latency_recorder.h" #include "glog/logging.h" diff --git a/paddle/fluid/distributed/common/local_random.h b/paddle/fluid/distributed/common/local_random.h index 96b8d2d21a560..5a9a3b595d023 100644 --- a/paddle/fluid/distributed/common/local_random.h +++ b/paddle/fluid/distributed/common/local_random.h @@ -15,6 +15,7 @@ #pragma once #include #include + #include #include diff --git a/paddle/fluid/distributed/common/registerer.h b/paddle/fluid/distributed/common/registerer.h index 630be930c14d9..f4938c0f93f8c 100644 --- a/paddle/fluid/distributed/common/registerer.h +++ b/paddle/fluid/distributed/common/registerer.h @@ -15,6 +15,7 @@ #pragma once #include + #include #include #include diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt index a36e8e648b193..3cafb0bdb5f92 100755 --- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt +++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt @@ -7,34 +7,81 @@ proto_library(interceptor_message_proto SRCS interceptor_message.proto) if(WITH_ARM_BRPC) set(BRPC_DEPS arm_brpc snappy gflags glog) elseif(WITH_DISTRIBUTE AND WITH_PSCORE) - set(BRPC_DEPS brpc ssl crypto protobuf zlib leveldb snappy gflags glog) + set(BRPC_DEPS + brpc + ssl + crypto + protobuf + zlib + leveldb + snappy + gflags + glog) else() set(BRPC_DEPS "") endif() -cc_library(task_loop_thread_pool SRCS task_loop_thread_pool.cc task_loop_thread.cc task_loop.cc DEPS enforce glog) +cc_library( + task_loop_thread_pool + SRCS task_loop_thread_pool.cc task_loop_thread.cc task_loop.cc + DEPS enforce glog) -cc_library(fleet_executor SRCS fleet_executor.cc carrier.cc task_node.cc runtime_graph.cc dist_model.cc interceptor.cc - compute_interceptor.cc amplifier_interceptor.cc source_interceptor.cc sink_interceptor.cc message_service.cc message_bus.cc dist_model_tensor_wrapper.cc - DEPS proto_desc fleet_executor_desc_proto interceptor_message_proto task_loop_thread_pool collective_helper - op_registry executor_gc_helper gflags glog ${BRPC_DEPS}) +cc_library( + fleet_executor + SRCS fleet_executor.cc + carrier.cc + task_node.cc + runtime_graph.cc + dist_model.cc + interceptor.cc + compute_interceptor.cc + amplifier_interceptor.cc + source_interceptor.cc + sink_interceptor.cc + message_service.cc + message_bus.cc + dist_model_tensor_wrapper.cc + DEPS proto_desc + fleet_executor_desc_proto + interceptor_message_proto + task_loop_thread_pool + collective_helper + op_registry + executor_gc_helper + gflags + glog + ${BRPC_DEPS}) if(WITH_DISTRIBUTE) - set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) + set(DISTRIBUTE_COMPILE_FLAGS + "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor" + ) + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") endif() - set_source_files_properties(interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(compute_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(amplifier_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(source_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(sink_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(message_bus.h PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(message_bus.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(fleet_executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(carrier.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(message_service.h PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(message_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + compute_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + amplifier_interceptor.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + source_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + sink_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + message_bus.h PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + message_bus.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + fleet_executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties(carrier.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + message_service.h PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + message_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) add_subdirectory(test) endif() diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc index 53bae87c0020e..754a3f5d2b22f 100644 --- a/paddle/fluid/distributed/fleet_executor/carrier.cc +++ b/paddle/fluid/distributed/fleet_executor/carrier.cc @@ -12,9 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/distributed/fleet_executor/carrier.h" + #include -#include "paddle/fluid/distributed/fleet_executor/carrier.h" #include "paddle/fluid/distributed/fleet_executor/global.h" #include "paddle/fluid/distributed/fleet_executor/interceptor.h" #include "paddle/fluid/distributed/fleet_executor/message_bus.h" @@ -148,8 +149,9 @@ void Carrier::WakeUp() { } void Carrier::Start() { - PADDLE_ENFORCE_EQ(is_init_, true, platform::errors::PreconditionNotMet( - "Using carrier before initialized.")); + PADDLE_ENFORCE_EQ(is_init_, true, + platform::errors::PreconditionNotMet( + "Using carrier before initialized.")); for (int64_t id : source_interceptor_ids_) { VLOG(3) << "Carrier Start is sending start to source interceptor " << id << "."; diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h index d35a3260915e2..2846af97716da 100644 --- a/paddle/fluid/distributed/fleet_executor/carrier.h +++ b/paddle/fluid/distributed/fleet_executor/carrier.h @@ -35,7 +35,7 @@ namespace paddle { namespace framework { class Scope; class ProgramDesc; -} +} // namespace framework namespace distributed { diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc index fb907e3b5c29f..4ba11fa7e327d 100644 --- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc +++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc @@ -13,8 +13,8 @@ // limitations under the License. #include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h" -#include "paddle/fluid/distributed/fleet_executor/carrier.h" +#include "paddle/fluid/distributed/fleet_executor/carrier.h" #include "paddle/fluid/distributed/fleet_executor/task_node.h" #include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc index d8f937e218be4..8fe73d774946c 100644 --- a/paddle/fluid/distributed/fleet_executor/dist_model.cc +++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/distributed/fleet_executor/dist_model.h" + #include + #include // NOLINT -#include "paddle/fluid/distributed/fleet_executor/dist_model.h" #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h" #include "paddle/fluid/distributed/fleet_executor/task_node.h" #include "paddle/fluid/framework/block_desc.h" @@ -294,8 +296,9 @@ bool DistModel::PrepareProgram() { bool DistModel::LoadProgram() { VLOG(3) << "Loading program from " << config_.model_dir; - PADDLE_ENFORCE_NE(config_.model_dir, "", platform::errors::InvalidArgument( - "Model dir must be provided.")); + PADDLE_ENFORCE_NE( + config_.model_dir, "", + platform::errors::InvalidArgument("Model dir must be provided.")); std::string model_path = config_.model_dir + ".pdmodel"; framework::proto::ProgramDesc program_proto; std::string pb_content; diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.h b/paddle/fluid/distributed/fleet_executor/dist_model.h index d0203c131357c..f5c1d47afb1a3 100644 --- a/paddle/fluid/distributed/fleet_executor/dist_model.h +++ b/paddle/fluid/distributed/fleet_executor/dist_model.h @@ -31,7 +31,7 @@ namespace framework { class ProgramDesc; class Scope; class BlockDesc; -} +} // namespace framework namespace distributed { diff --git a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.cc b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.cc index b440d39c73a70..b7f590e7a8c81 100644 --- a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.cc +++ b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h" + #include "paddle/fluid/platform/enforce.h" namespace paddle { diff --git a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h index dc8b2596803e0..459e609762d84 100644 --- a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h +++ b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h @@ -15,6 +15,7 @@ #pragma once #include #include + #include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/macros.h" diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc index e946d78550ff1..c4d7f3c7a6958 100644 --- a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc +++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc @@ -11,9 +11,10 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h" + #include -#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h" #include "paddle/fluid/distributed/fleet_executor/global.h" #include "paddle/fluid/distributed/fleet_executor/message_bus.h" #include "paddle/fluid/distributed/fleet_executor/runtime_graph.h" diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.h b/paddle/fluid/distributed/fleet_executor/fleet_executor.h index ccdb3dcc45948..176e5dab0da17 100644 --- a/paddle/fluid/distributed/fleet_executor/fleet_executor.h +++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.h @@ -25,7 +25,7 @@ namespace paddle { namespace framework { class ProgramDesc; class Scope; -} +} // namespace framework namespace distributed { class RuntimeGraph; diff --git a/paddle/fluid/distributed/fleet_executor/interceptor.cc b/paddle/fluid/distributed/fleet_executor/interceptor.cc index 710ebda41244e..2ff2bc04ff853 100644 --- a/paddle/fluid/distributed/fleet_executor/interceptor.cc +++ b/paddle/fluid/distributed/fleet_executor/interceptor.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/distributed/fleet_executor/interceptor.h" + #include "paddle/fluid/distributed/fleet_executor/carrier.h" #include "paddle/fluid/distributed/fleet_executor/task_loop.h" #include "paddle/fluid/distributed/fleet_executor/task_node.h" diff --git a/paddle/fluid/distributed/fleet_executor/interceptor.h b/paddle/fluid/distributed/fleet_executor/interceptor.h index 86ca7be7f44db..00fe2154d28fa 100644 --- a/paddle/fluid/distributed/fleet_executor/interceptor.h +++ b/paddle/fluid/distributed/fleet_executor/interceptor.h @@ -33,7 +33,7 @@ namespace paddle { namespace framework { class Scope; class GarbageCollector; -} +} // namespace framework namespace distributed { class TaskNode; diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc index 80a6b4667aa1a..76762af9e7e7a 100644 --- a/paddle/fluid/distributed/fleet_executor/message_bus.cc +++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/distributed/fleet_executor/message_bus.h" + #include #include #include @@ -19,7 +21,6 @@ #include "paddle/fluid/distributed/fleet_executor/carrier.h" #include "paddle/fluid/distributed/fleet_executor/global.h" -#include "paddle/fluid/distributed/fleet_executor/message_bus.h" #include "paddle/fluid/platform/gen_comm_id_helper.h" namespace paddle { @@ -28,8 +29,9 @@ namespace distributed { void MessageBus::Init( int64_t rank, const std::unordered_map& rank_to_addr, const std::string& addr) { - PADDLE_ENFORCE_EQ(is_init_, false, platform::errors::AlreadyExists( - "MessageBus is already init.")); + PADDLE_ENFORCE_EQ( + is_init_, false, + platform::errors::AlreadyExists("MessageBus is already init.")); rank_ = rank; is_init_ = true; rank_to_addr_ = rank_to_addr; diff --git a/paddle/fluid/distributed/fleet_executor/message_service.cc b/paddle/fluid/distributed/fleet_executor/message_service.cc index 1c66d83ea34d7..9d42b0d73dbb4 100644 --- a/paddle/fluid/distributed/fleet_executor/message_service.cc +++ b/paddle/fluid/distributed/fleet_executor/message_service.cc @@ -13,6 +13,7 @@ // limitations under the License. #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) #include "paddle/fluid/distributed/fleet_executor/message_service.h" + #include "brpc/server.h" #include "paddle/fluid/distributed/fleet_executor/global.h" #include "paddle/fluid/distributed/fleet_executor/message_bus.h" diff --git a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc index 614b4c37e8254..a5f90062dcfd9 100644 --- a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc +++ b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/distributed/fleet_executor/runtime_graph.h" + #include "paddle/fluid/distributed/fleet_executor/task_node.h" namespace paddle { diff --git a/paddle/fluid/distributed/fleet_executor/runtime_graph.h b/paddle/fluid/distributed/fleet_executor/runtime_graph.h index 1ca9f0174ed07..a59a43cc200a5 100644 --- a/paddle/fluid/distributed/fleet_executor/runtime_graph.h +++ b/paddle/fluid/distributed/fleet_executor/runtime_graph.h @@ -17,6 +17,7 @@ #include #include #include + #include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/platform/macros.h" diff --git a/paddle/fluid/distributed/fleet_executor/sink_interceptor.cc b/paddle/fluid/distributed/fleet_executor/sink_interceptor.cc index 77fbb23a6c71b..9d9e6c0356548 100644 --- a/paddle/fluid/distributed/fleet_executor/sink_interceptor.cc +++ b/paddle/fluid/distributed/fleet_executor/sink_interceptor.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/distributed/fleet_executor/sink_interceptor.h" + #include "paddle/fluid/distributed/fleet_executor/task_node.h" namespace paddle { diff --git a/paddle/fluid/distributed/fleet_executor/source_interceptor.cc b/paddle/fluid/distributed/fleet_executor/source_interceptor.cc index 78b2bed66dd99..6b2fd5565ea13 100644 --- a/paddle/fluid/distributed/fleet_executor/source_interceptor.cc +++ b/paddle/fluid/distributed/fleet_executor/source_interceptor.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/distributed/fleet_executor/source_interceptor.h" + #include "paddle/fluid/distributed/fleet_executor/task_node.h" namespace paddle { diff --git a/paddle/fluid/distributed/fleet_executor/task_loop_thread.cc b/paddle/fluid/distributed/fleet_executor/task_loop_thread.cc index bb313ad37890d..90765dbdd2d09 100644 --- a/paddle/fluid/distributed/fleet_executor/task_loop_thread.cc +++ b/paddle/fluid/distributed/fleet_executor/task_loop_thread.cc @@ -31,8 +31,9 @@ TaskLoopThread::~TaskLoopThread() { } TaskLoop* TaskLoopThread::StartLoop() { - PADDLE_ENFORCE_EQ(start_, false, platform::errors::PreconditionNotMet( - "thread is already running.")); + PADDLE_ENFORCE_EQ( + start_, false, + platform::errors::PreconditionNotMet("thread is already running.")); start_ = true; thread_ = std::thread([this]() { Loop(); }); diff --git a/paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.cc b/paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.cc index ed34bbb87fc6b..e962a29b4a150 100644 --- a/paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.cc +++ b/paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.cc @@ -30,8 +30,9 @@ TaskLoopThreadPool::TaskLoopThreadPool(int thread_num) TaskLoopThreadPool::~TaskLoopThreadPool() = default; void TaskLoopThreadPool::Start() { - PADDLE_ENFORCE_EQ(start_, false, platform::errors::PreconditionNotMet( - "thread pool is already start.")); + PADDLE_ENFORCE_EQ( + start_, false, + platform::errors::PreconditionNotMet("thread pool is already start.")); PADDLE_ENFORCE_GT( thread_num_, 0, platform::errors::InvalidArgument( @@ -45,10 +46,12 @@ void TaskLoopThreadPool::Start() { } TaskLoop* TaskLoopThreadPool::GetLoop(int tid) { - PADDLE_ENFORCE_EQ(start_, true, platform::errors::PreconditionNotMet( - "thread pool must start first.")); - PADDLE_ENFORCE_GE(tid, 0, platform::errors::OutOfRange( - "tid must >= 0, but now is %d", tid)); + PADDLE_ENFORCE_EQ( + start_, true, + platform::errors::PreconditionNotMet("thread pool must start first.")); + PADDLE_ENFORCE_GE( + tid, 0, + platform::errors::OutOfRange("tid must >= 0, but now is %d", tid)); PADDLE_ENFORCE_LT(tid, thread_num_, platform::errors::OutOfRange( "tid must < thread_num, but now tid=%d thread_num=%d", @@ -57,8 +60,9 @@ TaskLoop* TaskLoopThreadPool::GetLoop(int tid) { } std::vector TaskLoopThreadPool::GetAllLoops() { - PADDLE_ENFORCE_EQ(start_, true, platform::errors::PreconditionNotMet( - "thread pool must start first.")); + PADDLE_ENFORCE_EQ( + start_, true, + platform::errors::PreconditionNotMet("thread pool must start first.")); return loops_; } diff --git a/paddle/fluid/distributed/fleet_executor/task_node.cc b/paddle/fluid/distributed/fleet_executor/task_node.cc index 232317333ea11..00ae30d281ee8 100644 --- a/paddle/fluid/distributed/fleet_executor/task_node.cc +++ b/paddle/fluid/distributed/fleet_executor/task_node.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/distributed/fleet_executor/task_node.h" + #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" @@ -153,15 +154,17 @@ void TaskNode::SetRunAtOffset(int64_t value) { void TaskNode::SetReplyUpPerSteps(int64_t value) { PADDLE_ENFORCE_GE( - value, 1, platform::errors::InvalidArgument( - "reply_up_per_steps must >= 1, but received %ld", value)); + value, 1, + platform::errors::InvalidArgument( + "reply_up_per_steps must >= 1, but received %ld", value)); reply_up_per_steps_ = value; } void TaskNode::SetSendDownPerSteps(int64_t value) { PADDLE_ENFORCE_GE( - value, 1, platform::errors::InvalidArgument( - "send_down_per_steps must >= 1, but received %ld", value)); + value, 1, + platform::errors::InvalidArgument( + "send_down_per_steps must >= 1, but received %ld", value)); send_down_per_steps_ = value; } diff --git a/paddle/fluid/distributed/fleet_executor/task_node.h b/paddle/fluid/distributed/fleet_executor/task_node.h index 7dd4b5454567e..16e686a4401b8 100644 --- a/paddle/fluid/distributed/fleet_executor/task_node.h +++ b/paddle/fluid/distributed/fleet_executor/task_node.h @@ -26,7 +26,7 @@ namespace paddle { namespace framework { class OperatorBase; class OpDesc; -} +} // namespace framework namespace distributed { class TaskNode final { diff --git a/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt index e0db8a261b585..0cd39b3aad6e6 100644 --- a/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt +++ b/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt @@ -1,25 +1,72 @@ -set_source_files_properties(interceptor_ping_pong_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(interceptor_ping_pong_test SRCS interceptor_ping_pong_test.cc DEPS fleet_executor ${BRPC_DEPS}) +set_source_files_properties( + interceptor_ping_pong_test.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + interceptor_ping_pong_test + SRCS interceptor_ping_pong_test.cc + DEPS fleet_executor ${BRPC_DEPS}) -set_source_files_properties(compute_interceptor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(compute_interceptor_test SRCS compute_interceptor_test.cc DEPS fleet_executor ${BRPC_DEPS}) +set_source_files_properties( + compute_interceptor_test.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + compute_interceptor_test + SRCS compute_interceptor_test.cc + DEPS fleet_executor ${BRPC_DEPS}) -set_source_files_properties(source_interceptor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(source_interceptor_test SRCS source_interceptor_test.cc DEPS fleet_executor ${BRPC_DEPS}) +set_source_files_properties( + source_interceptor_test.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + source_interceptor_test + SRCS source_interceptor_test.cc + DEPS fleet_executor ${BRPC_DEPS}) -set_source_files_properties(sink_interceptor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(sink_interceptor_test SRCS sink_interceptor_test.cc DEPS fleet_executor ${BRPC_DEPS}) +set_source_files_properties( + sink_interceptor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + sink_interceptor_test + SRCS sink_interceptor_test.cc + DEPS fleet_executor ${BRPC_DEPS}) -set_source_files_properties(interceptor_pipeline_short_path_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(interceptor_pipeline_short_path_test SRCS interceptor_pipeline_short_path_test.cc DEPS fleet_executor ${BRPC_DEPS}) +set_source_files_properties( + interceptor_pipeline_short_path_test.cc + PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + interceptor_pipeline_short_path_test + SRCS interceptor_pipeline_short_path_test.cc + DEPS fleet_executor ${BRPC_DEPS}) -set_source_files_properties(interceptor_pipeline_long_path_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(interceptor_pipeline_long_path_test SRCS interceptor_pipeline_long_path_test.cc DEPS fleet_executor ${BRPC_DEPS}) +set_source_files_properties( + interceptor_pipeline_long_path_test.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + interceptor_pipeline_long_path_test + SRCS interceptor_pipeline_long_path_test.cc + DEPS fleet_executor ${BRPC_DEPS}) -set_source_files_properties(compute_interceptor_run_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(compute_interceptor_run_op_test SRCS compute_interceptor_run_op_test.cc DEPS fleet_executor ${BRPC_DEPS} op_registry fill_constant_op elementwise_add_op scope device_context) +set_source_files_properties( + compute_interceptor_run_op_test.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + compute_interceptor_run_op_test + SRCS compute_interceptor_run_op_test.cc + DEPS fleet_executor + ${BRPC_DEPS} + op_registry + fill_constant_op + elementwise_add_op + scope + device_context) -if(WITH_DISTRIBUTE AND WITH_PSCORE AND NOT (WITH_ASCEND OR WITH_ASCEND_CL)) -set_source_files_properties(interceptor_ping_pong_with_brpc_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(interceptor_ping_pong_with_brpc_test SRCS interceptor_ping_pong_with_brpc_test.cc DEPS fleet_executor ${BRPC_DEPS}) +if(WITH_DISTRIBUTE + AND WITH_PSCORE + AND NOT (WITH_ASCEND OR WITH_ASCEND_CL)) + set_source_files_properties( + interceptor_ping_pong_with_brpc_test.cc + PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_test( + interceptor_ping_pong_with_brpc_test + SRCS interceptor_ping_pong_with_brpc_test.cc + DEPS fleet_executor ${BRPC_DEPS}) endif() diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc index 35857fc86b5e0..bd81d3644f4d8 100644 --- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc +++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc @@ -16,7 +16,6 @@ limitations under the License. */ #include #include "gtest/gtest.h" - #include "paddle/fluid/distributed/fleet_executor/carrier.h" #include "paddle/fluid/distributed/fleet_executor/global.h" #include "paddle/fluid/distributed/fleet_executor/interceptor.h" diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc index 954b52693f46c..4992a8b34c9da 100644 --- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc +++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc @@ -16,7 +16,6 @@ limitations under the License. */ #include #include "gtest/gtest.h" - #include "paddle/fluid/distributed/fleet_executor/carrier.h" #include "paddle/fluid/distributed/fleet_executor/global.h" #include "paddle/fluid/distributed/fleet_executor/interceptor.h" diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc index 19c1d0a0d7a6a..54adf06fb67dd 100644 --- a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc +++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc @@ -16,7 +16,6 @@ limitations under the License. */ #include #include "gtest/gtest.h" - #include "paddle/fluid/distributed/fleet_executor/carrier.h" #include "paddle/fluid/distributed/fleet_executor/global.h" #include "paddle/fluid/distributed/fleet_executor/interceptor.h" diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc index 78cff2606f6b8..3828c4478cbe6 100644 --- a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc +++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc @@ -14,11 +14,11 @@ limitations under the License. */ #include #include + #include #include #include "gtest/gtest.h" - #include "paddle/fluid/distributed/fleet_executor/carrier.h" #include "paddle/fluid/distributed/fleet_executor/global.h" #include "paddle/fluid/distributed/fleet_executor/interceptor.h" diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc index e909744a4b5d6..a78cd6955f246 100644 --- a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc +++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc @@ -16,7 +16,6 @@ limitations under the License. */ #include #include "gtest/gtest.h" - #include "paddle/fluid/distributed/fleet_executor/carrier.h" #include "paddle/fluid/distributed/fleet_executor/global.h" #include "paddle/fluid/distributed/fleet_executor/interceptor.h" diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc index 0e57596bacbe6..53755bf1a40eb 100644 --- a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc +++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc @@ -16,7 +16,6 @@ limitations under the License. */ #include #include "gtest/gtest.h" - #include "paddle/fluid/distributed/fleet_executor/carrier.h" #include "paddle/fluid/distributed/fleet_executor/global.h" #include "paddle/fluid/distributed/fleet_executor/interceptor.h" diff --git a/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc index 8ff908f90ec85..879d7e9b02941 100644 --- a/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc +++ b/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc @@ -16,7 +16,6 @@ #include #include "gtest/gtest.h" - #include "paddle/fluid/distributed/fleet_executor/carrier.h" #include "paddle/fluid/distributed/fleet_executor/global.h" #include "paddle/fluid/distributed/fleet_executor/interceptor.h" diff --git a/paddle/fluid/distributed/fleet_executor/test/source_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/source_interceptor_test.cc index e9c0437c829d4..21a1b4accc9f1 100644 --- a/paddle/fluid/distributed/fleet_executor/test/source_interceptor_test.cc +++ b/paddle/fluid/distributed/fleet_executor/test/source_interceptor_test.cc @@ -16,7 +16,6 @@ #include #include "gtest/gtest.h" - #include "paddle/fluid/distributed/fleet_executor/carrier.h" #include "paddle/fluid/distributed/fleet_executor/global.h" #include "paddle/fluid/distributed/fleet_executor/interceptor.h" diff --git a/paddle/fluid/distributed/index_dataset/CMakeLists.txt b/paddle/fluid/distributed/index_dataset/CMakeLists.txt index 98bc0a0ad4a26..524245be5f2ad 100644 --- a/paddle/fluid/distributed/index_dataset/CMakeLists.txt +++ b/paddle/fluid/distributed/index_dataset/CMakeLists.txt @@ -1,9 +1,18 @@ proto_library(index_dataset_proto SRCS index_dataset.proto) -cc_library(index_wrapper SRCS index_wrapper.cc DEPS index_dataset_proto fs) +cc_library( + index_wrapper + SRCS index_wrapper.cc + DEPS index_dataset_proto fs) if(WITH_MKLDNN) - cc_library(index_sampler SRCS index_sampler.cc DEPS xxhash index_wrapper eigen3 mkldnn) + cc_library( + index_sampler + SRCS index_sampler.cc + DEPS xxhash index_wrapper eigen3 mkldnn) else() - cc_library(index_sampler SRCS index_sampler.cc DEPS xxhash index_wrapper eigen3) + cc_library( + index_sampler + SRCS index_sampler.cc + DEPS xxhash index_wrapper eigen3) endif() if(WITH_PYTHON) py_proto_compile(index_dataset_py_proto SRCS index_dataset.proto) diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.cc b/paddle/fluid/distributed/index_dataset/index_sampler.cc index 306d11d333dae..b82193220515a 100644 --- a/paddle/fluid/distributed/index_dataset/index_sampler.cc +++ b/paddle/fluid/distributed/index_dataset/index_sampler.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/distributed/index_dataset/index_sampler.h" + #include "paddle/fluid/framework/data_feed.h" namespace paddle { diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.h b/paddle/fluid/distributed/index_dataset/index_sampler.h index 02806b814c200..a82348c9ec586 100644 --- a/paddle/fluid/distributed/index_dataset/index_sampler.h +++ b/paddle/fluid/distributed/index_dataset/index_sampler.h @@ -14,6 +14,7 @@ #pragma once #include + #include "paddle/fluid/distributed/index_dataset/index_wrapper.h" #include "paddle/fluid/framework/data_feed.h" #include "paddle/fluid/framework/program_desc.h" diff --git a/paddle/fluid/distributed/index_dataset/index_wrapper.cc b/paddle/fluid/distributed/index_dataset/index_wrapper.cc index 27aa890f7600f..61941ef513334 100644 --- a/paddle/fluid/distributed/index_dataset/index_wrapper.cc +++ b/paddle/fluid/distributed/index_dataset/index_wrapper.cc @@ -9,15 +9,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/distributed/index_dataset/index_wrapper.h" + #include #include #include #include #include #include -#include "paddle/fluid/framework/io/fs.h" -#include "paddle/fluid/distributed/index_dataset/index_wrapper.h" +#include "paddle/fluid/framework/io/fs.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/index_dataset/index_wrapper.h b/paddle/fluid/distributed/index_dataset/index_wrapper.h index 8fb8faf6c84a2..1c652e60bbbc3 100644 --- a/paddle/fluid/distributed/index_dataset/index_wrapper.h +++ b/paddle/fluid/distributed/index_dataset/index_wrapper.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/distributed/index_dataset/index_dataset.pb.h" #include "paddle/fluid/platform/enforce.h" @@ -90,10 +91,11 @@ class IndexWrapper { } TreePtr tree = std::make_shared(); int ret = tree->Load(tree_path); - PADDLE_ENFORCE_EQ(ret, 0, paddle::platform::errors::InvalidArgument( - "Load tree[%s] from path[%s] failed. Please " - "check whether the file exists.", - name, tree_path)); + PADDLE_ENFORCE_EQ(ret, 0, + paddle::platform::errors::InvalidArgument( + "Load tree[%s] from path[%s] failed. Please " + "check whether the file exists.", + name, tree_path)); tree_map.insert(std::pair{name, tree}); } diff --git a/paddle/fluid/distributed/ps/service/CMakeLists.txt b/paddle/fluid/distributed/ps/service/CMakeLists.txt index e7519ef4998b1..ad49b651e2e71 100755 --- a/paddle/fluid/distributed/ps/service/CMakeLists.txt +++ b/paddle/fluid/distributed/ps/service/CMakeLists.txt @@ -1,57 +1,136 @@ set(BRPC_SRCS ps_client.cc server.cc) set_source_files_properties(${BRPC_SRCS}) - if(WITH_HETERPS) - set(BRPC_DEPS brpc ssl crypto protobuf gflags glog zlib leveldb snappy gflags glog device_context rocksdb) + set(BRPC_DEPS + brpc + ssl + crypto + protobuf + gflags + glog + zlib + leveldb + snappy + gflags + glog + device_context + rocksdb) else() - set(BRPC_DEPS brpc ssl crypto protobuf gflags glog zlib leveldb snappy gflags glog device_context) + set(BRPC_DEPS + brpc + ssl + crypto + protobuf + gflags + glog + zlib + leveldb + snappy + gflags + glog + device_context) endif() -brpc_library(sendrecv_rpc SRCS - ${BRPC_SRCS} - PROTO sendrecv.proto - DEPS ${BRPC_DEPS} ) +brpc_library( + sendrecv_rpc + SRCS + ${BRPC_SRCS} + PROTO + sendrecv.proto + DEPS + ${BRPC_DEPS}) #set_property(GLOBAL PROPERTY RPC_DEPS sendrecv_rpc ${BRPC_DEPS} string_helper) get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) -set_source_files_properties(communicator/communicator.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -set_source_files_properties(ps_service/service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -set_source_files_properties(brpc_ps_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -set_source_files_properties(brpc_ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -set_source_files_properties(ps_local_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - -set_source_files_properties(brpc_utils.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -set_source_files_properties(heter_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -set_source_files_properties(heter_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - -set_source_files_properties(client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -set_source_files_properties(ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -set_source_files_properties(server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -set_source_files_properties(graph_brpc_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -set_source_files_properties(graph_brpc_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_library(brpc_utils SRCS brpc_utils.cc DEPS tensor device_context ${COMMON_DEPS} ${RPC_DEPS}) - -cc_library(downpour_server SRCS graph_brpc_server.cc brpc_ps_server.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS}) -cc_library(downpour_client SRCS graph_brpc_client.cc brpc_ps_client.cc -ps_local_client.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS}) - -cc_library(client SRCS ps_client.cc DEPS downpour_client boost ${RPC_DEPS}) -cc_library(server SRCS server.cc DEPS downpour_server boost ${RPC_DEPS}) - -cc_library(communicator SRCS communicator/communicator.cc DEPS scope client boost table math_function selected_rows_functor ${RPC_DEPS}) -cc_library(ps_service SRCS ps_service/service.cc DEPS communicator client server boost ${RPC_DEPS}) - -cc_library(heter_client SRCS heter_client.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS}) -cc_library(heter_server SRCS heter_server.cc DEPS heter_client brpc_utils ${COMMON_DEPS} ${RPC_DEPS}) - -set_source_files_properties(ps_service/graph_py_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_library(graph_py_service SRCS ps_service/graph_py_service.cc DEPS ps_service) +set_source_files_properties( + communicator/communicator.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties( + ps_service/service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties( + brpc_ps_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties( + brpc_ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties( + ps_local_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + +set_source_files_properties( + brpc_utils.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties( + heter_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties( + heter_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + +set_source_files_properties(client.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties(ps_client.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties(server.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties( + graph_brpc_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties( + graph_brpc_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library( + brpc_utils + SRCS brpc_utils.cc + DEPS tensor device_context ${COMMON_DEPS} ${RPC_DEPS}) + +cc_library( + downpour_server + SRCS graph_brpc_server.cc brpc_ps_server.cc + DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS}) +cc_library( + downpour_client + SRCS graph_brpc_client.cc brpc_ps_client.cc ps_local_client.cc + DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS}) + +cc_library( + client + SRCS ps_client.cc + DEPS downpour_client boost ${RPC_DEPS}) +cc_library( + server + SRCS server.cc + DEPS downpour_server boost ${RPC_DEPS}) + +cc_library( + communicator + SRCS communicator/communicator.cc + DEPS scope + client + boost + table + math_function + selected_rows_functor + ${RPC_DEPS}) +cc_library( + ps_service + SRCS ps_service/service.cc + DEPS communicator client server boost ${RPC_DEPS}) + +cc_library( + heter_client + SRCS heter_client.cc + DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS}) +cc_library( + heter_server + SRCS heter_server.cc + DEPS heter_client brpc_utils ${COMMON_DEPS} ${RPC_DEPS}) + +set_source_files_properties( + ps_service/graph_py_service.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library( + graph_py_service + SRCS ps_service/graph_py_service.cc + DEPS ps_service) #add_subdirectory(communicator) diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc old mode 100755 new mode 100644 index 0959b651bb558..89466076b23d0 --- a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc +++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc @@ -12,11 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" + #include #include #include -#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" #include "paddle/fluid/framework/archive.h" static const int max_port = 65535; @@ -245,8 +246,9 @@ int32_t BrpcPsClient::Initialize() { int DownpourBrpcClosure::check_response(size_t request_idx, int cmd_id) { if (_cntls[request_idx]->Failed()) { - LOG(ERROR) << "resquest cmd_id:" << cmd_id << " failed, " - "err:" + LOG(ERROR) << "resquest cmd_id:" << cmd_id + << " failed, " + "err:" << _cntls[request_idx]->ErrorText(); return -1; } @@ -263,8 +265,9 @@ int DownpourBrpcClosure::check_response(size_t request_idx, int cmd_id) { int DownpourBrpcClosure::check_save_response(size_t request_idx, int cmd_id) { int32_t feasign_size = 0; if (_cntls[request_idx]->Failed()) { - LOG(ERROR) << "resquest cmd_id:" << cmd_id << " failed, " - "err:" + LOG(ERROR) << "resquest cmd_id:" << cmd_id + << " failed, " + "err:" << _cntls[request_idx]->ErrorText(); return -1; } diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.h b/paddle/fluid/distributed/ps/service/brpc_ps_client.h index e2c16d496c42c..17b6bbe22cefe 100644 --- a/paddle/fluid/distributed/ps/service/brpc_ps_client.h +++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.h @@ -15,6 +15,7 @@ #pragma once #include + #include #include #include diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_server.cc b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc index 8167c37b59987..d859acbb42e44 100644 --- a/paddle/fluid/distributed/ps/service/brpc_ps_server.cc +++ b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/distributed/ps/service/brpc_ps_server.h" + #include // NOLINT + #include "butil/object_pool.h" #include "paddle/fluid/distributed/common/cost_timer.h" #include "paddle/fluid/distributed/ps/table/depends/sparse_utils.h" diff --git a/paddle/fluid/distributed/ps/service/brpc_utils.h b/paddle/fluid/distributed/ps/service/brpc_utils.h index e68e15058f7b0..d4332744cebca 100644 --- a/paddle/fluid/distributed/ps/service/brpc_utils.h +++ b/paddle/fluid/distributed/ps/service/brpc_utils.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include #include #include diff --git a/paddle/fluid/distributed/ps/service/communicator/CMakeLists.txt b/paddle/fluid/distributed/ps/service/communicator/CMakeLists.txt index 3610729d74d93..612358c71a6fb 100644 --- a/paddle/fluid/distributed/ps/service/communicator/CMakeLists.txt +++ b/paddle/fluid/distributed/ps/service/communicator/CMakeLists.txt @@ -1,8 +1,15 @@ - - get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) -set_source_files_properties(communicator.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - - -cc_library(communicator SRCS communicator.cc DEPS scope client boost table math_function selected_rows_functor ${RPC_DEPS}) +set_source_files_properties( + communicator.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + +cc_library( + communicator + SRCS communicator.cc + DEPS scope + client + boost + table + math_function + selected_rows_functor + ${RPC_DEPS}) diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.cc b/paddle/fluid/distributed/ps/service/communicator/communicator.cc index c4b833f294e17..c50f1d909cd95 100644 --- a/paddle/fluid/distributed/ps/service/communicator/communicator.cc +++ b/paddle/fluid/distributed/ps/service/communicator/communicator.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/distributed/ps/service/communicator/communicator.h" + #include + #include "gflags/gflags.h" #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" #include "paddle/fluid/distributed/ps/wrapper/fleet.h" diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.h b/paddle/fluid/distributed/ps/service/communicator/communicator.h index 75676c392435c..5f2a0cbb90976 100644 --- a/paddle/fluid/distributed/ps/service/communicator/communicator.h +++ b/paddle/fluid/distributed/ps/service/communicator/communicator.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include #include #include @@ -30,6 +31,7 @@ limitations under the License. */ #include "gflags/gflags.h" #include "paddle/fluid/distributed/ps/service/communicator/communicator_common.h" +#include "paddle/fluid/distributed/ps/service/ps_client.h" #include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/variable.h" @@ -42,8 +44,6 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/fluid/distributed/ps/service/ps_client.h" - namespace paddle { namespace distributed { class PSClient; @@ -157,8 +157,9 @@ template inline void MergeVars(const std::string &var_name, const std::vector> &vars, Scope *scope, bool merge_add = true) { - PADDLE_ENFORCE_NE(vars.empty(), true, platform::errors::InvalidArgument( - "vector vars are empty.")); + PADDLE_ENFORCE_NE( + vars.empty(), true, + platform::errors::InvalidArgument("vector vars are empty.")); auto cpu_place = platform::CPUPlace(); auto &var0 = vars[0]; auto *out_var = scope->Var(var_name); diff --git a/paddle/fluid/distributed/ps/service/env.h b/paddle/fluid/distributed/ps/service/env.h index 162ee6f098422..0fddb17da7c41 100644 --- a/paddle/fluid/distributed/ps/service/env.h +++ b/paddle/fluid/distributed/ps/service/env.h @@ -18,11 +18,13 @@ #include #include #include + #include #include #include #include #include + #include "gflags/gflags.h" namespace paddle { diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_client.cc b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc index c1df490669dbe..ff9680044dd6b 100644 --- a/paddle/fluid/distributed/ps/service/graph_brpc_client.cc +++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc @@ -13,12 +13,14 @@ // limitations under the License. #include "paddle/fluid/distributed/ps/service/graph_brpc_client.h" + #include #include #include #include #include #include + #include "Eigen/Dense" #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" #include "paddle/fluid/distributed/ps/table/table.h" @@ -149,7 +151,7 @@ std::future GraphBrpcClient::get_node_feat( std::future GraphBrpcClient::clear_nodes(uint32_t table_id, int type_id, int idx_) { DownpourBrpcClosure *closure = new DownpourBrpcClosure( - server_size, [&, server_size = this->server_size ](void *done) { + server_size, [&, server_size = this->server_size](void *done) { int ret = 0; auto *closure = (DownpourBrpcClosure *)done; size_t fail_num = 0; @@ -665,5 +667,5 @@ int32_t GraphBrpcClient::Initialize() { local_channel = NULL; return 0; } -} -} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_client.h b/paddle/fluid/distributed/ps/service/graph_brpc_client.h index 51f14bc57cde0..c038c840df97f 100644 --- a/paddle/fluid/distributed/ps/service/graph_brpc_client.h +++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.h @@ -15,11 +15,12 @@ #pragma once #include + #include #include +#include #include -#include #include "ThreadPool.h" #include "brpc/channel.h" #include "brpc/controller.h" diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc index 8ff12265269b2..5ce26b4525041 100644 --- a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc +++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc @@ -13,13 +13,14 @@ // limitations under the License. #include "paddle/fluid/distributed/ps/service/graph_brpc_server.h" -#include "paddle/fluid/distributed/ps/service/brpc_ps_server.h" #include // NOLINT #include + #include "butil/endpoint.h" #include "iomanip" #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/ps/service/brpc_ps_server.h" #include "paddle/fluid/framework/archive.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.h b/paddle/fluid/distributed/ps/service/graph_brpc_server.h index caf728701b289..726876bef1621 100644 --- a/paddle/fluid/distributed/ps/service/graph_brpc_server.h +++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.h @@ -14,12 +14,12 @@ #pragma once +#include +#include + #include "brpc/channel.h" #include "brpc/controller.h" #include "brpc/server.h" - -#include -#include #include "paddle/fluid/distributed/ps/service/brpc_ps_server.h" #include "paddle/fluid/distributed/ps/service/server.h" #include "paddle/fluid/distributed/ps/table/common_graph_table.h" diff --git a/paddle/fluid/distributed/ps/service/heter_client.cc b/paddle/fluid/distributed/ps/service/heter_client.cc index fd0962caaaead..44c03ca1757e5 100755 --- a/paddle/fluid/distributed/ps/service/heter_client.cc +++ b/paddle/fluid/distributed/ps/service/heter_client.cc @@ -139,8 +139,9 @@ void HeterClient::SendAndRecvAsync( message_name, send_var_name_val, recv_var_name_val, *p_ctx, p_scope, &request, &request_io_buffer); - int micro_id = GetMicroId(ctx, p_scope); + int micro_id = GetMicroId(ctx, p_scope); // global auto minibatch_id = micro_id / 10; + VLOG(4) << "micro_id: " << micro_id; // select channel according to micro id if (mode == "forward") { int num = minibatch_id % xpu_channels_.size(); diff --git a/paddle/fluid/distributed/ps/service/heter_client.h b/paddle/fluid/distributed/ps/service/heter_client.h old mode 100644 new mode 100755 index efaa48470a8bd..7683b8a16793e --- a/paddle/fluid/distributed/ps/service/heter_client.h +++ b/paddle/fluid/distributed/ps/service/heter_client.h @@ -155,13 +155,13 @@ class HeterClient { // HeterClient singleton static std::shared_ptr GetInstance( - const std::vector& endpoint, - const std::vector& previous_endpoint, + const std::vector& endpoints, + const std::vector& previous_endpoints, const int& trainer_id) { if (NULL == s_instance_) { s_instance_.reset(new HeterClient()); - s_instance_->SetXpuList(endpoint); - s_instance_->SetPreviousXpuList(previous_endpoint); + s_instance_->SetXpuList(endpoints); + s_instance_->SetPreviousXpuList(previous_endpoints); s_instance_->SetTrainerID(trainer_id); s_instance_->CreateClient2XpuConnection(); } diff --git a/paddle/fluid/distributed/ps/service/heter_server.cc b/paddle/fluid/distributed/ps/service/heter_server.cc index fd38a030ff366..4440647ac94c4 100755 --- a/paddle/fluid/distributed/ps/service/heter_server.cc +++ b/paddle/fluid/distributed/ps/service/heter_server.cc @@ -94,7 +94,6 @@ void HeterServer::StartHeterInterService(bool neeed_encrypt) { VLOG(4) << "switch inter server server start success! listen on " << endpoint_inter_; } - { std::lock_guard lock(this->mutex_ready_); stoped_ = false; @@ -115,9 +114,6 @@ void HeterServer::SetFanin(const int& fan_in) { service_.SetFanin(fan_in); } void HeterServer::WaitServerReady() { std::unique_lock lock(this->mutex_ready_); condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); - while (!this->ready_) { - sleep(1); - } } int SendAndRecvVariableHandler::SaveInSwitchWithShard( diff --git a/paddle/fluid/distributed/ps/service/heter_server.h b/paddle/fluid/distributed/ps/service/heter_server.h index ddcf36bf68d7b..97028066e6641 100755 --- a/paddle/fluid/distributed/ps/service/heter_server.h +++ b/paddle/fluid/distributed/ps/service/heter_server.h @@ -90,8 +90,10 @@ class ServiceHandlerBase { using SharedMiniScope = std::shared_ptr>; + using SharedMicroScope = std::shared_ptr>>>; + using SharedTaskQueue = std::shared_ptr< std::unordered_map>>>>; @@ -226,6 +228,7 @@ class SendAndRecvVariableHandler final : public ServiceHandlerBase { auto* tensor = var->GetMutable(); auto data = reinterpret_cast(tensor->data()); auto micro_id = static_cast(data[0]); + VLOG(4) << "micro_id in heter server: " << micro_id; int minibatch_index = micro_id / 10; int microbatch_index = micro_id % 10; @@ -261,6 +264,9 @@ class SendAndRecvVariableHandler final : public ServiceHandlerBase { distributed::DeserializeFromMultiVarMsgAndIOBuf( *request, &request_io_buffer, *dev_ctx_, micro_scope); // blocking queue handles multi thread + VLOG(4) << "Handle in HeterServer: " << message_name << ", " + << microbatch_index; + VLOG(4) << "task_queue_ size: " << task_queue_->size(); (*task_queue_)[minibatch_index]->Push( std::make_pair(message_name, microbatch_index)); @@ -274,6 +280,7 @@ class SendAndRecvVariableHandler final : public ServiceHandlerBase { distributed::SerializeToMultiVarMsgAndIOBuf( message_name, response_var_names, empty_var_names, *dev_ctx_, &local_scope, response, &response_io_buffer); + VLOG(4) << "Handle over"; return 0; } @@ -612,11 +619,9 @@ class HeterServer { // HeterWrapper singleton static std::shared_ptr GetInstance() { + std::unique_lock lock(mtx_); if (s_instance_ == nullptr) { - std::unique_lock lock(mtx_); - if (NULL == s_instance_) { - s_instance_.reset(new HeterServer()); - } + s_instance_.reset(new HeterServer()); } return s_instance_; } diff --git a/paddle/fluid/distributed/ps/service/ps_client.cc b/paddle/fluid/distributed/ps/service/ps_client.cc index f7df99ec13cdf..a0216f2a7953a 100644 --- a/paddle/fluid/distributed/ps/service/ps_client.cc +++ b/paddle/fluid/distributed/ps/service/ps_client.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/distributed/ps/service/ps_client.h" + #include "glog/logging.h" #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" #include "paddle/fluid/distributed/ps/service/graph_brpc_client.h" diff --git a/paddle/fluid/distributed/ps/service/ps_client.h b/paddle/fluid/distributed/ps/service/ps_client.h index 926bb7e7c9fd3..adf096c8469c5 100644 --- a/paddle/fluid/distributed/ps/service/ps_client.h +++ b/paddle/fluid/distributed/ps/service/ps_client.h @@ -20,6 +20,7 @@ #include #include #include + #include "paddle/fluid/distributed/common/cost_timer.h" #include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/service/env.h" diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.cc b/paddle/fluid/distributed/ps/service/ps_local_client.cc index bc024ed3175bc..b6407ccebe52b 100644 --- a/paddle/fluid/distributed/ps/service/ps_local_client.cc +++ b/paddle/fluid/distributed/ps/service/ps_local_client.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/distributed/ps/service/ps_local_client.h" + #include "paddle/fluid/distributed/ps/table/table.h" //#define pslib_debug_dense_compress @@ -316,5 +317,5 @@ ::std::future PsLocalClient::PushSparse(size_t table_id, table_ptr->Push(table_context); return done(); } -} -} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.h b/paddle/fluid/distributed/ps/service/ps_local_client.h index 439ecf79f2f80..89c2f7446ac3b 100644 --- a/paddle/fluid/distributed/ps/service/ps_local_client.h +++ b/paddle/fluid/distributed/ps/service/ps_local_client.h @@ -223,5 +223,5 @@ class PsLocalClient : public PSClient { float _mse = 0; uint16_t _push_times = 0; }; -} -} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/ps/service/ps_local_server.h b/paddle/fluid/distributed/ps/service/ps_local_server.h index c09f8585b659d..2075e9dd2be28 100644 --- a/paddle/fluid/distributed/ps/service/ps_local_server.h +++ b/paddle/fluid/distributed/ps/service/ps_local_server.h @@ -16,6 +16,7 @@ #include #include + #include "paddle/fluid/distributed/ps/service/server.h" namespace paddle { @@ -37,5 +38,5 @@ class PsLocalServer : public PSServer { private: virtual int32_t Initialize() { return 0; } }; -} -} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc index ced51b8cbe383..255c0d3d655aa 100644 --- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc +++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h" + #include // NOLINT + #include "butil/endpoint.h" #include "iomanip" #include "paddle/fluid/distributed/ps/table/table.h" @@ -501,5 +503,5 @@ void GraphPyClient::StopServer() { if (status.get() == 0) stoped_ = true; } void GraphPyClient::FinalizeWorker() { this->worker_ptr->FinalizeWorker(); } -} -} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h index 55beb9b3932a6..7dd0340125693 100644 --- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h +++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h @@ -14,6 +14,7 @@ #pragma once #include + #include // NOLINT #include #include @@ -23,21 +24,20 @@ #include // NOLINT #include #include -#include "google/protobuf/text_format.h" +#include "google/protobuf/text_format.h" #include "gtest/gtest.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/variable.h" - #include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/service/env.h" #include "paddle/fluid/distributed/ps/service/graph_brpc_client.h" #include "paddle/fluid/distributed/ps/service/graph_brpc_server.h" #include "paddle/fluid/distributed/ps/service/ps_service/service.h" #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -198,5 +198,5 @@ class GraphPyClient : public GraphPyService { std::thread* client_thread; bool stoped_ = false; }; -} -} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/ps/service/ps_service/service.cc b/paddle/fluid/distributed/ps/service/ps_service/service.cc index 9c3a06c2212e6..9eb5d49a4051c 100644 --- a/paddle/fluid/distributed/ps/service/ps_service/service.cc +++ b/paddle/fluid/distributed/ps/service/ps_service/service.cc @@ -17,7 +17,9 @@ #include #include #include + #include + #include "paddle/fluid/distributed/ps/service/communicator/communicator.h" #include "paddle/fluid/string/string_helper.h" diff --git a/paddle/fluid/distributed/ps/service/server.h b/paddle/fluid/distributed/ps/service/server.h index c044e82884604..55bbbc06d878a 100644 --- a/paddle/fluid/distributed/ps/service/server.h +++ b/paddle/fluid/distributed/ps/service/server.h @@ -20,6 +20,7 @@ #include #include #include + #include "butil/endpoint.h" #include "google/protobuf/service.h" #include "paddle/fluid/distributed/common/registerer.h" diff --git a/paddle/fluid/distributed/ps/table/CMakeLists.txt b/paddle/fluid/distributed/ps/table/CMakeLists.txt index b8eff940a0dca..fdda59420f03c 100644 --- a/paddle/fluid/distributed/ps/table/CMakeLists.txt +++ b/paddle/fluid/distributed/ps/table/CMakeLists.txt @@ -1,49 +1,125 @@ set_property(GLOBAL PROPERTY TABLE_DEPS string_helper) set(graphDir graph) get_property(TABLE_DEPS GLOBAL PROPERTY TABLE_DEPS) -set_source_files_properties(${graphDir}/graph_edge.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties( + ${graphDir}/graph_edge.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) cc_library(graph_edge SRCS ${graphDir}/graph_edge.cc) -set_source_files_properties(${graphDir}/graph_weighted_sampler.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_library(WeightedSampler SRCS ${graphDir}/graph_weighted_sampler.cc DEPS graph_edge) -set_source_files_properties(${graphDir}/graph_node.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_library(graph_node SRCS ${graphDir}/graph_node.cc DEPS WeightedSampler) -set_source_files_properties(memory_dense_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -set_source_files_properties(barrier_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -set_source_files_properties(common_graph_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties( + ${graphDir}/graph_weighted_sampler.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library( + WeightedSampler + SRCS ${graphDir}/graph_weighted_sampler.cc + DEPS graph_edge) +set_source_files_properties( + ${graphDir}/graph_node.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library( + graph_node + SRCS ${graphDir}/graph_node.cc + DEPS WeightedSampler) +set_source_files_properties( + memory_dense_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties( + barrier_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties( + common_graph_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/") -include_directories(${PADDLE_LIB_THIRD_PARTY_PATH}libmct/src/extern_libmct/libmct/include) +include_directories( + ${PADDLE_LIB_THIRD_PARTY_PATH}libmct/src/extern_libmct/libmct/include) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") set(TABLE_SRC memory_dense_table.cc barrier_table.cc common_graph_table.cc) #set(EXTERN_DEP rocksdb) -cc_library(common_table SRCS ${TABLE_SRC} DEPS ${TABLE_DEPS} -${RPC_DEPS} graph_edge graph_node device_context string_helper -simple_threadpool xxhash generator) +cc_library( + common_table + SRCS ${TABLE_SRC} + DEPS ${TABLE_DEPS} + ${RPC_DEPS} + graph_edge + graph_node + device_context + string_helper + simple_threadpool + xxhash + generator) -set_source_files_properties(tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties( + tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_library(tensor_accessor SRCS tensor_accessor.cc DEPS ${TABLE_DEPS} eigen3 ps_framework_proto device_context) -cc_library(tensor_table SRCS DEPS eigen3 ps_framework_proto executor scope device_context tensor ${TABLE_DEPS}) -set_source_files_properties(table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library( + tensor_accessor + SRCS tensor_accessor.cc + DEPS ${TABLE_DEPS} eigen3 ps_framework_proto device_context) +cc_library( + tensor_table + SRCS + DEPS eigen3 + ps_framework_proto + executor + scope + device_context + tensor + ${TABLE_DEPS}) +set_source_files_properties(table.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) -set_source_files_properties(sparse_sgd_rule.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -set_source_files_properties(ctr_double_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -set_source_files_properties(ctr_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -set_source_files_properties(sparse_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -set_source_files_properties(ctr_dymf_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -set_source_files_properties(memory_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -set_source_files_properties(ssd_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -set_source_files_properties(memory_sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties( + sparse_sgd_rule.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties( + ctr_double_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties( + ctr_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties( + sparse_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties( + ctr_dymf_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties( + memory_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties( + ssd_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties( + memory_sparse_geo_table.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) -cc_library(sparse_sgd_rule SRCS sparse_sgd_rule.cc DEPS ${TABLE_DEPS} ps_framework_proto) -cc_library(ctr_accessor SRCS ctr_accessor.cc ctr_double_accessor.cc sparse_accessor.cc ctr_dymf_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule) -cc_library(sparse_table SRCS memory_sparse_table.cc ssd_sparse_table.cc memory_sparse_geo_table.cc DEPS ps_framework_proto ${TABLE_DEPS} fs afs_wrapper ctr_accessor common_table rocksdb) +cc_library( + sparse_sgd_rule + SRCS sparse_sgd_rule.cc + DEPS ${TABLE_DEPS} ps_framework_proto) +cc_library( + ctr_accessor + SRCS ctr_accessor.cc ctr_double_accessor.cc sparse_accessor.cc + ctr_dymf_accessor.cc + DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule) +cc_library( + sparse_table + SRCS memory_sparse_table.cc ssd_sparse_table.cc memory_sparse_geo_table.cc + DEPS ps_framework_proto + ${TABLE_DEPS} + fs + afs_wrapper + ctr_accessor + common_table + rocksdb) -cc_library(table SRCS table.cc DEPS sparse_table common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost) +cc_library( + table + SRCS table.cc + DEPS sparse_table + common_table + tensor_accessor + tensor_table + ps_framework_proto + string_helper + device_context + gflags + glog + boost) target_link_libraries(table -fopenmp) diff --git a/paddle/fluid/distributed/ps/table/accessor.h b/paddle/fluid/distributed/ps/table/accessor.h index 7713c2bda295f..4db8ad0a55a5e 100644 --- a/paddle/fluid/distributed/ps/table/accessor.h +++ b/paddle/fluid/distributed/ps/table/accessor.h @@ -15,8 +15,10 @@ #pragma once #include #include + #include #include + #include "paddle/fluid/distributed/common/afs_warpper.h" #include "paddle/fluid/distributed/common/registerer.h" #include "paddle/fluid/distributed/ps.pb.h" diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc index 43dee275a3dc6..55a9c794e8ead 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.cc +++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc @@ -13,11 +13,14 @@ // limitations under the License. #include "paddle/fluid/distributed/ps/table/common_graph_table.h" + #include + #include #include #include #include + #include "paddle/fluid/distributed/common/utils.h" #include "paddle/fluid/distributed/ps/table/graph/graph_node.h" #include "paddle/fluid/framework/generator.h" @@ -212,7 +215,6 @@ int64_t GraphTable::load_graph_to_memory_from_ssd(int idx, for (size_t i = 0; i < bags.size(); i++) { if (bags[i].size() > 0) { tasks.push_back(_shards_task_pool[i]->enqueue([&, i, idx, this]() -> int { - char ch[sizeof(int) * 2 + sizeof(int64_t)]; memset(ch, 0, sizeof(int)); memcpy(ch + sizeof(int), &idx, sizeof(int)); @@ -353,7 +355,6 @@ void GraphTable::export_partition_files(int idx, std::string file_path) { for (int i = 0; i < part_len; i++) { tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( [&, i, idx, this]() -> int { - std::string output_path = file_path + "partition_" + std::to_string(i); diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h index 25bec5276e729..6dd24df921dc1 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.h +++ b/paddle/fluid/distributed/ps/table/common_graph_table.h @@ -17,6 +17,7 @@ #include #include #include + #include #include #include @@ -36,6 +37,7 @@ #include #include #include + #include "paddle/fluid/distributed/ps/table/accessor.h" #include "paddle/fluid/distributed/ps/table/common_table.h" #include "paddle/fluid/distributed/ps/table/graph/class_macro.h" @@ -670,4 +672,4 @@ struct hash { return s.idx ^ s.node_key ^ s.sample_size; } }; -} +} // namespace std diff --git a/paddle/fluid/distributed/ps/table/common_table.h b/paddle/fluid/distributed/ps/table/common_table.h index f69d9ccbf1453..280573f71947e 100644 --- a/paddle/fluid/distributed/ps/table/common_table.h +++ b/paddle/fluid/distributed/ps/table/common_table.h @@ -19,9 +19,8 @@ #include // NOLINT #include -#include "paddle/fluid/distributed/ps/table/table.h" - #include "paddle/fluid/distributed/common/utils.h" +#include "paddle/fluid/distributed/ps/table/table.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_accessor.cc index ef7311824faa6..254bbb96cad62 100644 --- a/paddle/fluid/distributed/ps/table/ctr_accessor.cc +++ b/paddle/fluid/distributed/ps/table/ctr_accessor.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/distributed/ps/table/ctr_accessor.h" + #include + #include "glog/logging.h" #include "paddle/fluid/string/string_helper.h" diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.h b/paddle/fluid/distributed/ps/table/ctr_accessor.h index 327c4cea760eb..96ec5b8398d13 100644 --- a/paddle/fluid/distributed/ps/table/ctr_accessor.h +++ b/paddle/fluid/distributed/ps/table/ctr_accessor.h @@ -15,7 +15,9 @@ #pragma once #include #include + #include + #include "paddle/fluid/distributed/common/registerer.h" #include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/table/accessor.h" diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc index 4b84b7e8c36c3..2bde5271a0c43 100644 --- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc +++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/distributed/ps/table/ctr_double_accessor.h" + #include + #include "glog/logging.h" #include "paddle/fluid/string/string_helper.h" diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h index 5b781b2621c5b..3134b46960409 100644 --- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h +++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h @@ -15,7 +15,9 @@ #pragma once #include #include + #include + #include "paddle/fluid/distributed/common/registerer.h" #include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/table/accessor.h" diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc index 68f28640fc69e..6fb6675edde8d 100644 --- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc +++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h" + #include + #include "glog/logging.h" #include "paddle/fluid/string/string_helper.h" diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h index 6a9f5d28f5e59..c4bcd2bb3c98a 100644 --- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h +++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h @@ -15,7 +15,9 @@ #pragma once #include #include + #include + #include "paddle/fluid/distributed/common/registerer.h" #include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/table/accessor.h" diff --git a/paddle/fluid/distributed/ps/table/depends/dense.h b/paddle/fluid/distributed/ps/table/depends/dense.h index aea757e8d5959..5e7c1cd438de8 100644 --- a/paddle/fluid/distributed/ps/table/depends/dense.h +++ b/paddle/fluid/distributed/ps/table/depends/dense.h @@ -15,13 +15,14 @@ #pragma once #include // for sqrt in CPU and CUDA + #include #include #include #include #include -#include "gflags/gflags.h" +#include "gflags/gflags.h" #include "paddle/fluid/distributed/common/utils.h" namespace paddle { diff --git a/paddle/fluid/distributed/ps/table/depends/feature_value.h b/paddle/fluid/distributed/ps/table/depends/feature_value.h index 36dc34808bd27..e6ab278787d47 100644 --- a/paddle/fluid/distributed/ps/table/depends/feature_value.h +++ b/paddle/fluid/distributed/ps/table/depends/feature_value.h @@ -14,10 +14,10 @@ #pragma once +#include #include -#include "gflags/gflags.h" -#include +#include "gflags/gflags.h" #include "paddle/fluid/distributed/common/chunk_allocator.h" namespace paddle { diff --git a/paddle/fluid/distributed/ps/table/depends/geo_recorder.h b/paddle/fluid/distributed/ps/table/depends/geo_recorder.h index adab0ee344bca..99530f72b1f74 100644 --- a/paddle/fluid/distributed/ps/table/depends/geo_recorder.h +++ b/paddle/fluid/distributed/ps/table/depends/geo_recorder.h @@ -15,6 +15,7 @@ #pragma once #include + #include // NOLINT #include #include diff --git a/paddle/fluid/distributed/ps/table/depends/initializers.h b/paddle/fluid/distributed/ps/table/depends/initializers.h index f46e659a88bab..7c707feacecc5 100644 --- a/paddle/fluid/distributed/ps/table/depends/initializers.h +++ b/paddle/fluid/distributed/ps/table/depends/initializers.h @@ -20,10 +20,9 @@ #include #include #include -#include "gflags/gflags.h" +#include "gflags/gflags.h" #include "paddle/fluid/framework/generator.h" - #include "paddle/fluid/operators/truncated_gaussian_random_op.h" namespace paddle { diff --git a/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h b/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h index 223c8fafd26ab..4ae3aa7459a17 100644 --- a/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h +++ b/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h @@ -20,6 +20,7 @@ #include #include #include + #include #include @@ -153,5 +154,5 @@ class RocksDBHandler { std::vector _handles; rocksdb::DB* _db; }; -} // distributed -} // paddle +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/graph/graph_edge.cc b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc index 004a536e8e56c..f2f346232d326 100644 --- a/paddle/fluid/distributed/ps/table/graph/graph_edge.cc +++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/distributed/ps/table/graph/graph_edge.h" + #include namespace paddle { namespace distributed { @@ -25,5 +26,5 @@ void WeightedGraphEdgeBlob::add_edge(int64_t id, float weight = 1) { id_arr.push_back(id); weight_arr.push_back(weight); } -} -} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/graph/graph_edge.h b/paddle/fluid/distributed/ps/table/graph/graph_edge.h index 5fc785fe25682..6b929af679e50 100644 --- a/paddle/fluid/distributed/ps/table/graph/graph_edge.h +++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.h @@ -43,5 +43,5 @@ class WeightedGraphEdgeBlob : public GraphEdgeBlob { protected: std::vector weight_arr; }; -} -} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.cc b/paddle/fluid/distributed/ps/table/graph/graph_node.cc index 366e607261f0c..d966bd6965364 100644 --- a/paddle/fluid/distributed/ps/table/graph/graph_node.cc +++ b/paddle/fluid/distributed/ps/table/graph/graph_node.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/distributed/ps/table/graph/graph_node.h" + #include namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.h b/paddle/fluid/distributed/ps/table/graph/graph_node.h index c6c594036d4fc..13fdcf4c64e62 100644 --- a/paddle/fluid/distributed/ps/table/graph/graph_node.h +++ b/paddle/fluid/distributed/ps/table/graph/graph_node.h @@ -18,6 +18,7 @@ #include #include #include + #include "paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.cc b/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.cc index 8186acec1be3d..4f5c86db3142b 100644 --- a/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.cc +++ b/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.cc @@ -13,9 +13,11 @@ // limitations under the License. #include "paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h" + #include #include #include + #include "paddle/fluid/framework/generator.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h b/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h index c10617022decb..cf83d27d7a2fd 100644 --- a/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h +++ b/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h @@ -18,6 +18,7 @@ #include #include #include + #include "paddle/fluid/distributed/ps/table/graph/graph_edge.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/ps/table/memory_dense_table.h b/paddle/fluid/distributed/ps/table/memory_dense_table.h index 73653fbc2eb57..87a3f8661ae93 100644 --- a/paddle/fluid/distributed/ps/table/memory_dense_table.h +++ b/paddle/fluid/distributed/ps/table/memory_dense_table.h @@ -17,7 +17,9 @@ #include #include #include + #include + #include "Eigen/Dense" #include "paddle/fluid/distributed/ps/table/accessor.h" #include "paddle/fluid/distributed/ps/table/common_table.h" diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h index 60ba5d9602e44..bce9c774f1203 100644 --- a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h +++ b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h @@ -17,6 +17,7 @@ #include // #include #include + #include #include // NOLINT #include diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc index ee6a801fa9183..464f788b454e8 100644 --- a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc +++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc @@ -12,15 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/distributed/ps/table/memory_sparse_table.h" + #include -#include -#include "paddle/fluid/distributed/common/cost_timer.h" -#include "paddle/fluid/distributed/ps/table/memory_sparse_table.h" -#include "paddle/fluid/framework/io/fs.h" +#include #include "boost/lexical_cast.hpp" #include "glog/logging.h" +#include "paddle/fluid/distributed/common/cost_timer.h" +#include "paddle/fluid/framework/io/fs.h" #include "paddle/fluid/platform/enforce.h" DEFINE_bool(pserver_print_missed_key_num_every_push, false, @@ -272,9 +273,8 @@ int32_t MemorySparseTable::Save(const std::string& dirname, if (_value_accesor->Save(it.value().data(), save_param)) { std::string format_value = _value_accesor->ParseToString( it.value().data(), it.value().size()); - if (0 != - write_channel->write_line(paddle::string::format_string( - "%lu %s", it.key(), format_value.c_str()))) { + if (0 != write_channel->write_line(paddle::string::format_string( + "%lu %s", it.key(), format_value.c_str()))) { ++retry_num; is_write_failed = true; LOG(ERROR) diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_table.h index 6516c75a5d696..7b7a47ff998b1 100644 --- a/paddle/fluid/distributed/ps/table/memory_sparse_table.h +++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.h @@ -17,12 +17,14 @@ #include #include #include + #include #include // NOLINT #include #include #include #include + #include "Eigen/Dense" #include "paddle/fluid/distributed/ps/table/accessor.h" #include "paddle/fluid/distributed/ps/table/common_table.h" diff --git a/paddle/fluid/distributed/ps/table/sparse_accessor.cc b/paddle/fluid/distributed/ps/table/sparse_accessor.cc index bc537880f1c21..772ff5d1fc5cc 100644 --- a/paddle/fluid/distributed/ps/table/sparse_accessor.cc +++ b/paddle/fluid/distributed/ps/table/sparse_accessor.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/distributed/ps/table/sparse_accessor.h" + #include + #include "glog/logging.h" #include "paddle/fluid/string/string_helper.h" diff --git a/paddle/fluid/distributed/ps/table/sparse_accessor.h b/paddle/fluid/distributed/ps/table/sparse_accessor.h index 875904847b2ea..5e76365901c27 100644 --- a/paddle/fluid/distributed/ps/table/sparse_accessor.h +++ b/paddle/fluid/distributed/ps/table/sparse_accessor.h @@ -15,7 +15,9 @@ #pragma once #include #include + #include + #include "paddle/fluid/distributed/common/registerer.h" #include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/table/accessor.h" diff --git a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc index 8471b93612828..a9a4c9beae22c 100644 --- a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc +++ b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h" + #include + #include "glog/logging.h" DEFINE_bool(enable_show_scale_gradient, true, "enable show scale gradient"); diff --git a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h index 55a37b5941921..0f7766e20a326 100644 --- a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h +++ b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h @@ -14,8 +14,10 @@ #pragma once #include + #include #include + #include "glog/logging.h" // for CHECK #include "paddle/fluid/distributed/common/local_random.h" // for local_uniform_real_distribution #include "paddle/fluid/distributed/common/registerer.h" diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc index b1359d1323d89..7e1128baa0cd6 100644 --- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc +++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/distributed/ps/table/ssd_sparse_table.h" + #include "paddle/fluid/distributed/common/cost_timer.h" #include "paddle/fluid/distributed/common/local_random.h" #include "paddle/fluid/distributed/common/topk_calculator.h" @@ -362,9 +363,8 @@ int32_t SSDSparseTable::Save(const std::string& path, if (_value_accesor->Save(it.value().data(), save_param)) { std::string format_value = _value_accesor->ParseToString( it.value().data(), it.value().size()); - if (0 != - write_channel->write_line(paddle::string::format_string( - "%lu %s", it.key(), format_value.c_str()))) { + if (0 != write_channel->write_line(paddle::string::format_string( + "%lu %s", it.key(), format_value.c_str()))) { ++retry_num; is_write_failed = true; LOG(ERROR) << "SSDSparseTable save failed, retry it! path:" @@ -597,9 +597,8 @@ int32_t SSDSparseTable::SaveCache( while (shuffled_channel->Read(data)) { for (auto& t : data) { ++feasign_size; - if (0 != - write_channel->write_line(paddle::string::format_string( - "%lu %s", t.first, t.second.c_str()))) { + if (0 != write_channel->write_line(paddle::string::format_string( + "%lu %s", t.first, t.second.c_str()))) { LOG(ERROR) << "Cache Table save failed, " "path:" << channel_config.path << ", retry it!"; diff --git a/paddle/fluid/distributed/ps/table/table.cc b/paddle/fluid/distributed/ps/table/table.cc index ef2eb3a746f66..cfa286f1c3f7f 100644 --- a/paddle/fluid/distributed/ps/table/table.cc +++ b/paddle/fluid/distributed/ps/table/table.cc @@ -16,13 +16,11 @@ #include "glog/logging.h" #include "paddle/fluid/distributed/common/registerer.h" - #include "paddle/fluid/distributed/ps/table/common_graph_table.h" -#include "paddle/fluid/distributed/ps/table/memory_dense_table.h" - #include "paddle/fluid/distributed/ps/table/ctr_accessor.h" #include "paddle/fluid/distributed/ps/table/ctr_double_accessor.h" #include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h" +#include "paddle/fluid/distributed/ps/table/memory_dense_table.h" #include "paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h" #include "paddle/fluid/distributed/ps/table/memory_sparse_table.h" #include "paddle/fluid/distributed/ps/table/sparse_accessor.h" diff --git a/paddle/fluid/distributed/ps/table/table.h b/paddle/fluid/distributed/ps/table/table.h index 48fda782d489f..0c56b48a246d2 100644 --- a/paddle/fluid/distributed/ps/table/table.h +++ b/paddle/fluid/distributed/ps/table/table.h @@ -15,11 +15,13 @@ #pragma once #include + #include #include // NOLINT #include #include #include + #include "paddle/fluid/distributed/common/afs_warpper.h" #include "paddle/fluid/distributed/ps/table/accessor.h" #include "paddle/fluid/distributed/ps/table/depends/sparse_utils.h" diff --git a/paddle/fluid/distributed/ps/table/tensor_accessor.cc b/paddle/fluid/distributed/ps/table/tensor_accessor.cc index 5d1f69b7463da..880583f36842d 100644 --- a/paddle/fluid/distributed/ps/table/tensor_accessor.cc +++ b/paddle/fluid/distributed/ps/table/tensor_accessor.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/distributed/ps/table/tensor_accessor.h" + #include "Eigen/Dense" namespace paddle { diff --git a/paddle/fluid/distributed/ps/table/tensor_accessor.h b/paddle/fluid/distributed/ps/table/tensor_accessor.h index fad31d5df7f47..a5225127534a0 100644 --- a/paddle/fluid/distributed/ps/table/tensor_accessor.h +++ b/paddle/fluid/distributed/ps/table/tensor_accessor.h @@ -15,6 +15,7 @@ #pragma once #include #include + #include #include diff --git a/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt b/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt index 6279b6aa95412..8b5457ef9eea5 100644 --- a/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt +++ b/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt @@ -1,9 +1,18 @@ - get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) -set_source_files_properties(fleet.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_library(fleet - SRCS fleet.cc - DEPS framework_proto ps_framework_proto ps_service variable_helper scope op_registry fs shell ${RPC_DEPS}) +set_source_files_properties(fleet.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library( + fleet + SRCS fleet.cc + DEPS framework_proto + ps_framework_proto + ps_service + variable_helper + scope + op_registry + fs + shell + ${RPC_DEPS}) target_link_libraries(fleet z) diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.cc b/paddle/fluid/distributed/ps/wrapper/fleet.cc index 955ba75e672d1..b9754d7b9debb 100644 --- a/paddle/fluid/distributed/ps/wrapper/fleet.cc +++ b/paddle/fluid/distributed/ps/wrapper/fleet.cc @@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/distributed/ps/wrapper/fleet.h" + #include #include "paddle/fluid/distributed/ps/service/communicator/communicator.h" #include "paddle/fluid/distributed/ps/table/table.h" -#include "paddle/fluid/distributed/ps/wrapper/fleet.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.h b/paddle/fluid/distributed/ps/wrapper/fleet.h index ce109b63cce9c..f88c478724b8b 100644 --- a/paddle/fluid/distributed/ps/wrapper/fleet.h +++ b/paddle/fluid/distributed/ps/wrapper/fleet.h @@ -49,8 +49,8 @@ class PSCore; using framework::LoDTensor; using framework::Scope; -using phi::SelectedRows; using framework::Variable; +using phi::SelectedRows; using RpcCtxMap = std::unordered_map; diff --git a/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h b/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h old mode 100755 new mode 100644 index ca02ad31195ef..0156c0b42db05 --- a/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h +++ b/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h @@ -49,8 +49,8 @@ class PSCore; using framework::LoDTensor; using framework::Scope; -using phi::SelectedRows; using framework::Variable; +using phi::SelectedRows; using RpcCtxMap = std::unordered_map; diff --git a/paddle/fluid/distributed/store/CMakeLists.txt b/paddle/fluid/distributed/store/CMakeLists.txt index 1fde447d97dd9..cfab4aad5f795 100644 --- a/paddle/fluid/distributed/store/CMakeLists.txt +++ b/paddle/fluid/distributed/store/CMakeLists.txt @@ -1 +1,4 @@ -cc_library(tcp_store SRCS tcp_store.cc tcp_utils.cc DEPS enforce glog) +cc_library( + tcp_store + SRCS tcp_store.cc tcp_utils.cc + DEPS enforce glog) diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc index ec6f0e26a08fa..a46b4b32c9f18 100644 --- a/paddle/fluid/distributed/store/tcp_store.cc +++ b/paddle/fluid/distributed/store/tcp_store.cc @@ -12,11 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/distributed/store/tcp_store.h" + #include #include #include -#include "paddle/fluid/distributed/store/tcp_store.h" #include "paddle/fluid/distributed/store/tcp_utils.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/flags.h" diff --git a/paddle/fluid/distributed/store/tcp_utils.cc b/paddle/fluid/distributed/store/tcp_utils.cc index a28cba288333d..466cd11fa5d3d 100644 --- a/paddle/fluid/distributed/store/tcp_utils.cc +++ b/paddle/fluid/distributed/store/tcp_utils.cc @@ -13,9 +13,11 @@ // limitations under the License. #include "paddle/fluid/distributed/store/tcp_utils.h" + #include #include #include + #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -51,12 +53,13 @@ ::addrinfo* get_addr_info(const std::string host, const std::string port, int n; n = ::getaddrinfo(node, port_cstr, &hints, &res); const char* gai_err = ::gai_strerror(n); - const char* proto = - (family == AF_INET ? "IPv4" : family == AF_INET6 ? "IPv6" : ""); - PADDLE_ENFORCE_EQ( - n, 0, platform::errors::InvalidArgument( - "%s network %s:%s cannot be obtained. Details: %s.", proto, - host, port, gai_err)); + const char* proto = (family == AF_INET ? "IPv4" + : family == AF_INET6 ? "IPv6" + : ""); + PADDLE_ENFORCE_EQ(n, 0, + platform::errors::InvalidArgument( + "%s network %s:%s cannot be obtained. Details: %s.", + proto, host, port, gai_err)); return res; } @@ -79,10 +82,11 @@ SocketType tcp_connect(const std::string host, const std::string port, do { for (::addrinfo* cur = res; cur != nullptr; cur = cur->ai_next) { sockfd = ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol); - PADDLE_ENFORCE_GT(sockfd, 0, platform::errors::InvalidArgument( - "Create socket to connect %s:%s failed. " - "Details: %s. ", - host, port, socket_error().message())); + PADDLE_ENFORCE_GT(sockfd, 0, + platform::errors::InvalidArgument( + "Create socket to connect %s:%s failed. " + "Details: %s. ", + host, port, socket_error().message())); if (::connect(sockfd, cur->ai_addr, cur->ai_addrlen) == 0) { retry = false; diff --git a/paddle/fluid/distributed/store/tcp_utils.h b/paddle/fluid/distributed/store/tcp_utils.h index 60cb3de124da3..ec9f610a18c17 100644 --- a/paddle/fluid/distributed/store/tcp_utils.h +++ b/paddle/fluid/distributed/store/tcp_utils.h @@ -29,6 +29,7 @@ #include #include #include + #include "paddle/fluid/platform/enforce.h" // Utility functions for TCP socket. @@ -73,9 +74,10 @@ void send_bytes(SocketType socket, const T* buffer, size_t len) { while (to_send > 0) { auto byte_sent = ::send(socket, ptr, to_send, 0); - PADDLE_ENFORCE_GT(byte_sent, 0, platform::errors::InvalidArgument( - "TCP send error. Details: %s.", - socket_error().message())); + PADDLE_ENFORCE_GT( + byte_sent, 0, + platform::errors::InvalidArgument("TCP send error. Details: %s.", + socket_error().message())); to_send -= byte_sent; ptr += byte_sent; } @@ -91,9 +93,10 @@ void receive_bytes(SocketType socket, T* buffer, size_t len) { while (to_recv > 0) { auto byte_received = ::recv(socket, ptr, to_recv, 0); - PADDLE_ENFORCE_GT(byte_received, 0, platform::errors::InvalidArgument( - "TCP receive error. Details: %s.", - socket_error().message())); + PADDLE_ENFORCE_GT( + byte_received, 0, + platform::errors::InvalidArgument("TCP receive error. Details: %s.", + socket_error().message())); to_recv -= byte_received; ptr += byte_received; diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt index 9f339d7ee2c08..9b7a304b0a92a 100644 --- a/paddle/fluid/distributed/test/CMakeLists.txt +++ b/paddle/fluid/distributed/test/CMakeLists.txt @@ -1,46 +1,144 @@ -set_source_files_properties(table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(table_test SRCS table_test.cc DEPS common_table table tensor_accessor -ps_framework_proto ${COMMON_DEPS} ${RPC_DEPS}) +set_source_files_properties( + table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + table_test + SRCS table_test.cc + DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS} + ${RPC_DEPS}) -set_source_files_properties(dense_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(dense_table_test SRCS dense_table_test.cc DEPS common_table table -tensor_accessor ps_framework_proto ${COMMON_DEPS} ${RPC_DEPS}) +set_source_files_properties( + dense_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + dense_table_test + SRCS dense_table_test.cc + DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS} + ${RPC_DEPS}) -set_source_files_properties(barrier_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(barrier_table_test SRCS barrier_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS}) +set_source_files_properties( + barrier_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + barrier_table_test + SRCS barrier_table_test.cc + DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS}) -set_source_files_properties(brpc_service_dense_sgd_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(brpc_service_dense_sgd_test SRCS brpc_service_dense_sgd_test.cc DEPS scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS}) +set_source_files_properties( + brpc_service_dense_sgd_test.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + brpc_service_dense_sgd_test + SRCS brpc_service_dense_sgd_test.cc + DEPS scope + server + client + communicator + ps_service + boost + table + ps_framework_proto + ${COMMON_DEPS}) -set_source_files_properties(brpc_service_sparse_sgd_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(brpc_service_sparse_sgd_test SRCS brpc_service_sparse_sgd_test.cc DEPS scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS}) +set_source_files_properties( + brpc_service_sparse_sgd_test.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + brpc_service_sparse_sgd_test + SRCS brpc_service_sparse_sgd_test.cc + DEPS scope + server + client + communicator + ps_service + boost + table + ps_framework_proto + ${COMMON_DEPS}) -set_source_files_properties(brpc_utils_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(brpc_utils_test SRCS brpc_utils_test.cc DEPS brpc_utils scope math_function ${COMMON_DEPS} ${RPC_DEPS}) +set_source_files_properties( + brpc_utils_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + brpc_utils_test + SRCS brpc_utils_test.cc + DEPS brpc_utils scope math_function ${COMMON_DEPS} ${RPC_DEPS}) -set_source_files_properties(graph_node_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(graph_node_test SRCS graph_node_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS}) +set_source_files_properties( + graph_node_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + graph_node_test + SRCS graph_node_test.cc + DEPS graph_py_service + scope + server + client + communicator + ps_service + boost + table + ps_framework_proto + ${COMMON_DEPS}) -set_source_files_properties(graph_node_split_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(graph_node_split_test SRCS graph_node_split_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS}) +set_source_files_properties( + graph_node_split_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + graph_node_split_test + SRCS graph_node_split_test.cc + DEPS graph_py_service + scope + server + client + communicator + ps_service + boost + table + ps_framework_proto + ${COMMON_DEPS}) -set_source_files_properties(graph_table_sample_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(graph_table_sample_test SRCS graph_table_sample_test.cc DEPS table ps_framework_proto ${COMMON_DEPS}) +set_source_files_properties( + graph_table_sample_test.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + graph_table_sample_test + SRCS graph_table_sample_test.cc + DEPS table ps_framework_proto ${COMMON_DEPS}) -set_source_files_properties(feature_value_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(feature_value_test SRCS feature_value_test.cc DEPS ${COMMON_DEPS} boost table) +set_source_files_properties( + feature_value_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + feature_value_test + SRCS feature_value_test.cc + DEPS ${COMMON_DEPS} boost table) -set_source_files_properties(sparse_sgd_rule_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(sparse_sgd_rule_test SRCS sparse_sgd_rule_test.cc DEPS ${COMMON_DEPS} boost table) +set_source_files_properties( + sparse_sgd_rule_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + sparse_sgd_rule_test + SRCS sparse_sgd_rule_test.cc + DEPS ${COMMON_DEPS} boost table) -set_source_files_properties(ctr_accessor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(ctr_accessor_test SRCS ctr_accessor_test.cc DEPS ${COMMON_DEPS} boost table) -set_source_files_properties(ctr_dymf_accessor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(ctr_dymf_accessor_test SRCS ctr_dymf_accessor_test.cc DEPS ${COMMON_DEPS} boost table) +set_source_files_properties( + ctr_accessor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + ctr_accessor_test + SRCS ctr_accessor_test.cc + DEPS ${COMMON_DEPS} boost table) +set_source_files_properties( + ctr_dymf_accessor_test.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + ctr_dymf_accessor_test + SRCS ctr_dymf_accessor_test.cc + DEPS ${COMMON_DEPS} boost table) +set_source_files_properties( + memory_sparse_table_test.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + memory_sparse_table_test + SRCS memory_sparse_table_test.cc + DEPS ${COMMON_DEPS} boost table) -set_source_files_properties(memory_sparse_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(memory_sparse_table_test SRCS memory_sparse_table_test.cc DEPS ${COMMON_DEPS} boost table) - -set_source_files_properties(memory_geo_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(memory_sparse_geo_table_test SRCS memory_geo_table_test.cc DEPS ${COMMON_DEPS} boost table) +set_source_files_properties( + memory_geo_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + memory_sparse_geo_table_test + SRCS memory_geo_table_test.cc + DEPS ${COMMON_DEPS} boost table) diff --git a/paddle/fluid/distributed/test/barrier_table_test.cc b/paddle/fluid/distributed/test/barrier_table_test.cc index c4c5b22992804..f540939c6fd8f 100644 --- a/paddle/fluid/distributed/test/barrier_table_test.cc +++ b/paddle/fluid/distributed/test/barrier_table_test.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include #include + #include "gtest/gtest.h" #include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/table/common_table.h" diff --git a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc index f9d57be95affe..c1467dae9a7e2 100644 --- a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc +++ b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include #include // NOLINT diff --git a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc index 29195d9985728..bade56f239f65 100644 --- a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc +++ b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include #include // NOLINT diff --git a/paddle/fluid/distributed/test/brpc_utils_test.cc b/paddle/fluid/distributed/test/brpc_utils_test.cc index 16ff9bd75840b..33367bf16b72e 100644 --- a/paddle/fluid/distributed/test/brpc_utils_test.cc +++ b/paddle/fluid/distributed/test/brpc_utils_test.cc @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/distributed/ps/service/brpc_utils.h" + #include #include "gtest/gtest.h" - -#include "paddle/fluid/distributed/ps/service/brpc_utils.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { diff --git a/paddle/fluid/distributed/test/ctr_accessor_test.cc b/paddle/fluid/distributed/test/ctr_accessor_test.cc index 27b6ddf722b70..51254391a4283 100644 --- a/paddle/fluid/distributed/test/ctr_accessor_test.cc +++ b/paddle/fluid/distributed/test/ctr_accessor_test.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/distributed/ps/table/ctr_accessor.h" + #include #include + #include "gtest/gtest.h" #include "paddle/fluid/distributed/common/registerer.h" #include "paddle/fluid/distributed/ps.pb.h" diff --git a/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc b/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc index f6e773a414c7f..fbf179dbeeef0 100644 --- a/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc +++ b/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h" + #include #include + #include "gtest/gtest.h" #include "paddle/fluid/distributed/common/registerer.h" #include "paddle/fluid/distributed/ps.pb.h" diff --git a/paddle/fluid/distributed/test/dense_table_test.cc b/paddle/fluid/distributed/test/dense_table_test.cc index 9529c776c120e..185d9d3aed1d4 100644 --- a/paddle/fluid/distributed/test/dense_table_test.cc +++ b/paddle/fluid/distributed/test/dense_table_test.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include + #include "gtest/gtest.h" #include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/table/memory_dense_table.h" diff --git a/paddle/fluid/distributed/test/feature_value_test.cc b/paddle/fluid/distributed/test/feature_value_test.cc index 32e3944d35a1c..6e848c3e2f4e4 100644 --- a/paddle/fluid/distributed/test/feature_value_test.cc +++ b/paddle/fluid/distributed/test/feature_value_test.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/distributed/ps/table/depends/feature_value.h" + #include + #include "gtest/gtest.h" namespace paddle { diff --git a/paddle/fluid/distributed/test/graph_node_split_test.cc b/paddle/fluid/distributed/test/graph_node_split_test.cc index 395d7c1eace82..fa9b89d75c83c 100644 --- a/paddle/fluid/distributed/test/graph_node_split_test.cc +++ b/paddle/fluid/distributed/test/graph_node_split_test.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include // NOLINT #include #include @@ -17,8 +18,8 @@ limitations under the License. */ #include // NOLINT #include #include -#include "google/protobuf/text_format.h" +#include "google/protobuf/text_format.h" #include "gtest/gtest.h" #include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc index 3b43c2779ee4e..9cb244a9ec430 100644 --- a/paddle/fluid/distributed/test/graph_node_test.cc +++ b/paddle/fluid/distributed/test/graph_node_test.cc @@ -9,7 +9,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/distributed/ps/table/graph/graph_node.h" + #include + #include // NOLINT #include #include @@ -17,8 +20,8 @@ limitations under the License. */ #include // NOLINT #include #include -#include "google/protobuf/text_format.h" +#include "google/protobuf/text_format.h" #include "gtest/gtest.h" #include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" @@ -30,7 +33,6 @@ limitations under the License. */ #include "paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h" #include "paddle/fluid/distributed/ps/service/ps_service/service.h" #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" -#include "paddle/fluid/distributed/ps/table/graph/graph_node.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" diff --git a/paddle/fluid/distributed/test/graph_table_sample_test.cc b/paddle/fluid/distributed/test/graph_table_sample_test.cc index d7f6f2f34d77a..a3463162d276c 100644 --- a/paddle/fluid/distributed/test/graph_table_sample_test.cc +++ b/paddle/fluid/distributed/test/graph_table_sample_test.cc @@ -13,6 +13,8 @@ // limitations under the License. #include + +#include #include // NOLINT #include #include @@ -20,9 +22,8 @@ #include // NOLINT #include #include -#include "google/protobuf/text_format.h" -#include +#include "google/protobuf/text_format.h" #include "gtest/gtest.h" #include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/table/common_graph_table.h" diff --git a/paddle/fluid/distributed/test/memory_geo_table_test.cc b/paddle/fluid/distributed/test/memory_geo_table_test.cc index ca3b51fade177..507211e69fa0f 100644 --- a/paddle/fluid/distributed/test/memory_geo_table_test.cc +++ b/paddle/fluid/distributed/test/memory_geo_table_test.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include - #include + #include #include // NOLINT diff --git a/paddle/fluid/distributed/test/memory_sparse_table_test.cc b/paddle/fluid/distributed/test/memory_sparse_table_test.cc index 68bc50373ffad..1689b7716bbc4 100644 --- a/paddle/fluid/distributed/test/memory_sparse_table_test.cc +++ b/paddle/fluid/distributed/test/memory_sparse_table_test.cc @@ -12,16 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#include "paddle/fluid/distributed/ps/table/memory_sparse_table.h" +#include #include + #include #include // NOLINT #include "google/protobuf/text_format.h" #include "gtest/gtest.h" #include "paddle/fluid/distributed/ps.pb.h" -#include "paddle/fluid/distributed/ps/table/memory_sparse_table.h" #include "paddle/fluid/distributed/ps/table/table.h" namespace paddle { diff --git a/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc index 1a4e16b926619..3a9a8d0b39ccd 100644 --- a/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc +++ b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h" + #include #include + #include "gtest/gtest.h" #include "paddle/fluid/distributed/ps.pb.h" diff --git a/paddle/fluid/distributed/test/table_test.cc b/paddle/fluid/distributed/test/table_test.cc index 4f73519ef5e69..56809abad0c7c 100644 --- a/paddle/fluid/distributed/test/table_test.cc +++ b/paddle/fluid/distributed/test/table_test.cc @@ -30,4 +30,4 @@ TEST(Table, Initialize) { ASSERT_EQ(ret, -1); } } // namespace distributed -} // // namespace paddle +} // namespace paddle diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt index 11c98e5da9dde..73d8539329a75 100644 --- a/paddle/fluid/eager/CMakeLists.txt +++ b/paddle/fluid/eager/CMakeLists.txt @@ -1,29 +1,82 @@ -set(eager_deps phi_api phi_dygraph_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta eager_nan_inf_utils grad_node_info grad_tensor_holder accumulation_node custom_operator_node) +set(eager_deps + phi_api + phi_dygraph_api + hook_utils + tensor_utils + utils + global_utils + backward + phi_tensor + tracer + layer + autograd_meta + eager_nan_inf_utils + grad_node_info + grad_tensor_holder + accumulation_node + custom_operator_node) -set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy) -set(generated_deps final_dygraph_function final_dygraph_node dygraph_function dygraph_node) +set(fluid_deps + tracer + layer + proto_desc + operator + op_registry + variable_helper + memcpy) +set(generated_deps final_dygraph_function final_dygraph_node dygraph_function + dygraph_node) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) - message("Performing Eager Dygraph Auto Code Generation") - add_subdirectory(auto_code_generator) + message("Performing Eager Dygraph Auto Code Generation") + add_subdirectory(auto_code_generator) endif() add_subdirectory(api) add_subdirectory(accumulation) add_subdirectory(custom_operator) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) - add_subdirectory(pylayer) - cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator) - add_dependencies(grad_tensor_holder eager_final_state_codegen) - cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info switch_autotune) + add_subdirectory(pylayer) + cc_library( + grad_tensor_holder + SRCS grad_tensor_holder.cc + DEPS grad_node_info gradient_accumulator) + add_dependencies(grad_tensor_holder eager_final_state_codegen) + cc_library( + backward + SRCS backward.cc + DEPS grad_tensor_holder utils autograd_meta grad_node_info switch_autotune) endif() -cc_library(eager_nan_inf_utils SRCS nan_inf_utils.cc DEPS phi_tensor nan_inf_utils enforce) -cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor) +cc_library( + eager_nan_inf_utils + SRCS nan_inf_utils.cc + DEPS phi_tensor nan_inf_utils enforce) +cc_library( + grad_node_info + SRCS grad_node_info.cc + DEPS phi_api phi_tensor) -cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi_api phi_tensor) -cc_library(utils SRCS utils.cc DEPS phi_api phi_tensor global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils) +cc_library( + autograd_meta + SRCS autograd_meta.cc + DEPS phi_api phi_tensor) +cc_library( + utils + SRCS utils.cc + DEPS phi_api + phi_tensor + global_utils + layer + proto_desc + operator + op_registry + variable_helper + memcpy + scale_op + autograd_meta + hook_utils) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) - add_subdirectory(tests) + add_subdirectory(tests) endif() diff --git a/paddle/fluid/eager/accumulation/CMakeLists.txt b/paddle/fluid/eager/accumulation/CMakeLists.txt index 0531aa5aab373..297e853947dfb 100644 --- a/paddle/fluid/eager/accumulation/CMakeLists.txt +++ b/paddle/fluid/eager/accumulation/CMakeLists.txt @@ -1 +1,4 @@ -cc_library(accumulation_node SRCS accumulation_node.cc DEPS gradient_accumulator phi_api grad_node_info) +cc_library( + accumulation_node + SRCS accumulation_node.cc + DEPS gradient_accumulator phi_api grad_node_info) diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc index 544e7c8fe85d6..09db68399f332 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.cc +++ b/paddle/fluid/eager/accumulation/accumulation_node.cc @@ -13,17 +13,15 @@ // limitations under the License. #include "paddle/fluid/eager/accumulation/accumulation_node.h" + +#include "glog/logging.h" #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/imperative/gradient_accumulator.h" - -#include "paddle/phi/api/all.h" -#include "paddle/phi/core/dense_tensor.h" - #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/errors.h" - -#include "glog/logging.h" +#include "paddle/phi/api/all.h" +#include "paddle/phi/core/dense_tensor.h" namespace egr { @@ -72,8 +70,7 @@ paddle::small_vector, GradNodeAccumulation::operator()( paddle::small_vector, kSlotSmallVectorSize>& grads, // NOLINT - bool create_graph, - bool is_new_grad) { + bool create_graph, bool is_new_grad) { VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation"; PADDLE_ENFORCE(grads.size() == 1, paddle::platform::errors::Fatal( diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h index 6374534578cb8..7694e290bab95 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.h +++ b/paddle/fluid/eager/accumulation/accumulation_node.h @@ -41,8 +41,7 @@ class GradNodeAccumulation : public GradNodeBase { kSlotSmallVectorSize> operator()(paddle::small_vector, kSlotSmallVectorSize>& grads, // NOLINT - bool create_graph = false, - bool is_new_grad = false) override; + bool create_graph = false, bool is_new_grad = false) override; void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } diff --git a/paddle/fluid/eager/amp_utils.h b/paddle/fluid/eager/amp_utils.h index 2145f4a11965c..2834f7d5dc0b9 100644 --- a/paddle/fluid/eager/amp_utils.h +++ b/paddle/fluid/eager/amp_utils.h @@ -14,6 +14,7 @@ #pragma once #include + #include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/imperative/amp_auto_cast.h" diff --git a/paddle/fluid/eager/api/CMakeLists.txt b/paddle/fluid/eager/api/CMakeLists.txt index 4c241fd5b721c..4525a58a44d48 100644 --- a/paddle/fluid/eager/api/CMakeLists.txt +++ b/paddle/fluid/eager/api/CMakeLists.txt @@ -1,4 +1,7 @@ add_subdirectory(utils) add_subdirectory(generated) -cc_library(eager_api SRCS all.cc DEPS tensor_utils hook_utils global_utils eager_scale) +cc_library( + eager_api + SRCS all.cc + DEPS tensor_utils hook_utils global_utils eager_scale) diff --git a/paddle/fluid/eager/api/generated/CMakeLists.txt b/paddle/fluid/eager/api/generated/CMakeLists.txt index 4f634c6884b45..3f6bb90d69baa 100644 --- a/paddle/fluid/eager/api/generated/CMakeLists.txt +++ b/paddle/fluid/eager/api/generated/CMakeLists.txt @@ -1,5 +1,5 @@ add_subdirectory(eager_generated) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) - add_subdirectory(fluid_generated) + add_subdirectory(fluid_generated) endif() diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt index 81ff07b8963f9..f704d2a49184b 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt @@ -1,6 +1,12 @@ -cc_library(scale_node SRCS scale_node.cc DEPS global_utils phi phi_api grad_node_info) +cc_library( + scale_node + SRCS scale_node.cc + DEPS global_utils phi phi_api grad_node_info) if(NOT (NOT WITH_PYTHON AND ON_INFER)) -cc_library(final_dygraph_node SRCS nodes.cc DEPS ${eager_deps}) -add_dependencies(final_dygraph_node eager_final_state_codegen) + cc_library( + final_dygraph_node + SRCS nodes.cc + DEPS ${eager_deps}) + add_dependencies(final_dygraph_node eager_final_state_codegen) endif() diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc index 38f67cb5bdf2a..5adceb7e79af1 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc @@ -13,16 +13,14 @@ // limitations under the License. #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h" + +#include "glog/logging.h" #include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/eager/eager_tensor.h" - -#include "paddle/phi/kernels/scale_kernel.h" - #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/errors.h" - -#include "glog/logging.h" +#include "paddle/phi/kernels/scale_kernel.h" namespace egr { @@ -147,8 +145,7 @@ paddle::small_vector, GradNodeScale::operator()( paddle::small_vector, kSlotSmallVectorSize>& grads, // NOLINT - bool create_graph, - bool is_new_grad) { + bool create_graph, bool is_new_grad) { // 1. Check Output Size VLOG(6) << "grad size is: " << grads.size(); PADDLE_ENFORCE( diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h index 04ff510944dd2..45872c97002aa 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h @@ -42,8 +42,7 @@ class GradNodeScale : public GradNodeBase { kSlotSmallVectorSize> operator()(paddle::small_vector, kSlotSmallVectorSize>& grads, // NOLINT - bool create_graph = false, - bool is_new_grad = false) override; + bool create_graph = false, bool is_new_grad = false) override; void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt index c70bb80c35c78..8d6df647999bd 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt @@ -1,6 +1,12 @@ -cc_library(eager_scale SRCS scale.cc DEPS phi_api phi autograd_meta scale_node) +cc_library( + eager_scale + SRCS scale.cc + DEPS phi_api phi autograd_meta scale_node) if(NOT (NOT WITH_PYTHON AND ON_INFER)) -cc_library(final_dygraph_function SRCS dygraph_functions.cc DEPS ${eager_deps}) -add_dependencies(final_dygraph_function eager_final_state_codegen) + cc_library( + final_dygraph_function + SRCS dygraph_functions.cc + DEPS ${eager_deps}) + add_dependencies(final_dygraph_function eager_final_state_codegen) endif() diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc index 7a374d567d5d0..836216d64b009 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc +++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc @@ -23,11 +23,11 @@ * **/ #include "paddle/fluid/eager/api/generated/eager_generated/forwards/scale.h" + #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/utils.h" - #include "paddle/phi/api/all.h" namespace egr { diff --git a/paddle/fluid/eager/api/utils/CMakeLists.txt b/paddle/fluid/eager/api/utils/CMakeLists.txt index a2a380ebad6c5..1fd4905605ea8 100644 --- a/paddle/fluid/eager/api/utils/CMakeLists.txt +++ b/paddle/fluid/eager/api/utils/CMakeLists.txt @@ -1,3 +1,12 @@ -cc_library(tensor_utils SRCS tensor_utils.cc DEPS phi_api autograd_meta grad_node_info accumulation_node) -cc_library(hook_utils SRCS hook_utils.cc DEPS phi tensor_utils autograd_meta grad_node_info utils accumulation_node) -cc_library(global_utils SRCS global_utils.cc DEPS place tracer) +cc_library( + tensor_utils + SRCS tensor_utils.cc + DEPS phi_api autograd_meta grad_node_info accumulation_node) +cc_library( + hook_utils + SRCS hook_utils.cc + DEPS phi tensor_utils autograd_meta grad_node_info utils accumulation_node) +cc_library( + global_utils + SRCS global_utils.cc + DEPS place tracer) diff --git a/paddle/fluid/eager/api/utils/global_utils.h b/paddle/fluid/eager/api/utils/global_utils.h index 3c18efea20349..6a6a443f69333 100644 --- a/paddle/fluid/eager/api/utils/global_utils.h +++ b/paddle/fluid/eager/api/utils/global_utils.h @@ -17,6 +17,7 @@ #include #include + #include "paddle/fluid/eager/type_defs.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/phi/api/ext/op_meta_info.h" @@ -73,8 +74,9 @@ class Controller { return op_meta_info_map_; } - void MergeOpMetaInfoMap(const std::unordered_map< - std::string, std::vector>& map) { + void MergeOpMetaInfoMap( + const std::unordered_map>& + map) { op_meta_info_map_.insert(map.begin(), map.end()); } diff --git a/paddle/fluid/eager/api/utils/hook_utils.cc b/paddle/fluid/eager/api/utils/hook_utils.cc index 8ee646b718c2f..6493135141f6a 100644 --- a/paddle/fluid/eager/api/utils/hook_utils.cc +++ b/paddle/fluid/eager/api/utils/hook_utils.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/eager/api/utils/hook_utils.h" + #include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/api/utils/tensor_utils.h" #include "paddle/fluid/eager/autograd_meta.h" diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc index 81ea92d1c3c48..84a9eb6dea6bb 100644 --- a/paddle/fluid/eager/api/utils/tensor_utils.cc +++ b/paddle/fluid/eager/api/utils/tensor_utils.cc @@ -13,17 +13,16 @@ // limitations under the License. #include "paddle/fluid/eager/api/utils/tensor_utils.h" + #include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/utils.h" - -#include "paddle/phi/api/all.h" - #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/variable.h" +#include "paddle/phi/api/all.h" namespace egr { namespace egr_utils_api { diff --git a/paddle/fluid/eager/api/utils/tensor_utils.h b/paddle/fluid/eager/api/utils/tensor_utils.h index ac6de72dbff39..158aa5c8d7dd0 100644 --- a/paddle/fluid/eager/api/utils/tensor_utils.h +++ b/paddle/fluid/eager/api/utils/tensor_utils.h @@ -15,7 +15,7 @@ #pragma once #include "paddle/fluid/eager/eager_tensor.h" -#include "paddle/phi/api/all.h" +#include "paddle/phi/api/include/tensor.h" namespace egr { namespace egr_utils_api { diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt index d673c64d9da3c..8c067074d6efd 100644 --- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt +++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt @@ -1,99 +1,161 @@ add_subdirectory(final_state_generator) -set(EAGER_GENERETOR_DEPS ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag) +set(EAGER_GENERETOR_DEPS + ${GLOB_OP_LIB} + ${GLOB_OPERATOR_DEPS} + pybind + proto_desc + executor + layer + tracer + engine + imperative_profiler + imperative_flag) add_executable(eager_generator eager_generator.cc) target_link_libraries(eager_generator ${EAGER_GENERETOR_DEPS}) -get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) +get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(eager_generator ${os_dependency_modules}) if(WITH_ROCM) - target_link_libraries(eager_generator ${ROCM_HIPRTC_LIB}) + target_link_libraries(eager_generator ${ROCM_HIPRTC_LIB}) endif() # Prepare file structure -message("Generate dygraph file structure at path: ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/generated") -execute_process( - COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/generate_file_structures.py" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/" +message( + "Generate dygraph file structure at path: ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/generated" ) +execute_process( + COMMAND + "${PYTHON_EXECUTABLE}" + "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/generate_file_structures.py" + "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/") -set(tmp_dygraph_forward_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.tmp.h") -set(tmp_dygraph_forward_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions.tmp.cc") -set(tmp_dygraph_node_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.tmp.h") -set(tmp_dygraph_node_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.tmp.cc") -set(dygraph_forward_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h") -set(dygraph_forward_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions.cc") -set(dygraph_node_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h") -set(dygraph_node_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.cc") +set(tmp_dygraph_forward_h_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.tmp.h" +) +set(tmp_dygraph_forward_cc_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions.tmp.cc" +) +set(tmp_dygraph_node_h_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.tmp.h" +) +set(tmp_dygraph_node_cc_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.tmp.cc" +) +set(dygraph_forward_h_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" +) +set(dygraph_forward_cc_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions.cc" +) +set(dygraph_node_h_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h" +) +set(dygraph_node_cc_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.cc" +) if(WIN32) - set(EAGER_CODEGEN_DEPS eager_generator) - if("${CMAKE_GENERATOR}" STREQUAL "Ninja") - set(eager_generator_path "${CMAKE_CURRENT_BINARY_DIR}") - else() - set(eager_generator_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}") - endif() - - if(${CBLAS_PROVIDER} STREQUAL MKLML) - message("Copied libiomp5md.dll for Eager AutoCodeGen") - ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/libiomp5md.dll - COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${eager_generator_path} - DEPENDS mklml) - list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/libiomp5md.dll) - else(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS) - message("Copied openblas.dll for Eager AutoCodeGen") - ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/openblas.dll - COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_SHARED_LIB} ${eager_generator_path} - DEPENDS extern_openblas) - list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/openblas.dll) - endif() + set(EAGER_CODEGEN_DEPS eager_generator) + if("${CMAKE_GENERATOR}" STREQUAL "Ninja") + set(eager_generator_path "${CMAKE_CURRENT_BINARY_DIR}") + else() + set(eager_generator_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}") + endif() + + if(${CBLAS_PROVIDER} STREQUAL MKLML) + message("Copied libiomp5md.dll for Eager AutoCodeGen") + add_custom_command( + OUTPUT ${eager_generator_path}/libiomp5md.dll + COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} + ${eager_generator_path} + DEPENDS mklml) + list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/libiomp5md.dll) + else(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS) + message("Copied openblas.dll for Eager AutoCodeGen") + add_custom_command( + OUTPUT ${eager_generator_path}/openblas.dll + COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_SHARED_LIB} + ${eager_generator_path} + DEPENDS extern_openblas) + list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/openblas.dll) + endif() - if(WITH_MKLDNN) - message("Copied mkldnn.dll for Eager AutoCodeGen") - ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/mkldnn.dll - COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${eager_generator_path} - DEPENDS mkldnn) - list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/mkldnn.dll) - endif() + if(WITH_MKLDNN) + message("Copied mkldnn.dll for Eager AutoCodeGen") + add_custom_command( + OUTPUT ${eager_generator_path}/mkldnn.dll + COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} + ${eager_generator_path} + DEPENDS mkldnn) + list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/mkldnn.dll) + endif() - if(WITH_ONNXRUNTIME) - message("Copied onnxruntime for Eager AutoCodeGen") - ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/onnxruntime.dll - COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SHARED_LIB} ${eager_generator_path} - DEPENDS onnxruntime) - list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/onnxruntime.dll) - ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/paddle2onnx.dll - COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_SHARED_LIB} ${eager_generator_path} - DEPENDS paddle2onnx) - list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/paddle2onnx.dll) - endif() + if(WITH_ONNXRUNTIME) + message("Copied onnxruntime for Eager AutoCodeGen") + add_custom_command( + OUTPUT ${eager_generator_path}/onnxruntime.dll + COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SHARED_LIB} + ${eager_generator_path} + DEPENDS onnxruntime) + list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/onnxruntime.dll) + add_custom_command( + OUTPUT ${eager_generator_path}/paddle2onnx.dll + COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_SHARED_LIB} + ${eager_generator_path} + DEPENDS paddle2onnx) + list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/paddle2onnx.dll) + endif() - add_custom_target(eager_codegen - COMMAND "${eager_generator_path}/eager_generator.exe" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated" - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_h_path} ${dygraph_forward_h_path} - COMMENT "copy_if_different ${tmp_dygraph_forward_h_path} to ${dygraph_forward_h_path}" - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_cc_path} ${dygraph_forward_cc_path} - COMMENT "copy_if_different ${tmp_dygraph_forward_cc_path} to ${dygraph_forward_cc_path}" - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_h_path} ${dygraph_node_h_path} - COMMENT "copy_if_different ${tmp_dygraph_node_h_path} to ${dygraph_node_h_path}" - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_cc_path} ${dygraph_node_cc_path} - COMMENT "copy_if_different ${tmp_dygraph_node_cc_path} to ${dygraph_node_cc_path}" - DEPENDS ${EAGER_CODEGEN_DEPS} - VERBATIM) + add_custom_target( + eager_codegen + COMMAND + "${eager_generator_path}/eager_generator.exe" + "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_h_path} + ${dygraph_forward_h_path} + COMMENT + "copy_if_different ${tmp_dygraph_forward_h_path} to ${dygraph_forward_h_path}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_cc_path} + ${dygraph_forward_cc_path} + COMMENT + "copy_if_different ${tmp_dygraph_forward_cc_path} to ${dygraph_forward_cc_path}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_h_path} + ${dygraph_node_h_path} + COMMENT + "copy_if_different ${tmp_dygraph_node_h_path} to ${dygraph_node_h_path}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_cc_path} + ${dygraph_node_cc_path} + COMMENT + "copy_if_different ${tmp_dygraph_node_cc_path} to ${dygraph_node_cc_path}" + DEPENDS ${EAGER_CODEGEN_DEPS} + VERBATIM) else() - add_custom_target(eager_codegen - COMMAND ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${CMAKE_CURRENT_BINARY_DIR}/../../pybind" - "${CMAKE_CURRENT_BINARY_DIR}/eager_generator" - "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated" - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_h_path} ${dygraph_forward_h_path} - COMMENT "copy_if_different ${tmp_dygraph_forward_h_path} to ${dygraph_forward_h_path}" - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_cc_path} ${dygraph_forward_cc_path} - COMMENT "copy_if_different ${tmp_dygraph_forward_cc_path} to ${dygraph_forward_cc_path}" - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_h_path} ${dygraph_node_h_path} - COMMENT "copy_if_different ${tmp_dygraph_node_h_path} to ${dygraph_node_h_path}" - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_cc_path} ${dygraph_node_cc_path} - COMMENT "copy_if_different ${tmp_dygraph_node_cc_path} to ${dygraph_node_cc_path}" - DEPENDS eager_generator - VERBATIM) + add_custom_target( + eager_codegen + COMMAND + ${CMAKE_COMMAND} -E env + "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${CMAKE_CURRENT_BINARY_DIR}/../../pybind" + "${CMAKE_CURRENT_BINARY_DIR}/eager_generator" + "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_h_path} + ${dygraph_forward_h_path} + COMMENT + "copy_if_different ${tmp_dygraph_forward_h_path} to ${dygraph_forward_h_path}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_cc_path} + ${dygraph_forward_cc_path} + COMMENT + "copy_if_different ${tmp_dygraph_forward_cc_path} to ${dygraph_forward_cc_path}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_h_path} + ${dygraph_node_h_path} + COMMENT + "copy_if_different ${tmp_dygraph_node_h_path} to ${dygraph_node_h_path}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_cc_path} + ${dygraph_node_cc_path} + COMMENT + "copy_if_different ${tmp_dygraph_node_cc_path} to ${dygraph_node_cc_path}" + DEPENDS eager_generator + VERBATIM) endif() diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt index 50dab6ce840a5..06668fa736570 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt @@ -1,39 +1,72 @@ -set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/new_api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml") -set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/new_backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml") -set(tmp_forwards_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.cc") -set(tmp_forwards_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.h") -set(tmp_nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.cc") -set(tmp_nodes_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.h") -set(forwards_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.cc") -set(forwards_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h") -set(nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.cc") -set(nodes_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h") +set(api_yaml_path + "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/new_api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml" +) +set(backward_yaml_path + "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/new_backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml" +) +set(tmp_forwards_cc_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.cc" +) +set(tmp_forwards_h_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.h" +) +set(tmp_nodes_cc_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.cc" +) +set(tmp_nodes_h_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.h" +) +set(forwards_cc_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.cc" +) +set(forwards_h_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" +) +set(nodes_cc_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.cc" +) +set(nodes_h_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h" +) # StringTensor only needs forward api -set(fwd_api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/strings_api.yaml") +set(fwd_api_yaml_path + "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/strings_api.yaml") message("Final State Eager CodeGen") -add_custom_target(eager_final_state_codegen - COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py" - "--api_yaml_path=${api_yaml_path}" - "--backward_yaml_path=${backward_yaml_path}" - "--forwards_cc_path=${tmp_forwards_cc_path}" - "--forwards_h_path=${tmp_forwards_h_path}" - "--nodes_cc_path=${tmp_nodes_cc_path}" - "--nodes_h_path=${tmp_nodes_h_path}" - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_forwards_cc_path} ${forwards_cc_path} - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_forwards_h_path} ${forwards_h_path} - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_nodes_cc_path} ${nodes_cc_path} - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_nodes_h_path} ${nodes_h_path} - VERBATIM -) +add_custom_target( + eager_final_state_codegen + COMMAND + "${PYTHON_EXECUTABLE}" + "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py" + "--api_yaml_path=${api_yaml_path}" + "--backward_yaml_path=${backward_yaml_path}" + "--forwards_cc_path=${tmp_forwards_cc_path}" + "--forwards_h_path=${tmp_forwards_h_path}" + "--nodes_cc_path=${tmp_nodes_cc_path}" "--nodes_h_path=${tmp_nodes_h_path}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_forwards_cc_path} + ${forwards_cc_path} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_forwards_h_path} + ${forwards_h_path} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_nodes_cc_path} + ${nodes_cc_path} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_nodes_h_path} + ${nodes_h_path} + VERBATIM) -set(tmp_python_c_output_path "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h") -set(python_c_output_path "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/eager_final_state_op_function_impl.h") - -add_custom_target(eager_final_state_python_c_codegen - COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py" - "--api_yaml_path=${api_yaml_path},${fwd_api_yaml_path}" - "--output_path=${tmp_python_c_output_path}" - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_python_c_output_path} ${python_c_output_path} - VERBATIM +set(tmp_python_c_output_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h" +) +set(python_c_output_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/eager_final_state_op_function_impl.h" ) + +add_custom_target( + eager_final_state_python_c_codegen + COMMAND + "${PYTHON_EXECUTABLE}" + "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py" + "--api_yaml_path=${api_yaml_path},${fwd_api_yaml_path}" + "--output_path=${tmp_python_c_output_path}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_python_c_output_path} + ${python_c_output_path} + VERBATIM) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py index 57681be58ae47..87b2ff986dc92 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py @@ -31,7 +31,8 @@ "leaky_relu_double_grad", "sqrt_double_grad", "rsqrt_double_grad", "square_double_grad", "celu_double_grad", "pad_double_grad", "pad3d_double_grad", "squeeze_double_grad", "unsqueeze_double_grad", - "conv3d_double_grad", "depthwise_conv2d_grad_grad" + "instance_norm_double_grad", "conv3d_double_grad", + "depthwise_conv2d_grad_grad" ]) # For API dispatch used at python-level diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index d8b909c3bacc1..d23d71b07626d 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -1404,7 +1404,7 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str, const auto& out_metas = OutputMeta(); paddle::small_vector, egr::kSlotSmallVectorSize> returns({slot_num_bwd_outputs}); for (int i = 0; i < {slot_num_bwd_outputs}; ++i) {{ - returns[i].resize(out_metas[i].size()); + out_metas[i].size() == 0 ? returns[i].resize(1) : returns[i].resize(out_metas[i].size()); }} """ diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 63b899f6d6b62..36cfb4db1137a 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -13,27 +13,28 @@ // limitations under the License. #include "paddle/fluid/eager/backward.h" -#include +#include + +#include "glog/logging.h" +#include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/grad_tensor_holder.h" #include "paddle/fluid/eager/utils.h" -#include "paddle/fluid/platform/profiler.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" - -#include "glog/logging.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/errors.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/phi/kernels/autotune/switch_autotune.h" namespace egr { /* -* GeneralGrad is Helpper class to implement custom grad operation between -* outputs and inputs. -* -* **/ + * GeneralGrad is Helpper class to implement custom grad operation between + * outputs and inputs. + * + * **/ class GeneralGrad { public: static GeneralGrad& Instance() { return *general_grad_; } @@ -100,19 +101,19 @@ class GeneralGrad { // make sure the path from root to target_node is ok std::unordered_set startup_ops; VLOG(6) << "Running in UpdateGraphInfo"; - std::queue queue; + std::deque queue; for (auto& target_nodes_inputmeta_pair : input_target_nodes_inputmeta_map_) { - queue.emplace(target_nodes_inputmeta_pair.first); + queue.push_back(target_nodes_inputmeta_pair.first); } while (!queue.empty()) { auto* target_node = queue.front(); - queue.pop(); + queue.pop_front(); if (!(depending_nodes_)[target_node].empty()) { auto precedding_nodes = (depending_nodes_)[target_node]; for (auto pre_nodes : precedding_nodes) { - queue.emplace(pre_nodes); + queue.push_back(pre_nodes); if (potential_stop_nodes_.find(pre_nodes) != potential_stop_nodes_.end()) { potential_stop_nodes_.erase(pre_nodes); @@ -144,20 +145,20 @@ class GeneralGrad { // Get Graph Info Betweent input target GradNode and outputs, // record depending_nodes_、potential_stop_nodes_、potential_startup_nodes_ - void GetGraphInfoBetweenTargets(const std::queue& init_queue) { + void GetGraphInfoBetweenTargets(const std::deque& init_queue) { VLOG(6) << "Runing In GetGraphInfoBetweenTargets"; // Calculate in_degree for each node std::unordered_map node_in_degree_map; // Copy nodes - std::queue queue = init_queue; + std::deque queue = init_queue; std::unordered_set visited; // Visit each node exactly once in any order while (!queue.empty()) { GradNodeBase* node = queue.front(); - queue.pop(); + queue.pop_front(); if (visited.count(node)) { continue; @@ -198,7 +199,7 @@ class GeneralGrad { // Record depending relationship (depending_nodes_)[next_node].emplace(node); - queue.push(next_node); + queue.push_back(next_node); } } } @@ -207,10 +208,10 @@ class GeneralGrad { UpdateGraphInfo(); } - void ModifyReadyQueue(std::queue* queue) { - std::queue tmp_queue; + void ModifyReadyQueue(std::deque* queue) { + std::deque tmp_queue; for (auto nodes : potential_startup_nodes_) { - tmp_queue.emplace(nodes); + tmp_queue.push_back(nodes); } tmp_queue.swap(*queue); } @@ -297,7 +298,7 @@ class GeneralGrad { void PreparedForGeneralGrad( const std::vector& inputs, const std::vector& no_grad_vars, - std::queue* queue, + std::deque* queue, const std::unordered_map>& node_input_buffers_dict) { @@ -366,14 +367,14 @@ class GeneralGrad { } void ReconstructBackwardGraph( - const std::queue& orig_init_queue) { - std::queue queue = orig_init_queue; + const std::deque& orig_init_queue) { + std::deque queue = orig_init_queue; std::unordered_set visited; // BFS and recursively copy the grad nodes while (!queue.empty()) { GradNodeBase* orig_node = queue.front(); - queue.pop(); + queue.pop_front(); if (visited.count(orig_node)) { continue; } @@ -417,7 +418,7 @@ class GeneralGrad { copied_edge.SetGradNode(copied_next_node); // Update BFS queue - queue.push(orig_next_node.get()); + queue.push_back(orig_next_node.get()); } } } @@ -449,20 +450,20 @@ class GeneralGrad { }; std::unordered_map getInDegreeMap( - const std::queue& init_queue) { + const std::deque& init_queue) { // Calculate in_degree for each node // We can completely remove this pass, if in_degree were set during forward // pass std::unordered_map node_in_degree_map; // Copy nodes - std::queue queue = init_queue; + std::deque queue = init_queue; std::unordered_set visited; // Visit each node exactly once in any order while (!queue.empty()) { GradNodeBase* node = queue.front(); - queue.pop(); + queue.pop_front(); if (visited.count(node)) { continue; @@ -490,7 +491,7 @@ std::unordered_map getInDegreeMap( if (!node_in_degree_map.count(next_node)) node_in_degree_map[next_node] = 0; node_in_degree_map[next_node]++; - queue.push(next_node); + queue.push_back(next_node); } } } @@ -548,8 +549,8 @@ std::vector RunBackward( /* --- Initialization --- */ // 1. Init queue with starting nodes // 2. Prepare initial input buffers - std::queue queue; - std::queue orig_queue; + std::deque queue; + std::deque orig_queue; std::unordered_map> node_input_buffers_dict; for (size_t i = 0; i < tensors.size(); i++) { @@ -582,7 +583,7 @@ std::vector RunBackward( GradNodeBase* grad_node = shared_grad_node.get(); if (is_general_grad) { // Save orig grad node - orig_queue.push(grad_node); + orig_queue.push_back(grad_node); // Replace grad_node with copied grad_node grad_node = GeneralGrad::Instance().CopyGradNode(shared_grad_node); @@ -625,7 +626,7 @@ std::vector RunBackward( } // Prepare queue, potential startup_nodes - queue.push(grad_node); + queue.push_back(grad_node); } if (is_general_grad) { @@ -663,10 +664,10 @@ std::vector RunBackward( paddle::platform::TracerEventType::Operator, 1); if (queue.size() > 1 && node_in_degree_map[node] != 0) { - queue.pop(); + queue.pop_front(); continue; } - queue.pop(); + queue.pop_front(); // Run node: This is where Hook happens auto node_input_buffer_iter = node_input_buffers_dict.find(node); @@ -798,11 +799,19 @@ std::vector RunBackward( bool is_potential_stop_node = GeneralGrad::Instance().GetPotentialStopNodes()->count(next_node); if (node_in_degree_map[next_node] == 0 && !is_potential_stop_node) { - queue.emplace(std::move(next_node)); + if (dynamic_cast(next_node)) { + queue.push_front(std::move(next_node)); + } else { + queue.push_back(std::move(next_node)); + } } } else { if (node_in_degree_map[next_node] == 0) { - queue.emplace(std::move(next_node)); + if (dynamic_cast(next_node)) { + queue.push_front(std::move(next_node)); + } else { + queue.push_back(std::move(next_node)); + } } } } diff --git a/paddle/fluid/eager/custom_operator/CMakeLists.txt b/paddle/fluid/eager/custom_operator/CMakeLists.txt index ccc9a03a55660..424194557dd84 100644 --- a/paddle/fluid/eager/custom_operator/CMakeLists.txt +++ b/paddle/fluid/eager/custom_operator/CMakeLists.txt @@ -1 +1,4 @@ -cc_library(custom_operator_node SRCS custom_operator_node.cc DEPS phi_tensor phi_api grad_node_info custom_operator op_meta_info) +cc_library( + custom_operator_node + SRCS custom_operator_node.cc + DEPS phi_tensor phi_api grad_node_info custom_operator op_meta_info) diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc index abdd8cadeed4c..3efcf3b21a4e3 100644 --- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc +++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/eager/custom_operator/custom_operator_node.h" + #include "paddle/fluid/framework/custom_operator.h" #include "paddle/fluid/framework/op_meta_info_helper.h" #include "paddle/fluid/platform/profiler/event_tracing.h" diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index af387bb3238d1..71ccb072ce917 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -13,27 +13,24 @@ // limitations under the License. #include "paddle/fluid/eager/grad_node_info.h" + +#include "glog/logging.h" #include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/utils.h" - -#include "paddle/phi/common/data_type.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/sparse_coo_tensor.h" - #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/var_type.h" - #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/errors.h" - -#include "glog/logging.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/sparse_coo_tensor.h" /** * Implementation of GradNodeBase, Edge and GradTensorHolder. -**/ + **/ namespace egr { static void CheckTensor(const paddle::experimental::Tensor& pre, diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h index 747e98b846616..9070ac9e5b652 100644 --- a/paddle/fluid/eager/grad_node_info.h +++ b/paddle/fluid/eager/grad_node_info.h @@ -179,14 +179,13 @@ class GradNodeBase { kSlotSmallVectorSize> operator()(paddle::small_vector, kSlotSmallVectorSize>& grads, // NOLINT - bool create_graph = false, - bool is_new_grad = false) = 0; + bool create_graph = false, bool is_new_grad = false) = 0; virtual void ClearTensorWrappers() = 0; /** - * Self-Copy interface designed for use in DoubleGrad - * **/ + * Self-Copy interface designed for use in DoubleGrad + * **/ virtual std::shared_ptr Copy() const = 0; // adj_edges were moved inside OutputMeta(), so no available direct access @@ -230,8 +229,8 @@ class GradNodeBase { std::shared_ptr&& hook); /** - * Remove GradientHook - * **/ + * Remove GradientHook + * **/ bool RemoveGradientHook(const int64_t& hook_id) { auto remove_cnt = gradient_hooks_.erase(hook_id); if (remove_cnt == 0) { @@ -252,8 +251,8 @@ class GradNodeBase { kSlotSmallVectorSize>& tensors); /** - * Handle Complex - Real Type Promotion - * **/ + * Handle Complex - Real Type Promotion + * **/ void HandleComplexGradToRealGrad( paddle::small_vector, kSlotSmallVectorSize>* out_grads); @@ -262,8 +261,8 @@ class GradNodeBase { virtual std::string name() { return "GradNodeBase"; } /** - * The following interfaces are designed for no_need_buffer - * **/ + * The following interfaces are designed for no_need_buffer + * **/ bool IsTensorWrappersCleared() { return is_tensor_wrappers_cleared_; } void SetIsTensorWrappersCleared(bool is_tensor_wrappers_cleared) { diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc index 64fb8b53b473c..6abf759cdba7a 100644 --- a/paddle/fluid/eager/grad_tensor_holder.cc +++ b/paddle/fluid/eager/grad_tensor_holder.cc @@ -13,11 +13,11 @@ // limitations under the License. #include "paddle/fluid/eager/grad_tensor_holder.h" -#include "paddle/fluid/imperative/gradient_accumulator.h" #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/imperative/gradient_accumulator.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace egr { diff --git a/paddle/fluid/eager/hooks.h b/paddle/fluid/eager/hooks.h index 097150cf5ed59..a98b3d9f8e4df 100644 --- a/paddle/fluid/eager/hooks.h +++ b/paddle/fluid/eager/hooks.h @@ -18,6 +18,7 @@ #include #include #include + #include "paddle/phi/api/include/tensor.h" namespace egr { diff --git a/paddle/fluid/eager/pylayer/CMakeLists.txt b/paddle/fluid/eager/pylayer/CMakeLists.txt index 59030342eccad..4b0ad071117bc 100644 --- a/paddle/fluid/eager/pylayer/CMakeLists.txt +++ b/paddle/fluid/eager/pylayer/CMakeLists.txt @@ -1 +1,4 @@ -cc_library(py_layer_node SRCS py_layer_node.cc DEPS pybind phi_api grad_node_info) +cc_library( + py_layer_node + SRCS py_layer_node.cc + DEPS pybind phi_api grad_node_info) diff --git a/paddle/fluid/eager/pylayer/py_layer_node.cc b/paddle/fluid/eager/pylayer/py_layer_node.cc index a00b292fe0915..ec17a324b1ec9 100644 --- a/paddle/fluid/eager/pylayer/py_layer_node.cc +++ b/paddle/fluid/eager/pylayer/py_layer_node.cc @@ -13,18 +13,16 @@ // limitations under the License. #include "paddle/fluid/eager/pylayer/py_layer_node.h" -#include "paddle/fluid/eager/eager_tensor.h" - -#include "paddle/phi/api/all.h" -#include "paddle/phi/core/dense_tensor.h" +#include "glog/logging.h" +#include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/errors.h" #include "paddle/fluid/pybind/eager.h" #include "paddle/fluid/pybind/eager_utils.h" - -#include "glog/logging.h" +#include "paddle/phi/api/all.h" +#include "paddle/phi/core/dense_tensor.h" #pragma GCC diagnostic ignored "-Wattributes" #include "pybind11/pytypes.h" @@ -34,8 +32,7 @@ paddle::small_vector, GradNodePyLayer::operator()( paddle::small_vector, kSlotSmallVectorSize>& grads, // NOLINT - bool create_graph, - bool is_new_grad) { + bool create_graph, bool is_new_grad) { VLOG(3) << "Running Eager Backward Node: " << name(); paddle::small_vector, diff --git a/paddle/fluid/eager/pylayer/py_layer_node.h b/paddle/fluid/eager/pylayer/py_layer_node.h index c1a8c6e626b4f..998480bbfebfa 100644 --- a/paddle/fluid/eager/pylayer/py_layer_node.h +++ b/paddle/fluid/eager/pylayer/py_layer_node.h @@ -38,8 +38,7 @@ class GradNodePyLayer : public GradNodeBase { kSlotSmallVectorSize> operator()(paddle::small_vector, kSlotSmallVectorSize>& grads, // NOLINT - bool create_graph = false, - bool is_new_grad = false) override; + bool create_graph = false, bool is_new_grad = false) override; void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } diff --git a/paddle/fluid/eager/tests/data_structure_tests/CMakeLists.txt b/paddle/fluid/eager/tests/data_structure_tests/CMakeLists.txt index 76c59561fc0bb..90159e9b8c32e 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/CMakeLists.txt +++ b/paddle/fluid/eager/tests/data_structure_tests/CMakeLists.txt @@ -1,9 +1,27 @@ -cc_test(test_egr_ds_eager_tensor SRCS eager_tensor_test.cc DEPS ${eager_deps}) -cc_test(test_egr_ds_auotgrad_meta SRCS autograd_meta_test.cc DEPS ${eager_deps}) -cc_test(test_egr_ds_grad_node_info SRCS grad_node_info_test.cc DEPS ${eager_deps}) -cc_test(test_egr_ds_accumulation_node SRCS accumulation_node_test.cc DEPS ${eager_deps}) -cc_test(test_egr_ds_tensor_wrapper SRCS tensor_wrapper_test.cc DEPS ${eager_deps}) +cc_test( + test_egr_ds_eager_tensor + SRCS eager_tensor_test.cc + DEPS ${eager_deps}) +cc_test( + test_egr_ds_auotgrad_meta + SRCS autograd_meta_test.cc + DEPS ${eager_deps}) +cc_test( + test_egr_ds_grad_node_info + SRCS grad_node_info_test.cc + DEPS ${eager_deps}) +cc_test( + test_egr_ds_accumulation_node + SRCS accumulation_node_test.cc + DEPS ${eager_deps}) +cc_test( + test_egr_ds_tensor_wrapper + SRCS tensor_wrapper_test.cc + DEPS ${eager_deps}) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) - cc_test(test_egr_ds_grad_tensor_holder SRCS grad_tensor_holder_test.cc DEPS ${eager_deps} ${generated_deps}) + cc_test( + test_egr_ds_grad_tensor_holder + SRCS grad_tensor_holder_test.cc + DEPS ${eager_deps} ${generated_deps}) endif() diff --git a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc index c159084d683e8..c53ffe823abba 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc @@ -12,11 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/eager/accumulation/accumulation_node.h" + #include #include "gtest/gtest.h" - -#include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/api/utils/hook_utils.h" #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/grad_node_info.h" diff --git a/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc b/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc index 48b4b9c57487a..f7415dd1f713d 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/eager/autograd_meta.h" + #include "glog/logging.h" #include "gtest/gtest.h" - -#include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h" diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc index edbb441f27a08..a82965303af14 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/eager/eager_tensor.h" + #include "glog/logging.h" #include "gtest/gtest.h" - -#include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/imperative/var_helper.h" #include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/common/layout.h" @@ -35,7 +35,7 @@ class AutogradMetaTest : public AbstractAutogradMeta { explicit AutogradMetaTest(int val) : val_(val) {} int val_ = 0; }; -} +} // namespace eager_test TEST(Tensor, Constructor) { paddle::experimental::Tensor et1 = paddle::experimental::Tensor(); paddle::experimental::Tensor et2 = paddle::experimental::Tensor("et2"); diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc index 6687b6621ad54..63a4a72b631d6 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc @@ -12,12 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/eager/grad_node_info.h" + #include "glog/logging.h" #include "gtest/gtest.h" - #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/eager_tensor.h" -#include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h" #include "paddle/phi/api/lib/utils/allocator.h" @@ -85,8 +85,8 @@ void TestGradNodeBase(bool is_remove_gradient_hook) { CHECK_EQ(grad_test_node2->OutputMeta()[0].size(), size_t(1)); VLOG(6) << "Test Gradient Hook"; - auto gradient_hook = []( - const paddle::experimental::Tensor& et) -> paddle::experimental::Tensor { + auto gradient_hook = [](const paddle::experimental::Tensor& et) + -> paddle::experimental::Tensor { paddle::experimental::Tensor res; phi::DenseTensorMeta meta = phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1})); diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h index a00e629d1029a..eb9bd6007bf8a 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h @@ -14,7 +14,6 @@ #pragma once #include "glog/logging.h" #include "gtest/gtest.h" - #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/grad_node_info.h" @@ -35,8 +34,7 @@ class GradTestNode : public egr::GradNodeBase { egr::kSlotSmallVectorSize> operator()(paddle::small_vector, egr::kSlotSmallVectorSize>& grads, // NOLINT - bool create_graph = false, - bool is_new_grad = false) override { + bool create_graph = false, bool is_new_grad = false) override { val_ = std::dynamic_pointer_cast(grads[0][0].impl()) ->data()[0]; phi::DenseTensorMeta meta = diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc index 0fe349294b438..17f593e24905d 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc @@ -12,17 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/eager/grad_tensor_holder.h" + #include #include "gtest/gtest.h" - #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/grad_node_info.h" -#include "paddle/fluid/eager/grad_tensor_holder.h" #include "paddle/phi/api/lib/utils/allocator.h" -#include "paddle/phi/core/selected_rows.h" - #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/selected_rows.h" PD_DECLARE_KERNEL(full_like, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); diff --git a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc index 28c3472f90d03..8813f364840e0 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/eager/tensor_wrapper.h" + #include "glog/logging.h" #include "gtest/gtest.h" - -#include "paddle/fluid/eager/tensor_wrapper.h" #include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h" #include "paddle/fluid/eager/utils.h" diff --git a/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt b/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt index 516789cbb8cf7..7b6dfae729f38 100644 --- a/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt +++ b/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt @@ -1,7 +1,29 @@ -cc_library(performance_benchmark_utils SRCS benchmark_utils.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node scale_op matmul_v2_op dygraph_function) +cc_library( + performance_benchmark_utils + SRCS benchmark_utils.cc + DEPS ${eager_deps} + ${fluid_deps} + ${generated_deps} + eager_scale + scale_node + scale_op + matmul_v2_op + dygraph_function) -cc_test(test_egr_performance_benchmark_eager_cpu SRCS benchmark_eager_cpu.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps}) -cc_test(test_egr_performance_benchmark_fluid_cpu SRCS benchmark_fluid_cpu.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps}) +cc_test( + test_egr_performance_benchmark_eager_cpu + SRCS benchmark_eager_cpu.cc + DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps}) +cc_test( + test_egr_performance_benchmark_fluid_cpu + SRCS benchmark_fluid_cpu.cc + DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps}) -cc_test(test_egr_performance_benchmark_eager_cuda SRCS benchmark_eager_cuda.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps}) -cc_test(test_egr_performance_benchmark_fluid_cuda SRCS benchmark_fluid_cuda.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps}) +cc_test( + test_egr_performance_benchmark_eager_cuda + SRCS benchmark_eager_cuda.cc + DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps}) +cc_test( + test_egr_performance_benchmark_fluid_cuda + SRCS benchmark_fluid_cuda.cc + DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps}) diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc index 056c7102f663b..3b0e6a3fdb6e1 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc @@ -15,19 +15,17 @@ // Eager Dygraph #include + #include #include "gtest/gtest.h" -#include "paddle/fluid/platform/flags.h" - #include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/backward.h" - -#include "paddle/fluid/imperative/tracer.h" - #include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h" #include "paddle/fluid/eager/tests/test_utils.h" +#include "paddle/fluid/imperative/tracer.h" +#include "paddle/fluid/platform/flags.h" #ifdef WITH_GPERFTOOLS #include "gperftools/profiler.h" diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc index 287d6e770dea2..5dd5cde548fc0 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc @@ -14,19 +14,17 @@ // Eager Dygraph #include + #include #include "gtest/gtest.h" -#include "paddle/fluid/platform/flags.h" - #include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/backward.h" - -#include "paddle/fluid/imperative/tracer.h" - #include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h" #include "paddle/fluid/eager/tests/test_utils.h" +#include "paddle/fluid/imperative/tracer.h" +#include "paddle/fluid/platform/flags.h" #ifdef WITH_GPERFTOOLS #include "gperftools/profiler.h" diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc index b4b47a85f6666..bf1d955b9000f 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc @@ -23,7 +23,6 @@ #include "glog/logging.h" #include "gtest/gtest.h" - #include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h" #include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/fluid/imperative/basic_engine.h" diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc index d9afd7cc96523..0cd33a72e1a9a 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc @@ -23,7 +23,6 @@ #include "glog/logging.h" #include "gtest/gtest.h" - #include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h" #include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/fluid/imperative/basic_engine.h" diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h index 86bf13707ed40..5b37e973f1dc6 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/imperative/layer.h" #include "paddle/phi/api/all.h" diff --git a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt index 719ef6673c07d..2f57489999ff8 100644 --- a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt +++ b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt @@ -1,14 +1,47 @@ -cc_test(test_egr_task_tensor_utils SRCS tensor_utils_test.cc DEPS ${eager_deps}) -cc_test(test_egr_task_eager_utils SRCS eager_utils_test.cc DEPS ${eager_deps}) -cc_test(test_egr_task_forward_autograd SRCS forward_autograd_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) -cc_test(test_egr_task_nan_inf_utils SRCS nan_inf_utils_test.cc DEPS eager_nan_inf_utils) +cc_test( + test_egr_task_tensor_utils + SRCS tensor_utils_test.cc + DEPS ${eager_deps}) +cc_test( + test_egr_task_eager_utils + SRCS eager_utils_test.cc + DEPS ${eager_deps}) +cc_test( + test_egr_task_forward_autograd + SRCS forward_autograd_test.cc + DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) +cc_test( + test_egr_task_nan_inf_utils + SRCS nan_inf_utils_test.cc + DEPS eager_nan_inf_utils) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) - cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node) - cc_test(test_egr_task_backward SRCS backward_test.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node) - cc_test(test_egr_task_grad SRCS grad_test.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node) - cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node) - cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node) - cc_test(test_egr_task_hook_intermidiate SRCS hook_test_intermidiate.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} dygraph_node) - cc_test(test_egr_task_autocodegen SRCS generated_test.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps}) + cc_test( + test_egr_task_hook + SRCS hook_test.cc + DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node) + cc_test( + test_egr_task_backward + SRCS backward_test.cc + DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node) + cc_test( + test_egr_task_grad + SRCS grad_test.cc + DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node) + cc_test( + test_egr_task_fwd_bwd_joint + SRCS fwd_bwd_joint_test.cc + DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node) + cc_test( + test_egr_task_cross_batch + SRCS cross_batch_accumulation_test.cc + DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node) + cc_test( + test_egr_task_hook_intermidiate + SRCS hook_test_intermidiate.cc + DEPS ${eager_deps} ${fluid_deps} ${generated_deps} dygraph_node) + cc_test( + test_egr_task_autocodegen + SRCS generated_test.cc + DEPS ${eager_deps} ${fluid_deps} ${generated_deps}) endif() diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc index 7552ad83fa20f..c6d4514fa8e33 100644 --- a/paddle/fluid/eager/tests/task_tests/backward_test.cc +++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc @@ -12,25 +12,22 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/eager/backward.h" + #include #include "glog/logging.h" #include "gtest/gtest.h" - #include "paddle/fluid/eager/accumulation/accumulation_node.h" +#include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h" #include "paddle/fluid/eager/api/utils/tensor_utils.h" #include "paddle/fluid/eager/autograd_meta.h" -#include "paddle/fluid/eager/backward.h" #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/tests/test_utils.h" - -#include "paddle/fluid/eager/api/all.h" - #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/tensor_meta.h" - #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT); diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc index 4337c0d092ca0..847c082a30173 100644 --- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc +++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc @@ -16,22 +16,17 @@ #include "glog/logging.h" #include "gtest/gtest.h" - #include "paddle/fluid/eager/accumulation/accumulation_node.h" +#include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h" #include "paddle/fluid/eager/api/utils/tensor_utils.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/backward.h" #include "paddle/fluid/eager/grad_node_info.h" - -#include "paddle/fluid/eager/api/all.h" - -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/tensor_meta.h" - #include "paddle/fluid/eager/tests/test_utils.h" - +#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); diff --git a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc index 551262d259e08..e4ca8dd164b8f 100644 --- a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc +++ b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc @@ -15,14 +15,12 @@ #include #include "gtest/gtest.h" - #include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h" #include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/fluid/eager/utils.h" - #include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc index 4cb316380aade..ebf396bebfab0 100644 --- a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc +++ b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc @@ -16,18 +16,15 @@ #include "glog/logging.h" #include "gtest/gtest.h" - #include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h" #include "paddle/fluid/eager/api/utils/tensor_utils.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/tests/test_utils.h" - #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/tensor_meta.h" - #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc index 1f8fdb7de0c17..a4da315f44a7a 100644 --- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc +++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc @@ -16,21 +16,17 @@ #include "glog/logging.h" #include "gtest/gtest.h" - #include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/backward.h" #include "paddle/fluid/eager/grad_node_info.h" - -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/tensor_meta.h" - #include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/eager/tests/test_utils.h" - +#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc index 3c237b76e64b0..b53cdf55d4306 100644 --- a/paddle/fluid/eager/tests/task_tests/generated_test.cc +++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc @@ -17,17 +17,14 @@ #include #include "gtest/gtest.h" - #include "paddle/fluid/eager/api/all.h" +#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" #include "paddle/fluid/eager/api/utils/tensor_utils.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/backward.h" -#include "paddle/fluid/eager/utils.h" - #include "paddle/fluid/eager/tests/test_utils.h" +#include "paddle/fluid/eager/utils.h" #include "paddle/fluid/imperative/tracer.h" - -#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" #include "paddle/phi/core/kernel_registry.h" PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); diff --git a/paddle/fluid/eager/tests/task_tests/grad_test.cc b/paddle/fluid/eager/tests/task_tests/grad_test.cc index 72a94b40ed753..8d6c4d7843fb2 100644 --- a/paddle/fluid/eager/tests/task_tests/grad_test.cc +++ b/paddle/fluid/eager/tests/task_tests/grad_test.cc @@ -16,17 +16,14 @@ #include "glog/logging.h" #include "gtest/gtest.h" - #include "paddle/fluid/eager/accumulation/accumulation_node.h" +#include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h" #include "paddle/fluid/eager/api/utils/tensor_utils.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/backward.h" #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/tests/test_utils.h" - -#include "paddle/fluid/eager/api/all.h" - #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc index d7b887b28bde8..badbe87159785 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc @@ -16,22 +16,17 @@ #include "glog/logging.h" #include "gtest/gtest.h" - #include "paddle/fluid/eager/accumulation/accumulation_node.h" +#include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/backward.h" #include "paddle/fluid/eager/grad_node_info.h" - -#include "paddle/fluid/eager/api/all.h" - -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/tensor_meta.h" - #include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/eager/tests/test_utils.h" - +#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc index c4d4ff9110682..dbe2c13894566 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc @@ -15,16 +15,14 @@ #include #include "gtest/gtest.h" - #include "paddle/fluid/eager/api/all.h" +#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" #include "paddle/fluid/eager/backward.h" #include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/phi/core/dense_tensor.h" - -#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" -#include "paddle/fluid/eager/hooks.h" #include "paddle/phi/core/kernel_registry.h" PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); diff --git a/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc b/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc index be0563fbeedb4..73d213f71148f 100644 --- a/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc +++ b/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc @@ -12,13 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/eager/nan_inf_utils.h" + #include #include #include #include "gtest/gtest.h" - -#include "paddle/fluid/eager/nan_inf_utils.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/phi/api/include/api.h" diff --git a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc index 24e5da060111f..aeddeb6fae7f2 100644 --- a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc +++ b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc @@ -12,17 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/eager/api/utils/tensor_utils.h" + #include #include "gtest/gtest.h" - -#include "paddle/fluid/eager/api/utils/tensor_utils.h" #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/grad_tensor_holder.h" #include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/phi/api/lib/utils/allocator.h" - #include "paddle/phi/core/kernel_registry.h" PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); diff --git a/paddle/fluid/eager/tests/test_utils.h b/paddle/fluid/eager/tests/test_utils.h index 47bfe9a7cabd5..cb1e531d82d63 100644 --- a/paddle/fluid/eager/tests/test_utils.h +++ b/paddle/fluid/eager/tests/test_utils.h @@ -18,14 +18,12 @@ #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/utils.h" - -#include "paddle/phi/api/all.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/tensor_meta.h" - #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/init.h" +#include "paddle/phi/api/all.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/tensor_meta.h" namespace eager_test { diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h index 5a730e4dbf164..3254b3bf89262 100644 --- a/paddle/fluid/eager/to_static/run_program_op_node.h +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -17,7 +17,6 @@ #include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/tensor_wrapper.h" - #include "paddle/fluid/operators/run_program_op.h" #include "paddle/fluid/platform/enforce.h" @@ -273,7 +272,7 @@ inline void RunProgramGradAPI( const paddle::framework::AttributeMap &attrs, std::vector &x_grad, // NOLINT std::vector ¶ms_grad // NOLINT - ) { +) { // if all output vars are set to stop_gradient, grad op no need to executed if (x_grad.empty() && params_grad.empty()) return; @@ -368,8 +367,7 @@ class GradNodeRunProgram : public egr::GradNodeBase { egr::kSlotSmallVectorSize> operator()(paddle::small_vector, egr::kSlotSmallVectorSize> &grads, // NOLINT - bool create_graph, - bool is_new_grad) override { + bool create_graph, bool is_new_grad) override { VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram"; paddle::small_vector, egr::kSlotSmallVectorSize> diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc index 7d9554c52eb6c..4d7d1aa2d8a3d 100644 --- a/paddle/fluid/eager/utils.cc +++ b/paddle/fluid/eager/utils.cc @@ -27,7 +27,7 @@ #include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/variable.h" -PADDLE_DEFINE_EXPORTED_bool(retain_grad_for_all_tensor, true, +PADDLE_DEFINE_EXPORTED_bool(retain_grad_for_all_tensor, false, "retain grad for all tensor"); namespace egr { diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h index c6389e998315c..783afcc1e2c73 100644 --- a/paddle/fluid/eager/utils.h +++ b/paddle/fluid/eager/utils.h @@ -18,7 +18,6 @@ #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/grad_node_info.h" - #include "paddle/phi/api/all.h" namespace egr { @@ -161,10 +160,11 @@ class EagerUtils { if (require_any_grad && autograd_meta) { PADDLE_ENFORCE_EQ(!autograd_meta->StopGradient() && egr::egr_utils_api::IsLeafTensor(target), - false, paddle::platform::errors::InvalidArgument( - "Leaf Var (%s) that doesn't stop gradient " - "can't use inplace strategy.", - target.name())); + false, + paddle::platform::errors::InvalidArgument( + "Leaf Var (%s) that doesn't stop gradient " + "can't use inplace strategy.", + target.name())); } } @@ -234,8 +234,8 @@ class EagerUtils { const paddle::experimental::Tensor& tensor); /** - * Fill Zero - * **/ + * Fill Zero + * **/ static void FillZeroForEmptyOptionalGradInput( std::vector* in_grads, const std::vector& grad_in_metas); diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index bb7f3f26463d4..5402beb49e69d 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -1,22 +1,30 @@ - #windows treat symbolic file as a real file, which is different with unix #We create a hidden file and compile it instead of origin source file. function(windows_symbolic TARGET) set(oneValueArgs "") set(multiValueArgs SRCS PATH) - cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH}) foreach(src ${windows_symbolic_SRCS}) get_filename_component(src ${src} NAME_WE) - if (NOT EXISTS ${final_path}/${src}.cc OR NOT EXISTS ${final_path}/${src}.cu) - message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") + if(NOT EXISTS ${final_path}/${src}.cc OR NOT EXISTS ${final_path}/${src}.cu) + message( + FATAL + " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file." + ) endif() - file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc) + file( + GENERATE + OUTPUT ${final_path}/.${src}.cu + INPUT ${final_path}/${src}.cc) - add_custom_command(OUTPUT ${final_path}/.${src}.cu - COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu" - COMMENT "create hidden file of ${src}.cu") + add_custom_command( + OUTPUT ${final_path}/.${src}.cu + COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" + "${final_path}/.${src}.cu" + COMMENT "create hidden file of ${src}.cu") add_custom_target(${TARGET} ALL DEPENDS ${final_path}/.${src}.cu) endforeach() endfunction() @@ -26,7 +34,7 @@ add_subdirectory(details) add_subdirectory(fleet) add_subdirectory(io) add_subdirectory(new_executor) -if (WITH_CINN) +if(WITH_CINN) add_subdirectory(paddle2cinn) endif() #ddim lib @@ -34,420 +42,1101 @@ proto_library(framework_proto SRCS framework.proto) proto_library(pass_desc_proto SRCS pass_desc.proto DEPS framework_proto) proto_library(op_def_proto SRCS op_def.proto DEPS framework_proto) -cc_library(op_def_api SRCS op_def_api.cc DEPS op_def_proto boost) - -FILE(GLOB OP_DEF_FILES ${PADDLE_SOURCE_DIR}/paddle/fluid/operators/compat/*.pbtxt) -FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt +cc_library( + op_def_api + SRCS op_def_api.cc + DEPS op_def_proto boost) + +file(GLOB OP_DEF_FILES + ${PADDLE_SOURCE_DIR}/paddle/fluid/operators/compat/*.pbtxt) +file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt "namespace { \n" "const std::unordered_map op_def_map = { \n") foreach(OP_DEF_FILE ${OP_DEF_FILES}) - FILE(READ ${OP_DEF_FILE} OP_DEF_CONTENT) - get_filename_component(OP_NAME ${OP_DEF_FILE} NAME_WE) - FILE(APPEND ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt - "{\"${OP_NAME}\",R\"(${OP_DEF_CONTENT})\"},\n") + file(READ ${OP_DEF_FILE} OP_DEF_CONTENT) + get_filename_component(OP_NAME ${OP_DEF_FILE} NAME_WE) + file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt + "{\"${OP_NAME}\",R\"(${OP_DEF_CONTENT})\"},\n") endforeach(OP_DEF_FILE) -FILE(APPEND ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt "{\"\",\"\"}};\n}") +file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt "{\"\",\"\"}};\n}") proto_library(heter_service_proto SRCS heter_service.proto) proto_library(data_feed_proto SRCS data_feed.proto) proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto - data_feed_proto) - -cc_library(string_array SRCS string_array.cc DEPS utf8proc) - -cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context) -cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor) + data_feed_proto) + +cc_library( + string_array + SRCS string_array.cc + DEPS utf8proc) + +cc_library( + data_type + SRCS data_type.cc + DEPS framework_proto ddim device_context) +cc_test( + data_type_test + SRCS data_type_test.cc + DEPS data_type place tensor) if(WITH_GPU) - if (WIN32) + if(WIN32) windows_symbolic(tensor_util SRCS tensor_util.cu) - nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context dense_tensor) + nv_library( + tensor + SRCS tensor.cc .tensor_util.cu + DEPS place memory data_type device_context dense_tensor) add_dependencies(tensor tensor_util) else() - nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler dense_tensor) + nv_library( + tensor + SRCS tensor.cc tensor_util.cu + DEPS place memory data_type device_context profiler dense_tensor) endif(WIN32) elseif(WITH_ROCM) - hip_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler dense_tensor) + hip_library( + tensor + SRCS tensor.cc tensor_util.cu + DEPS place memory data_type device_context profiler dense_tensor) else() - cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler dense_tensor) + cc_library( + tensor + SRCS tensor.cc tensor_util.cc + DEPS place memory data_type device_context profiler dense_tensor) endif() -cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) +cc_test( + tensor_test + SRCS tensor_test.cc + DEPS tensor) if(WITH_GPU) - nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor dlpack_tensor) + nv_test( + tensor_util_test + SRCS tensor_util_test.cc tensor_util_test.cu + DEPS tensor dlpack_tensor) elseif(WITH_ROCM) - hip_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor dlpack_tensor) + hip_test( + tensor_util_test + SRCS tensor_util_test.cc tensor_util_test.cu + DEPS tensor dlpack_tensor) else() - cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor dlpack_tensor) + cc_test( + tensor_util_test + SRCS tensor_util_test.cc + DEPS tensor dlpack_tensor) endif() -cc_test(copy_same_tensor_test SRCS copy_same_tensor_test.cc DEPS tensor) +cc_test( + copy_same_tensor_test + SRCS copy_same_tensor_test.cc + DEPS tensor) -cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) -cc_library(mixed_vector SRCS mixed_vector.cc DEPS device_context place memory) +cc_test( + eigen_test + SRCS eigen_test.cc + DEPS tensor) +cc_library( + mixed_vector + SRCS mixed_vector.cc + DEPS device_context place memory) if(WITH_GPU) - nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS mixed_vector place memory device_context tensor) + nv_test( + mixed_vector_test + SRCS mixed_vector_test.cc mixed_vector_test.cu + DEPS mixed_vector place memory device_context tensor) elseif(WITH_ROCM) - hip_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS mixed_vector place memory device_context tensor) + hip_test( + mixed_vector_test + SRCS mixed_vector_test.cc mixed_vector_test.cu + DEPS mixed_vector place memory device_context tensor) else() - cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS mixed_vector place memory device_context tensor) + cc_test( + mixed_vector_test + SRCS mixed_vector_test.cc + DEPS mixed_vector place memory device_context tensor) endif() -cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim mixed_vector place tensor framework_proto version) +cc_library( + lod_tensor + SRCS lod_tensor.cc + DEPS ddim mixed_vector place tensor framework_proto version) -cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_utils lod_tensor memory) +cc_test( + lod_tensor_test + SRCS lod_tensor_test.cc + DEPS lod_utils lod_tensor memory) if(WITH_GPU) - nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) + nv_test( + lod_tensor_gpu_test + SRCS lod_tensor_test.cu + DEPS lod_tensor) elseif(WITH_ROCM) - hip_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) + hip_test( + lod_tensor_gpu_test + SRCS lod_tensor_test.cu + DEPS lod_tensor) endif() -cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory gflags glog) - -cc_library(reader SRCS reader.cc DEPS lod_tensor ddim) -cc_test(reader_test SRCS reader_test.cc DEPS reader) - -cc_library(threadpool SRCS threadpool.cc DEPS enforce) -cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) - -cc_library(var_type_traits SRCS var_type_traits.cc DEPS lod_tensor selected_rows_utils framework_proto scope) -if (WITH_GPU) +cc_library( + garbage_collector + SRCS garbage_collector.cc + DEPS device_context memory gflags glog) + +cc_library( + reader + SRCS reader.cc + DEPS lod_tensor ddim) +cc_test( + reader_test + SRCS reader_test.cc + DEPS reader) + +cc_library( + threadpool + SRCS threadpool.cc + DEPS enforce) +cc_test( + threadpool_test + SRCS threadpool_test.cc + DEPS threadpool) + +cc_library( + var_type_traits + SRCS var_type_traits.cc + DEPS lod_tensor selected_rows_utils framework_proto scope) +if(WITH_GPU) target_link_libraries(var_type_traits dynload_cuda) endif() -cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits) +cc_test( + var_type_traits_test + SRCS var_type_traits_test.cc + DEPS var_type_traits) set(BRPC_DEPS "") if(WITH_PSCORE) - set(BRPC_DEPS brpc ssl crypto) + set(BRPC_DEPS brpc ssl crypto) endif() if(WITH_PSLIB) - if(WITH_PSLIB_BRPC) - set(BRPC_DEPS pslib_brpc) - elseif(NOT WITH_HETERPS) - set(BRPC_DEPS brpc ssl crypto) - endif() - if (WITH_ARM_BRPC) - set(BRPC_DEPS arm_brpc) - endif() + if(WITH_PSLIB_BRPC) + set(BRPC_DEPS pslib_brpc) + elseif(NOT WITH_HETERPS) + set(BRPC_DEPS brpc ssl crypto) + endif() + if(WITH_ARM_BRPC) + set(BRPC_DEPS arm_brpc) + endif() endif() -cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits) -cc_library(device_worker SRCS device_worker.cc DEPS trainer_desc_proto lod_tensor scope ${BRPC_DEPS}) -cc_test(device_worker_test SRCS device_worker_test.cc DEPS device_worker) - -cc_library(scope_pool SRCS scope_pool.cc DEPS scope) -cc_test(scope_test SRCS scope_test.cc DEPS scope) -cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits) - -cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor) +cc_library( + scope + SRCS scope.cc + DEPS glog threadpool xxhash var_type_traits) +cc_library( + device_worker + SRCS device_worker.cc + DEPS trainer_desc_proto lod_tensor scope ${BRPC_DEPS}) +cc_test( + device_worker_test + SRCS device_worker_test.cc + DEPS device_worker) + +cc_library( + scope_pool + SRCS scope_pool.cc + DEPS scope) +cc_test( + scope_test + SRCS scope_test.cc + DEPS scope) +cc_test( + variable_test + SRCS variable_test.cc + DEPS tensor var_type_traits) + +cc_library( + data_device_transform + SRCS data_device_transform.cc + DEPS tensor) if(WITH_GPU) - nv_test(data_device_transform_test SRCS data_device_transform_test.cu - DEPS operator op_registry device_context math_function scope) + nv_test( + data_device_transform_test + SRCS data_device_transform_test.cu + DEPS operator op_registry device_context math_function scope) elseif(WITH_ROCM) - hip_test(data_device_transform_test SRCS data_device_transform_test.cu - DEPS operator op_registry device_context math_function scope) + hip_test( + data_device_transform_test + SRCS data_device_transform_test.cu + DEPS operator op_registry device_context math_function scope) endif() if(WITH_GPU) - if (WIN32) -#windows treat symbolic file as a real file, which is different with unix -#We create a hidden file and compile it instead of origin source file. - windows_symbolic(hidden_file SRCS data_type_transform.cu) - nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor) - add_dependencies(data_type_transform hidden_file) + if(WIN32) + #windows treat symbolic file as a real file, which is different with unix + #We create a hidden file and compile it instead of origin source file. + windows_symbolic(hidden_file SRCS data_type_transform.cu) + nv_library( + data_type_transform + SRCS .data_type_transform.cu + DEPS tensor) + add_dependencies(data_type_transform hidden_file) else() - nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor) + nv_library( + data_type_transform + SRCS data_type_transform.cu + DEPS tensor) endif(WIN32) - nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform) + nv_test( + data_type_transform_test + SRCS data_type_transform_test.cc data_type_transform_test.cu + DEPS data_type_transform) elseif(WITH_ROCM) - hip_library(data_type_transform SRCS data_type_transform.cu DEPS tensor) - hip_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform) + hip_library( + data_type_transform + SRCS data_type_transform.cu + DEPS tensor) + hip_test( + data_type_transform_test + SRCS data_type_transform_test.cc data_type_transform_test.cu + DEPS data_type_transform) else() - cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor) - cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform) + cc_library( + data_type_transform + SRCS data_type_transform.cc + DEPS tensor) + cc_test( + data_type_transform_test + SRCS data_type_transform_test.cc + DEPS data_type_transform) endif() -cc_library(data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_function) -cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_layout_transform) - -cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor - framework_proto selected_rows_utils data_device_transform data_type_transform data_layout_transform) - -cc_library(attribute SRCS attribute.cc DEPS framework_proto boost enforce) -cc_test(attribute_test SRCS attribute_test.cc DEPS attribute framework_proto proto_desc) -cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc -device_context) - -cc_library(op_version_proto SRCS op_version_proto.cc DEPS framework_proto boost) - -cc_library(op_version_registry SRCS op_version_registry.cc DEPS op_version_proto framework_proto boost) -cc_test(op_version_registry_test SRCS op_version_registry_test.cc DEPS op_version_registry) - -cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute glog) -cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker) -cc_library(no_need_buffer_vars_inference SRCS no_need_buffer_vars_inference.cc DEPS attribute device_context) -cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto no_need_buffer_vars_inference) -cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context) - -cc_test(no_need_buffer_vars_inference_test SRCS no_need_buffer_vars_inference_test.cc DEPS no_need_buffer_vars_inference layer) - -cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context) - -cc_library(unused_var_check SRCS unused_var_check.cc DEPS glog no_need_buffer_vars_inference) - -cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place) - -IF(WITH_XPU) -cc_library(phi_utils SRCS phi_utils.cc DEPS lod_tensor selected_rows_utils place phi var_type_traits phi_api_utils op_info xpu_op_list) -ELSE() -cc_library(phi_utils SRCS phi_utils.cc DEPS lod_tensor selected_rows_utils place phi var_type_traits phi_api_utils op_info) -ENDIF() +cc_library( + data_layout_transform + SRCS data_layout_transform.cc + DEPS tensor math_function) +cc_test( + data_layout_transform_test + SRCS data_layout_transform_test.cc + DEPS data_layout_transform) + +cc_library( + data_transform + SRCS data_transform.cc + DEPS math_function + tensor + framework_proto + selected_rows_utils + data_device_transform + data_type_transform + data_layout_transform) + +cc_library( + attribute + SRCS attribute.cc + DEPS framework_proto boost enforce) +cc_test( + attribute_test + SRCS attribute_test.cc + DEPS attribute framework_proto proto_desc) +cc_test( + program_desc_test + SRCS program_desc_test.cc + DEPS proto_desc device_context) + +cc_library( + op_version_proto + SRCS op_version_proto.cc + DEPS framework_proto boost) + +cc_library( + op_version_registry + SRCS op_version_registry.cc + DEPS op_version_proto framework_proto boost) +cc_test( + op_version_registry_test + SRCS op_version_registry_test.cc + DEPS op_version_registry) + +cc_library( + op_proto_maker + SRCS op_proto_maker.cc + DEPS framework_proto attribute glog) +cc_test( + op_proto_maker_test + SRCS op_proto_maker_test.cc + DEPS op_proto_maker) +cc_library( + no_need_buffer_vars_inference + SRCS no_need_buffer_vars_inference.cc + DEPS attribute device_context) +cc_library( + op_info + SRCS op_info.cc + DEPS attribute framework_proto no_need_buffer_vars_inference) +cc_library( + shape_inference + SRCS shape_inference.cc + DEPS ddim attribute device_context) + +cc_test( + no_need_buffer_vars_inference_test + SRCS no_need_buffer_vars_inference_test.cc + DEPS no_need_buffer_vars_inference layer) + +cc_library( + transfer_scope_cache + SRCS transfer_scope_cache.cc + DEPS scope framework_proto device_context) + +cc_library( + unused_var_check + SRCS unused_var_check.cc + DEPS glog no_need_buffer_vars_inference) + +cc_library( + op_kernel_type + SRCS op_kernel_type.cc + DEPS device_context place) + +if(WITH_XPU) + cc_library( + phi_utils + SRCS phi_utils.cc + DEPS lod_tensor + selected_rows_utils + place + phi + var_type_traits + phi_api_utils + op_info + xpu_op_list) +else() + cc_library( + phi_utils + SRCS phi_utils.cc + DEPS lod_tensor + selected_rows_utils + place + phi + var_type_traits + phi_api_utils + op_info) +endif() -IF(WITH_XPU) -cc_library(operator SRCS operator.cc DEPS xpu_op_list op_info device_context tensor scope glog trainer_desc_proto data_feed_proto - shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils - phi_utils kernel_factory infershape_utils op_utils) -ELSE() -cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog trainer_desc_proto data_feed_proto - shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils - phi_utils kernel_factory infershape_utils op_utils) -ENDIF() +if(WITH_XPU) + cc_library( + operator + SRCS operator.cc + DEPS xpu_op_list + op_info + device_context + tensor + scope + glog + trainer_desc_proto + data_feed_proto + shape_inference + data_transform + lod_tensor + profiler + transfer_scope_cache + op_kernel_type + op_call_stack + unused_var_check + nan_inf_utils + phi_utils + kernel_factory + infershape_utils + op_utils) +else() + cc_library( + operator + SRCS operator.cc + DEPS op_info + device_context + tensor + scope + glog + trainer_desc_proto + data_feed_proto + shape_inference + data_transform + lod_tensor + profiler + transfer_scope_cache + op_kernel_type + op_call_stack + unused_var_check + nan_inf_utils + phi_utils + kernel_factory + infershape_utils + op_utils) +endif() -cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context) -cc_test(operator_exception_test SRCS operator_exception_test.cc DEPS operator op_registry device_context) +cc_test( + operator_test + SRCS operator_test.cc + DEPS operator op_registry device_context) +cc_test( + operator_exception_test + SRCS operator_exception_test.cc + DEPS operator op_registry device_context) cc_library(version SRCS version.cc) -cc_test(version_test SRCS version_test.cc DEPS version) - -cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc process_mesh_desc.cc DEPS attribute shape_inference op_info operator glog version) - -cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) - -cc_library(op_call_stack SRCS op_call_stack.cc DEPS op_proto_maker enforce) -cc_test(op_call_stack_test SRCS op_call_stack_test.cc DEPS op_call_stack) - -cc_library(program_processing SRCS program_processing.cc DEPS boost proto_desc) -cc_test(program_processing_test SRCS program_processing_test.cc DEPS proto_desc program_processing) +cc_test( + version_test + SRCS version_test.cc + DEPS version) + +cc_library( + proto_desc + SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc process_mesh_desc.cc + DEPS attribute shape_inference op_info operator glog version) + +cc_library( + op_registry + SRCS op_registry.cc + DEPS op_proto_maker op_info operator glog proto_desc) + +cc_library( + op_call_stack + SRCS op_call_stack.cc + DEPS op_proto_maker enforce) +cc_test( + op_call_stack_test + SRCS op_call_stack_test.cc + DEPS op_call_stack) + +cc_library( + program_processing + SRCS program_processing.cc + DEPS boost proto_desc) +cc_test( + program_processing_test + SRCS program_processing_test.cc + DEPS proto_desc program_processing) if(WITH_GPU) - nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) + nv_test( + op_registry_test + SRCS op_registry_test.cc + DEPS op_registry) elseif(WITH_ROCM) - hip_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) + hip_test( + op_registry_test + SRCS op_registry_test.cc + DEPS op_registry) endif() if(WITH_PYTHON) py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto) py_proto_compile(trainer_py_proto SRCS trainer_desc.proto data_feed.proto) - py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto) + py_proto_compile(distributed_strategy_py_proto SRCS + distributed_strategy.proto) py_proto_compile(pass_desc_py_proto SRCS pass_desc.proto) -#Generate an empty \ - #__init__.py to make framework_py_proto as a valid python module. - add_custom_target(fleet_proto_init ALL - COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto - COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py - ) - add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) - add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init pass_desc_py_proto ps_py_proto ps_py_proto_init) - if (NOT WIN32) - add_custom_command(TARGET framework_py_proto POST_BUILD - COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto + #Generate an empty \ + #__init__.py to make framework_py_proto as a valid python module. + add_custom_target( + fleet_proto_init ALL + COMMAND ${CMAKE_COMMAND} -E make_directory + ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto + COMMAND + ${CMAKE_COMMAND} -E touch + ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py) + add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E + touch __init__.py) + add_dependencies( + framework_py_proto + framework_py_proto_init + trainer_py_proto + distributed_strategy_py_proto + fleet_proto_init + pass_desc_py_proto + ps_py_proto + ps_py_proto_init) + if(NOT WIN32) + add_custom_command( + TARGET framework_py_proto + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory + ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/ - COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto + COMMAND cp distributed_strategy_*.py + ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto COMMENT "Copy generated python proto into directory paddle/fluid/proto." WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) - add_custom_target(fleet_executor_proto_init ALL DEPENDS fleet_proto_init fleet_executor_desc_py_proto - COMMAND cp ${PADDLE_BINARY_DIR}/paddle/fluid/distributed/fleet_executor/fleet_executor_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto - COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto.") + add_custom_target( + fleet_executor_proto_init ALL + DEPENDS fleet_proto_init fleet_executor_desc_py_proto + COMMAND + cp + ${PADDLE_BINARY_DIR}/paddle/fluid/distributed/fleet_executor/fleet_executor_*.py + ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto + COMMENT + "Copy generated python proto into directory paddle/distributed/fleet/proto." + ) else(NOT WIN32) - string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/") - string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/") - add_custom_command(TARGET framework_py_proto POST_BUILD - COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto - COMMAND copy /Y *.py ${proto_dstpath} - COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath} - COMMENT "Copy generated python proto into directory paddle/fluid/proto." - COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto." - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + string(REPLACE "/" "\\" proto_dstpath + "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/") + string( + REPLACE "/" "\\" fleet_proto_dstpath + "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/") + add_custom_command( + TARGET framework_py_proto + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory + ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto + COMMAND copy /Y *.py ${proto_dstpath} + COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath} + COMMENT "Copy generated python proto into directory paddle/fluid/proto." + COMMENT + "Copy generated python proto into directory paddle/distributed/fleet/proto." + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endif(NOT WIN32) endif() -if (WITH_PSCORE) - add_custom_target(index_dataset_proto_init ALL DEPENDS fleet_proto_init index_dataset_py_proto - COMMAND cp ${PADDLE_BINARY_DIR}/paddle/fluid/distributed/index_dataset/index_dataset_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto - COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto.") +if(WITH_PSCORE) + add_custom_target( + index_dataset_proto_init ALL + DEPENDS fleet_proto_init index_dataset_py_proto + COMMAND + cp + ${PADDLE_BINARY_DIR}/paddle/fluid/distributed/index_dataset/index_dataset_*.py + ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto + COMMENT + "Copy generated python proto into directory paddle/distributed/fleet/proto." + ) endif(WITH_PSCORE) -cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) - -cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) -cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor) - -if (TENSORRT_FOUND) -cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry denormal device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper tensorrt_engine_op) +cc_library( + lod_rank_table + SRCS lod_rank_table.cc + DEPS lod_tensor) + +cc_library( + feed_fetch_method + SRCS feed_fetch_method.cc + DEPS lod_tensor scope glog) +cc_library( + variable_helper + SRCS variable_helper.cc + DEPS lod_tensor) + +if(TENSORRT_FOUND) + cc_library( + naive_executor + SRCS naive_executor.cc + DEPS op_registry + denormal + device_context + scope + framework_proto + glog + lod_rank_table + feed_fetch_method + graph_to_program_pass + variable_helper + tensorrt_engine_op) else() -cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry denormal device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) + cc_library( + naive_executor + SRCS naive_executor.cc + DEPS op_registry + denormal + device_context + scope + framework_proto + glog + lod_rank_table + feed_fetch_method + graph_to_program_pass + variable_helper) endif(TENSORRT_FOUND) -cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc operator garbage_collector op_registry while_op_helper recurrent_op_helper conditional_block_op_helper) +cc_library( + executor_gc_helper + SRCS executor_gc_helper.cc + DEPS scope + proto_desc + operator + garbage_collector + op_registry + while_op_helper + recurrent_op_helper + conditional_block_op_helper) if(WITH_DISTRIBUTE) if(WITH_PSLIB) - cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc - dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc - heterxpu_trainer.cc - data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc - ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc data_feed.cu - pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry - device_context scope framework_proto trainer_desc_proto glog fs shell - fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper metrics lodtensor_printer - lod_rank_table feed_fetch_method collective_helper ${GLOB_DISTRIBUTE_DEPS} - graph_to_program_pass variable_helper data_feed_proto timer monitor - heter_service_proto fleet_executor ${BRPC_DEP}) - set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=parentheses") - if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) - set(DISTRIBUTE_COMPILE_FLAGS - "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") + cc_library( + executor + SRCS executor.cc + multi_trainer.cc + pipeline_trainer.cc + dataset_factory.cc + dist_multi_trainer.cc + trainer_factory.cc + trainer.cc + data_feed_factory.cc + heterxpu_trainer.cc + data_feed.cc + device_worker.cc + hogwild_worker.cc + hetercpu_worker.cc + ps_gpu_worker.cc + ps_gpu_trainer.cc + downpour_worker.cc + downpour_worker_opt.cc + data_feed.cu + pull_dense_worker.cc + section_worker.cc + device_worker_factory.cc + data_set.cc + DEPS op_registry + device_context + scope + framework_proto + trainer_desc_proto + glog + fs + shell + fleet_wrapper + heter_wrapper + ps_gpu_wrapper + box_wrapper + metrics + lodtensor_printer + lod_rank_table + feed_fetch_method + collective_helper + ${GLOB_DISTRIBUTE_DEPS} + graph_to_program_pass + variable_helper + data_feed_proto + timer + monitor + heter_service_proto + fleet_executor + ${BRPC_DEP}) + set(DISTRIBUTE_COMPILE_FLAGS + "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=parentheses" + ) + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) + set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") endif() - set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(device_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(hetercpu_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(heterxpu_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + device_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + hetercpu_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + heterxpu_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) elseif(WITH_PSCORE) - cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc - dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc - heterxpu_trainer.cc heter_pipeline_trainer.cc - data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc - downpour_worker.cc downpour_lite_worker.cc downpour_worker_opt.cc data_feed.cu - pull_dense_worker.cc section_worker.cc heter_section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry - device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog - index_sampler index_wrapper sampler index_dataset_proto - lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper metrics lodtensor_printer feed_fetch_method - graph_to_program_pass variable_helper timer monitor heter_service_proto fleet heter_server brpc fleet_executor) - set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=parentheses") - if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) - set(DISTRIBUTE_COMPILE_FLAGS - "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") + cc_library( + executor + SRCS executor.cc + multi_trainer.cc + pipeline_trainer.cc + dataset_factory.cc + dist_multi_trainer.cc + trainer_factory.cc + trainer.cc + data_feed_factory.cc + heterxpu_trainer.cc + heter_pipeline_trainer.cc + data_feed.cc + device_worker.cc + hogwild_worker.cc + hetercpu_worker.cc + downpour_worker.cc + downpour_lite_worker.cc + downpour_worker_opt.cc + data_feed.cu + pull_dense_worker.cc + section_worker.cc + heter_section_worker.cc + device_worker_factory.cc + data_set.cc + DEPS op_registry + device_context + scope + framework_proto + data_feed_proto + heter_service_proto + trainer_desc_proto + glog + index_sampler + index_wrapper + sampler + index_dataset_proto + lod_rank_table + fs + shell + fleet_wrapper + heter_wrapper + box_wrapper + metrics + lodtensor_printer + feed_fetch_method + graph_to_program_pass + variable_helper + timer + monitor + heter_service_proto + fleet + heter_server + brpc + fleet_executor) + set(DISTRIBUTE_COMPILE_FLAGS + "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=parentheses" + ) + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) + set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") endif() - set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(device_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(multi_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(hogwild_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(downpour_lite_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(heter_section_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(heter_pipeline_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + device_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + multi_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + hogwild_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + downpour_lite_worker.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + heter_section_worker.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + heter_pipeline_trainer.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) else() - cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc - dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc - heterxpu_trainer.cc - data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc - ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc data_feed.cu - pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry - device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog - lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper metrics lodtensor_printer feed_fetch_method - graph_to_program_pass variable_helper timer monitor fleet_executor) + cc_library( + executor + SRCS executor.cc + multi_trainer.cc + pipeline_trainer.cc + dataset_factory.cc + dist_multi_trainer.cc + trainer_factory.cc + trainer.cc + data_feed_factory.cc + heterxpu_trainer.cc + data_feed.cc + device_worker.cc + hogwild_worker.cc + hetercpu_worker.cc + ps_gpu_worker.cc + ps_gpu_trainer.cc + downpour_worker.cc + downpour_worker_opt.cc + data_feed.cu + pull_dense_worker.cc + section_worker.cc + device_worker_factory.cc + data_set.cc + DEPS op_registry + device_context + scope + framework_proto + data_feed_proto + heter_service_proto + trainer_desc_proto + glog + lod_rank_table + fs + shell + fleet_wrapper + heter_wrapper + ps_gpu_wrapper + box_wrapper + metrics + lodtensor_printer + feed_fetch_method + graph_to_program_pass + variable_helper + timer + monitor + fleet_executor) endif() elseif(WITH_PSLIB) - set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) - set(DISTRIBUTE_COMPILE_FLAGS - "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") + set(DISTRIBUTE_COMPILE_FLAGS + "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor" + ) + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) + set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") endif() - set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(device_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(hetercpu_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(heterxpu_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc - dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc - heterxpu_trainer.cc - data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc - ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc data_feed.cu - pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry - device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog - lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method - graph_to_program_pass variable_helper timer monitor fleet_executor ${BRPC_DEP}) + set_source_files_properties( + executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + device_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + hetercpu_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + heterxpu_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_library( + executor + SRCS executor.cc + multi_trainer.cc + pipeline_trainer.cc + dataset_factory.cc + dist_multi_trainer.cc + trainer_factory.cc + trainer.cc + data_feed_factory.cc + heterxpu_trainer.cc + data_feed.cc + device_worker.cc + hogwild_worker.cc + hetercpu_worker.cc + ps_gpu_worker.cc + ps_gpu_trainer.cc + downpour_worker.cc + downpour_worker_opt.cc + data_feed.cu + pull_dense_worker.cc + section_worker.cc + device_worker_factory.cc + data_set.cc + DEPS op_registry + device_context + scope + framework_proto + data_feed_proto + heter_service_proto + trainer_desc_proto + glog + lod_rank_table + fs + shell + fleet_wrapper + heter_wrapper + ps_gpu_wrapper + box_wrapper + lodtensor_printer + feed_fetch_method + graph_to_program_pass + variable_helper + timer + monitor + fleet_executor + ${BRPC_DEP}) else() - cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc - dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc - heterxpu_trainer.cc - data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc - ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc data_feed.cu - pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry - device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog - lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method - graph_to_program_pass variable_helper timer monitor fleet_executor) + cc_library( + executor + SRCS executor.cc + multi_trainer.cc + pipeline_trainer.cc + dataset_factory.cc + dist_multi_trainer.cc + trainer_factory.cc + trainer.cc + data_feed_factory.cc + heterxpu_trainer.cc + data_feed.cc + device_worker.cc + hogwild_worker.cc + hetercpu_worker.cc + ps_gpu_worker.cc + ps_gpu_trainer.cc + downpour_worker.cc + downpour_worker_opt.cc + data_feed.cu + pull_dense_worker.cc + section_worker.cc + device_worker_factory.cc + data_set.cc + DEPS op_registry + device_context + scope + framework_proto + data_feed_proto + heter_service_proto + trainer_desc_proto + glog + lod_rank_table + fs + shell + fleet_wrapper + heter_wrapper + ps_gpu_wrapper + box_wrapper + lodtensor_printer + feed_fetch_method + graph_to_program_pass + variable_helper + timer + monitor + fleet_executor) endif() -target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_helper conditional_block_op_helper) - -cc_library(parallel_executor SRCS parallel_executor.cc DEPS - threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor async_ssa_graph_executor - graph build_strategy bind_threaded_ssa_graph_executor collective_helper - fast_threaded_ssa_graph_executor variable_helper) - -cc_library(executor_cache SRCS executor_cache.cc DEPS parallel_executor) +target_link_libraries(executor while_op_helper executor_gc_helper + recurrent_op_helper conditional_block_op_helper) + +cc_library( + parallel_executor + SRCS parallel_executor.cc + DEPS threaded_ssa_graph_executor + scope_buffered_ssa_graph_executor + parallel_ssa_graph_executor + async_ssa_graph_executor + graph + build_strategy + bind_threaded_ssa_graph_executor + collective_helper + fast_threaded_ssa_graph_executor + variable_helper) + +cc_library( + executor_cache + SRCS executor_cache.cc + DEPS parallel_executor) if(WITH_PSCORE) - get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) - cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS - conditional_block_op executor gloo_wrapper ${RPC_DEPS}) - cc_test(heter_pipeline_trainer_test SRCS heter_pipeline_trainer_test.cc DEPS - conditional_block_op scale_op heter_listen_and_serv_op executor heter_server gloo_wrapper eigen_function ${RPC_DEPS}) + get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) + cc_test( + dist_multi_trainer_test + SRCS dist_multi_trainer_test.cc + DEPS conditional_block_op executor gloo_wrapper ${RPC_DEPS}) + cc_test( + heter_pipeline_trainer_test + SRCS heter_pipeline_trainer_test.cc + DEPS conditional_block_op + scale_op + heter_listen_and_serv_op + executor + heter_server + gloo_wrapper + eigen_function + ${RPC_DEPS}) else() - cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS - conditional_block_op executor gloo_wrapper) + cc_test( + dist_multi_trainer_test + SRCS dist_multi_trainer_test.cc + DEPS conditional_block_op executor gloo_wrapper) endif() -cc_library(prune SRCS prune.cc DEPS framework_proto boost) -cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) -cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry - proto_desc) -cc_library(selected_rows_utils SRCS selected_rows_utils.cc DEPS selected_rows) -cc_test(selected_rows_utils_test SRCS selected_rows_utils_test.cc DEPS selected_rows_utils) - -cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto op_kernel_type) +cc_library( + prune + SRCS prune.cc + DEPS framework_proto boost) +cc_test( + prune_test + SRCS prune_test.cc + DEPS op_info prune recurrent_op device_context) +cc_test( + var_type_inference_test + SRCS var_type_inference_test.cc + DEPS op_registry proto_desc) +cc_library( + selected_rows_utils + SRCS selected_rows_utils.cc + DEPS selected_rows) +cc_test( + selected_rows_utils_test + SRCS selected_rows_utils_test.cc + DEPS selected_rows_utils) + +cc_test( + op_kernel_type_test + SRCS op_kernel_type_test.cc + DEPS place device_context framework_proto op_kernel_type) cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) -cc_test(tuple_test SRCS tuple_test.cc ) +cc_test(tuple_test SRCS tuple_test.cc) cc_test(inlined_vector_test SRCS inlined_vector_test.cc) -cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack) -cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog) - -cc_library(op_compatible_info SRCS op_compatible_info.cc DEPS string_helper proto_desc) -cc_test(op_compatible_info_test SRCS op_compatible_info_test.cc DEPS op_compatible_info proto_desc string_helper glog) - -cc_library(save_load_util SRCS save_load_util.cc DEPS tensor scope layer) -cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer) -cc_library(generator SRCS generator.cc DEPS enforce place) - -cc_library(infershape_utils SRCS infershape_utils.cc DEPS lod_tensor selected_rows_utils attribute place var_type_traits phi phi_api_utils op_info shape_inference) -cc_test(infershape_utils_test SRCS infershape_utils_test.cc DEPS infershape_utils infermeta_utils meta_tensor) +cc_library( + dlpack_tensor + SRCS dlpack_tensor.cc + DEPS tensor dlpack) +cc_test( + dlpack_tensor_test + SRCS dlpack_tensor_test.cc + DEPS dlpack_tensor glog) + +cc_library( + op_compatible_info + SRCS op_compatible_info.cc + DEPS string_helper proto_desc) +cc_test( + op_compatible_info_test + SRCS op_compatible_info_test.cc + DEPS op_compatible_info proto_desc string_helper glog) + +cc_library( + save_load_util + SRCS save_load_util.cc + DEPS tensor scope layer) +cc_test( + save_load_util_test + SRCS save_load_util_test.cc + DEPS save_load_util tensor scope layer) +cc_library( + generator + SRCS generator.cc + DEPS enforce place) + +cc_library( + infershape_utils + SRCS infershape_utils.cc + DEPS lod_tensor + selected_rows_utils + attribute + place + var_type_traits + phi + phi_api_utils + op_info + shape_inference) +cc_test( + infershape_utils_test + SRCS infershape_utils_test.cc + DEPS infershape_utils infermeta_utils meta_tensor) # Get the current working branch execute_process( COMMAND git rev-parse --abbrev-ref HEAD - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} - OUTPUT_VARIABLE PADDLE_BRANCH - OUTPUT_STRIP_TRAILING_WHITESPACE - ) + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_BRANCH + OUTPUT_STRIP_TRAILING_WHITESPACE) # Get the latest abbreviated commit hash of the working branch execute_process( COMMAND git log -1 --format=%h - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} - OUTPUT_VARIABLE PADDLE_COMMIT - OUTPUT_STRIP_TRAILING_WHITESPACE - ) + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_COMMIT + OUTPUT_STRIP_TRAILING_WHITESPACE) message(STATUS "commit: ${PADDLE_COMMIT}") message(STATUS "branch: ${PADDLE_BRANCH}") configure_file(commit.h.in commit.h) -cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper phi_tensor op_meta_info phi_api) +cc_library( + custom_operator + SRCS custom_operator.cc + DEPS tensor + attribute + framework_proto + op_registry + operator + dynamic_loader + string_helper + phi_tensor + op_meta_info + phi_api) #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ) #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler) -set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator) +set(FLUID_FRAMEWORK_MODULES + proto_desc + memory + lod_tensor + executor + data_feed_proto + layer + dynamic_loader + custom_operator) cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES}) @@ -456,11 +1145,23 @@ if(WITH_TESTING AND TEST selected_rows_utils_test) endif() cc_test(scope_guard_test SRCS scope_guard_test.cc) -cc_test(phi_utils_test SRCS phi_utils_test.cc DEPS phi_utils) +cc_test( + phi_utils_test + SRCS phi_utils_test.cc + DEPS phi_utils) if(WITH_GPU OR WITH_ROCM) - cc_library(fluid_convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info) + cc_library( + fluid_convert_utils + SRCS convert_utils.cc + DEPS data_type place gpu_info) else() - cc_library(fluid_convert_utils SRCS convert_utils.cc DEPS data_type place) + cc_library( + fluid_convert_utils + SRCS convert_utils.cc + DEPS data_type place) endif() -cc_test(convert_utils_test SRCS convert_utils_test.cc DEPS fluid_convert_utils) +cc_test( + convert_utils_test + SRCS convert_utils_test.cc + DEPS fluid_convert_utils) diff --git a/paddle/fluid/framework/archive.h b/paddle/fluid/framework/archive.h index d058938386343..6a8f4ff47f35d 100644 --- a/paddle/fluid/framework/archive.h +++ b/paddle/fluid/framework/archive.h @@ -20,6 +20,7 @@ #endif #include + #include #include #include @@ -31,6 +32,7 @@ #include #include #include + #include "paddle/fluid/framework/expect.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index ae3d8379bdbf7..d6cc5dc639fe6 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -13,11 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/async_executor.h" + +#include "gflags/gflags.h" #include "google/protobuf/io/zero_copy_stream_impl.h" #include "google/protobuf/message.h" #include "google/protobuf/text_format.h" - -#include "gflags/gflags.h" #include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/executor_thread_worker.h" #include "paddle/fluid/framework/feed_fetch_method.h" diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index b0c6c8a01648f..01daf3c11187b 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include #include #include // NOLINT @@ -24,6 +25,7 @@ limitations under the License. */ #include // NOLINT #include #include + #include "paddle/fluid/framework/data_feed.pb.h" #include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/executor.h" diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h index 2164a21f3f892..b2c5bfde3aa56 100644 --- a/paddle/fluid/framework/attribute.h +++ b/paddle/fluid/framework/attribute.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include #include #include diff --git a/paddle/fluid/framework/attribute_test.cc b/paddle/fluid/framework/attribute_test.cc index 27a6afb49f5e8..8a47e41d38359 100644 --- a/paddle/fluid/framework/attribute_test.cc +++ b/paddle/fluid/framework/attribute_test.cc @@ -12,13 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/attribute.h" + #include #include -#include "paddle/fluid/framework/attribute.h" -#include "paddle/fluid/framework/program_desc.h" - #include "gtest/gtest.h" +#include "paddle/fluid/framework/program_desc.h" #include "paddle/utils/any.h" TEST(Attribute, GetAttrValueToAny) { diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h index 80fee94f1c85d..1eb3585fa3339 100644 --- a/paddle/fluid/framework/channel.h +++ b/paddle/fluid/framework/channel.h @@ -20,6 +20,7 @@ #endif #include + #include #include // NOLINT #include @@ -28,6 +29,7 @@ #include // NOLINT #include #include + #include "paddle/fluid/framework/expect.h" namespace paddle { diff --git a/paddle/fluid/framework/convert_utils_test.cc b/paddle/fluid/framework/convert_utils_test.cc index 140806dfd7c5e..e3f5a4a8dcda1 100644 --- a/paddle/fluid/framework/convert_utils_test.cc +++ b/paddle/fluid/framework/convert_utils_test.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/convert_utils.h" + #include "gtest/gtest.h" namespace phi { diff --git a/paddle/fluid/framework/copy_same_tensor_test.cc b/paddle/fluid/framework/copy_same_tensor_test.cc index d8c27ad280d18..d4f36be5e87e7 100644 --- a/paddle/fluid/framework/copy_same_tensor_test.cc +++ b/paddle/fluid/framework/copy_same_tensor_test.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include #include "gflags/gflags.h" diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index 65c41e19ac423..0130fd4b57ffa 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -867,43 +867,43 @@ void RegisterOperatorWithMetaInfo(const std::vector& op_meta_infos, bool is_double_grad = (i == 2); // GradOpDescMaker - info.grad_op_maker_ = [grad_op_name, grad_op_inputs, grad_op_outputs, - is_double_grad]( - const OpDesc& fwd_op, - const std::unordered_set& no_grad_set, - std::unordered_map* grad_to_var, - const std::vector& grad_block) { - CustomGradOpMaker maker( - fwd_op, no_grad_set, grad_to_var, grad_block, grad_op_name, - grad_op_inputs, grad_op_outputs, is_double_grad); - return maker(); - }; + info.grad_op_maker_ = + [grad_op_name, grad_op_inputs, grad_op_outputs, is_double_grad]( + const OpDesc& fwd_op, + const std::unordered_set& no_grad_set, + std::unordered_map* grad_to_var, + const std::vector& grad_block) { + CustomGradOpMaker maker( + fwd_op, no_grad_set, grad_to_var, grad_block, grad_op_name, + grad_op_inputs, grad_op_outputs, is_double_grad); + return maker(); + }; // GradOpBaseMaker - info.dygraph_grad_op_maker_ = [grad_op_name, grad_op_inputs, - grad_op_outputs, is_double_grad]( - const std::string& type, - const imperative::NameVarBaseMap& var_base_map_in, - const imperative::NameVarBaseMap& var_base_map_out, - const framework::AttributeMap& attrs, - const framework::AttributeMap& default_attrs, - const std::map& inplace_map) { - CustomGradOpMaker maker( - type, var_base_map_in, var_base_map_out, attrs, inplace_map, - grad_op_name, grad_op_inputs, grad_op_outputs, is_double_grad); - maker.SetDygraphDefaultAttrsMap(default_attrs); - return maker(); - }; + info.dygraph_grad_op_maker_ = + [grad_op_name, grad_op_inputs, grad_op_outputs, is_double_grad]( + const std::string& type, + const imperative::NameVarBaseMap& var_base_map_in, + const imperative::NameVarBaseMap& var_base_map_out, + const framework::AttributeMap& attrs, + const framework::AttributeMap& default_attrs, + const std::map& inplace_map) { + CustomGradOpMaker maker( + type, var_base_map_in, var_base_map_out, attrs, inplace_map, + grad_op_name, grad_op_inputs, grad_op_outputs, is_double_grad); + maker.SetDygraphDefaultAttrsMap(default_attrs); + return maker(); + }; /* Grad op register */ OpInfo grad_info; // Grad Op - grad_info.creator_ = []( - const std::string& type, const VariableNameMap& inputs, - const VariableNameMap& outputs, const AttributeMap& attrs) { - return new CustomOperator(type, inputs, outputs, attrs); - }; + grad_info.creator_ = + [](const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs) { + return new CustomOperator(type, inputs, outputs, attrs); + }; // Grad InferShape if (grad_infer_shape_fn == nullptr) { diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu index 4757eb60f4361..d51707970ffe4 100644 --- a/paddle/fluid/framework/data_device_transform_test.cu +++ b/paddle/fluid/framework/data_device_transform_test.cu @@ -13,18 +13,16 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "gtest/gtest.h" - #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/init.h" #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/fluid/framework/phi_utils.h" - namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index b63f317aae893..1808caddabccd 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -18,6 +18,7 @@ limitations under the License. */ #endif #include "paddle/fluid/framework/data_feed.h" + #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #ifdef _LINUX #include @@ -220,6 +221,7 @@ bool DataFeed::PickOneFile(std::string* filename) { file_idx_, platform::errors::PreconditionNotMet( "You should call SetFileListIndex before PickOneFile")); std::unique_lock lock(*mutex_for_pick_file_); + VLOG(4) << "filelist_ size: " << filelist_.size(); if (*file_idx_ == filelist_.size()) { VLOG(3) << "DataFeed::PickOneFile no more file to pick"; return false; @@ -230,8 +232,9 @@ bool DataFeed::PickOneFile(std::string* filename) { } void DataFeed::CheckInit() { - PADDLE_ENFORCE_EQ(finish_init_, true, platform::errors::PreconditionNotMet( - "DataFeed initialization failed.")); + PADDLE_ENFORCE_EQ( + finish_init_, true, + platform::errors::PreconditionNotMet("DataFeed initialization failed.")); } void DataFeed::CheckSetFileList() { @@ -284,6 +287,7 @@ void PrivateQueueDataFeed::SetQueueSize(int queue_size) { template bool PrivateQueueDataFeed::Start() { + VLOG(4) << "entering PrivateQueueDataFeed::Start()"; CheckSetFileList(); read_thread_ = std::thread(&PrivateQueueDataFeed::ReadThread, this); read_thread_.detach(); @@ -295,6 +299,7 @@ bool PrivateQueueDataFeed::Start() { template void PrivateQueueDataFeed::ReadThread() { #ifdef _LINUX + VLOG(4) << "entering PrivateQueueDataFeed::ReadThread()"; std::string filename; while (PickOneFile(&filename)) { int err_no = 0; @@ -356,6 +361,7 @@ InMemoryDataFeed::InMemoryDataFeed() { template bool InMemoryDataFeed::Start() { #ifdef _LINUX + VLOG(4) << "entering InMemoryDataFeed::Start()"; this->CheckSetFileList(); if (output_channel_->Size() == 0 && input_channel_->Size() != 0) { std::vector data; @@ -664,6 +670,7 @@ void MultiSlotDataFeed::Init( void MultiSlotDataFeed::ReadThread() { #ifdef _LINUX + VLOG(4) << "entering MultiSlotDataFeed::ReadThread()"; std::string filename; while (PickOneFile(&filename)) { int err_no = 0; @@ -831,7 +838,6 @@ bool MultiSlotDataFeed::ParseOneInstanceFromPipe( } else { int use_slots_num = use_slots_.size(); instance->resize(use_slots_num); - const char* str = reader.get(); std::string line = std::string(str); @@ -971,10 +977,13 @@ void MultiSlotDataFeed::PutToFeedVec( if (feed_vec_[i] == nullptr) { continue; } + VLOG(4) << "MultiSlotDataFeed::PutToFeedVec i: " << i; const auto& type = ins_vec[i].GetType(); const auto& offset = ins_vec[i].GetOffset(); int total_instance = static_cast(offset.back()); - + VLOG(4) << "total_instance: " << total_instance; + // platform::CPUPlace() + VLOG(4) << "this->place_: " << this->place_; if (type[0] == 'f') { // float const auto& feasign = ins_vec[i].GetFloatData(); float* tensor_ptr = @@ -1612,9 +1621,10 @@ template class PrivateInstantDataFeed>; bool MultiSlotFileInstantDataFeed::Preprocess(const std::string& filename) { fd_ = open(filename.c_str(), O_RDONLY); PADDLE_ENFORCE_NE( - fd_, -1, platform::errors::Unavailable( - "Fail to open file: %s in MultiSlotFileInstantDataFeed.", - filename.c_str())); + fd_, -1, + platform::errors::Unavailable( + "Fail to open file: %s in MultiSlotFileInstantDataFeed.", + filename.c_str())); struct stat sb; fstat(fd_, &sb); @@ -2175,7 +2185,7 @@ void SlotRecordInMemoryDataFeed::LoadIntoMemoryByLine(void) { SlotRecordPool().get(&record_vec, OBJPOOL_BLOCK_SIZE); // get slotrecord object function auto record_func = [this, &offset, &record_vec, &old_offset]( - std::vector& vec, int num) { + std::vector& vec, int num) { vec.resize(num); if (offset + num > OBJPOOL_BLOCK_SIZE) { input_channel_->WriteMove(offset, &record_vec[0]); @@ -2573,6 +2583,7 @@ void SlotRecordInMemoryDataFeed::ExpandSlotRecord(SlotRecord* rec) { } bool SlotRecordInMemoryDataFeed::Start() { + VLOG(4) << "entering SlotRecordInMemoryDataFeed::Start"; #ifdef _LINUX this->CheckSetFileList(); if (input_channel_->Size() != 0) { @@ -2667,8 +2678,8 @@ void SlotRecordInMemoryDataFeed::BuildSlotBatchGPU(const int ins_num) { size_t* off_start_ptr = &offsets[j * offset_cols_size]; int total_instance = static_cast(off_start_ptr[offset_cols_size - 1]); - CHECK(total_instance >= 0) << "slot idx:" << j - << ", total instance:" << total_instance; + CHECK(total_instance >= 0) + << "slot idx:" << j << ", total instance:" << total_instance; auto& info = used_slots_info_[j]; // fill slot value with default value 0 diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc index e46e4aeb0124c..e058b19469000 100644 --- a/paddle/fluid/framework/data_feed_factory.cc +++ b/paddle/fluid/framework/data_feed_factory.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_feed_factory.h" #include + #include #include diff --git a/paddle/fluid/framework/data_feed_test.cc b/paddle/fluid/framework/data_feed_test.cc index 2cc441bbd34cb..8375ed80e8319 100644 --- a/paddle/fluid/framework/data_feed_test.cc +++ b/paddle/fluid/framework/data_feed_test.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/framework/data_feed.h" + #include + #include // NOLINT #include #include @@ -23,6 +25,7 @@ #include // NOLINT #include #include + #include "google/protobuf/io/zero_copy_stream_impl.h" #include "google/protobuf/text_format.h" #include "gtest/gtest.h" diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc index 0c762ab2e77e5..f89d0f969abb2 100644 --- a/paddle/fluid/framework/data_set.cc +++ b/paddle/fluid/framework/data_set.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/framework/data_set.h" + #include "google/protobuf/text_format.h" #if (defined PADDLE_WITH_DISTRIBUTE) && (defined PADDLE_WITH_PSCORE) #include "paddle/fluid/distributed/index_dataset/index_sampler.h" diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h index 3d096eaebe344..5d961841a250b 100644 --- a/paddle/fluid/framework/data_set.h +++ b/paddle/fluid/framework/data_set.h @@ -15,6 +15,7 @@ #pragma once #include + #include #include #include // NOLINT @@ -26,6 +27,7 @@ #include #ifdef PADDLE_WITH_GLOO #include + #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #endif diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc index 15cf30c1cf352..01802c11d5219 100644 --- a/paddle/fluid/framework/data_type_test.cc +++ b/paddle/fluid/framework/data_type_test.cc @@ -44,8 +44,8 @@ TEST(DataType, float16) { TEST(DataType, bfloat16) { using paddle::framework::Tensor; - using paddle::platform::CPUPlace; using paddle::platform::bfloat16; + using paddle::platform::CPUPlace; namespace f = paddle::framework; f::proto::VarType::Type dtype = f::proto::VarType::BF16; diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu index 4fab3a7845489..3420298297b3f 100644 --- a/paddle/fluid/framework/data_type_transform_test.cu +++ b/paddle/fluid/framework/data_type_transform_test.cu @@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "gtest/gtest.h" #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/tensor_util.h" -#include "gtest/gtest.h" - TEST(DataTypeTransform, GPUTransform) { auto cpu_place = paddle::platform::CPUPlace(); auto gpu_place = paddle::platform::CUDAPlace(0); diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 948eaab40b4f6..e193274ff2137 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -1,96 +1,284 @@ -cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto node) -cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor) +cc_library( + var_handle + SRCS var_handle.cc + DEPS place framework_proto node) +cc_library( + op_handle_base + SRCS op_handle_base.cc + DEPS var_handle device_context lod_tensor) -cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) -cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) -cc_library(fetch_async_op_handle SRCS fetch_async_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) +cc_library( + scale_loss_grad_op_handle + SRCS scale_loss_grad_op_handle.cc + DEPS op_handle_base scope lod_tensor ddim memory) +cc_library( + fetch_op_handle + SRCS fetch_op_handle.cc + DEPS op_handle_base scope lod_tensor ddim memory) +cc_library( + fetch_async_op_handle + SRCS fetch_async_op_handle.cc + DEPS op_handle_base scope lod_tensor ddim memory) -cc_library(share_tensor_buffer_functor SRCS share_tensor_buffer_functor.cc DEPS framework_proto scope place operator op_registry) -cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry) -cc_library(share_tensor_buffer_op_handle SRCS share_tensor_buffer_op_handle.cc DEPS op_handle_base scope computation_op_handle share_tensor_buffer_functor) -cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry) -cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framework_proto scope place operator op_registry) -cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper) +cc_library( + share_tensor_buffer_functor + SRCS share_tensor_buffer_functor.cc + DEPS framework_proto scope place operator op_registry) +cc_library( + computation_op_handle + SRCS computation_op_handle.cc + DEPS framework_proto scope place operator op_registry) +cc_library( + share_tensor_buffer_op_handle + SRCS share_tensor_buffer_op_handle.cc + DEPS op_handle_base scope computation_op_handle share_tensor_buffer_functor) +cc_library( + rpc_op_handle + SRCS rpc_op_handle.cc + DEPS framework_proto scope place operator op_registry) +cc_library( + fetch_barrier_op_handle + SRCS fetch_barrier_op_handle.cc + DEPS framework_proto scope place operator op_registry) +cc_library( + multi_devices_helper + SRCS multi_devices_helper.cc + DEPS graph graph_helper) -cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows_utils) +cc_library( + variable_visitor + SRCS variable_visitor.cc + DEPS lod_tensor selected_rows_utils) if(WITH_PSCORE) - set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - set_source_files_properties(reduce_op_handle.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(threaded_ssa_graph_executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(async_ssa_graph_executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set(DISTRIBUTE_COMPILE_FLAGS + "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor" + ) + set_source_files_properties( + reduce_op_handle.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + threaded_ssa_graph_executor.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + async_ssa_graph_executor.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) endif() - if(WITH_GPU) - nv_library(nan_inf_utils SRCS nan_inf_utils_detail.cc nan_inf_utils_detail.cu DEPS framework_proto scope place) - nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory - dynload_cuda variable_visitor) - nv_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory - dynload_cuda variable_visitor place device_memory_aligment) - nv_library(grad_merge_all_reduce_op_handle SRCS grad_merge_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor - ddim memory dynload_cuda variable_visitor place device_memory_aligment all_reduce_op_handle fused_all_reduce_op_handle) - - if(WITH_DGC) - nv_library(sparse_all_reduce_op_handle SRCS sparse_all_reduce_op_handle.cc DEPS op_handle_base scope - lod_tensor ddim memory dynload_cuda variable_visitor dgc all_reduce_op_handle) - endif() - - if(WITH_DISTRIBUTE) - nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope - ddim dynload_cuda selected_rows_functor) - else() - nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope - ddim dynload_cuda selected_rows_functor) - endif() - nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda) - nv_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) + nv_library( + nan_inf_utils + SRCS nan_inf_utils_detail.cc nan_inf_utils_detail.cu + DEPS framework_proto scope place) + nv_library( + all_reduce_op_handle + SRCS all_reduce_op_handle.cc + DEPS op_handle_base + scope + lod_tensor + ddim + memory + dynload_cuda + variable_visitor) + nv_library( + fused_all_reduce_op_handle + SRCS fused_all_reduce_op_handle.cc + DEPS op_handle_base + scope + lod_tensor + ddim + memory + dynload_cuda + variable_visitor + place + device_memory_aligment) + nv_library( + grad_merge_all_reduce_op_handle + SRCS grad_merge_all_reduce_op_handle.cc + DEPS op_handle_base + scope + lod_tensor + ddim + memory + dynload_cuda + variable_visitor + place + device_memory_aligment + all_reduce_op_handle + fused_all_reduce_op_handle) + + if(WITH_DGC) + nv_library( + sparse_all_reduce_op_handle + SRCS sparse_all_reduce_op_handle.cc + DEPS op_handle_base + scope + lod_tensor + ddim + memory + dynload_cuda + variable_visitor + dgc + all_reduce_op_handle) + endif() + + if(WITH_DISTRIBUTE) + nv_library( + reduce_op_handle + SRCS reduce_op_handle.cc + DEPS op_handle_base variable_visitor scope ddim dynload_cuda + selected_rows_functor) + else() + nv_library( + reduce_op_handle + SRCS reduce_op_handle.cc + DEPS op_handle_base variable_visitor scope ddim dynload_cuda + selected_rows_functor) + endif() + nv_library( + broadcast_op_handle + SRCS broadcast_op_handle.cc + DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda) + nv_library( + fused_broadcast_op_handle + SRCS fused_broadcast_op_handle.cc + DEPS broadcast_op_handle) elseif(WITH_ROCM) - hip_library(nan_inf_utils SRCS nan_inf_utils_detail.cc nan_inf_utils_detail.cu DEPS framework_proto scope place) - hip_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory - dynload_cuda variable_visitor) - hip_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory - dynload_cuda variable_visitor place device_memory_aligment) - hip_library(grad_merge_all_reduce_op_handle SRCS grad_merge_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor - ddim memory dynload_cuda variable_visitor place device_memory_aligment all_reduce_op_handle fused_all_reduce_op_handle) - - if(WITH_DISTRIBUTE) - hip_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope - ddim dynload_cuda selected_rows_functor) - else() - hip_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope - ddim dynload_cuda selected_rows_functor) - endif() - hip_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda) - hip_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) + hip_library( + nan_inf_utils + SRCS nan_inf_utils_detail.cc nan_inf_utils_detail.cu + DEPS framework_proto scope place) + hip_library( + all_reduce_op_handle + SRCS all_reduce_op_handle.cc + DEPS op_handle_base + scope + lod_tensor + ddim + memory + dynload_cuda + variable_visitor) + hip_library( + fused_all_reduce_op_handle + SRCS fused_all_reduce_op_handle.cc + DEPS op_handle_base + scope + lod_tensor + ddim + memory + dynload_cuda + variable_visitor + place + device_memory_aligment) + hip_library( + grad_merge_all_reduce_op_handle + SRCS grad_merge_all_reduce_op_handle.cc + DEPS op_handle_base + scope + lod_tensor + ddim + memory + dynload_cuda + variable_visitor + place + device_memory_aligment + all_reduce_op_handle + fused_all_reduce_op_handle) + + if(WITH_DISTRIBUTE) + hip_library( + reduce_op_handle + SRCS reduce_op_handle.cc + DEPS op_handle_base variable_visitor scope ddim dynload_cuda + selected_rows_functor) + else() + hip_library( + reduce_op_handle + SRCS reduce_op_handle.cc + DEPS op_handle_base variable_visitor scope ddim dynload_cuda + selected_rows_functor) + endif() + hip_library( + broadcast_op_handle + SRCS broadcast_op_handle.cc + DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda) + hip_library( + fused_broadcast_op_handle + SRCS fused_broadcast_op_handle.cc + DEPS broadcast_op_handle) else() - if (WITH_ASCEND_CL) - cc_library(nan_inf_utils SRCS nan_inf_utils_detail.cc DEPS npu_op_runner framework_proto scope place) - else() - cc_library(nan_inf_utils SRCS nan_inf_utils_detail.cc DEPS framework_proto scope place) - endif() - cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory - variable_visitor) - cc_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory - variable_visitor place device_memory_aligment) - cc_library(grad_merge_all_reduce_op_handle SRCS grad_merge_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor - ddim memory variable_visitor place device_memory_aligment all_reduce_op_handle fused_all_reduce_op_handle) - if(WITH_DISTRIBUTE) - cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope - ddim selected_rows_functor) - else() - cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope - ddim selected_rows_functor) - endif() - cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) - cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) + if(WITH_ASCEND_CL) + cc_library( + nan_inf_utils + SRCS nan_inf_utils_detail.cc + DEPS npu_op_runner framework_proto scope place) + else() + cc_library( + nan_inf_utils + SRCS nan_inf_utils_detail.cc + DEPS framework_proto scope place) + endif() + cc_library( + all_reduce_op_handle + SRCS all_reduce_op_handle.cc + DEPS op_handle_base scope lod_tensor ddim memory variable_visitor) + cc_library( + fused_all_reduce_op_handle + SRCS fused_all_reduce_op_handle.cc + DEPS op_handle_base + scope + lod_tensor + ddim + memory + variable_visitor + place + device_memory_aligment) + cc_library( + grad_merge_all_reduce_op_handle + SRCS grad_merge_all_reduce_op_handle.cc + DEPS op_handle_base + scope + lod_tensor + ddim + memory + variable_visitor + place + device_memory_aligment + all_reduce_op_handle + fused_all_reduce_op_handle) + if(WITH_DISTRIBUTE) + cc_library( + reduce_op_handle + SRCS reduce_op_handle.cc + DEPS op_handle_base variable_visitor scope ddim selected_rows_functor) + else() + cc_library( + reduce_op_handle + SRCS reduce_op_handle.cc + DEPS op_handle_base variable_visitor scope ddim selected_rows_functor) + endif() + cc_library( + broadcast_op_handle + SRCS broadcast_op_handle.cc + DEPS op_handle_base scope ddim memory variable_visitor) + cc_library( + fused_broadcast_op_handle + SRCS fused_broadcast_op_handle.cc + DEPS broadcast_op_handle) endif() -cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) +cc_library( + gather_op_handle + SRCS gather_op_handle.cc + DEPS op_handle_base scope ddim memory variable_visitor) -cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows_utils reference_count_pass_helper) +cc_library( + eager_deletion_op_handle + SRCS eager_deletion_op_handle.cc + DEPS lod_tensor selected_rows_utils reference_count_pass_helper) -set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto +set(SSA_GRAPH_EXECUTOR_DEPS + graph + framework_proto multi_devices_helper reference_count_pass eager_deletion_pass @@ -98,60 +286,122 @@ set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto buffer_shared_cross_op_memory_reuse_pass inplace_addto_op_pass set_reader_device_info_utils) -cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) +cc_library( + ssa_graph_executor + SRCS ssa_graph_executor.cc + DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) -cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope - simple_threadpool device_context) +cc_library( + threaded_ssa_graph_executor + SRCS threaded_ssa_graph_executor.cc + DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool + device_context) -cc_library(parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor) +cc_library( + parallel_ssa_graph_executor + SRCS parallel_ssa_graph_executor.cc + DEPS threaded_ssa_graph_executor) set(ASYNC_SSA_GRAPH_EXECUTOR_DEPS threaded_ssa_graph_executor) -cc_library(async_ssa_graph_executor SRCS async_ssa_graph_executor.cc DEPS ${ASYNC_SSA_GRAPH_EXECUTOR_DEPS}) +cc_library( + async_ssa_graph_executor + SRCS async_ssa_graph_executor.cc + DEPS ${ASYNC_SSA_GRAPH_EXECUTOR_DEPS}) -cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory - device_context broadcast_op_handle) -cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory - device_context gather_op_handle) +cc_test( + broadcast_op_test + SRCS broadcast_op_handle_test.cc + DEPS var_handle + op_handle_base + scope + ddim + memory + device_context + broadcast_op_handle) +cc_test( + gather_op_test + SRCS gather_op_handle_test.cc + DEPS var_handle + op_handle_base + scope + ddim + memory + device_context + gather_op_handle) -cc_library(scope_buffered_monitor SRCS scope_buffered_monitor.cc DEPS scope profiler selected_rows_utils) -cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor scope_buffered_monitor) +cc_library( + scope_buffered_monitor + SRCS scope_buffered_monitor.cc + DEPS scope profiler selected_rows_utils) +cc_library( + scope_buffered_ssa_graph_executor + SRCS scope_buffered_ssa_graph_executor.cc + DEPS ssa_graph_executor scope_buffered_monitor) #cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory # device_context reduce_op_handle ) -cc_library(bind_threaded_ssa_graph_executor SRCS bind_threaded_ssa_graph_executor.cc - DEPS fetch_op_handle gflags ssa_graph_executor scope simple_threadpool device_context) -cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc - DEPS fetch_async_op_handle ssa_graph_executor scope simple_threadpool device_context) -cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle) - -cc_test(exception_holder_test SRCS exception_holder_test.cc ) - -set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass - multi_devices_graph_print_pass multi_devices_graph_check_pass - fuse_elewise_add_act_pass fuse_bn_act_pass fuse_bn_add_act_pass - multi_batch_merge_pass +cc_library( + bind_threaded_ssa_graph_executor + SRCS bind_threaded_ssa_graph_executor.cc + DEPS fetch_op_handle gflags ssa_graph_executor scope simple_threadpool + device_context) +cc_library( + fast_threaded_ssa_graph_executor + SRCS fast_threaded_ssa_graph_executor.cc + DEPS fetch_async_op_handle ssa_graph_executor scope simple_threadpool + device_context) +cc_test( + fused_broadcast_op_test + SRCS fused_broadcast_op_handle_test.cc + DEPS fused_broadcast_op_handle) + +cc_test(exception_holder_test SRCS exception_holder_test.cc) + +set(IR_PASS_DEPS + graph_viz_pass + multi_devices_graph_pass + multi_devices_graph_print_pass + multi_devices_graph_check_pass + fuse_elewise_add_act_pass + fuse_bn_act_pass + fuse_bn_add_act_pass + multi_batch_merge_pass fuse_relu_depthwise_conv_pass lock_free_optimize_pass sequential_execution_pass all_reduce_deps_pass add_reader_dependency_pass modify_op_lock_and_record_event_pass - coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass - fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass - sync_batch_norm_pass runtime_context_cache_pass graph_to_program_pass - fix_op_run_order_pass fuse_gemm_epilogue_pass) + coalesce_grad_tensor_pass + fuse_all_reduce_op_pass + backward_optimizer_op_deps_pass + fuse_adam_op_pass + fuse_sgd_op_pass + fuse_momentum_op_pass + sync_batch_norm_pass + runtime_context_cache_pass + graph_to_program_pass + fix_op_run_order_pass + fuse_gemm_epilogue_pass) -if (WITH_CINN) +if(WITH_CINN) set(IR_PASS_DEPS ${IR_PASS_DEPS} build_cinn_pass) endif() -if(NOT APPLE AND NOT WIN32 AND (WITH_GPU OR WITH_ROCM)) +if(NOT APPLE + AND NOT WIN32 + AND (WITH_GPU OR WITH_ROCM)) set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass) endif() -cc_library(build_strategy SRCS build_strategy.cc DEPS pass_builder ${IR_PASS_DEPS}) -cc_test(build_strategy_test SRCS build_strategy_test.cc - DEPS build_strategy op_registry op_proto_maker graph string_helper) +cc_library( + build_strategy + SRCS build_strategy.cc + DEPS pass_builder ${IR_PASS_DEPS}) +cc_test( + build_strategy_test + SRCS build_strategy_test.cc + DEPS build_strategy op_registry op_proto_maker graph string_helper) -if (WITH_MKLDNN) +if(WITH_MKLDNN) target_link_libraries(build_strategy mkldnn_placement_pass) endif() diff --git a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc index 75baf15dc5ec9..ebdf66cdde131 100644 --- a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h" + #include #include #include #include #include #include + #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" diff --git a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h index 5e973f13cc618..c907a4b4afc7c 100644 --- a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h @@ -14,12 +14,14 @@ #pragma once #include + #include // NOLINT #include #include // NOLINT #include #include #include + #include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/execution_strategy.h" diff --git a/paddle/fluid/framework/details/bkcl_op_handle.h b/paddle/fluid/framework/details/bkcl_op_handle.h index 1a098f06f08f9..b0c2275b3a52b 100644 --- a/paddle/fluid/framework/details/bkcl_op_handle.h +++ b/paddle/fluid/framework/details/bkcl_op_handle.h @@ -14,8 +14,6 @@ #pragma once -#include "xpu/bkcl.h" - #include #include #include @@ -24,6 +22,7 @@ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/device/xpu/bkcl_helper.h" +#include "xpu/bkcl.h" DECLARE_bool(sync_bkcl_allreduce); diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index fdf74d2f769fc..9ed76c87d846c 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/details/build_strategy.h" #include + #include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/ir/graph_printer.h" #include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h" diff --git a/paddle/fluid/framework/details/build_strategy_test.cc b/paddle/fluid/framework/details/build_strategy_test.cc index 69af77d23fbf4..1914c1d33de01 100644 --- a/paddle/fluid/framework/details/build_strategy_test.cc +++ b/paddle/fluid/framework/details/build_strategy_test.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/details/build_strategy.h" + #include #include #include @@ -23,8 +25,6 @@ #include "gtest/gtest-test-part.h" #include "gtest/gtest.h" #include "gtest/gtest_pred_impl.h" - -#include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/var_type_inference.h" diff --git a/paddle/fluid/framework/details/cow_ptr_test.cc b/paddle/fluid/framework/details/cow_ptr_test.cc index 5b055d7cb4d12..b440da9f1dfb4 100644 --- a/paddle/fluid/framework/details/cow_ptr_test.cc +++ b/paddle/fluid/framework/details/cow_ptr_test.cc @@ -13,6 +13,7 @@ limitations under the License. */ #include "paddle/fluid/framework/details/cow_ptr.h" + #include "gtest/gtest.h" namespace paddle { diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index 7f51de435ba6c..57440ed9aa2f4 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -14,6 +14,7 @@ #pragma once #include // for size_t + #include "paddle/fluid/platform/device_context.h" namespace paddle { diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index ce471d55b24a1..8b5c3c1798780 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -46,6 +46,12 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor( VLOG(10) << "Change thread number to 1 because the toposort order is unique"; strategy_.num_threads_ = 1; + traced_ops_.clear(); + for (auto *op_node : TopologySortOperations(*graph_)) { + if (op_node->IsWrappedBy()) { + traced_ops_.emplace_back(&(op_node->Wrapper())); + } + } } pool_.reset(new ::ThreadPool(strategy.num_threads_)); for (auto &op : ir::FilterByNodeWrapper(*graph_)) { diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h index 4477702900a8d..19b0061571596 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h @@ -14,10 +14,12 @@ #pragma once #include + #include #include #include #include + #include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/execution_strategy.h" diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index f4ca4907d48d0..7f44e68af6b0b 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -325,9 +325,10 @@ void FusedAllReduceOpHandle::GetGradLoDTensor( PADDLE_ENFORCE_EQ( platform::is_same_place(lod_tensor.place(), places_.at(scope_idx)), - true, platform::errors::InvalidArgument( - "The variable '%s' at scope %d is not in the right place.", - var_name, scope_idx)); + true, + platform::errors::InvalidArgument( + "The variable '%s' at scope %d is not in the right place.", + var_name, scope_idx)); grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor)); } } @@ -356,10 +357,11 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel( // Get element number int64_t len = grad_tensor.at(i).second->numel(); PADDLE_ENFORCE_GT( - len, 0, platform::errors::InvalidArgument( - "The size of grad tensors of fused_all_reduce_op_handle " - "must be > 0, but got %d.", - len)); + len, 0, + platform::errors::InvalidArgument( + "The size of grad tensors of fused_all_reduce_op_handle " + "must be > 0, but got %d.", + len)); *numel += platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype; } diff --git a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc index 44b9ca90fc540..18de9f443a72f 100644 --- a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h" + #include "paddle/fluid/platform/profiler/event_tracing.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) diff --git a/paddle/fluid/framework/details/graph_test_base.h b/paddle/fluid/framework/details/graph_test_base.h index d139f8488309e..08d9c999a8a5d 100644 --- a/paddle/fluid/framework/details/graph_test_base.h +++ b/paddle/fluid/framework/details/graph_test_base.h @@ -18,6 +18,7 @@ #include #include #include + #include "glog/logging.h" #include "gtest/gtest.h" #include "paddle/fluid/framework/ir/graph.h" diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index e6790de92d054..7b93baddb4af6 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -12,8 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/details/nan_inf_utils_detail.h" + +#include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/scope.h" @@ -261,7 +262,7 @@ void CheckNanInf>( } template <> - void CheckNanInf>> + void CheckNanInf < paddle::platform::complex < double >>> (const paddle::platform::complex* value, const size_t numel, int print_num, const std::string& op_type, const std::string& var_name) { double real_sum = 0.0; @@ -563,8 +564,9 @@ static void NPUCheckOpHasNanOrInf(const framework::OperatorBase& op, if (sum >= 1.0) PrintNPUOpValueInfo(op, scope, place); - PADDLE_ENFORCE_LT(sum, 1.0, platform::errors::PreconditionNotMet( - "Operator %s contains Nan/Inf.", op.Type())); + PADDLE_ENFORCE_LT(sum, 1.0, + platform::errors::PreconditionNotMet( + "Operator %s contains Nan/Inf.", op.Type())); } #endif diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu index 7cf11f7829da9..b8b5537c93cca 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu @@ -12,15 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/details/nan_inf_utils.h" -#include "paddle/fluid/framework/details/nan_inf_utils_detail.h" - #include #include #include #include #include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/details/nan_inf_utils.h" +#include "paddle/fluid/framework/details/nan_inf_utils_detail.h" #include "paddle/fluid/framework/scope.h" namespace paddle { diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h index 427b981e7cda2..213d70337648a 100644 --- a/paddle/fluid/framework/details/op_registry.h +++ b/paddle/fluid/framework/details/op_registry.h @@ -213,14 +213,14 @@ struct OpInfoFiller { platform::errors::AlreadyExists( "GradOpDescMaker of %s has been registered", op_type)); - info->grad_op_maker_ = []( - const OpDesc& fwd_op, - const std::unordered_set& no_grad_set, - std::unordered_map* grad_to_var, - const std::vector& grad_block) { - T maker(fwd_op, no_grad_set, grad_to_var, grad_block); - return maker(); - }; + info->grad_op_maker_ = + [](const OpDesc& fwd_op, + const std::unordered_set& no_grad_set, + std::unordered_map* grad_to_var, + const std::vector& grad_block) { + T maker(fwd_op, no_grad_set, grad_to_var, grad_block); + return maker(); + }; info->use_default_grad_op_desc_maker_ = std::is_base_of, T>::value || @@ -244,17 +244,17 @@ struct OpInfoFiller { platform::errors::AlreadyExists( "GradOpBaseMaker of %s has been registered", op_type)); - info->dygraph_grad_op_maker_ = []( - const std::string& type, - const imperative::NameVarBaseMap& var_base_map_in, - const imperative::NameVarBaseMap& var_base_map_out, - const framework::AttributeMap& attrs, - const framework::AttributeMap& default_attrs, - const std::map& inplace_map) { - T maker(type, var_base_map_in, var_base_map_out, attrs, inplace_map); - maker.SetDygraphDefaultAttrsMap(default_attrs); - return maker(); - }; + info->dygraph_grad_op_maker_ = + [](const std::string& type, + const imperative::NameVarBaseMap& var_base_map_in, + const imperative::NameVarBaseMap& var_base_map_out, + const framework::AttributeMap& attrs, + const framework::AttributeMap& default_attrs, + const std::map& inplace_map) { + T maker(type, var_base_map_in, var_base_map_out, attrs, inplace_map); + maker.SetDygraphDefaultAttrsMap(default_attrs); + return maker(); + }; } }; diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index 936e84a6c82b9..22c27fe86f1ae 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -90,10 +90,9 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( const std::vector &places, ir::Graph *graph) // TODO(Yancey1989): Copying graphs is not safely since it deleted the // attrs. - : ParallelSSAGraphExecutor(strategy, local_scopes, local_exec_scopes, - places, - SeparateMultiDevicesGraph(graph, - places.size())) {} + : ParallelSSAGraphExecutor( + strategy, local_scopes, local_exec_scopes, places, + SeparateMultiDevicesGraph(graph, places.size())) {} ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( const ExecutionStrategy &strategy, const std::vector &local_scopes, diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h index d9d83efcb8e9b..88c8b1cbfb294 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h @@ -17,6 +17,7 @@ #include #include #include + #include "ThreadPool.h" #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 2ae3880ab3c2c..799005e4b09bb 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -245,14 +245,15 @@ void ReduceOpHandle::RunImpl() { int type = platform::ToBKCLDataType( framework::TransToProtoVarType(lod_tensor.dtype())); size_t numel = static_cast(lod_tensor.numel()); - all_reduce_calls.emplace_back([buffer, recvbuffer, type, numel, root_id, - &bkcl_ctx] { - PADDLE_ENFORCE_EQ(bkcl_reduce(bkcl_ctx.comm(), buffer, recvbuffer, - numel, static_cast(type), - BKCL_ADD, root_id, nullptr), - BKCL_SUCCESS, platform::errors::Unavailable( - "bkcl_all_reduce failed")); - }); + all_reduce_calls.emplace_back( + [buffer, recvbuffer, type, numel, root_id, &bkcl_ctx] { + PADDLE_ENFORCE_EQ( + bkcl_reduce(bkcl_ctx.comm(), buffer, recvbuffer, numel, + static_cast(type), BKCL_ADD, + root_id, nullptr), + BKCL_SUCCESS, + platform::errors::Unavailable("bkcl_all_reduce failed")); + }); } WaitInputVarGenerated(); diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc index 39bcf1d0f385f..35373e1a7090b 100644 --- a/paddle/fluid/framework/details/rpc_op_handle.cc +++ b/paddle/fluid/framework/details/rpc_op_handle.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/details/rpc_op_handle.h" + #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/platform/profiler/event_tracing.h" diff --git a/paddle/fluid/framework/details/scope_buffered_monitor.cc b/paddle/fluid/framework/details/scope_buffered_monitor.cc index 57faf0e75ba99..bd1a4378f0729 100644 --- a/paddle/fluid/framework/details/scope_buffered_monitor.cc +++ b/paddle/fluid/framework/details/scope_buffered_monitor.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/details/scope_buffered_monitor.h" + #include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index ea5a3c07957bf..091224f1e59bc 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -14,6 +14,7 @@ #pragma once #include + #include #include #include @@ -21,6 +22,7 @@ #include #include #include + #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/scope_buffered_monitor.h" diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc index 7e63c5ffb9a44..28a5c31f6440f 100644 --- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc @@ -41,8 +41,9 @@ SparseAllReduceOpHandle::SparseAllReduceOpHandle( is_encoded_(is_encoded), nranks_(nranks) { // TODO(gongwb) :polish them! - PADDLE_ENFORCE_EQ(is_encoded, true, platform::errors::InvalidArgument( - "The argument is_encoded is false.")); + PADDLE_ENFORCE_EQ( + is_encoded, true, + platform::errors::InvalidArgument("The argument is_encoded is false.")); VLOG(1) << "Use dgc allreduce mode" << ", nranks:" << nranks_; @@ -193,11 +194,12 @@ void SparseAllReduceOpHandle::RunImplEncoded() { sparse_reduce_calls.emplace_back([=] { platform::CUDADeviceGuard guard(dev_id); - PADDLE_ENFORCE_EQ(paddle::communication::dgc::sparseReduce( - gather_buff, k, out_tensor_buf, - static_cast(out_numel), nranks_, stream), - true, platform::errors::Unavailable( - "Calling sparseReduce() failed.")); + PADDLE_ENFORCE_EQ( + paddle::communication::dgc::sparseReduce( + gather_buff, k, out_tensor_buf, static_cast(out_numel), + nranks_, stream), + true, + platform::errors::Unavailable("Calling sparseReduce() failed.")); }); } diff --git a/paddle/fluid/framework/device_worker.cc b/paddle/fluid/framework/device_worker.cc index 880261436831d..56cd12f500168 100644 --- a/paddle/fluid/framework/device_worker.cc +++ b/paddle/fluid/framework/device_worker.cc @@ -190,9 +190,10 @@ void DeviceWorker::DumpField(const Scope& scope, int dump_mode, tensor = &cpu_tensor; } if (!CheckValidOutput(tensor, batch_size)) { - VLOG(0) << "Note: field[" << field << "] cannot pass check, so it was " - "skipped. Maybe the dimension is " - "wrong "; + VLOG(0) << "Note: field[" << field + << "] cannot pass check, so it was " + "skipped. Maybe the dimension is " + "wrong "; continue; } for (size_t i = 0; i < batch_size; ++i) { diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc index e6635a2f941cd..c973afd156085 100644 --- a/paddle/fluid/framework/device_worker_factory.cc +++ b/paddle/fluid/framework/device_worker_factory.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/device_worker_factory.h" #include + #include #include diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto old mode 100644 new mode 100755 index fff78dd872c99..b3a01ae169e4e --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -120,6 +120,7 @@ message BuildStrategy { optional bool fix_op_run_order = 13 [ default = false ]; optional bool allow_cuda_graph_capture = 14 [ default = false ]; optional int32 reduce_strategy = 15 [ default = 0 ]; + optional bool fuse_gemm_epilogue = 16 [ default = false ]; } message ExecutionStrategy { @@ -314,6 +315,7 @@ message DistributedStrategy { optional bool adam_d2sum = 36 [ default = false ]; optional bool auto_search = 37 [ default = false ]; optional bool heter_ccl_mode = 38 [ default = false ]; + optional bool is_fl_ps_mode = 39 [ default = false ]; optional RecomputeConfig recompute_configs = 101; optional AMPConfig amp_configs = 102; diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index 20d08ef18aeb3..7e1f740bcc2cf 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/framework/dlpack_tensor.h" + #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/data_type.h" diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc index 829908bd98228..6c19cf3450dbd 100644 --- a/paddle/fluid/framework/dlpack_tensor_test.cc +++ b/paddle/fluid/framework/dlpack_tensor_test.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/dlpack_tensor.h" + #include #include @@ -39,7 +40,7 @@ constexpr uint8_t GetDLDataTypeCode() { : (std::is_integral::value ? static_cast(kDLInt) : static_cast(-1))); } -} // NOLINT +} // namespace template void TestMain(const platform::Place &place, uint16_t lanes) { diff --git a/paddle/fluid/framework/downpour_lite_worker.cc b/paddle/fluid/framework/downpour_lite_worker.cc index 7344c93ef0679..8ceffe58dcf42 100644 --- a/paddle/fluid/framework/downpour_lite_worker.cc +++ b/paddle/fluid/framework/downpour_lite_worker.cc @@ -202,15 +202,15 @@ void DownpourLiteWorker::CopyDenseVars() { Variable* src_var = thread_scope_->FindVar(src_var_name); CHECK(src_var != nullptr) << src_var_name << " not found"; // NOLINT LoDTensor* src_tensor = src_var->GetMutable(); - CHECK(src_tensor != nullptr) << src_var_name - << " tensor is null"; // NOLINT + CHECK(src_tensor != nullptr) + << src_var_name << " tensor is null"; // NOLINT float* src_data = src_tensor->data(); Variable* dest_var = thread_scope_->FindVar(dest_var_name); CHECK(dest_var != nullptr) << dest_var_name << " not found"; // NOLINT LoDTensor* dest_tensor = dest_var->GetMutable(); - CHECK(dest_tensor != nullptr) << dest_var_name - << " tensor is null"; // NOLINT + CHECK(dest_tensor != nullptr) + << dest_var_name << " tensor is null"; // NOLINT float* dest_data = dest_tensor->data(); CHECK(src_tensor->numel() == dest_tensor->numel()) diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc index 06c3d18af84ae..c14b48ef8a72f 100644 --- a/paddle/fluid/framework/downpour_worker.cc +++ b/paddle/fluid/framework/downpour_worker.cc @@ -155,8 +155,8 @@ void DownpourWorker::CollectLabelInfo(size_t table_idx) { continue; } LoDTensor* tensor = fea_var->GetMutable(); - CHECK(tensor != nullptr) << "tensor of var " - << sparse_key_names_[table_id][i] << " is null"; + CHECK(tensor != nullptr) + << "tensor of var " << sparse_key_names_[table_id][i] << " is null"; // skip slots which do not have embedding Variable* emb_var = @@ -309,9 +309,9 @@ void DownpourWorker::AdjustInsWeight() { float* ins_weights = ins_weight_tensor->data(); size_t len = ins_weight_tensor->numel(); // len = batch size // here we assume nid_show slot only has one feasign in each instance - CHECK(len == nid_show_.size()) << "ins_weight size should be equal to " - << "nid_show size, " << len << " vs " - << nid_show_.size(); + CHECK(len == nid_show_.size()) + << "ins_weight size should be equal to " + << "nid_show size, " << len << " vs " << nid_show_.size(); float nid_adjw_threshold = adjust_ins_weight_config_.nid_adjw_threshold(); float nid_adjw_ratio = adjust_ins_weight_config_.nid_adjw_ratio(); int64_t nid_adjw_num = 0; @@ -326,9 +326,8 @@ void DownpourWorker::AdjustInsWeight() { } float ins_weight = 1.0; if (nid_show >= 0 && nid_show < nid_adjw_threshold) { - ins_weight = log(M_E + - (nid_adjw_threshold - nid_show) / nid_adjw_threshold * - nid_adjw_ratio); + ins_weight = log(M_E + (nid_adjw_threshold - nid_show) / + nid_adjw_threshold * nid_adjw_ratio); // count nid adjw insnum and weight ++nid_adjw_num; nid_adjw_weight += ins_weight; @@ -423,15 +422,15 @@ void DownpourWorker::CopyDenseVars() { Variable* src_var = thread_scope_->FindVar(src_var_name); CHECK(src_var != nullptr) << src_var_name << " not found"; // NOLINT LoDTensor* src_tensor = src_var->GetMutable(); - CHECK(src_tensor != nullptr) << src_var_name - << " tensor is null"; // NOLINT + CHECK(src_tensor != nullptr) + << src_var_name << " tensor is null"; // NOLINT float* src_data = src_tensor->data(); Variable* dest_var = thread_scope_->FindVar(dest_var_name); CHECK(dest_var != nullptr) << dest_var_name << " not found"; // NOLINT LoDTensor* dest_tensor = dest_var->GetMutable(); - CHECK(dest_tensor != nullptr) << dest_var_name - << " tensor is null"; // NOLINT + CHECK(dest_tensor != nullptr) + << dest_var_name << " tensor is null"; // NOLINT float* dest_data = dest_tensor->data(); CHECK(src_tensor->numel() == dest_tensor->numel()) diff --git a/paddle/fluid/framework/eigen_test.cc b/paddle/fluid/framework/eigen_test.cc index 43d5f9ea0e8db..4e214bd36f33a 100644 --- a/paddle/fluid/framework/eigen_test.cc +++ b/paddle/fluid/framework/eigen_test.cc @@ -13,10 +13,11 @@ // limitations under the License. #include "paddle/fluid/framework/eigen.h" -#include "paddle/phi/core/ddim.h" #include +#include "paddle/phi/core/ddim.h" + namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 06ce9712f5c52..830bbacb6398c 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/executor.h" + #include + #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/trainer_desc.pb.h" #include "paddle/fluid/framework/trainer_factory.h" @@ -585,8 +587,9 @@ void Executor::RunPreparedContext( "Program in ExecutorPrepareContext should has feed_ops.")); PADDLE_ENFORCE_EQ( has_fetch_operators(global_block, *fetch_targets, fetch_holder_name), - true, platform::errors::PreconditionNotMet( - "Program in the prepared context should has fetch_ops.")); + true, + platform::errors::PreconditionNotMet( + "Program in the prepared context should has fetch_ops.")); // map the data of feed_targets to feed_holder for (auto* op : global_block.AllOps()) { diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc index 0ab4bd5a12b06..468b3bc680af3 100644 --- a/paddle/fluid/framework/executor_cache.cc +++ b/paddle/fluid/framework/executor_cache.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/executor_cache.h" + #include "paddle/fluid/framework/op_info.h" namespace paddle { @@ -137,6 +138,31 @@ ExecutorInfoCache &ExecutorInfoCache::Instance() { return g_exe_cache_info_map; } +static PEAndGraphPair CreateExecutorInfo( + const ProgramDesc &program_desc, const platform::Place &place, + int64_t start_op_index, int64_t end_op_index, framework::Scope *scope, + const details::BuildStrategy &build_strategy) { + auto execution_strategy = details::GetExecutionStrategy(place); + auto graph = std::make_shared( + program_desc, start_op_index, end_op_index); + auto parallel_executor = std::make_shared( + place, scope, execution_strategy, build_strategy, graph.get()); + parallel_executor->PrepareVariables(scope); + return std::make_pair(parallel_executor, graph); +} + +PEAndGraphPair CreateFixOrderExecutorInfo(const ProgramDesc &program_desc, + const platform::Place &place, + int64_t start_op_index, + int64_t end_op_index, + framework::Scope *scope) { + details::BuildStrategy build_strategy; + build_strategy.fix_op_run_order_ = true; + auto pe_and_graph = CreateExecutorInfo(program_desc, place, start_op_index, + end_op_index, scope, build_strategy); + return pe_and_graph; +} + CacheInfo GetExecutorInfoFromCache(const ProgramDesc &program_desc, const platform::Place &place, int64_t start_op_index, int64_t end_op_index, @@ -153,21 +179,17 @@ CacheInfo GetExecutorInfoFromCache(const ProgramDesc &program_desc, } VLOG(1) << "create exe_info for " << program_id << " is_grad: " << is_grad; - auto execution_strategy = details::GetExecutionStrategy(place); auto &build_strategy = cached_exe_info.GetBuildStrategy(program_id); // 2. Construct Graph and ParallelExecutor. - auto graph = std::make_shared( - program_desc, start_op_index, end_op_index); - auto parallel_executor = std::make_shared( - place, scope, execution_strategy, build_strategy, graph.get()); - parallel_executor->PrepareVariables(scope); + auto pe_and_graph = CreateExecutorInfo(program_desc, place, start_op_index, + end_op_index, scope, build_strategy); // 3. Insert value into cached map. auto &cached_value = cached_exe_info.GetMutable(program_id, is_grad); - cached_value.executor_ = parallel_executor; - cached_value.graph_ = std::move(graph); - return std::make_pair(parallel_executor, /*is_new_created=*/true); + cached_value.executor_ = pe_and_graph.first; + cached_value.graph_ = pe_and_graph.second; + return std::make_pair(pe_and_graph.first, /*is_new_created=*/true); } else { VLOG(1) << "get exe_info from cache by: " << program_id << " is_grad: " << is_grad; diff --git a/paddle/fluid/framework/executor_cache.h b/paddle/fluid/framework/executor_cache.h index 8207b56fc04f1..25c0bfab90c4a 100644 --- a/paddle/fluid/framework/executor_cache.h +++ b/paddle/fluid/framework/executor_cache.h @@ -127,11 +127,20 @@ class ExecutorInfoCache { using CacheInfo = std::pair, bool /*is_new_created*/>; +using PEAndGraphPair = + std::pair, std::shared_ptr>; + CacheInfo GetExecutorInfoFromCache(const ProgramDesc& program_desc, const platform::Place& place, int64_t start_op_index, int64_t end_op_index, bool is_grad, int64_t program_id, framework::Scope* scope); +PEAndGraphPair CreateFixOrderExecutorInfo(const ProgramDesc& program_desc, + const platform::Place& place, + int64_t start_op_index, + int64_t end_op_index, + framework::Scope* scope); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 06019372a7323..c6ccc2adc659f 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -13,13 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/executor_thread_worker.h" + #include #include + +#include "gflags/gflags.h" #include "google/protobuf/io/zero_copy_stream_impl.h" #include "google/protobuf/message.h" #include "google/protobuf/text_format.h" - -#include "gflags/gflags.h" #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_type.h" @@ -616,8 +617,8 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) { int len = tensor->numel(); CHECK(slot_dim * len == g_tensor->numel()) << "len:" << len << " g_numel:" << g_tensor->numel(); - CHECK(len == tensor->numel()) << "len:" << len - << "t_numel:" << tensor->numel(); + CHECK(len == tensor->numel()) + << "len:" << len << "t_numel:" << tensor->numel(); int64_t* ids = tensor->data(); for (auto id_idx = 0u; id_idx < len; ++id_idx) { if (ids[id_idx] == 0) { @@ -626,15 +627,15 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) { } memcpy(push_g[fea_idx].data() + offset, g, sizeof(float) * slot_dim); push_g[fea_idx][0] = 1.0f; - CHECK(fea_idx < fea_info.size()) << "fea_idx:" << fea_idx - << " size:" << fea_info.size(); + CHECK(fea_idx < fea_info.size()) + << "fea_idx:" << fea_idx << " size:" << fea_info.size(); push_g[fea_idx][1] = static_cast(fea_info[fea_idx].label); g += slot_dim; fea_idx++; } } - CHECK(fea_idx == features.size()) << "fea_idx:" << fea_idx - << " features size:" << features.size(); + CHECK(fea_idx == features.size()) + << "fea_idx:" << fea_idx << " features size:" << features.size(); CHECK_GT(features.size(), 0); std::vector push_g_vec; @@ -701,5 +702,5 @@ void AsyncExecutorThreadWorker::check_pull_push_memory( } #endif -} // einit_modelnd namespace framework +} // namespace framework } // end namespace paddle diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index 524922b0322e5..f4fa54d2c3a7b 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -21,6 +21,7 @@ limitations under the License. */ #include #include // NOLINT #include + #include "paddle/fluid/framework/data_feed.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/program_desc.h" diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 096134e852833..ec3fdc49fdf1f 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -14,9 +14,9 @@ limitations under the License. */ #include "paddle/fluid/framework/feed_fetch_method.h" +#include #include -#include #include "glog/logging.h" namespace phi { diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt index 2e9104f40cc60..3b22a4b0d5d7a 100644 --- a/paddle/fluid/framework/fleet/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -1,71 +1,125 @@ if(WITH_PSLIB) - if(WITH_PSLIB_BRPC) - set(BRPC_DEPS pslib_brpc) - else() - if(NOT WITH_HETERPS) - set(BRPC_DEPS brpc) - endif() - endif(WITH_PSLIB_BRPC) - cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto proto_desc op_registry variable_helper scope ${BRPC_DEPS} pslib) + if(WITH_PSLIB_BRPC) + set(BRPC_DEPS pslib_brpc) + else() + if(NOT WITH_HETERPS) + set(BRPC_DEPS brpc) + endif() + endif(WITH_PSLIB_BRPC) + cc_library( + fleet_wrapper + SRCS fleet_wrapper.cc + DEPS framework_proto + proto_desc + op_registry + variable_helper + scope + ${BRPC_DEPS} + pslib) else() - cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope) + cc_library( + fleet_wrapper + SRCS fleet_wrapper.cc + DEPS framework_proto variable_helper scope) endif(WITH_PSLIB) if(WITH_HETERPS) - if(WITH_NCCL AND WITH_GPU) - nv_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc - DEPS heter_ps gloo_wrapper ${BRPC_DEPS}) - add_subdirectory(heter_ps) - elseif(WITH_XPU_KP) - xpu_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.kps ps_gpu_wrapper.cc - DEPS heter_ps gloo_wrapper ${BRPC_DEPS}) - add_subdirectory(heter_ps) - elseif(WITH_RCCL) - hip_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc - DEPS heter_ps gloo_wrapper ${BRPC_DEPS}) - add_subdirectory(heter_ps) - endif() + if(WITH_NCCL AND WITH_GPU) + nv_library( + ps_gpu_wrapper + SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc + DEPS heter_ps gloo_wrapper ${BRPC_DEPS}) + add_subdirectory(heter_ps) + elseif(WITH_XPU_KP) + xpu_library( + ps_gpu_wrapper + SRCS ps_gpu_wrapper.kps ps_gpu_wrapper.cc + DEPS heter_ps gloo_wrapper ${BRPC_DEPS}) + add_subdirectory(heter_ps) + elseif(WITH_RCCL) + hip_library( + ps_gpu_wrapper + SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc + DEPS heter_ps gloo_wrapper ${BRPC_DEPS}) + add_subdirectory(heter_ps) + endif() else() - cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc DEPS gloo_wrapper) + cc_library( + ps_gpu_wrapper + SRCS ps_gpu_wrapper.cc + DEPS gloo_wrapper) endif(WITH_HETERPS) if(WITH_NCCL OR WITH_RCCL) - cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope) + cc_library( + nccl_wrapper + SRCS nccl_wrapper.cc + DEPS framework_proto variable_helper scope) endif() if(WITH_BOX_PS) - if(WITH_GPU) - nv_library(box_wrapper SRCS box_wrapper.cc box_wrapper.cu DEPS framework_proto lod_tensor box_ps) - endif() - if(WITH_ROCM) - hip_library(box_wrapper SRCS box_wrapper.cc box_wrapper.cu DEPS framework_proto lod_tensor box_ps) - endif() + if(WITH_GPU) + nv_library( + box_wrapper + SRCS box_wrapper.cc box_wrapper.cu + DEPS framework_proto lod_tensor box_ps) + endif() + if(WITH_ROCM) + hip_library( + box_wrapper + SRCS box_wrapper.cc box_wrapper.cu + DEPS framework_proto lod_tensor box_ps) + endif() else() - cc_library(box_wrapper SRCS box_wrapper.cc DEPS framework_proto lod_tensor) + cc_library( + box_wrapper + SRCS box_wrapper.cc + DEPS framework_proto lod_tensor) endif(WITH_BOX_PS) - if(WITH_GLOO) - cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope gloo) - cc_library(metrics SRCS metrics.cc DEPS gloo_wrapper) + cc_library( + gloo_wrapper + SRCS gloo_wrapper.cc + DEPS framework_proto variable_helper scope gloo) + cc_library( + metrics + SRCS metrics.cc + DEPS gloo_wrapper) else() - cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope) - cc_library(metrics SRCS metrics.cc DEPS gloo_wrapper) + cc_library( + gloo_wrapper + SRCS gloo_wrapper.cc + DEPS framework_proto variable_helper scope) + cc_library( + metrics + SRCS metrics.cc + DEPS gloo_wrapper) endif(WITH_GLOO) if(WITH_PSLIB) -set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") -if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) - set(DISTRIBUTE_COMPILE_FLAGS - "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") -endif() -set_source_files_properties(heter_wrapper.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set(DISTRIBUTE_COMPILE_FLAGS + "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor" + ) + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) + set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") + endif() + set_source_files_properties( + heter_wrapper.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) endif() -cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto -device_context heter_service_proto ${BRPC_DEPS}) +cc_library( + heter_wrapper + SRCS heter_wrapper.cc + DEPS framework_proto device_context heter_service_proto ${BRPC_DEPS}) -cc_test(test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell) +cc_test( + test_fleet_cc + SRCS test_fleet.cc + DEPS fleet_wrapper gloo_wrapper fs shell) if(WITH_ASCEND OR WITH_ASCEND_CL) - cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend_ge ascend_graph) + cc_library( + ascend_wrapper + SRCS ascend_wrapper.cc + DEPS framework_proto lod_tensor ascend_ge ascend_graph) endif() diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h index d55862120116d..a4bd208959e43 100644 --- a/paddle/fluid/framework/fleet/ascend_wrapper.h +++ b/paddle/fluid/framework/fleet/ascend_wrapper.h @@ -22,6 +22,10 @@ limitations under the License. */ #include #include +#include "ge/ge_api.h" +#include "graph/attr_value.h" +#include "graph/tensor.h" +#include "graph/types.h" #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" @@ -29,11 +33,6 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/timer.h" -#include "ge/ge_api.h" -#include "graph/attr_value.h" -#include "graph/tensor.h" -#include "graph/types.h" - namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/box_wrapper.cc b/paddle/fluid/framework/fleet/box_wrapper.cc index 8564a42165961..1bb432a791e2c 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.cc +++ b/paddle/fluid/framework/fleet/box_wrapper.cc @@ -14,10 +14,12 @@ #ifdef PADDLE_WITH_BOX_PS #include "paddle/fluid/framework/fleet/box_wrapper.h" + #include #include #include #include + #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" @@ -186,26 +188,30 @@ void BasicAucCalculator::calculate_bucket_error() { void BoxWrapper::FeedPass(int date, const std::vector& feasgin_to_box) const { int ret = boxps_ptr_->FeedPass(date, feasgin_to_box); - PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( - "FeedPass failed in BoxPS.")); + PADDLE_ENFORCE_EQ( + ret, 0, + platform::errors::PreconditionNotMet("FeedPass failed in BoxPS.")); } void BoxWrapper::BeginFeedPass(int date, boxps::PSAgentBase** agent) const { int ret = boxps_ptr_->BeginFeedPass(date, *agent); - PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( - "BeginFeedPass failed in BoxPS.")); + PADDLE_ENFORCE_EQ( + ret, 0, + platform::errors::PreconditionNotMet("BeginFeedPass failed in BoxPS.")); } void BoxWrapper::EndFeedPass(boxps::PSAgentBase* agent) const { int ret = boxps_ptr_->EndFeedPass(agent); - PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( - "EndFeedPass failed in BoxPS.")); + PADDLE_ENFORCE_EQ( + ret, 0, + platform::errors::PreconditionNotMet("EndFeedPass failed in BoxPS.")); } void BoxWrapper::BeginPass() const { int ret = boxps_ptr_->BeginPass(); - PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( - "BeginPass failed in BoxPS.")); + PADDLE_ENFORCE_EQ( + ret, 0, + platform::errors::PreconditionNotMet("BeginPass failed in BoxPS.")); } void BoxWrapper::SetTestMode(bool is_test) const { diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu index aea479ed0b214..17e59ac9104f6 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.cu +++ b/paddle/fluid/framework/fleet/box_wrapper.cu @@ -17,6 +17,7 @@ #include #include #include + #include "paddle/fluid/framework/fleet/box_wrapper.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" @@ -175,13 +176,13 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place, #define EXPAND_EMBED_PULL_CASE(i, ...) \ case i: { \ constexpr size_t ExpandDim = i; \ - PullCopy<<<(total_length + 512 - 1) / 512, 512, 0, stream>>>( \ - gpu_values, \ - reinterpret_cast*>( \ - total_values_gpu), \ - gpu_len, hidden_size, expand_embed_dim, slot_num, total_length, \ - gpu_keys); \ + PullCopy \ + <<<(total_length + 512 - 1) / 512, 512, 0, stream>>>( \ + gpu_values, \ + reinterpret_cast*>( \ + total_values_gpu), \ + gpu_len, hidden_size, expand_embed_dim, slot_num, total_length, \ + gpu_keys); \ } break #endif diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h index b043edca138a8..dc01df221e966 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.h +++ b/paddle/fluid/framework/fleet/box_wrapper.h @@ -24,6 +24,7 @@ limitations under the License. */ #include #endif #include + #include #include #include @@ -36,6 +37,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/data_feed.h" #include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/lod_tensor.h" @@ -65,10 +67,12 @@ class BasicAucCalculator { _local_pred = 0; } void add_data(double pred, int label) { - PADDLE_ENFORCE_GE(pred, 0.0, platform::errors::PreconditionNotMet( - "pred should be greater than 0")); - PADDLE_ENFORCE_LE(pred, 1.0, platform::errors::PreconditionNotMet( - "pred should be lower than 1")); + PADDLE_ENFORCE_GE( + pred, 0.0, + platform::errors::PreconditionNotMet("pred should be greater than 0")); + PADDLE_ENFORCE_LE( + pred, 1.0, + platform::errors::PreconditionNotMet("pred should be lower than 1")); PADDLE_ENFORCE_EQ( label * label, label, platform::errors::PreconditionNotMet( @@ -172,13 +176,15 @@ class AfsManager { pwd.c_str(), conf_path.c_str()); VLOG(0) << "AFSAPI Init: user: " << user << ", pwd: " << pwd; int ret = _afshandler->Init(true, (com_logstatus() == 0)); - PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( - "Called AFSAPI Init Interface Failed.")); + PADDLE_ENFORCE_EQ(ret, 0, + platform::errors::PreconditionNotMet( + "Called AFSAPI Init Interface Failed.")); // Too high level will hurt the performance comlog_set_log_level(4); ret = _afshandler->Connect(); - PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( - "Called AFSAPI Connect Interface Failed")); + PADDLE_ENFORCE_EQ(ret, 0, + platform::errors::PreconditionNotMet( + "Called AFSAPI Connect Interface Failed")); } virtual ~AfsManager() { if (_afshandler != NULL) { @@ -294,8 +300,9 @@ class AfsManager { int ret = PopenBidirectionalInternal(cmd.c_str(), rfp, wfp, pid, true, true); - PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( - "Called PopenBidirectionalInternal Failed")); + PADDLE_ENFORCE_EQ(ret, 0, + platform::errors::PreconditionNotMet( + "Called PopenBidirectionalInternal Failed")); std::string filename(path); if (strncmp(filename.c_str(), "afs:", 4) == 0) { filename = filename.substr(4); @@ -451,8 +458,9 @@ class BoxWrapper { std::string ret_str; int ret = boxps_ptr_->SaveBase(batch_model_path, xbox_model_path, ret_str, seconds_from_1970 / 86400); - PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( - "SaveBase failed in BoxPS.")); + PADDLE_ENFORCE_EQ( + ret, 0, + platform::errors::PreconditionNotMet("SaveBase failed in BoxPS.")); return ret_str; } @@ -460,8 +468,9 @@ class BoxWrapper { VLOG(3) << "Begin SaveDelta"; std::string ret_str; int ret = boxps_ptr_->SaveDelta(xbox_model_path, ret_str); - PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( - "SaveDelta failed in BoxPS.")); + PADDLE_ENFORCE_EQ( + ret, 0, + platform::errors::PreconditionNotMet("SaveDelta failed in BoxPS.")); return ret_str; } diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h index 6f7009f4d5143..f6f1cbfc2a08d 100644 --- a/paddle/fluid/framework/fleet/box_wrapper_impl.h +++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h @@ -79,8 +79,9 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place, int ret = boxps_ptr_->PullSparseGPU( total_keys, reinterpret_cast(total_values_gpu), static_cast(total_length), device_id); - PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( - "PullSparseGPU failed in BoxPS.")); + PADDLE_ENFORCE_EQ( + ret, 0, + platform::errors::PreconditionNotMet("PullSparseGPU failed in BoxPS.")); pull_boxps_timer.Pause(); VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length @@ -144,8 +145,9 @@ void BoxWrapper::PushSparseGradCase( int ret = boxps_ptr_->PushSparseGPU( total_keys, reinterpret_cast(total_grad_values_gpu), static_cast(total_length), place.GetDeviceId()); - PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( - "PushSparseGPU failed in BoxPS.")); + PADDLE_ENFORCE_EQ( + ret, 0, + platform::errors::PreconditionNotMet("PushSparseGPU failed in BoxPS.")); push_boxps_timer.Pause(); #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index deb2b90c93353..5c2be1e55f9ef 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -20,6 +20,7 @@ limitations under the License. */ #include #endif #include + #include #include #include diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.cc b/paddle/fluid/framework/fleet/gloo_wrapper.cc index d850d05d87f5c..56d0e1ec47e7e 100644 --- a/paddle/fluid/framework/fleet/gloo_wrapper.cc +++ b/paddle/fluid/framework/fleet/gloo_wrapper.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/fleet/gloo_wrapper.h" + #include "paddle/fluid/framework/io/fs.h" #include "paddle/fluid/string/string_helper.h" diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.h b/paddle/fluid/framework/fleet/gloo_wrapper.h index 42ae73f9b13f1..1ecaf1318b01b 100644 --- a/paddle/fluid/framework/fleet/gloo_wrapper.h +++ b/paddle/fluid/framework/fleet/gloo_wrapper.h @@ -214,8 +214,9 @@ class GlooWrapper { static_cast( &gloo::min)); } else { - PADDLE_ENFORCE_EQ(0, 1, paddle::platform::errors::InvalidArgument( - "AllReduce mode not known: " + mode)); + PADDLE_ENFORCE_EQ(0, 1, + paddle::platform::errors::InvalidArgument( + "AllReduce mode not known: " + mode)); } gloo::allreduce(opts); #else diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h index 823b60c5ef1f2..560607bd160a1 100644 --- a/paddle/fluid/framework/fleet/heter_context.h +++ b/paddle/fluid/framework/fleet/heter_context.h @@ -17,6 +17,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_HETERPS #include + #include #include #include diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt index d62fc1c084962..7540c6147f4b7 100644 --- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt @@ -1,38 +1,96 @@ -IF(WITH_GPU) - SET(HETERPS_DEPS device_context) - if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) - SET(HETERPS_DEPS ${HETERPS_DEPS} cub) - endif() - if(WITH_PSCORE) - get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) - SET(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS}) - endif() - nv_library(heter_comm_kernel SRCS heter_comm_kernel.cu feature_value.h DEPS ${HETERPS_DEPS}) - nv_library(hashtable_kernel SRCS hashtable_kernel.cu feature_value.h DEPS ${HETERPS_DEPS}) - nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h mem_pool.h DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel) - nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm) - nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm) - if(WITH_PSCORE) - nv_library(graph_gpu_ps SRCS graph_gpu_ps_table_inl.cu DEPS heter_comm table hashtable_kernel) - nv_library(graph_sampler SRCS graph_sampler_inl.h DEPS graph_gpu_ps) - nv_library(graph_gpu_wrapper SRCS graph_gpu_wrapper.cu DEPS heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS} graph_gpu_ps) - nv_test(test_cpu_query SRCS test_cpu_query.cu DEPS heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS} graph_gpu_ps graph_gpu_wrapper) - #ADD_EXECUTABLE(test_sample_rate test_sample_rate.cu) - #target_link_libraries(test_sample_rate heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS}) - #nv_test(test_sample_rate SRCS test_sample_rate.cu DEPS heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS}) - #ADD_EXECUTABLE(test_cpu_query test_cpu_query.cu) - #target_link_libraries(test_cpu_query graph_gpu_ps) - endif() -ENDIF() -IF(WITH_XPU_KP) - SET(HETERPS_DEPS device_context) - xpu_library(heter_comm_kernel SRCS heter_comm_kernel.h heter_comm_kernel.kps feature_value.h) - xpu_library(hashtable_kernel SRCS hashtable.h hashtable_kernel.kps) - cc_library(heter_comm SRCS heter_comm.h heter_resource.cc DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel) - cc_library(heter_ps SRCS heter_ps.cc DEPS heter_comm) -ENDIF() -IF(WITH_ROCM) - hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context) - hip_test(test_heter_comm SRCS feature_value.h DEPS heter_comm) - hip_library(heter_ps SRCS heter_ps.cu DEPS heter_comm) -ENDIF() +if(WITH_GPU) + set(HETERPS_DEPS device_context) + if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) + set(HETERPS_DEPS ${HETERPS_DEPS} cub) + endif() + if(WITH_PSCORE) + get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) + set(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS}) + endif() + nv_library( + heter_comm_kernel + SRCS heter_comm_kernel.cu feature_value.h + DEPS ${HETERPS_DEPS}) + nv_library( + hashtable_kernel + SRCS hashtable_kernel.cu feature_value.h + DEPS ${HETERPS_DEPS}) + nv_library( + heter_comm + SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h + mem_pool.h + DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel) + nv_test( + test_heter_comm + SRCS feature_value.h + DEPS heter_comm) + nv_library( + heter_ps + SRCS heter_ps.cu + DEPS heter_comm) + if(WITH_PSCORE) + nv_library( + graph_gpu_ps + SRCS graph_gpu_ps_table_inl.cu + DEPS heter_comm table hashtable_kernel) + nv_library( + graph_sampler + SRCS graph_sampler_inl.h + DEPS graph_gpu_ps) + nv_library( + graph_gpu_wrapper + SRCS graph_gpu_wrapper.cu + DEPS heter_comm + table + heter_comm_kernel + hashtable_kernel + heter_ps + ${HETERPS_DEPS} + graph_gpu_ps) + nv_test( + test_cpu_query + SRCS test_cpu_query.cu + DEPS heter_comm + table + heter_comm_kernel + hashtable_kernel + heter_ps + ${HETERPS_DEPS} + graph_gpu_ps + graph_gpu_wrapper) + #ADD_EXECUTABLE(test_sample_rate test_sample_rate.cu) + #target_link_libraries(test_sample_rate heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS}) + #nv_test(test_sample_rate SRCS test_sample_rate.cu DEPS heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS}) + #ADD_EXECUTABLE(test_cpu_query test_cpu_query.cu) + #target_link_libraries(test_cpu_query graph_gpu_ps) + endif() +endif() +if(WITH_XPU_KP) + set(HETERPS_DEPS device_context) + xpu_library(heter_comm_kernel SRCS heter_comm_kernel.h heter_comm_kernel.kps + feature_value.h) + xpu_library(hashtable_kernel SRCS hashtable.h hashtable_kernel.kps) + cc_library( + heter_comm + SRCS heter_comm.h heter_resource.cc + DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel) + cc_library( + heter_ps + SRCS heter_ps.cc + DEPS heter_comm) +endif() +if(WITH_ROCM) + hip_library( + heter_comm + SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h + hashtable.h + DEPS cub device_context) + hip_test( + test_heter_comm + SRCS feature_value.h + DEPS heter_comm) + hip_library( + heter_ps + SRCS heter_ps.cu + DEPS heter_comm) +endif() diff --git a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h index 4ad32d1714f7d..da65cccb435d1 100644 --- a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h +++ b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h @@ -22,6 +22,7 @@ #define CONCURRENT_UNORDERED_MAP_CUH #include + #include #include #include @@ -258,7 +259,7 @@ class cycle_iterator_adapter { return old; } - __host__ __device__ const cycle_iterator_adapter& operator++(int)const { + __host__ __device__ const cycle_iterator_adapter& operator++(int) const { cycle_iterator_adapter old(m_begin, m_end, m_current); if (m_end == (m_current + 1)) m_current = m_begin; diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h index 19c355c671a38..2e7588d0ac48c 100644 --- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h +++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h @@ -17,6 +17,7 @@ #include #include #include + #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/cuda_device_guard.h" @@ -284,6 +285,6 @@ struct NodeQueryResult { }; ~NodeQueryResult() {} }; -} -}; +} // namespace framework +}; // namespace paddle #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h index ae57c2ebe932f..5831863f7f5c3 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h @@ -14,7 +14,9 @@ #pragma once #include + #include + #include "heter_comm.h" #include "paddle/fluid/distributed/ps/table/common_graph_table.h" #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h" @@ -123,7 +125,7 @@ class GpuPsGraphTable : public HeterComm { std::condition_variable cv_; int cpu_table_status; }; -} -}; +} // namespace framework +}; // namespace paddle //#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h" #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu index 72b9cae41c0fd..ab33d2a9c05bf 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu @@ -15,6 +15,7 @@ #include #include #include + #include #pragma once #ifdef PADDLE_WITH_HETERPS @@ -859,11 +860,10 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( constexpr int TILE_SIZE = BLOCK_WARPS * 16; const dim3 block(WARP_SIZE, BLOCK_WARPS); const dim3 grid((shard_len + TILE_SIZE - 1) / TILE_SIZE); - neighbor_sample_example_v2< - WARP_SIZE, BLOCK_WARPS, - TILE_SIZE><<remote_stream(i, gpu_id)>>>( - graph, id_array, actual_size_array, sample_array, sample_size, - shard_len, default_value); + neighbor_sample_example_v2 + <<remote_stream(i, gpu_id)>>>( + graph, id_array, actual_size_array, sample_array, sample_size, + shard_len, default_value); } for (int i = 0; i < total_gpu; ++i) { @@ -946,12 +946,12 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( constexpr int TILE_SIZE_ = BLOCK_WARPS_ * 16; const dim3 block2(WARP_SIZE_, BLOCK_WARPS_); const dim3 grid2((number_on_cpu + TILE_SIZE_ - 1) / TILE_SIZE_); - copy_buffer_ac_to_final_place<<>>( - gpu_buffers_ptr, gpu_ac_ptr, val, actual_sample_size, - thrust::raw_pointer_cast(t_index.data()) + 1, - thrust::raw_pointer_cast(cumsum_gpu_ac.data()), number_on_cpu, - sample_size); + copy_buffer_ac_to_final_place + <<>>( + gpu_buffers_ptr, gpu_ac_ptr, val, actual_sample_size, + thrust::raw_pointer_cast(t_index.data()) + 1, + thrust::raw_pointer_cast(cumsum_gpu_ac.data()), number_on_cpu, + sample_size); delete[] merge_buffers; delete[] cpu_keys; @@ -1027,13 +1027,13 @@ NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, int start, local_begin_pos = [0,3] sample_size = [2,3] */ - std::function range_check = []( - int x, int y, int x1, int y1, int& x2, int& y2) { - if (y <= x1 || x >= y1) return 0; - y2 = min(y, y1); - x2 = max(x1, x); - return y2 - x2; - }; + std::function range_check = + [](int x, int y, int x1, int y1, int& x2, int& y2) { + if (y <= x1 || x >= y1) return 0; + y2 = min(y, y1); + x2 = max(x1, x); + return y2 - x2; + }; auto graph = gpu_graph_list[gpu_id]; if (graph.node_size == 0) { return result; @@ -1106,6 +1106,6 @@ NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, int start, return result; */ } -} -}; +} // namespace framework +}; // namespace paddle #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu index c976bb67cb21e..43f0101009d08 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu @@ -271,5 +271,5 @@ void GraphGpuWrapper::export_partition_files(int idx, std::string file_path) { ->cpu_graph_table->export_partition_files(idx, file_path); } #endif -} -}; +} // namespace framework +}; // namespace paddle diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h index a34e752fc7ea7..d3c4dea589030 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h @@ -16,6 +16,7 @@ #include #include #include + #include "paddle/fluid/distributed/ps/table/common_graph_table.h" #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h" namespace paddle { @@ -73,5 +74,5 @@ class GraphGpuWrapper { void* graph_table; }; #endif -} -}; +} // namespace framework +}; // namespace paddle diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_sampler.h b/paddle/fluid/framework/fleet/heter_ps/graph_sampler.h index a7c043f1edf37..7cec4fcfb8311 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_sampler.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_sampler.h @@ -14,6 +14,7 @@ #pragma once #include + #include #include #include @@ -23,6 +24,7 @@ #include #include #include + #include "paddle/fluid/distributed/ps/table/common_graph_table.h" #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h" #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" @@ -106,7 +108,7 @@ class AllInGpuGraphSampler : public GraphSampler { // std::shared_ptr random; int gpu_num; }; -} -}; +} // namespace framework +}; // namespace paddle #include "paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h" #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h index ad4b00b11aa39..e68612d57e259 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h @@ -156,6 +156,6 @@ void AllInGpuGraphSampler::init(GpuPsGraphTable *g, this->gpu_num = g->gpu_num; graph_table = g->cpu_graph_table.get(); } -} -}; +} // namespace framework +}; // namespace paddle #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h index 234aa15ebf74d..112a59c8fec87 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #ifdef PADDLE_WITH_HETERPS #include + #include #include #include @@ -36,6 +37,7 @@ limitations under the License. */ #include "thrust/pair.h" #elif defined(__xpu__) #include + #include "xpu/kernel/cluster_header.h" #include "xpu/kernel/math.h" #include "xpu/kernel/simd.h" diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu index 57741c2c19b1c..c2e6cdc5c6993 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu @@ -14,6 +14,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_HETERPS #include + #include "paddle/fluid/framework/fleet/heter_ps/hashtable.h" #include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h" @@ -366,10 +367,10 @@ template class HashTable; template class HashTable; template class HashTable; -template void HashTable::get< - cudaStream_t>(const unsigned long* d_keys, - paddle::framework::FeatureValue* d_vals, size_t len, - cudaStream_t stream); +template void +HashTable::get( + const unsigned long* d_keys, paddle::framework::FeatureValue* d_vals, + size_t len, cudaStream_t stream); template void HashTable::get( @@ -395,10 +396,10 @@ template void HashTable::get( // const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t // stream); -template void HashTable::insert< - cudaStream_t>(const unsigned long* d_keys, - const paddle::framework::FeatureValue* d_vals, size_t len, - cudaStream_t stream); +template void +HashTable::insert( + const unsigned long* d_keys, const paddle::framework::FeatureValue* d_vals, + size_t len, cudaStream_t stream); template void HashTable:: insert(const unsigned long* d_keys, size_t len, char* pool, @@ -438,21 +439,22 @@ template void HashTable::update< paddle::framework::FeaturePushValue>, cudaStream_t>(const unsigned long* d_keys, const paddle::framework::FeaturePushValue* d_grads, - size_t len, Optimizer - sgd, - cudaStream_t stream); - -template void -HashTable::update< - Optimizer, - cudaStream_t>(const unsigned long* d_keys, const char* d_grads, size_t len, + size_t len, Optimizer sgd, cudaStream_t stream); +template void HashTable:: + update, + cudaStream_t>(const unsigned long* d_keys, const char* d_grads, + size_t len, + Optimizer + sgd, + cudaStream_t stream); + // template void HashTable::update< // Optimizer #include + #include "cub/cub.cuh" #include "cub/util_allocator.cuh" #if defined(PADDLE_WITH_CUDA) @@ -26,6 +27,7 @@ limitations under the License. */ #elif defined(PADDLE_WITH_XPU_KP) // #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h" #include + #include "paddle/fluid/platform/device/xpu/enforce_xpu.h" #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index 64b177abb8638..38a4e7b7bb1a9 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #ifdef PADDLE_WITH_HETERPS #include + #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu index 94d7929b2947d..a5ee8e2ff8395 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu @@ -294,10 +294,10 @@ template void HeterCommKernel::fill_idx( template void HeterCommKernel::calc_shard_offset( int* idx, int* left, int* right, long long len, int total_devs, const cudaStream_t& stream); -template void HeterCommKernel::calc_shard_index< - unsigned long, int, cudaStream_t>(unsigned long* d_keys, long long len, - int* shard_index, int total_devs, - const cudaStream_t& stream); +template void +HeterCommKernel::calc_shard_index( + unsigned long* d_keys, long long len, int* shard_index, int total_devs, + const cudaStream_t& stream); template void HeterCommKernel::calc_shard_index( long* d_keys, long long len, int* shard_index, int total_devs, diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc index 700b43f18fb96..fe8e8c86505ce 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h" + #include #ifdef PADDLE_WITH_HETERPS diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu index 43b84ee5d26fb..cfe4662629415 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h" #ifdef PADDLE_WITH_HETERPS diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h index 8449a4048b72f..83dc232bc6a3b 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h" #if defined(PADDLE_WITH_CUDA) diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h index 2c312e9d4d60a..fe44c81fe445f 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h" diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h index 5717f44d400a5..087877818f5fb 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h @@ -24,6 +24,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU_KP #include // NOLINT + #include "paddle/fluid/platform/device/xpu/xpu_info.h" #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h index 4684b4a0bc155..82090ef4817c9 100644 --- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h +++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h @@ -19,6 +19,7 @@ limitations under the License. */ #include #endif #include + #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h" diff --git a/paddle/fluid/framework/fleet/heter_ps/test_comm.cu b/paddle/fluid/framework/fleet/heter_ps/test_comm.cu index 3a6ed50ad8e70..72fa0282066d2 100644 --- a/paddle/fluid/framework/fleet/heter_ps/test_comm.cu +++ b/paddle/fluid/framework/fleet/heter_ps/test_comm.cu @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include + #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" diff --git a/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu index 62a0df9430002..621c7f5bab412 100644 --- a/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu +++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu @@ -13,8 +13,10 @@ // limitations under the License. #include + #include #include + #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h" diff --git a/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu b/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu index ff3cd9d2d046d..49e9a051ec0c0 100644 --- a/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu +++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu @@ -13,8 +13,10 @@ // limitations under the License. #include + #include #include + #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h" diff --git a/paddle/fluid/framework/fleet/heter_ps/test_graph.cu b/paddle/fluid/framework/fleet/heter_ps/test_graph.cu index 06c7026eb51ca..28098181b6c2a 100644 --- a/paddle/fluid/framework/fleet/heter_ps/test_graph.cu +++ b/paddle/fluid/framework/fleet/heter_ps/test_graph.cu @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include + #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h" diff --git a/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu b/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu index affa60d022ece..a1e8f06368b07 100644 --- a/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu +++ b/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu @@ -13,6 +13,8 @@ // limitations under the License. #include + +#include #include // NOLINT #include #include @@ -20,32 +22,30 @@ #include // NOLINT #include #include -#include "google/protobuf/text_format.h" -#include +#include "google/protobuf/text_format.h" #include "gtest/gtest.h" #include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/service/env.h" #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" #include "paddle/fluid/distributed/ps/table/common_graph_table.h" #include "paddle/fluid/distributed/ps/table/graph/graph_node.h" +#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" +#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" +#include "paddle/fluid/framework/fleet/heter_ps/graph_sampler.h" +#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h" +#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" +#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" -#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" -#include "paddle/fluid/framework/fleet/heter_ps/graph_sampler.h" -#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h" -#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" -#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h" -#include "paddle/fluid/platform/cuda_device_guard.h" - using namespace paddle::framework; namespace platform = paddle::platform; namespace operators = paddle::operators; diff --git a/paddle/fluid/framework/fleet/metrics.cc b/paddle/fluid/framework/fleet/metrics.cc index 56bc568460bbc..4225281640588 100644 --- a/paddle/fluid/framework/fleet/metrics.cc +++ b/paddle/fluid/framework/fleet/metrics.cc @@ -17,6 +17,7 @@ #include #include #include + #include "paddle/fluid/framework/lod_tensor.h" #if defined(PADDLE_WITH_PSLIB) || defined(PADDLE_WITH_PSCORE) @@ -63,10 +64,12 @@ void BasicAucCalculator::add_data(const float* d_pred, const int64_t* d_label, } void BasicAucCalculator::add_unlock_data(double pred, int label) { - PADDLE_ENFORCE_GE(pred, 0.0, platform::errors::PreconditionNotMet( - "pred should be greater than 0")); - PADDLE_ENFORCE_LE(pred, 1.0, platform::errors::PreconditionNotMet( - "pred should be lower than 1")); + PADDLE_ENFORCE_GE( + pred, 0.0, + platform::errors::PreconditionNotMet("pred should be greater than 0")); + PADDLE_ENFORCE_LE( + pred, 1.0, + platform::errors::PreconditionNotMet("pred should be lower than 1")); PADDLE_ENFORCE_EQ( label * label, label, platform::errors::PreconditionNotMet( @@ -272,10 +275,12 @@ void BasicAucCalculator::add_uid_data(const float* d_pred, void BasicAucCalculator::add_uid_unlock_data(double pred, int label, uint64_t uid) { - PADDLE_ENFORCE_GE(pred, 0.0, platform::errors::PreconditionNotMet( - "pred should be greater than 0")); - PADDLE_ENFORCE_LE(pred, 1.0, platform::errors::PreconditionNotMet( - "pred should be lower than 1")); + PADDLE_ENFORCE_GE( + pred, 0.0, + platform::errors::PreconditionNotMet("pred should be greater than 0")); + PADDLE_ENFORCE_LE( + pred, 1.0, + platform::errors::PreconditionNotMet("pred should be lower than 1")); PADDLE_ENFORCE_EQ( label * label, label, platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/framework/fleet/metrics.h b/paddle/fluid/framework/fleet/metrics.h index 69b242664bb46..7c3ea1b5512f6 100644 --- a/paddle/fluid/framework/fleet/metrics.h +++ b/paddle/fluid/framework/fleet/metrics.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include #include #include @@ -35,6 +36,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_GLOO) #include + #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #endif diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu index 488a9ef8ce78f..fbe76696114d5 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h" #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/framework/lod_tensor.h" diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index 0efec57e59db6..7ddc5a1f6dd66 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -27,6 +27,7 @@ limitations under the License. */ #include #ifdef PADDLE_WITH_GLOO #include + #include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #endif diff --git a/paddle/fluid/framework/fleet/test_fleet.cc b/paddle/fluid/framework/fleet/test_fleet.cc index 24f3e6bed6494..34aea9de3b1c5 100644 --- a/paddle/fluid/framework/fleet/test_fleet.cc +++ b/paddle/fluid/framework/fleet/test_fleet.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "paddle/fluid/framework/fleet/fleet_wrapper.h" #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #include "paddle/fluid/string/string_helper.h" diff --git a/paddle/fluid/framework/generator.cc b/paddle/fluid/framework/generator.cc index b621eca35b893..e3b9fe3626ddf 100644 --- a/paddle/fluid/framework/generator.cc +++ b/paddle/fluid/framework/generator.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/generator.h" #include + #include #include diff --git a/paddle/fluid/framework/generator.h b/paddle/fluid/framework/generator.h index 35efc1bee33d5..f62e8f74d26d5 100644 --- a/paddle/fluid/framework/generator.h +++ b/paddle/fluid/framework/generator.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include #include #include // temp for debug diff --git a/paddle/fluid/framework/gpu_utils.h b/paddle/fluid/framework/gpu_utils.h index 37c9852a1ab1f..9c59333000e91 100644 --- a/paddle/fluid/framework/gpu_utils.h +++ b/paddle/fluid/framework/gpu_utils.h @@ -17,6 +17,7 @@ #define EIGEN_USE_GPU #include + #include "paddle/fluid/platform/enforce.h" #include "unsupported/Eigen/CXX11/Tensor" @@ -104,15 +105,17 @@ ConvertTensorIndex(int index, const Dim3& dims) { template IntType CeilOrFloor(IntType x, IntType deviser) { - PADDLE_ENFORCE_GT(deviser, 0, platform::errors::InvalidArgument( - "deviser should be greater than 0, " - "but received is:%d", - deviser)); + PADDLE_ENFORCE_GT( + deviser, 0, + platform::errors::InvalidArgument("deviser should be greater than 0, " + "but received is:%d", + deviser)); PADDLE_ENFORCE_GT( - x, 0, platform::errors::InvalidArgument("input should be greater than 0, " - "but received is:%d", - x)); + x, 0, + platform::errors::InvalidArgument("input should be greater than 0, " + "but received is:%d", + x)); const IntType round_to_zero = x / deviser; const IntType inte_result = round_to_zero * deviser; diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h index ebbfd446a03de..81f17be867f76 100644 --- a/paddle/fluid/framework/grad_op_desc_maker.h +++ b/paddle/fluid/framework/grad_op_desc_maker.h @@ -20,6 +20,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_call_stack.h" #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/operator.h" @@ -157,8 +158,9 @@ class GradOpDescMakerBase { const Attribute& GetAttr(const std::string& name) const { auto& map = fwd_op_.GetAttrMap(); auto it = map.find(name); - PADDLE_ENFORCE_NE(it, map.end(), platform::errors::NotFound( - "Cannot find attribute (%s).", name)); + PADDLE_ENFORCE_NE( + it, map.end(), + platform::errors::NotFound("Cannot find attribute (%s).", name)); return it->second; } diff --git a/paddle/fluid/framework/heter_pipeline_trainer.cc b/paddle/fluid/framework/heter_pipeline_trainer.cc index d0d3c2fea3b56..dc99885811c2b 100644 --- a/paddle/fluid/framework/heter_pipeline_trainer.cc +++ b/paddle/fluid/framework/heter_pipeline_trainer.cc @@ -32,7 +32,9 @@ using TaskQueue = std::pair>>>; void HeterPipelineTrainer::ResetDataset(Dataset* dataset) { +#ifndef PADDLE_WITH_FLPS if (pipeline_stage_ == 0) { +#endif SetDataset(dataset); const std::vector readers = dataset->GetReaders(); @@ -51,40 +53,39 @@ void HeterPipelineTrainer::ResetDataset(Dataset* dataset) { this_worker->SetDataFeed(readers[cnt]); this_worker->SetReaderPlace(place_); } +#ifndef PADDLE_WITH_FLPS } +#endif } void HeterPipelineTrainer::Initialize(const TrainerDesc& trainer_desc, Dataset* dataset) { + trainer_desc_ = trainer_desc; thread_num_ = trainer_desc.thread_num(); ParseDumpConfig(trainer_desc); SetDebug(trainer_desc.debug()); const std::vector readers = dataset->GetReaders(); - VLOG(3) << "readers num: " << readers.size(); // change thread num to readers num thread_num_ = readers.size(); - VLOG(3) << "worker thread num: " << thread_num_; + VLOG(3) << "worker(readers) thread num: " << thread_num_; const auto& heter_section_params = trainer_desc.heter_section_param(); num_pipeline_stages_ = heter_section_params.num_pipeline_stages(); pipeline_stage_ = heter_section_params.pipeline_stage(); num_microbatches_ = heter_section_params.num_microbatches(); VLOG(3) << "Number of microbatches per minibatch: " << num_microbatches_; - trainer_desc_ = trainer_desc; trainer_id_ = trainer_desc.trainer_id(); for (int i = 0; i < num_pipeline_stages_; ++i) { auto trainer_num = trainer_desc.trainers(i); trainers_.push_back(trainer_num); } int cpu_trainer_num = trainers_[0]; - // int cur_stage_trainer_num = trainers_[pipeline_stage_]; - // int global_thread_num = cpu_trainer_num * thread_num_; - // int previous_trainers = 0; - // for (int i = 0; i < pipeline_stage_; i++) previous_trainers += - // trainers_[i]; - // int stage_trainer_id = - // trainer_id_ - previous_trainers; // trainer id in current stage - + VLOG(4) << "trainer_id_: " << trainer_id_; + VLOG(4) << "cpu_trainer_num: " << cpu_trainer_num + << " xpu_trainer_num: " << trainers_[1]; +#ifdef PADDLE_WITH_FLPS + thread_num_ = 1; +#endif if (pipeline_stage_ == 0) { // for cpu trainer int cnt = -1; int real_thread_id = trainer_id_; @@ -103,25 +104,33 @@ void HeterPipelineTrainer::Initialize(const TrainerDesc& trainer_desc, this_worker->InitRandomDumpConfig(trainer_desc); this_worker->SetDeviceIndex(real_thread_id); real_thread_id += cpu_trainer_num; - // if (pipeline_stage_ == 0) { this_worker->SetDataFeed(readers[cnt]); - //} this_worker->SetMicrobatchNum(num_microbatches_); this_worker->SetPipelineStageNum(num_pipeline_stages_); this_worker->SetPipelineStage(pipeline_stage_); } - } else { // for heter_trainer - // heter trainer with thread_id == -1 is not for - // real training + } else { + // for heter_trainer + // heter trainer with thread_id == -1 is not for real training, just for run + // listen op workers_[-1] = DeviceWorkerFactory::CreateDeviceWorker( trainer_desc.device_worker_name()); auto this_worker = std::dynamic_pointer_cast( workers_[-1]); +#ifdef PADDLE_WITH_FLPS + this_worker->SetDebug(debug_); + this_worker->SetNeedDumpField(need_dump_field_); + this_worker->SetNeedDumpParam(need_dump_param_); + this_worker->SetDumpFieldVector(dump_fields_); + this_worker->SetDumpParamVector(dump_param_); + this_worker->InitRandomDumpConfig(trainer_desc); + this_worker->SetDataFeed(readers[0]); +#endif + this_worker->SetDeviceIndex(-1); this_worker->SetMicrobatchNum(num_microbatches_); this_worker->SetPipelineStageNum(num_pipeline_stages_); this_worker->SetPipelineStage(pipeline_stage_); - this_worker->SetDeviceIndex(-1); } } @@ -159,14 +168,19 @@ void HeterPipelineTrainer::InitTrainerEnv(const ProgramDesc& main_program, for (auto& worker_pair : workers_) { auto worker_index = worker_pair.first; auto device_worker = worker_pair.second; + VLOG(0) << "workers index in InitTrainerEnv: " << worker_index; auto this_worker = std::dynamic_pointer_cast( device_worker); this_worker->SetPlace(place); this_worker->Initialize(trainer_desc_); +#ifdef PADDLE_WITH_FLPS + this_worker->SetReaderPlace(place); +#else if (pipeline_stage_ == 0) { this_worker->SetReaderPlace(place); } +#endif this_worker->SetRootScope(root_scope_); // generate mini_batch scope for every worker auto* minibatch_scope = &root_scope_->NewScope(); @@ -175,6 +189,7 @@ void HeterPipelineTrainer::InitTrainerEnv(const ProgramDesc& main_program, // after set micro num & mini batch scope this_worker->CreateMicrobatchScopes(); (*micro_scopes_)[worker_index] = this_worker->GetMicrobatchScopes(); + VLOG(4) << "worker_index: " << worker_index; (*task_queue_)[worker_index] = this_worker->GetThreadQueue(); } } @@ -182,6 +197,7 @@ void HeterPipelineTrainer::InitTrainerEnv(const ProgramDesc& main_program, void HeterPipelineTrainer::Run() { VLOG(3) << "Going to run HeterPipelineTrainer::Run()"; if (listen_ptr_ == nullptr) { + VLOG(3) << "listen_ptr_ is null"; for (auto& worker_pair : workers_) { auto& device_worker = worker_pair.second; auto worker_0 = @@ -196,10 +212,14 @@ void HeterPipelineTrainer::Run() { heter_server->WaitServerReady(); heter_server->SetMiniBatchScopes(mini_scopes_); heter_server->SetMicroBatchScopes(micro_scopes_); + VLOG(4) << "heter_server SetTaskQueue"; heter_server->SetTaskQueue(task_queue_); + // main training logic + VLOG(3) << "pipeline_stage_ is " << pipeline_stage_; if (pipeline_stage_ == 0) { // for cpu trainer for (auto& worker_pair : workers_) { + VLOG(4) << "cpu worker index : " << worker_pair.first; auto device_worker = worker_pair.second; if (!debug_) { threads_.push_back( @@ -212,6 +232,7 @@ void HeterPipelineTrainer::Run() { } else { // for heter worker // start thread_worker with thread_id = -1 for (auto& worker_pair : workers_) { + VLOG(4) << "xpu worker index : " << worker_pair.first; auto device_worker = worker_pair.second; if (!debug_) { threads_.push_back( @@ -252,6 +273,10 @@ void HeterPipelineTrainer::Run() { this_worker->SetPipelineStageNum(num_pipeline_stages_); this_worker->SetPipelineStage(pipeline_stage_); this_worker->SetPlace(place_); +#ifdef PADDLE_WITH_FLPS + this_worker->SetDataFeed(workers_[-1]->device_reader_); + this_worker->SetReaderPlace(place_); +#endif this_worker->Initialize(trainer_desc_); this_worker->SetRootScope(root_scope_); @@ -308,5 +333,5 @@ Scope* HeterPipelineTrainer::GetWorkerScope(int thread_id) { } } // end namespace framework -} // end namespace paddle +} // namespace paddle #endif diff --git a/paddle/fluid/framework/heter_section_worker.cc b/paddle/fluid/framework/heter_section_worker.cc old mode 100644 new mode 100755 index b6759bb2e6fe6..acbfe21ecdae0 --- a/paddle/fluid/framework/heter_section_worker.cc +++ b/paddle/fluid/framework/heter_section_worker.cc @@ -65,6 +65,52 @@ class TrainerDesc; uint64_t HeterSectionWorker::batch_id_(0); +#ifdef PADDLE_WITH_FLPS +void HeterSectionWorker::Initialize(const TrainerDesc& desc) { + trainer_desc_ = desc; + fetch_config_ = desc.fetch_config(); + dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_); + program_.reset(new ProgramDesc( + desc.heter_section_param().section_config().program_desc())); + thread_queue_.reset( + new ::paddle::framework::BlockingQueue>()); + VLOG(4) << "addr of thread_queue_ is: " << thread_queue_.get(); + bool is_first_stage = (pipeline_stage_ == 0); + bool is_last_stage = (pipeline_stage_ + 1 == num_pipeline_stages_); + + if (is_first_stage) { + VLOG(0) << "entering first stage"; + for (auto& op_desc : program_->Block(0).AllOps()) { + forward_ops_.push_back(std::move(OpRegistry::CreateOp(*op_desc))); + } + for (auto& op_desc : program_->Block(1).AllOps()) { + auto op = std::move(OpRegistry::CreateOp(*op_desc)); + auto op_type = op->Type(); + if (listen_op_ == nullptr && op_type == "heter_listen_and_serv") { + listen_op_ = std::move(op); + } else { + backward_ops_.push_back(std::move(op)); + } + } + } else if (is_last_stage) { + VLOG(0) << "HeterSectionWorker::Initialize for the last stage"; + for (auto& op_desc : program_->Block(0).AllOps()) { + auto op = std::move(OpRegistry::CreateOp(*op_desc)); + auto op_type = op->Type(); + if (listen_op_ == nullptr && op_type == "heter_listen_and_serv") { + listen_op_ = std::move(op); + } else { + forward_ops_.push_back(std::move(op)); + } + } + VLOG(0) << "test111"; + for (auto& op_desc : program_->Block(1).AllOps()) { + auto op = std::move(OpRegistry::CreateOp(*op_desc)); + backward_ops_.push_back(std::move(op)); + } + } +} +#else void HeterSectionWorker::Initialize(const TrainerDesc& desc) { trainer_desc_ = desc; fetch_config_ = desc.fetch_config(); @@ -122,6 +168,7 @@ void HeterSectionWorker::Initialize(const TrainerDesc& desc) { } } } +#endif void HeterSectionWorker::RunBackward(int micro_id) { for (size_t i = 0; i < backward_ops_.size(); i++) { @@ -147,8 +194,11 @@ void HeterSectionWorker::RunBackward(int micro_id) { void HeterSectionWorker::MiniBatchBarrier() { // get micro id & deserialize data std::set micro_ids; + VLOG(4) << "entering MiniBatchBarrier"; + VLOG(4) << "micro_ids_.size(): " << micro_ids_.size(); while (micro_ids.size() < micro_ids_.size()) { auto task = (*thread_queue_).Pop(); + VLOG(4) << "got one task from task que in cpu worker"; auto message_name = task.first; auto micro_id = task.second; PADDLE_ENFORCE_EQ(message_name.find("backward") != std::string::npos, true, @@ -164,19 +214,44 @@ void HeterSectionWorker::MiniBatchBarrier() { RunBackward(micro_id); batch_num_++; BatchPostProcess(); + VLOG(0) << "one task in cpu worker overed!"; } micro_ids_.clear(); } -void HeterSectionWorker::RunListen() { listen_op_->Run(*root_scope_, place_); } +void HeterSectionWorker::RunListen() { + VLOG(4) << ">>> run listen_op"; + listen_op_->Run(*root_scope_, place_); + VLOG(4) << "<<< run listen_op over"; +} void HeterSectionWorker::RunForward(int micro_id) { +#ifdef PADDLE_WITH_FLPS + BindingDataFeedMemory(micro_id); + if (debug_) { + timeline_.Start(); + } + int cur_micro_batch = device_reader_->Next(); + if (cur_micro_batch <= 0) { + VLOG(0) << "no more data in device_reader_"; + epoch_finish_ = true; + return; + } + if (debug_) { + timeline_.Pause(); + read_time_ += timeline_.ElapsedSec(); + total_time_ += timeline_.ElapsedSec(); + total_ins_num_ += cur_micro_batch; + } + VLOG(3) << "read a batch in thread " << thread_id_ << " micro " << micro_id; +#else if (pipeline_stage_ == 0) { BindingDataFeedMemory(micro_id); if (debug_) { timeline_.Start(); } - int cur_micro_batch = device_reader_->Next(); + int cur_micro_batch = + device_reader_->Next(); // batch_size is just micro_batch_size if (cur_micro_batch <= 0) { epoch_finish_ = true; return; @@ -189,6 +264,7 @@ void HeterSectionWorker::RunForward(int micro_id) { } VLOG(3) << "read a batch in thread " << thread_id_ << " micro " << micro_id; } +#endif for (size_t i = 0; i < forward_ops_.size(); i++) { auto& op = forward_ops_[i]; VLOG(3) << "Forward: start to run op " << op->Type() << " for micro-batch " @@ -301,7 +377,7 @@ void HeterSectionWorker::Run() { while (!epoch_finish_) { // forward for (int i = 0; i < num_microbatches_; i++) { - VLOG(5) << "Run " << i << " microbatch"; + VLOG(4) << "Run " << i << " microbatch"; RunForward(i); if (epoch_finish_ == true) { break; @@ -312,15 +388,19 @@ void HeterSectionWorker::Run() { if (micro_ids_.size() > 0) { MiniBatchBarrier(); } + VLOG(0) << "one batch run over! micro_ids_size: " << micro_ids_.size(); } } else { // for heter worker + VLOG(4) << "entering heter Run..."; auto heter_server = paddle::distributed::HeterServer::GetInstance(); while (true) { if (heter_server->IsStop()) { + VLOG(0) << "heter_server is stopped!!"; epoch_finish_ = true; break; } auto task = (*thread_queue_).Pop(); + VLOG(4) << "got one task from task que in heter worker"; auto message_name = task.first; auto micro_id = task.second; if (is_last_stage) { @@ -331,6 +411,8 @@ void HeterSectionWorker::Run() { RunBackward(micro_id); batch_num_++; BatchPostProcess(); + VLOG(0) << "one batch run over! micro_id: " << micro_id + << " batch_num: " << batch_num_; } else { if (message_name.find("forward") != std::string::npos) { RunForward(micro_id); @@ -371,6 +453,7 @@ void HeterSectionWorker::BatchPostProcess() { } void HeterSectionWorker::TrainFiles() { + VLOG(4) << "entering HeterSectionWorker::TrainFiles"; if (thread_id_ >= 0) { total_ins_num_ = 0; batch_num_ = 0; @@ -378,9 +461,17 @@ void HeterSectionWorker::TrainFiles() { timeline_.Start(); VLOG(3) << "begin section_worker TrainFiles"; epoch_finish_ = false; +#ifdef PADDLE_WITH_FLPS + if (device_reader_ == nullptr) { + VLOG(4) << "device_reader_ is null!!"; + } + device_reader_->Start(); +#else if (pipeline_stage_ == 0) { device_reader_->Start(); } +#endif + VLOG(4) << "Run in TrainFiles:"; while (!epoch_finish_) { Run(); dev_ctx_->Wait(); @@ -428,9 +519,13 @@ void HeterSectionWorker::TrainFilesWithProfiler() { total_ins_num_ = 0; op_name_.clear(); op_total_time_.clear(); +#ifdef PADDLE_WITH_FLPS + device_reader_->Start(); +#else if (pipeline_stage_ == 0) { device_reader_->Start(); } +#endif while (!epoch_finish_) { Run(); dev_ctx_->Wait(); diff --git a/paddle/fluid/framework/heter_service.h b/paddle/fluid/framework/heter_service.h index 9d0e3c50953bd..6b115d33d2faa 100644 --- a/paddle/fluid/framework/heter_service.h +++ b/paddle/fluid/framework/heter_service.h @@ -22,6 +22,7 @@ limitations under the License. */ #include // NOLINT #include // NOLINT #include + #include "paddle/fluid/framework/heter_service.pb.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" diff --git a/paddle/fluid/framework/hetercpu_worker.cc b/paddle/fluid/framework/hetercpu_worker.cc index 75cc18887da9a..85e44ec44c6e3 100644 --- a/paddle/fluid/framework/hetercpu_worker.cc +++ b/paddle/fluid/framework/hetercpu_worker.cc @@ -311,8 +311,8 @@ void HeterCpuWorker::CollectLabelInfo(std::shared_ptr task, continue; } LoDTensor* tensor = fea_var->GetMutable(); - CHECK(tensor != nullptr) << "tensor of var " - << sparse_key_names_[table_id][i] << " is null"; + CHECK(tensor != nullptr) + << "tensor of var " << sparse_key_names_[table_id][i] << " is null"; // skip slots which do not have embedding Variable* emb_var = scope->FindVar(sparse_value_names_[table_id][i]); @@ -465,9 +465,9 @@ void HeterCpuWorker::AdjustInsWeight(std::shared_ptr task) { float* ins_weights = ins_weight_tensor->data(); size_t len = ins_weight_tensor->numel(); // len = batch size // here we assume nid_show slot only has one feasign in each instance - CHECK(len == nid_show_.size()) << "ins_weight size should be equal to " - << "nid_show size, " << len << " vs " - << nid_show_.size(); + CHECK(len == nid_show_.size()) + << "ins_weight size should be equal to " + << "nid_show size, " << len << " vs " << nid_show_.size(); float nid_adjw_threshold = adjust_ins_weight_config_.nid_adjw_threshold(); float nid_adjw_ratio = adjust_ins_weight_config_.nid_adjw_ratio(); int64_t nid_adjw_num = 0; @@ -482,9 +482,8 @@ void HeterCpuWorker::AdjustInsWeight(std::shared_ptr task) { } float ins_weight = 1.0; if (nid_show >= 0 && nid_show < nid_adjw_threshold) { - ins_weight = log(M_E + - (nid_adjw_threshold - nid_show) / nid_adjw_threshold * - nid_adjw_ratio); + ins_weight = log(M_E + (nid_adjw_threshold - nid_show) / + nid_adjw_threshold * nid_adjw_ratio); // count nid adjw insnum and weight ++nid_adjw_num; nid_adjw_weight += ins_weight; @@ -579,15 +578,15 @@ void HeterCpuWorker::CopyDenseVars() { Variable* src_var = thread_scope_->FindVar(src_var_name); CHECK(src_var != nullptr) << src_var_name << " not found"; // NOLINT LoDTensor* src_tensor = src_var->GetMutable(); - CHECK(src_tensor != nullptr) << src_var_name - << " tensor is null"; // NOLINT + CHECK(src_tensor != nullptr) + << src_var_name << " tensor is null"; // NOLINT float* src_data = src_tensor->data(); Variable* dest_var = thread_scope_->FindVar(dest_var_name); CHECK(dest_var != nullptr) << dest_var_name << " not found"; // NOLINT LoDTensor* dest_tensor = dest_var->GetMutable(); - CHECK(dest_tensor != nullptr) << dest_var_name - << " tensor is null"; // NOLINT + CHECK(dest_tensor != nullptr) + << dest_var_name << " tensor is null"; // NOLINT float* dest_data = dest_tensor->data(); CHECK(src_tensor->numel() == dest_tensor->numel()) diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc index a4af56419a766..81c1a684959fa 100644 --- a/paddle/fluid/framework/heterxpu_trainer.cc +++ b/paddle/fluid/framework/heterxpu_trainer.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include + #include "io/fs.h" #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/data_feed_factory.h" diff --git a/paddle/fluid/framework/infershape_utils_test.cc b/paddle/fluid/framework/infershape_utils_test.cc index 2eeefb19a1aa8..805f992cf3e8b 100644 --- a/paddle/fluid/framework/infershape_utils_test.cc +++ b/paddle/fluid/framework/infershape_utils_test.cc @@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" + #include #include #include "gtest/gtest.h" - #include "paddle/fluid/framework/attribute.h" -#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/framework/inplace_op_inference.h b/paddle/fluid/framework/inplace_op_inference.h index c46a77f0b3590..93bbec251fee4 100644 --- a/paddle/fluid/framework/inplace_op_inference.h +++ b/paddle/fluid/framework/inplace_op_inference.h @@ -15,6 +15,7 @@ #pragma once #include #include + #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/type_defs.h" diff --git a/paddle/fluid/framework/io/CMakeLists.txt b/paddle/fluid/framework/io/CMakeLists.txt index 85b45f1a5bbc1..0033e825172bb 100644 --- a/paddle/fluid/framework/io/CMakeLists.txt +++ b/paddle/fluid/framework/io/CMakeLists.txt @@ -1,7 +1,16 @@ -cc_library(shell SRCS shell.cc DEPS string_helper glog timer enforce) -cc_library(fs SRCS fs.cc DEPS string_helper glog boost enforce shell) +cc_library( + shell + SRCS shell.cc + DEPS string_helper glog timer enforce) +cc_library( + fs + SRCS fs.cc + DEPS string_helper glog boost enforce shell) -cc_test(test_fs SRCS test_fs.cc DEPS fs shell) -if (WITH_CRYPTO) - add_subdirectory(crypto) -endif (WITH_CRYPTO) +cc_test( + test_fs + SRCS test_fs.cc + DEPS fs shell) +if(WITH_CRYPTO) + add_subdirectory(crypto) +endif(WITH_CRYPTO) diff --git a/paddle/fluid/framework/io/crypto/CMakeLists.txt b/paddle/fluid/framework/io/crypto/CMakeLists.txt index ae16353ec92ef..e2de877c39e51 100644 --- a/paddle/fluid/framework/io/crypto/CMakeLists.txt +++ b/paddle/fluid/framework/io/crypto/CMakeLists.txt @@ -1,3 +1,12 @@ -cc_library(paddle_crypto SRCS cipher_utils.cc cipher.cc aes_cipher.cc DEPS cryptopp enforce) -cc_test(aes_cipher_test SRCS aes_cipher_test.cc DEPS paddle_crypto) -cc_test(cipher_utils_test SRCS cipher_utils_test.cc DEPS paddle_crypto) +cc_library( + paddle_crypto + SRCS cipher_utils.cc cipher.cc aes_cipher.cc + DEPS cryptopp enforce) +cc_test( + aes_cipher_test + SRCS aes_cipher_test.cc + DEPS paddle_crypto) +cc_test( + cipher_utils_test + SRCS cipher_utils_test.cc + DEPS paddle_crypto) diff --git a/paddle/fluid/framework/io/crypto/aes_cipher_test.cc b/paddle/fluid/framework/io/crypto/aes_cipher_test.cc index 7f923f597b6de..67c758b012ad5 100644 --- a/paddle/fluid/framework/io/crypto/aes_cipher_test.cc +++ b/paddle/fluid/framework/io/crypto/aes_cipher_test.cc @@ -13,11 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/io/crypto/aes_cipher.h" + #include #include #include + #include #include + #include "paddle/fluid/framework/io/crypto/cipher_utils.h" namespace paddle { diff --git a/paddle/fluid/framework/io/crypto/cipher.cc b/paddle/fluid/framework/io/crypto/cipher.cc index eca175c020cb6..2001e8a416a1a 100644 --- a/paddle/fluid/framework/io/crypto/cipher.cc +++ b/paddle/fluid/framework/io/crypto/cipher.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/io/crypto/cipher.h" + #include "paddle/fluid/framework/io/crypto/aes_cipher.h" #include "paddle/fluid/framework/io/crypto/cipher_utils.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/io/crypto/cipher_utils.cc b/paddle/fluid/framework/io/crypto/cipher_utils.cc index ee9f06b2f3eb1..b622138f7814a 100644 --- a/paddle/fluid/framework/io/crypto/cipher_utils.cc +++ b/paddle/fluid/framework/io/crypto/cipher_utils.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/io/crypto/cipher_utils.h" #include + #include #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/io/crypto/cipher_utils_test.cc b/paddle/fluid/framework/io/crypto/cipher_utils_test.cc index 928e2ced9b195..356c919cbcbe8 100644 --- a/paddle/fluid/framework/io/crypto/cipher_utils_test.cc +++ b/paddle/fluid/framework/io/crypto/cipher_utils_test.cc @@ -12,12 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/io/crypto/cipher_utils.h" + #include + #include #include -#include "paddle/fluid/framework/io/crypto/cipher_utils.h" - namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/io/fs.cc b/paddle/fluid/framework/io/fs.cc index b8aca886e7d60..fd602895aaed5 100644 --- a/paddle/fluid/framework/io/fs.cc +++ b/paddle/fluid/framework/io/fs.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/io/fs.h" #include + #include #include "glog/logging.h" diff --git a/paddle/fluid/framework/io/fs.h b/paddle/fluid/framework/io/fs.h index 1ebe80e943aae..088d4d97424a1 100644 --- a/paddle/fluid/framework/io/fs.h +++ b/paddle/fluid/framework/io/fs.h @@ -16,6 +16,7 @@ #include #include + #include #include #include diff --git a/paddle/fluid/framework/io/test_fs.cc b/paddle/fluid/framework/io/test_fs.cc index 49dee603200c9..adb6141fd56a1 100644 --- a/paddle/fluid/framework/io/test_fs.cc +++ b/paddle/fluid/framework/io/test_fs.cc @@ -13,7 +13,9 @@ // limitations under the License. #include + #include + #include "paddle/fluid/framework/io/fs.h" #if defined _WIN32 || defined __APPLE__ diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 8166c43e65db1..374b5490d5da1 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -1,6 +1,11 @@ -set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp) -set(pass_file_final ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) -file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT!\n\n") +set(pass_file + ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp) +set(pass_file_final + ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) +file( + WRITE ${pass_file} + "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT!\n\n" +) file(APPEND ${pass_file} "\#pragma once\n") file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n") @@ -9,54 +14,103 @@ copy_if_different(${pass_file} ${pass_file_final}) add_subdirectory(fuse_optimizer_ops_pass) add_subdirectory(memory_optimize_pass) add_subdirectory(multi_devices_graph_pass) -if(NOT APPLE AND NOT WIN32 AND (WITH_GPU OR WITH_ROCM)) - add_subdirectory(fusion_group) +if(NOT APPLE + AND NOT WIN32 + AND (WITH_GPU OR WITH_ROCM)) + add_subdirectory(fusion_group) endif() # Usage: pass_library(target inference) will append to paddle_inference_pass.h unset(INFER_IR_PASSES CACHE) # clear the global variable function(pass_library TARGET DEST) - set(options "") - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS DIR) - set(targetPrefix "") - - cmake_parse_arguments(pass_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - if(pass_library_DIR) - cc_library(${TARGET} SRCS ${pass_library_DIR}/${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base op_version_registry ${pass_library_DEPS}) - else() - cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base op_version_registry ${pass_library_DEPS}) - endif() + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS DIR) + set(targetPrefix "") + + cmake_parse_arguments(pass_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + if(pass_library_DIR) + cc_library( + ${TARGET} + SRCS ${pass_library_DIR}/${TARGET}.cc + DEPS graph_pattern_detector pass fuse_pass_base op_version_registry + ${pass_library_DEPS}) + else() + cc_library( + ${TARGET} + SRCS ${TARGET}.cc + DEPS graph_pattern_detector pass fuse_pass_base op_version_registry + ${pass_library_DEPS}) + endif() - # add more DEST here, such as train, dist and collect USE_PASS into a file automatically. - if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference") - if(NOT CMAKE_BUILD_TYPE STREQUAL "Release") - message(STATUS "add pass ${TARGET} ${DEST}") - endif() - file(APPEND ${pass_file} "USE_PASS(${TARGET});\n") - set(INFER_IR_PASSES ${INFER_IR_PASSES} ${TARGET} CACHE INTERNAL "") + # add more DEST here, such as train, dist and collect USE_PASS into a file automatically. + if(${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference") + if(NOT CMAKE_BUILD_TYPE STREQUAL "Release") + message(STATUS "add pass ${TARGET} ${DEST}") endif() + file(APPEND ${pass_file} "USE_PASS(${TARGET});\n") + set(INFER_IR_PASSES + ${INFER_IR_PASSES} ${TARGET} + CACHE INTERNAL "") + endif() endfunction() -cc_library(node SRCS node.cc DEPS proto_desc) -cc_library(graph SRCS graph.cc DEPS node pretty_log) -cc_library(graph_helper SRCS graph_helper.cc DEPS graph) -cc_library(pass SRCS pass.cc DEPS graph node graph_helper) -cc_library(graph_traits SRCS graph_traits.cc DEPS graph) -cc_library(cost_model SRCS cost_model.cc DEPS executor graph profiler proto_desc device_tracer) +cc_library( + node + SRCS node.cc + DEPS proto_desc) +cc_library( + graph + SRCS graph.cc + DEPS node pretty_log) +cc_library( + graph_helper + SRCS graph_helper.cc + DEPS graph) +cc_library( + pass + SRCS pass.cc + DEPS graph node graph_helper) +cc_library( + graph_traits + SRCS graph_traits.cc + DEPS graph) +cc_library( + cost_model + SRCS cost_model.cc + DEPS executor graph profiler proto_desc device_tracer) -SET(GRAPH_PATTERN_DETECTOR_DEPS graph graph_helper graph_traits) -if (WITH_TESTING) - SET(GRAPH_PATTERN_DETECTOR_DEPS ${GRAPH_PATTERN_DETECTOR_DEPS} gtest) +set(GRAPH_PATTERN_DETECTOR_DEPS graph graph_helper graph_traits) +if(WITH_TESTING) + set(GRAPH_PATTERN_DETECTOR_DEPS ${GRAPH_PATTERN_DETECTOR_DEPS} gtest) endif(WITH_TESTING) -cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS ${GRAPH_PATTERN_DETECTOR_DEPS}) +cc_library( + graph_pattern_detector + SRCS graph_pattern_detector.cc + DEPS ${GRAPH_PATTERN_DETECTOR_DEPS}) -cc_library(op_compat_sensible_pass SRCS op_compat_sensible_pass.cc DEPS graph_pattern_detector op_def_api pass) -cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS graph_pattern_detector executor) -cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS op_compat_sensible_pass) -cc_library(placement_pass_base SRCS placement_pass_base.cc DEPS pass) +cc_library( + op_compat_sensible_pass + SRCS op_compat_sensible_pass.cc + DEPS graph_pattern_detector op_def_api pass) +cc_library( + subgraph_detector + SRCS subgraph_detector.cc + DEPS graph_pattern_detector executor) +cc_library( + fuse_pass_base + SRCS fuse_pass_base.cc + DEPS op_compat_sensible_pass) +cc_library( + placement_pass_base + SRCS placement_pass_base.cc + DEPS pass) -cc_library(coalesce_grad_tensor_pass SRCS coalesce_grad_tensor_pass.cc DEPS graph graph_helper) +cc_library( + coalesce_grad_tensor_pass + SRCS coalesce_grad_tensor_pass.cc + DEPS graph graph_helper) pass_library(graph_to_program_pass base) pass_library(graph_viz_pass base) @@ -106,138 +160,348 @@ pass_library(generate_pass DEPS pass_desc_proto) target_link_libraries(generate_pass pass_desc_proto) if(WITH_TENSORRT) - pass_library(trt_map_matmul_to_mul_pass inference) - pass_library(preln_embedding_eltwise_layernorm_fuse_pass inference) - pass_library(preln_skip_layernorm_fuse_pass inference) - pass_library(set_transformer_input_convert_pass inference) - pass_library(remove_padding_recover_padding_pass inference) - pass_library(delete_remove_padding_recover_padding_pass inference) + pass_library(trt_map_matmul_to_mul_pass inference) + pass_library(trt_embedding_eltwise_layernorm_fuse_pass inference) + pass_library(trt_multihead_matmul_fuse_pass inference) + pass_library(trt_skip_layernorm_fuse_pass inference) + pass_library(preln_embedding_eltwise_layernorm_fuse_pass inference) + pass_library(preln_skip_layernorm_fuse_pass inference) + pass_library(set_transformer_input_convert_pass inference) + pass_library(remove_padding_recover_padding_pass inference) + pass_library(delete_remove_padding_recover_padding_pass inference) endif() if(WITH_GPU OR WITH_ROCM) - pass_library(cudnn_placement_pass base DEPS placement_pass_base) - pass_library(embedding_eltwise_layernorm_fuse_pass inference) + pass_library(cudnn_placement_pass base DEPS placement_pass_base) + pass_library(embedding_eltwise_layernorm_fuse_pass inference) endif() if(WITH_MKLDNN) - pass_library(mkldnn_placement_pass base DEPS placement_pass_base DIR mkldnn) - pass_library(mkldnn_inplace_pass inference DEPS mkldnn_placement_pass op_registry elementwise_add_op gelu_op activation_op softmax_op softmax DIR mkldnn) - pass_library(depthwise_conv_mkldnn_pass base DIR mkldnn) - pass_library(conv_affine_channel_mkldnn_fuse_pass inference DIR mkldnn) - pass_library(conv_bias_mkldnn_fuse_pass inference DIR mkldnn) - pass_library(conv_activation_mkldnn_fuse_pass inference DIR mkldnn) - pass_library(conv_concat_relu_mkldnn_fuse_pass inference DIR mkldnn) - pass_library(conv_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn) - pass_library(int8_scale_calculation_mkldnn_pass inference DIR mkldnn) - pass_library(fc_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn) - pass_library(scale_matmul_fuse_pass inference DIR mkldnn) - pass_library(cpu_bfloat16_placement_pass inference DIR mkldnn) - pass_library(cpu_bfloat16_pass inference DIR mkldnn) - pass_library(fc_mkldnn_pass inference DIR mkldnn) - pass_library(interpolate_mkldnn_pass inference DIR mkldnn) - pass_library(softplus_activation_mkldnn_fuse_pass inference DIR mkldnn) - pass_library(shuffle_channel_mkldnn_detect_pass inference DIR mkldnn) - pass_library(fc_act_mkldnn_fuse_pass inference DIR mkldnn) - pass_library(elt_act_mkldnn_fuse_pass inference DIR mkldnn) - pass_library(cpu_quantize_placement_pass base DIR mkldnn) - pass_library(cpu_quantize_pass inference DIR mkldnn) - pass_library(cpu_quantize_squash_pass inference DIR mkldnn) - pass_library(reshape_transpose_matmul_mkldnn_fuse_pass inference DIR mkldnn) - pass_library(reshape_transpose_matmul_v2_mkldnn_fuse_pass inference DIR mkldnn) - pass_library(matmul_transpose_reshape_fuse_pass inference DIR mkldnn) - pass_library(matmul_v2_transpose_reshape_fuse_pass inference DIR mkldnn) - pass_library(batch_norm_act_fuse_pass inference DIR mkldnn) - pass_library(multi_gru_fuse_pass inference DIR mkldnn) - pass_library(multi_gru_seq_fuse_pass inference DIR mkldnn) - pass_library(quant_dequant_mkldnn_pass inference DIR mkldnn) - pass_library(compute_propagate_scales_mkldnn_pass inference DIR mkldnn) + pass_library(mkldnn_placement_pass base DEPS placement_pass_base DIR mkldnn) + pass_library( + mkldnn_inplace_pass + inference + DEPS + mkldnn_placement_pass + op_registry + elementwise_add_op + gelu_op + activation_op + softmax_op + softmax + DIR + mkldnn) + pass_library(depthwise_conv_mkldnn_pass base DIR mkldnn) + pass_library(conv_affine_channel_mkldnn_fuse_pass inference DIR mkldnn) + pass_library(conv_bias_mkldnn_fuse_pass inference DIR mkldnn) + pass_library(conv_activation_mkldnn_fuse_pass inference DIR mkldnn) + pass_library(conv_concat_relu_mkldnn_fuse_pass inference DIR mkldnn) + pass_library(conv_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn) + pass_library(int8_scale_calculation_mkldnn_pass inference DIR mkldnn) + pass_library(fc_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn) + pass_library(scale_matmul_fuse_pass inference DIR mkldnn) + pass_library(cpu_bfloat16_placement_pass inference DIR mkldnn) + pass_library(cpu_bfloat16_pass inference DIR mkldnn) + pass_library(fc_mkldnn_pass inference DIR mkldnn) + pass_library(interpolate_mkldnn_pass inference DIR mkldnn) + pass_library(softplus_activation_mkldnn_fuse_pass inference DIR mkldnn) + pass_library(shuffle_channel_mkldnn_detect_pass inference DIR mkldnn) + pass_library(fc_act_mkldnn_fuse_pass inference DIR mkldnn) + pass_library(elt_act_mkldnn_fuse_pass inference DIR mkldnn) + pass_library(cpu_quantize_placement_pass base DIR mkldnn) + pass_library(cpu_quantize_pass inference DIR mkldnn) + pass_library(cpu_quantize_squash_pass inference DIR mkldnn) + pass_library(reshape_transpose_matmul_mkldnn_fuse_pass inference DIR mkldnn) + pass_library(reshape_transpose_matmul_v2_mkldnn_fuse_pass inference DIR + mkldnn) + pass_library(matmul_transpose_reshape_fuse_pass inference DIR mkldnn) + pass_library(matmul_v2_transpose_reshape_fuse_pass inference DIR mkldnn) + pass_library(batch_norm_act_fuse_pass inference DIR mkldnn) + pass_library(multi_gru_fuse_pass inference DIR mkldnn) + pass_library(multi_gru_seq_fuse_pass inference DIR mkldnn) + pass_library(quant_dequant_mkldnn_pass inference DIR mkldnn) + pass_library(compute_propagate_scales_mkldnn_pass inference DIR mkldnn) endif() if(WITH_IPU) - pass_library(forward_graph_extract_pass base DIR ipu) - pass_library(optimizer_extract_pass base DIR ipu) - pass_library(optimizer_state_align_pass base DIR ipu) - pass_library(ipu_graph_builder_pass base DIR ipu) - pass_library(ipu_runtime_replacer_pass base DIR ipu) - pass_library(inference_process_pass base DIR ipu) - pass_library(inference_postprocess_pass base DIR ipu) - pass_library(popart_canonicalization_pass base DIR ipu) - pass_library(ipu_inplace_pass base DIR ipu) - pass_library(infer_shape_pass base DIR ipu) - pass_library(delete_scale_op_pass base DIR ipu) - pass_library(avg_shard_pass base DIR ipu) + pass_library(forward_graph_extract_pass base DIR ipu) + pass_library(optimizer_extract_pass base DIR ipu) + pass_library(optimizer_state_align_pass base DIR ipu) + pass_library(ipu_graph_builder_pass base DIR ipu) + pass_library(ipu_runtime_replacer_pass base DIR ipu) + pass_library(inference_process_pass base DIR ipu) + pass_library(inference_postprocess_pass base DIR ipu) + pass_library(popart_canonicalization_pass base DIR ipu) + pass_library(ipu_inplace_pass base DIR ipu) + pass_library(infer_shape_pass base DIR ipu) + pass_library(delete_scale_op_pass base DIR ipu) + pass_library(avg_shard_pass base DIR ipu) endif() -cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector ) -cc_library(fuse_bn_add_act_pass SRCS fuse_bn_add_act_pass.cc DEPS pass graph_pattern_detector ) -cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector ) -cc_library(fuse_gemm_epilogue_pass SRCS fuse_gemm_epilogue_pass.cc DEPS pass graph_pattern_detector ) -cc_library(fuse_relu_depthwise_conv_pass SRCS fuse_relu_depthwise_conv_pass.cc DEPS pass graph_pattern_detector ) - -set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") - -cc_library(pass_builder SRCS pass_builder.cc DEPS pass) -cc_library(pass_test_util SRCS pass_test_util.cc DEPS graph pass) - -cc_test(node_test SRCS node_test.cc DEPS node) -cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) -cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) -cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) -cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) -cc_test(cost_model_test SRCS cost_model_test.cc DEPS cost_model op_registry) -cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) -cc_test(test_op_compat_sensible_pass SRCS op_compat_sensible_pass_tester.cc DEPS op_compat_sensible_pass) -cc_test(test_fc_fuse_pass_cc SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) -cc_test(test_fc_lstm_fuse_pass_cc SRCS fc_lstm_fuse_pass_tester.cc DEPS fc_lstm_fuse_pass framework_proto) -cc_test(test_fc_gru_fuse_pass_cc SRCS fc_gru_fuse_pass_tester.cc DEPS fc_gru_fuse_pass framework_proto) -cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto) -cc_test(test_seqpool_cvm_concat_fuse_pass SRCS seqpool_cvm_concat_fuse_pass_tester.cc DEPS seqpool_cvm_concat_fuse_pass framework_proto) -cc_test(test_repeated_fc_relu_fuse_pass_cc SRCS repeated_fc_relu_fuse_pass_tester.cc DEPS repeated_fc_relu_fuse_pass framework_proto) -cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass) -cc_test(test_simplify_with_basic_ops_pass SRCS simplify_with_basic_ops_pass_tester.cc DEPS simplify_with_basic_ops_pass) -cc_test(test_fc_elementwise_layernorm_fuse_pass_cc SRCS fc_elementwise_layernorm_fuse_pass_tester.cc DEPS fc_elementwise_layernorm_fuse_pass) -cc_test(test_skip_layernorm_fuse_pass SRCS skip_layernorm_fuse_pass_tester.cc DEPS skip_layernorm_fuse_pass) -cc_test(test_multihead_matmul_fuse_pass SRCS multihead_matmul_fuse_pass_tester.cc DEPS multihead_matmul_fuse_pass) -cc_test(test_conv_bn_fuse_pass_cc SRCS conv_bn_fuse_pass_tester.cc DEPS conv_bn_fuse_pass) -cc_test(test_adaptive_pool2d_convert_global_pass SRCS adaptive_pool2d_convert_global_pass_tester.cc DEPS adaptive_pool2d_convert_global_pass) -cc_test(test_unsqueeze2_eltwise_fuse_pass_cc SRCS unsqueeze2_eltwise_fuse_pass_tester.cc DEPS unsqueeze2_eltwise_fuse_pass) -cc_test(test_generate_pass_cc SRCS generate_pass_tester.cc DEPS generate_pass pass_desc_proto) +cc_library( + fuse_bn_act_pass + SRCS fuse_bn_act_pass.cc + DEPS pass graph_pattern_detector) +cc_library( + fuse_bn_add_act_pass + SRCS fuse_bn_add_act_pass.cc + DEPS pass graph_pattern_detector) +cc_library( + fuse_elewise_add_act_pass + SRCS fuse_elewise_add_act_pass.cc + DEPS pass graph_pattern_detector) +cc_library( + fuse_gemm_epilogue_pass + SRCS fuse_gemm_epilogue_pass.cc + DEPS pass graph_pattern_detector) +cc_library( + fuse_relu_depthwise_conv_pass + SRCS fuse_relu_depthwise_conv_pass.cc + DEPS pass graph_pattern_detector) + +set(GLOB_PASS_LIB + ${PASS_LIBRARY} + CACHE INTERNAL "Global PASS library") + +cc_library( + pass_builder + SRCS pass_builder.cc + DEPS pass) +cc_library( + pass_test_util + SRCS pass_test_util.cc + DEPS graph pass) + +cc_test( + node_test + SRCS node_test.cc + DEPS node) +cc_test( + pass_test + SRCS pass_test.cc + DEPS graph pass graph_helper) +cc_test( + graph_test + SRCS graph_test.cc + DEPS graph graph_helper op_registry) +cc_test( + graph_helper_test + SRCS graph_helper_test.cc + DEPS graph graph_helper op_registry) +cc_test( + graph_to_program_pass_test + SRCS graph_to_program_pass_test.cc + DEPS graph_to_program_pass) +cc_test( + cost_model_test + SRCS cost_model_test.cc + DEPS cost_model op_registry) +cc_test( + test_graph_pattern_detector + SRCS graph_pattern_detector_tester.cc + DEPS graph_pattern_detector) +cc_test( + test_op_compat_sensible_pass + SRCS op_compat_sensible_pass_tester.cc + DEPS op_compat_sensible_pass) +cc_test( + test_fc_fuse_pass_cc + SRCS fc_fuse_pass_tester.cc + DEPS fc_fuse_pass framework_proto) +cc_test( + test_fc_lstm_fuse_pass_cc + SRCS fc_lstm_fuse_pass_tester.cc + DEPS fc_lstm_fuse_pass framework_proto) +cc_test( + test_fc_gru_fuse_pass_cc + SRCS fc_gru_fuse_pass_tester.cc + DEPS fc_gru_fuse_pass framework_proto) +cc_test( + test_seqpool_concat_fuse_pass + SRCS seqpool_concat_fuse_pass_tester.cc + DEPS seqpool_concat_fuse_pass framework_proto) +cc_test( + test_seqpool_cvm_concat_fuse_pass + SRCS seqpool_cvm_concat_fuse_pass_tester.cc + DEPS seqpool_cvm_concat_fuse_pass framework_proto) +cc_test( + test_repeated_fc_relu_fuse_pass_cc + SRCS repeated_fc_relu_fuse_pass_tester.cc + DEPS repeated_fc_relu_fuse_pass framework_proto) +cc_test( + test_is_test_pass + SRCS is_test_pass_tester.cc + DEPS is_test_pass) +cc_test( + test_simplify_with_basic_ops_pass + SRCS simplify_with_basic_ops_pass_tester.cc + DEPS simplify_with_basic_ops_pass) +cc_test( + test_fc_elementwise_layernorm_fuse_pass_cc + SRCS fc_elementwise_layernorm_fuse_pass_tester.cc + DEPS fc_elementwise_layernorm_fuse_pass) +cc_test( + test_skip_layernorm_fuse_pass + SRCS skip_layernorm_fuse_pass_tester.cc + DEPS skip_layernorm_fuse_pass) +cc_test( + test_multihead_matmul_fuse_pass + SRCS multihead_matmul_fuse_pass_tester.cc + DEPS multihead_matmul_fuse_pass) +cc_test( + test_conv_bn_fuse_pass_cc + SRCS conv_bn_fuse_pass_tester.cc + DEPS conv_bn_fuse_pass) +cc_test( + test_adaptive_pool2d_convert_global_pass + SRCS adaptive_pool2d_convert_global_pass_tester.cc + DEPS adaptive_pool2d_convert_global_pass) +cc_test( + test_unsqueeze2_eltwise_fuse_pass_cc + SRCS unsqueeze2_eltwise_fuse_pass_tester.cc + DEPS unsqueeze2_eltwise_fuse_pass) +cc_test( + test_generate_pass_cc + SRCS generate_pass_tester.cc + DEPS generate_pass pass_desc_proto) if(WITH_GPU OR WITH_ROCM) - cc_test(test_embedding_eltwise_layernorm_fuse_pass SRCS embedding_eltwise_layernorm_fuse_pass_tester.cc DEPS embedding_eltwise_layernorm_fuse_pass) - cc_test(test_cudnn_placement_pass SRCS cudnn_placement_pass_tester.cc DEPS cudnn_placement_pass) + cc_test( + test_embedding_eltwise_layernorm_fuse_pass + SRCS embedding_eltwise_layernorm_fuse_pass_tester.cc + DEPS embedding_eltwise_layernorm_fuse_pass) + cc_test( + test_cudnn_placement_pass + SRCS cudnn_placement_pass_tester.cc + DEPS cudnn_placement_pass) endif() if(NOT WIN32) - cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass) + cc_test( + test_sync_batch_norm_pass + SRCS sync_batch_norm_pass_tester.cc + DEPS sync_batch_norm_pass) endif() -if (WITH_MKLDNN) - cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass) - cc_test(test_conv_bias_mkldnn_fuse_pass_cc SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor) - cc_test(test_conv_activation_mkldnn_fuse_pass SRCS mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc DEPS conv_activation_mkldnn_fuse_pass) - cc_test(test_conv_concat_relu_mkldnn_fuse_pass SRCS mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc DEPS conv_concat_relu_mkldnn_fuse_pass) - cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass pass_test_util) - cc_test(test_int8_scale_calculation_mkldnn_pass SRCS mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc DEPS int8_scale_calculation_mkldnn_pass pass_test_util) - cc_test(test_fc_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/fc_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS fc_elementwise_add_mkldnn_fuse_pass pass_test_util) - cc_test(test_fc_act_mkldnn_fuse_pass SRCS mkldnn/fc_act_mkldnn_fuse_pass_tester.cc DEPS fc_act_mkldnn_fuse_pass pass_test_util) - cc_test(test_batch_norm_act_fuse_pass SRCS mkldnn/batch_norm_act_fuse_pass_tester.cc DEPS batch_norm_act_fuse_pass pass_test_util) - set(TEST_CONV_BN_PASS_DEPS conv_bn_fuse_pass graph_to_program_pass conv_op conv_transpose_op math_function im2col vol2col batch_norm_op gelu_op activation_op elementwise_add_op concat_and_split naive_executor device_context eigen_function) -if (WITH_GPU OR WITH_ROCM) +if(WITH_MKLDNN) + cc_test( + test_depthwise_conv_mkldnn_pass + SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc + DEPS depthwise_conv_mkldnn_pass) + cc_test( + test_conv_bias_mkldnn_fuse_pass_cc + SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc + DEPS conv_bias_mkldnn_fuse_pass naive_executor) + cc_test( + test_conv_activation_mkldnn_fuse_pass + SRCS mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc + DEPS conv_activation_mkldnn_fuse_pass) + cc_test( + test_conv_concat_relu_mkldnn_fuse_pass + SRCS mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc + DEPS conv_concat_relu_mkldnn_fuse_pass) + cc_test( + test_conv_elementwise_add_mkldnn_fuse_pass + SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc + DEPS conv_elementwise_add_mkldnn_fuse_pass pass_test_util) + cc_test( + test_int8_scale_calculation_mkldnn_pass + SRCS mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc + DEPS int8_scale_calculation_mkldnn_pass pass_test_util) + cc_test( + test_fc_elementwise_add_mkldnn_fuse_pass + SRCS mkldnn/fc_elementwise_add_mkldnn_fuse_pass_tester.cc + DEPS fc_elementwise_add_mkldnn_fuse_pass pass_test_util) + cc_test( + test_fc_act_mkldnn_fuse_pass + SRCS mkldnn/fc_act_mkldnn_fuse_pass_tester.cc + DEPS fc_act_mkldnn_fuse_pass pass_test_util) + cc_test( + test_batch_norm_act_fuse_pass + SRCS mkldnn/batch_norm_act_fuse_pass_tester.cc + DEPS batch_norm_act_fuse_pass pass_test_util) + set(TEST_CONV_BN_PASS_DEPS + conv_bn_fuse_pass + graph_to_program_pass + conv_op + conv_transpose_op + math_function + im2col + vol2col + batch_norm_op + gelu_op + activation_op + elementwise_add_op + concat_and_split + naive_executor + device_context + eigen_function) + if(WITH_GPU OR WITH_ROCM) set(TEST_CONV_BN_PASS_DEPS ${TEST_CONV_BN_PASS_DEPS} depthwise_conv) + endif() + cc_test( + test_conv_batch_norm_mkldnn_fuse_pass + SRCS mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc + DEPS ${TEST_CONV_BN_PASS_DEPS}) + cc_test( + test_scale_matmul_fuse_pass + SRCS mkldnn/scale_matmul_fuse_pass_tester.cc + DEPS scale_matmul_fuse_pass) + cc_test( + test_mkldnn_placement_pass + SRCS mkldnn/mkldnn_placement_pass_tester.cc + DEPS mkldnn_placement_pass) + cc_test( + test_mkldnn_inplace_pass + SRCS mkldnn/mkldnn_inplace_pass_tester.cc + DEPS mkldnn_inplace_pass) + cc_test( + test_compute_propagate_scales_mkldnn_pass + SRCS mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc + DEPS compute_propagate_scales_mkldnn_pass naive_executor) + cc_test( + test_cpu_quantize_placement_pass + SRCS mkldnn/cpu_quantize_placement_pass_tester.cc + DEPS cpu_quantize_placement_pass) + cc_test( + test_cpu_quantize_pass + SRCS mkldnn/cpu_quantize_pass_tester.cc + DEPS cpu_quantize_pass naive_executor) + cc_test( + test_cpu_quantize_squash_pass + SRCS mkldnn/cpu_quantize_squash_pass_tester.cc + DEPS cpu_quantize_squash_pass naive_executor) + cc_test( + test_reshape_transpose_matmul_mkldnn_fuse_pass + SRCS mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc + DEPS reshape_transpose_matmul_mkldnn_fuse_pass + reshape_transpose_matmul_v2_mkldnn_fuse_pass) + cc_test( + test_matmul_transpose_reshape_fuse_pass + SRCS mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc + DEPS matmul_transpose_reshape_fuse_pass + matmul_v2_transpose_reshape_fuse_pass) + cc_test( + test_shuffle_channel_mkldnn_detect_pass + SRCS mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc + DEPS shuffle_channel_mkldnn_detect_pass) + cc_test( + test_cpu_bfloat16_placement_pass + SRCS mkldnn/cpu_bfloat16_placement_pass_tester.cc + DEPS cpu_bfloat16_placement_pass) + cc_test( + test_cpu_bfloat16_pass + SRCS mkldnn/cpu_bfloat16_pass_tester.cc + DEPS cpu_bfloat16_pass) + cc_test( + test_multi_gru_fuse_pass + SRCS mkldnn/multi_gru_fuse_pass_tester.cc + DEPS multi_gru_fuse_pass) + cc_test( + test_multi_gru_seq_fuse_pass + SRCS mkldnn/multi_gru_seq_fuse_pass_tester.cc + DEPS multi_gru_seq_fuse_pass) + set(TEST_FC_RNN_PASS_DEPS fc_gru_fuse_pass fc_lstm_fuse_pass + mkldnn_placement_pass) + cc_test( + test_fc_rnn_mkldnn_fuse_pass + SRCS mkldnn/mkldnn_fc_rnn_fuse_pass_tester.cc + DEPS ${TEST_FC_RNN_PASS_DEPS}) endif() - cc_test(test_conv_batch_norm_mkldnn_fuse_pass SRCS mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc DEPS ${TEST_CONV_BN_PASS_DEPS}) - cc_test(test_scale_matmul_fuse_pass SRCS mkldnn/scale_matmul_fuse_pass_tester.cc DEPS scale_matmul_fuse_pass) - cc_test(test_mkldnn_placement_pass SRCS mkldnn/mkldnn_placement_pass_tester.cc DEPS mkldnn_placement_pass) - cc_test(test_mkldnn_inplace_pass SRCS mkldnn/mkldnn_inplace_pass_tester.cc DEPS mkldnn_inplace_pass) - cc_test(test_compute_propagate_scales_mkldnn_pass SRCS mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc DEPS compute_propagate_scales_mkldnn_pass naive_executor) - cc_test(test_cpu_quantize_placement_pass SRCS mkldnn/cpu_quantize_placement_pass_tester.cc DEPS cpu_quantize_placement_pass) - cc_test(test_cpu_quantize_pass SRCS mkldnn/cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor) - cc_test(test_cpu_quantize_squash_pass SRCS mkldnn/cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor) - cc_test(test_reshape_transpose_matmul_mkldnn_fuse_pass SRCS mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc DEPS reshape_transpose_matmul_mkldnn_fuse_pass reshape_transpose_matmul_v2_mkldnn_fuse_pass) - cc_test(test_matmul_transpose_reshape_fuse_pass SRCS mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc DEPS matmul_transpose_reshape_fuse_pass matmul_v2_transpose_reshape_fuse_pass) - cc_test(test_shuffle_channel_mkldnn_detect_pass SRCS mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc DEPS shuffle_channel_mkldnn_detect_pass) - cc_test(test_cpu_bfloat16_placement_pass SRCS mkldnn/cpu_bfloat16_placement_pass_tester.cc DEPS cpu_bfloat16_placement_pass) - cc_test(test_cpu_bfloat16_pass SRCS mkldnn/cpu_bfloat16_pass_tester.cc DEPS cpu_bfloat16_pass) - cc_test(test_multi_gru_fuse_pass SRCS mkldnn/multi_gru_fuse_pass_tester.cc DEPS multi_gru_fuse_pass) - cc_test(test_multi_gru_seq_fuse_pass SRCS mkldnn/multi_gru_seq_fuse_pass_tester.cc DEPS multi_gru_seq_fuse_pass) - set(TEST_FC_RNN_PASS_DEPS fc_gru_fuse_pass fc_lstm_fuse_pass mkldnn_placement_pass) - cc_test(test_fc_rnn_mkldnn_fuse_pass SRCS mkldnn/mkldnn_fc_rnn_fuse_pass_tester.cc DEPS ${TEST_FC_RNN_PASS_DEPS}) -endif () diff --git a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass_tester.cc b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass_tester.cc index 8870b68fbc5c5..e0ce58121a15e 100644 --- a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass_tester.cc +++ b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass_tester.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.h" - #include + +#include "paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/ir/add_support_int8_pass.cc b/paddle/fluid/framework/ir/add_support_int8_pass.cc index 3a3f5c3741f4d..d38853bb96489 100644 --- a/paddle/fluid/framework/ir/add_support_int8_pass.cc +++ b/paddle/fluid/framework/ir/add_support_int8_pass.cc @@ -68,9 +68,8 @@ void AddSupportInt8Pass::ApplyImpl(ir::Graph* graph) const { i++) { if (quanted_op_desc->Output(quanted_op_desc->OutputNames()[i]) .size() > 0 && - input_name == - quanted_op_desc->Output( - quanted_op_desc->OutputNames()[i])[0]) { + input_name == quanted_op_desc->Output( + quanted_op_desc->OutputNames()[i])[0]) { outscale_flag = true; quanted_op_desc->SetAttr( quanted_op_desc->OutputNames()[i], diff --git a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc index 08e7c6f5b8689..910cb5801db45 100644 --- a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc +++ b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h" + #include #include + #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc index ae843aad7d313..710f8ef1b3759 100644 --- a/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/conv_bn_fuse_pass.h" - #include + +#include "paddle/fluid/framework/ir/conv_bn_fuse_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/cost_model.cc b/paddle/fluid/framework/ir/cost_model.cc index 6086409ffd971..05c7834c9ca9b 100644 --- a/paddle/fluid/framework/ir/cost_model.cc +++ b/paddle/fluid/framework/ir/cost_model.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/ir/cost_model.h" #include + #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/errors.h" diff --git a/paddle/fluid/framework/ir/cost_model_test.cc b/paddle/fluid/framework/ir/cost_model_test.cc index 57f3904d845c8..f5eaa2f0338cb 100644 --- a/paddle/fluid/framework/ir/cost_model_test.cc +++ b/paddle/fluid/framework/ir/cost_model_test.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/cost_model.h" + #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc b/paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc index 2d270f444adbc..2711ddf92d792 100644 --- a/paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc +++ b/paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/cudnn_placement_pass.h" - #include + +#include "paddle/fluid/framework/ir/cudnn_placement_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/framework/ir/delete_dropout_op_pass.cc b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc index 9473cc069285c..5043beef82401 100644 --- a/paddle/fluid/framework/ir/delete_dropout_op_pass.cc +++ b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc @@ -11,10 +11,10 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#include - #include "paddle/fluid/framework/ir/delete_dropout_op_pass.h" +#include + namespace phi { class DenseTensor; } // namespace phi diff --git a/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc b/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc index 79a06572d1427..e4b6e43e5c3dc 100644 --- a/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc +++ b/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/delete_fill_constant_op_pass.h" + #include "paddle/fluid/framework/ir/graph_pattern_detector.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc index 2fc133edb7a96..a02efc0a7cef2 100644 --- a/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc +++ b/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc @@ -102,9 +102,10 @@ void DeleteQuantDequantFilterOpPass::ApplyImpl(ir::Graph* graph) const { break; } } - PADDLE_ENFORCE_GT(arg_name.size(), 0, platform::errors::InvalidArgument( - "can not find the input %s.", - quant_dequant_op_out_name)); + PADDLE_ENFORCE_GT( + arg_name.size(), 0, + platform::errors::InvalidArgument("can not find the input %s.", + quant_dequant_op_out_name)); // any_op2_desc->SetAttr("enable_int8", true); any_op2_desc->SetAttr("bit_length", bit_length); diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc index 727e42629f9fa..8deaf10d200a5 100644 --- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h" - #include + +#include "paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/op_version_registry.h" diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc index 482e38355c59c..a34e0a5d1deae 100644 --- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h" #include + #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/kernels/funcs/blas/blas.h" diff --git a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc index 46a9b2eae35db..be22ee9b2fe36 100644 --- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc @@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h" - #include +#include "paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc index 1e25b21483b82..1802616c0df5b 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/fc_fuse_pass.h" + #include #include "paddle/fluid/framework/op_version_registry.h" diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc index 39b544e716079..e40759cd3fbe2 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/fc_fuse_pass.h" - #include + +#include "paddle/fluid/framework/ir/fc_fuse_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h b/paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h index df3fbc293b78e..9ad3c28f09a2e 100644 --- a/paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h +++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h @@ -13,9 +13,9 @@ // limitations under the License. #pragma once -#include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h" - #include + +#include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc index b99e607f92b5d..5b4bb98ff537c 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h" + #include #include "paddle/fluid/framework/op_version_registry.h" diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h index a313e49f0b2b6..3e47f0795738e 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h @@ -14,9 +14,9 @@ #pragma once -#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h" - #include + +#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h b/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h index ab66fb4a46a8a..632bb237fa219 100644 --- a/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h +++ b/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h @@ -14,6 +14,7 @@ #pragma once #include + #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc index f12273e94dddd..6a2a086704829 100644 --- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc +++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/framework/ir/fuse_bn_act_pass.h" + #include + #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc index 005f006ab0478..ff4850838c51f 100644 --- a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc +++ b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/framework/ir/fuse_bn_add_act_pass.h" + #include + #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc index 62f65baf33618..3feea822bc1ef 100644 --- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc +++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h" + #include + #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc index f48224cbdc24f..1c6b856d987ce 100644 --- a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc +++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc @@ -14,7 +14,9 @@ // limitations under the License. #include "paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h" + #include + #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/enforce.h" @@ -22,6 +24,12 @@ namespace paddle { namespace framework { namespace ir { +static void GetTransposeAttrsFromOp(const OpDesc &op, bool *trans_x, + bool *trans_y) { + *trans_x = BOOST_GET_CONST(bool, op.GetAttr("trans_x")); + *trans_y = BOOST_GET_CONST(bool, op.GetAttr("trans_y")); +} + void FuseGemmEpiloguePass::ApplyImpl(ir::Graph *graph) const { EpiloguePassActivationCache cache; @@ -75,6 +83,9 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearFwd(ir::Graph *graph, if (!IsGemmFromLinear_(matmul_x_shape, matmul_w_shape, matmul_op_desc)) return; + bool trans_x, trans_y; + GetTransposeAttrsFromOp(*matmul_op_desc, &trans_x, &trans_y); + OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block()); std::string activation = "none"; fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue"); @@ -85,6 +96,8 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearFwd(ir::Graph *graph, fused_gemm_epilogue_op_desc.SetAttr("activation", activation); fused_gemm_epilogue_op_desc.SetAttr("op_role", matmul_op_desc->GetAttr("op_role")); + fused_gemm_epilogue_op_desc.SetAttr("trans_x", trans_x); + fused_gemm_epilogue_op_desc.SetAttr("trans_y", trans_y); auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc); IR_NODE_LINK_TO(subgraph.at(x), gemm_epilogue_node); @@ -154,6 +167,9 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearActFwd( auto activation = act_op->Op()->Type(); + bool trans_x, trans_y; + GetTransposeAttrsFromOp(*matmul_op_desc, &trans_x, &trans_y); + OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block()); fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue"); fused_gemm_epilogue_op_desc.SetInput("X", {subgraph.at(x)->Name()}); @@ -163,6 +179,8 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearActFwd( fused_gemm_epilogue_op_desc.SetAttr("activation", activation); fused_gemm_epilogue_op_desc.SetAttr("op_role", matmul_op_desc->GetAttr("op_role")); + fused_gemm_epilogue_op_desc.SetAttr("trans_x", trans_x); + fused_gemm_epilogue_op_desc.SetAttr("trans_y", trans_y); auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc); @@ -274,6 +292,9 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearBwd(ir::Graph *graph, matmul_grad_op_desc)) return; + bool trans_x, trans_y; + GetTransposeAttrsFromOp(*matmul_grad_op_desc, &trans_x, &trans_y); + OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block()); std::string activation_grad = "none"; fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad"); @@ -292,6 +313,8 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearBwd(ir::Graph *graph, activation_grad); fused_gemm_epilogue_grad_op_desc.SetAttr( "op_role", matmul_grad_op_desc->GetAttr("op_role")); + fused_gemm_epilogue_grad_op_desc.SetAttr("trans_x", trans_x); + fused_gemm_epilogue_grad_op_desc.SetAttr("trans_y", trans_y); auto gemm_epilogue_grad_node = g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc); @@ -394,6 +417,8 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearActBwd( auto activation_grad = act_grad_op->Op()->Type(); + bool trans_x, trans_y; + GetTransposeAttrsFromOp(*matmul_grad_op_desc, &trans_x, &trans_y); OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block()); fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad"); fused_gemm_epilogue_grad_op_desc.SetInput("DOut", @@ -410,6 +435,8 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearActBwd( activation_grad); fused_gemm_epilogue_grad_op_desc.SetAttr( "op_role", matmul_grad_op_desc->GetAttr("op_role")); + fused_gemm_epilogue_grad_op_desc.SetAttr("trans_x", trans_x); + fused_gemm_epilogue_grad_op_desc.SetAttr("trans_y", trans_y); auto gemm_epilogue_grad_node = g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc); @@ -456,10 +483,6 @@ bool FuseGemmEpiloguePass::IsGemmFromLinear_( if (tmp_vec.size() > 0) return false; } } - if (BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_x")) || - BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_y"))) - return false; - return true; } diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/CMakeLists.txt b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/CMakeLists.txt index 22876e962a033..7146e9919190d 100644 --- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/CMakeLists.txt +++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/CMakeLists.txt @@ -1,4 +1,16 @@ -cc_library(fuse_optimizer_op_pass SRCS fuse_optimizer_op_pass.cc DEPS graph graph_helper) -cc_library(fuse_adam_op_pass SRCS fuse_adam_op_pass.cc DEPS fuse_optimizer_op_pass) -cc_library(fuse_sgd_op_pass SRCS fuse_sgd_op_pass.cc DEPS fuse_optimizer_op_pass) -cc_library(fuse_momentum_op_pass SRCS fuse_momentum_op_pass.cc DEPS fuse_optimizer_op_pass) +cc_library( + fuse_optimizer_op_pass + SRCS fuse_optimizer_op_pass.cc + DEPS graph graph_helper) +cc_library( + fuse_adam_op_pass + SRCS fuse_adam_op_pass.cc + DEPS fuse_optimizer_op_pass) +cc_library( + fuse_sgd_op_pass + SRCS fuse_sgd_op_pass.cc + DEPS fuse_optimizer_op_pass) +cc_library( + fuse_momentum_op_pass + SRCS fuse_momentum_op_pass.cc + DEPS fuse_optimizer_op_pass) diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc index 0094b674c2a17..9629b9209c4d8 100644 --- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc +++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. #include + #include #include "glog/logging.h" diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc index f87d31cbc409c..e290bdf99ce65 100644 --- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc +++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc @@ -67,8 +67,9 @@ class FuseMomentumOpPass : public FuseOptimizerOpPass { platform::errors::InvalidArgument( "All momentum Op's attr(use_nesterov) must be same, but there " "are two different value: %d, %d.", - use_nesterov, BOOST_GET_CONST(bool, momentum_op->Op()->GetAttr( - "use_nesterov")))); + use_nesterov, + BOOST_GET_CONST(bool, + momentum_op->Op()->GetAttr("use_nesterov")))); PADDLE_ENFORCE_EQ( op_role, BOOST_GET_CONST(int, momentum_op->Op()->GetAttr( diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc index 40e1de8a523aa..e3e5221531ee0 100644 --- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc +++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h" + #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/operator.h" #include "paddle/phi/core/kernel_factory.h" diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc index 56ca98b566070..bcfa69ac2e7ef 100644 --- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc +++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc @@ -13,10 +13,12 @@ // limitations under the License. #include "paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h" + #include #include #include #include + #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt index 78b15398cc792..7df678fbdd7e3 100644 --- a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt +++ b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt @@ -1,14 +1,22 @@ -cc_library(code_generator - SRCS operation.cc code_generator.cc code_generator_helper.cc - DEPS graph subgraph_detector) +cc_library( + code_generator + SRCS operation.cc code_generator.cc code_generator_helper.cc + DEPS graph subgraph_detector) if(WITH_GPU OR WITH_ROCM) - cc_test(test_code_generator SRCS code_generator_tester.cc DEPS code_generator device_code lod_tensor graph_viz_pass) + cc_test( + test_code_generator + SRCS code_generator_tester.cc + DEPS code_generator device_code lod_tensor graph_viz_pass) endif() -cc_library(fusion_group_pass - SRCS fusion_group_pass.cc elementwise_group_detector.cc - DEPS subgraph_detector fuse_pass_base code_generator device_code) -cc_test(test_fusion_group_pass SRCS fusion_group_pass_tester.cc DEPS fusion_group_pass graph_viz_pass) +cc_library( + fusion_group_pass + SRCS fusion_group_pass.cc elementwise_group_detector.cc + DEPS subgraph_detector fuse_pass_base code_generator device_code) +cc_test( + test_fusion_group_pass + SRCS fusion_group_pass_tester.cc + DEPS fusion_group_pass graph_viz_pass) if(WITH_TESTING AND TEST test_code_generator) - set_tests_properties(test_code_generator PROPERTIES TIMEOUT 120) + set_tests_properties(test_code_generator PROPERTIES TIMEOUT 120) endif() diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator.cc b/paddle/fluid/framework/ir/fusion_group/code_generator.cc index 5b125030a7a77..a8a09d690239c 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator.cc +++ b/paddle/fluid/framework/ir/fusion_group/code_generator.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/fusion_group/code_generator.h" + #include "paddle/fluid/framework/ir/fusion_group/code_generator_helper.h" #include "paddle/fluid/framework/ir/fusion_group/cuda_resources.h" diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc index 18bd6d623b7ea..650ed965067ad 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc +++ b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/ir/fusion_group/operation.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc index 7b6bbf0251001..a24a9af158ec0 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc +++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include #include diff --git a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc index 6fa3044affc21..5be4091ca8b3c 100644 --- a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc +++ b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h" + #include #include "paddle/fluid/framework/ir/fusion_group/operation.h" diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc index 85d34405c5e57..44df3a837f6d3 100644 --- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc +++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h" + #include "paddle/fluid/framework/ir/fusion_group/code_generator.h" #include "paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc index db22c03a7d9c0..402fad0e84cfa 100644 --- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc +++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h" - #include + +#include "paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/fusion_group/operation.cc b/paddle/fluid/framework/ir/fusion_group/operation.cc index 2b7a3e1899c76..7d1b7bafa1365 100644 --- a/paddle/fluid/framework/ir/fusion_group/operation.cc +++ b/paddle/fluid/framework/ir/fusion_group/operation.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/fusion_group/operation.h" + #include "paddle/fluid/framework/operator.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/fusion_group/subgraph.h b/paddle/fluid/framework/ir/fusion_group/subgraph.h index 5a29e875aea61..1c334e70f1c30 100644 --- a/paddle/fluid/framework/ir/fusion_group/subgraph.h +++ b/paddle/fluid/framework/ir/fusion_group/subgraph.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/ir/fusion_group/operation.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_traits.h" diff --git a/paddle/fluid/framework/ir/generate_pass.cc b/paddle/fluid/framework/ir/generate_pass.cc index 02c9d8e1c0c24..00d69c9d5d2b1 100644 --- a/paddle/fluid/framework/ir/generate_pass.cc +++ b/paddle/fluid/framework/ir/generate_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/generate_pass.h" + #include "paddle/fluid/framework/ir/graph_pattern_detector.h" namespace paddle { @@ -234,178 +235,183 @@ bool IsDuplicatePattern(const GraphPatternDetector::subgraph_t& subgraph, GraphPatternDetector::handle_t GetGenerateDelete( const PDPattern& pattern, const proto::PassDesc& pass_desc) { - GraphPatternDetector::handle_t handler = [&]( - const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { - if (IsDuplicatePattern(subgraph, graph)) { - return; - } - // `var_node_maps` record the mapping of variable to the pattern subgraph. - std::map var_node_maps; - for (const proto::PassDesc::VarMap& var_map : pass_desc.var_maps()) { - Node* node = subgraph.at(pattern.RetrieveNode(var_map.pattern_var())); - const auto& iter = var_node_maps.find(var_map.replace_var()); - if (var_node_maps.end() == iter) { - // first node is input - var_node_maps.insert({var_map.replace_var(), node}); - } else { - // output node - for (Node* s_node : node->outputs) { - iter->second->outputs.push_back(s_node); - std::replace(s_node->inputs.begin(), s_node->inputs.end(), node, - iter->second); - s_node->Op()->RenameInput(node->Name(), iter->second->Name()); + GraphPatternDetector::handle_t handler = + [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { + if (IsDuplicatePattern(subgraph, graph)) { + return; } - } - } - // Remove nodes that are intermediate. - std::unordered_set remove_nodes; - for (const std::unique_ptr& pdnode : pattern.nodes()) { - remove_nodes.emplace(subgraph.at(pdnode.get())); - } - for (auto iter : var_node_maps) { - remove_nodes.erase(iter.second); - } - GraphSafeRemoveNodes(graph, remove_nodes); - }; + // `var_node_maps` record the mapping of variable to the pattern + // subgraph. + std::map var_node_maps; + for (const proto::PassDesc::VarMap& var_map : pass_desc.var_maps()) { + Node* node = subgraph.at(pattern.RetrieveNode(var_map.pattern_var())); + const auto& iter = var_node_maps.find(var_map.replace_var()); + if (var_node_maps.end() == iter) { + // first node is input + var_node_maps.insert({var_map.replace_var(), node}); + } else { + // output node + for (Node* s_node : node->outputs) { + iter->second->outputs.push_back(s_node); + std::replace(s_node->inputs.begin(), s_node->inputs.end(), node, + iter->second); + s_node->Op()->RenameInput(node->Name(), iter->second->Name()); + } + } + } + // Remove nodes that are intermediate. + std::unordered_set remove_nodes; + for (const std::unique_ptr& pdnode : pattern.nodes()) { + remove_nodes.emplace(subgraph.at(pdnode.get())); + } + for (auto iter : var_node_maps) { + remove_nodes.erase(iter.second); + } + GraphSafeRemoveNodes(graph, remove_nodes); + }; return handler; } GraphPatternDetector::handle_t GetGenerateRewrite( const PDPattern& pattern, const proto::PassDesc& pass_desc) { - GraphPatternDetector::handle_t handler = [&]( - const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { - if (IsDuplicatePattern(subgraph, graph)) { - return; - } - for (const auto& condition : pass_desc.var_attr_conditions()) { - if (condition.has_condition_attr()) { - Node* node = - subgraph.at(pattern.RetrieveNode(condition.attr().var_name())); - Attribute node_attr = GetVarAttrValue(node->Var(), condition.attr()); - Attribute condition_attr; - if (condition.condition_attr().role() == - proto::PassDesc_RoleType_kVariable) { - Node* condition_node = - subgraph.at(pattern.RetrieveNode(condition.attr().var_name())); - condition_attr = GetVarAttrValue(condition_node->Var(), - condition.condition_attr()); - } else { - PADDLE_THROW( - platform::errors::Unimplemented("Unimplemented for operation.")); - } - bool check_failed = false; - if (condition.type() == proto::PassDesc_ConditionType_kEQ) { - check_failed = !(node_attr == condition_attr); - } - if (check_failed) { - VLOG(3) << "Check var [" << node->Name() << "] with attr [" - << condition.attr().name() << "] failed, skip this pattern."; + GraphPatternDetector::handle_t handler = + [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { + if (IsDuplicatePattern(subgraph, graph)) { return; } - } - } - // `var_node_maps` record the mapping of variable to the pattern subgraph. - std::map var_node_maps; - for (const proto::PassDesc::VarMap& var_map : pass_desc.var_maps()) { - Node* node = subgraph.at(pattern.RetrieveNode(var_map.pattern_var())); - var_node_maps.insert({var_map.replace_var(), node}); - } - // Traverse all operators to create subgraph. - for (int index = 0; index < pass_desc.replace_size(); ++index) { - const proto::OpDesc& op = pass_desc.replace(index); - OpDesc op_desc; - std::vector in_nodes, out_nodes; - op_desc.SetType(op.type()); - // Create Nodes for inputs of current operator. - for (const proto::OpDesc::Var& var : op.inputs()) { - std::vector arguments; - for (const std::string& argument : var.arguments()) { - // The input may be mapped on the operator of pattern subgraph. - Node* node = nullptr; - auto iter = var_node_maps.find(argument); - if (var_node_maps.end() == iter) { - VarDesc var_desc(patterns::UniqueKey(argument)); - node = graph->CreateVarNode(&var_desc); - var_node_maps.insert({argument, node}); - } else { - node = iter->second; - } - in_nodes.push_back(node); - arguments.push_back(node->Name()); - } - op_desc.SetInput(var.parameter(), arguments); - } - // Create Nodes for outputs of current operator. - for (const proto::OpDesc::Var& var : op.outputs()) { - std::vector arguments; - for (const std::string& argument : var.arguments()) { - // The output may be mapped on the operator of pattern subgraph. - Node* node = nullptr; - auto iter = var_node_maps.find(argument); - if (var_node_maps.end() == iter) { - VarDesc var_desc(patterns::UniqueKey(argument)); - node = graph->CreateVarNode(&var_desc); - var_node_maps.insert({argument, node}); - } else { - if (in_nodes.end() == - std::find(in_nodes.begin(), in_nodes.end(), iter->second)) { - node = iter->second; + for (const auto& condition : pass_desc.var_attr_conditions()) { + if (condition.has_condition_attr()) { + Node* node = + subgraph.at(pattern.RetrieveNode(condition.attr().var_name())); + Attribute node_attr = + GetVarAttrValue(node->Var(), condition.attr()); + Attribute condition_attr; + if (condition.condition_attr().role() == + proto::PassDesc_RoleType_kVariable) { + Node* condition_node = subgraph.at( + pattern.RetrieveNode(condition.attr().var_name())); + condition_attr = GetVarAttrValue(condition_node->Var(), + condition.condition_attr()); } else { - node = graph->CreateVarNode(iter->second->Var()); + PADDLE_THROW(platform::errors::Unimplemented( + "Unimplemented for operation.")); + } + bool check_failed = false; + if (condition.type() == proto::PassDesc_ConditionType_kEQ) { + check_failed = !(node_attr == condition_attr); + } + if (check_failed) { + VLOG(3) << "Check var [" << node->Name() << "] with attr [" + << condition.attr().name() + << "] failed, skip this pattern."; + return; } } - out_nodes.push_back(node); - arguments.push_back(node->Name()); } - op_desc.SetOutput(var.parameter(), arguments); - } - // Set attribute for current operator. - for (const proto::OpDesc::Attr& attr : op.attrs()) { - op_desc.SetAttr(attr.name(), GetAttrValue(attr)); - } - for (const auto& attr_map : pass_desc.op_attr_maps()) { - if (attr_map.replace_attr().op_index() == index) { - Attribute attr; - if (attr_map.pattern_attr().role() == - proto::PassDesc_RoleType_kVariable) { - Node* condition_node = subgraph.at( - pattern.RetrieveNode(attr_map.pattern_attr().var_name())); - attr = - GetVarAttrValue(condition_node->Var(), attr_map.pattern_attr()); - } else { - Node* condition_node = subgraph.at(pattern.RetrieveNode( - std::to_string(attr_map.pattern_attr().op_index()))); - attr = - GetOpAttrValue(condition_node->Op(), attr_map.pattern_attr()); + // `var_node_maps` record the mapping of variable to the pattern + // subgraph. + std::map var_node_maps; + for (const proto::PassDesc::VarMap& var_map : pass_desc.var_maps()) { + Node* node = subgraph.at(pattern.RetrieveNode(var_map.pattern_var())); + var_node_maps.insert({var_map.replace_var(), node}); + } + // Traverse all operators to create subgraph. + for (int index = 0; index < pass_desc.replace_size(); ++index) { + const proto::OpDesc& op = pass_desc.replace(index); + OpDesc op_desc; + std::vector in_nodes, out_nodes; + op_desc.SetType(op.type()); + // Create Nodes for inputs of current operator. + for (const proto::OpDesc::Var& var : op.inputs()) { + std::vector arguments; + for (const std::string& argument : var.arguments()) { + // The input may be mapped on the operator of pattern subgraph. + Node* node = nullptr; + auto iter = var_node_maps.find(argument); + if (var_node_maps.end() == iter) { + VarDesc var_desc(patterns::UniqueKey(argument)); + node = graph->CreateVarNode(&var_desc); + var_node_maps.insert({argument, node}); + } else { + node = iter->second; + } + in_nodes.push_back(node); + arguments.push_back(node->Name()); + } + op_desc.SetInput(var.parameter(), arguments); + } + // Create Nodes for outputs of current operator. + for (const proto::OpDesc::Var& var : op.outputs()) { + std::vector arguments; + for (const std::string& argument : var.arguments()) { + // The output may be mapped on the operator of pattern subgraph. + Node* node = nullptr; + auto iter = var_node_maps.find(argument); + if (var_node_maps.end() == iter) { + VarDesc var_desc(patterns::UniqueKey(argument)); + node = graph->CreateVarNode(&var_desc); + var_node_maps.insert({argument, node}); + } else { + if (in_nodes.end() == + std::find(in_nodes.begin(), in_nodes.end(), iter->second)) { + node = iter->second; + } else { + node = graph->CreateVarNode(iter->second->Var()); + } + } + out_nodes.push_back(node); + arguments.push_back(node->Name()); + } + op_desc.SetOutput(var.parameter(), arguments); + } + // Set attribute for current operator. + for (const proto::OpDesc::Attr& attr : op.attrs()) { + op_desc.SetAttr(attr.name(), GetAttrValue(attr)); } - if (attr_map.has_operation()) { - Attribute operation = GetAttrValue(attr_map.operation().value()); - attr = boost::apply_visitor( - operation_visitor(attr_map.operation().type()), attr, - operation); + for (const auto& attr_map : pass_desc.op_attr_maps()) { + if (attr_map.replace_attr().op_index() == index) { + Attribute attr; + if (attr_map.pattern_attr().role() == + proto::PassDesc_RoleType_kVariable) { + Node* condition_node = subgraph.at( + pattern.RetrieveNode(attr_map.pattern_attr().var_name())); + attr = GetVarAttrValue(condition_node->Var(), + attr_map.pattern_attr()); + } else { + Node* condition_node = subgraph.at(pattern.RetrieveNode( + std::to_string(attr_map.pattern_attr().op_index()))); + attr = GetOpAttrValue(condition_node->Op(), + attr_map.pattern_attr()); + } + if (attr_map.has_operation()) { + Attribute operation = + GetAttrValue(attr_map.operation().value()); + attr = boost::apply_visitor( + operation_visitor(attr_map.operation().type()), attr, + operation); + } + op_desc.SetAttr(attr_map.replace_attr().name(), attr); + } + } + // Create a Node for current operator. + Node* op_node = graph->CreateOpNode(&op_desc); + for (Node* node : in_nodes) { + IR_NODE_LINK_TO(node, op_node); + } + for (Node* node : out_nodes) { + IR_NODE_LINK_TO(op_node, node); } - op_desc.SetAttr(attr_map.replace_attr().name(), attr); } - } - // Create a Node for current operator. - Node* op_node = graph->CreateOpNode(&op_desc); - for (Node* node : in_nodes) { - IR_NODE_LINK_TO(node, op_node); - } - for (Node* node : out_nodes) { - IR_NODE_LINK_TO(op_node, node); - } - } - // Remove nodes that are intermediate. - std::unordered_set remove_nodes; - for (const std::unique_ptr& pdnode : pattern.nodes()) { - remove_nodes.emplace(subgraph.at(pdnode.get())); - } - for (auto iter : var_node_maps) { - remove_nodes.erase(iter.second); - } - GraphSafeRemoveNodes(graph, remove_nodes); - }; + // Remove nodes that are intermediate. + std::unordered_set remove_nodes; + for (const std::unique_ptr& pdnode : pattern.nodes()) { + remove_nodes.emplace(subgraph.at(pdnode.get())); + } + for (auto iter : var_node_maps) { + remove_nodes.erase(iter.second); + } + GraphSafeRemoveNodes(graph, remove_nodes); + }; return handler; } diff --git a/paddle/fluid/framework/ir/generate_pass_tester.cc b/paddle/fluid/framework/ir/generate_pass_tester.cc index 6876dde50c157..7e98b11215a75 100644 --- a/paddle/fluid/framework/ir/generate_pass_tester.cc +++ b/paddle/fluid/framework/ir/generate_pass_tester.cc @@ -12,16 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/generate_pass.h" #include "gtest/gtest.h" +#include "paddle/fluid/framework/ir/generate_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" REGISTER_GENERATE_PASS(generate_fc_fuse) { paddle::framework::ir::PassPairs pass_pairs; for (bool with_relu : {true, false}) { // pattern - SUBGRAPH_(pattern) = - [ subgraph = &pattern, with_relu ](VAR_(x), VAR_(y), VAR_(z)) { + SUBGRAPH_(pattern) = [subgraph = &pattern, with_relu](VAR_(x), VAR_(y), + VAR_(z)) { VLOG(3) << "exec lambda func."; auto mul = OP_(mul)({{"X", x}, {"Y", y}}).Out("Out"); auto ewadd = OP_(elementwise_add)({{"X", mul}, {"Y", z}}).Out("Out"); @@ -32,8 +32,8 @@ REGISTER_GENERATE_PASS(generate_fc_fuse) { } }; // replace - SUBGRAPH_(replace) = - [ subgraph = &replace, with_relu ](VAR_(x), VAR_(y), VAR_(z)) { + SUBGRAPH_(replace) = [subgraph = &replace, with_relu](VAR_(x), VAR_(y), + VAR_(z)) { auto& fc = OP_(fc)({{"Input", x}, {"W", y}, {"Bias", z}}); return fc.Out("Out"); }; diff --git a/paddle/fluid/framework/ir/gpu_cpu_map_matmul_to_mul_pass.cc b/paddle/fluid/framework/ir/gpu_cpu_map_matmul_to_mul_pass.cc index ac580b99b5c95..8e58231e98681 100644 --- a/paddle/fluid/framework/ir/gpu_cpu_map_matmul_to_mul_pass.cc +++ b/paddle/fluid/framework/ir/gpu_cpu_map_matmul_to_mul_pass.cc @@ -16,9 +16,9 @@ #include #include + #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/op_proto_maker.h" - #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index f5f6f3ecb855c..acf8f6ec6435b 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/ir/graph.h" + #include -#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/operator.h" PADDLE_DEFINE_EXPORTED_bool(convert_all_blocks, true, diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 10645f08dc3ba..40a6fbbade80e 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include #include #include diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index ed7aa451d134c..d4c7a607db371 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/graph_helper.h" + #include #include + #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/op_proto_maker.h" @@ -421,8 +423,9 @@ std::vector TopologySortGraphByDescOrder(const Graph &graph) { DescOrderComparator> adj_list = BuildOperationAdjList(graph); PADDLE_ENFORCE_EQ(HasCircleInternal(adj_list, nullptr), - false, platform::errors::InvalidArgument( - "Generated graph shouldn't contain cycle.")); + false, + platform::errors::InvalidArgument( + "Generated graph shouldn't contain cycle.")); std::unordered_set visited; std::vector ret; for (auto adj : adj_list) { diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc index 0a2dcfed000c9..5972cd40817ac 100644 --- a/paddle/fluid/framework/ir/graph_helper_test.cc +++ b/paddle/fluid/framework/ir/graph_helper_test.cc @@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/ir/graph.h" -#include "gtest/gtest.h" #include "paddle/fluid/framework/ir/graph_helper.h" + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/program_desc.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index f7c1a68c826f0..ca5a82708c554 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/graph_pattern_detector.h" + #include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/operator.h" @@ -70,8 +71,9 @@ void PDPattern::AddEdge(PDNode *a, PDNode *b) { a, platform::errors::NotFound("PDNode %s is not found.", a->name())); PADDLE_ENFORCE_NOT_NULL( b, platform::errors::NotFound("PDNode %s is not found.", b->name())); - PADDLE_ENFORCE_NE(a, b, platform::errors::PermissionDenied( - "Cannot connect the same node in the graph.")); + PADDLE_ENFORCE_NE(a, b, + platform::errors::PermissionDenied( + "Cannot connect the same node in the graph.")); edges_.emplace_back(a, b); } @@ -2631,8 +2633,10 @@ PDNode *patterns::Bfloat16Placement::operator()( PDNode *patterns::OrphanedBfloat16::operator()() { auto *prev_op = pattern->NewNode(prev_op_repr())->assert_is_op(); prev_op->assert_more([&](Node *node) { - return node->Op()->GetAttrIfExists("mkldnn_data_type") == - "float32"; + bool data_type_is_missing = !node->Op()->HasAttr("mkldnn_data_type"); + bool data_type_is_fp32 = node->Op()->GetAttrIfExists( + "mkldnn_data_type") == "float32"; + return data_type_is_missing || data_type_is_fp32; }); auto *prev_out = pattern->NewNode(prev_out_repr())->AsOutput(); @@ -2645,8 +2649,10 @@ PDNode *patterns::OrphanedBfloat16::operator()() { auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op(); next_op->assert_more([&](Node *node) { - return node->Op()->GetAttrIfExists("mkldnn_data_type") == - "float32"; + bool data_type_is_missing = !node->Op()->HasAttr("mkldnn_data_type"); + bool data_type_is_fp32 = node->Op()->GetAttrIfExists( + "mkldnn_data_type") == "float32"; + return data_type_is_missing || data_type_is_fp32; }); prev_op->LinksTo({prev_out}); @@ -3058,11 +3064,10 @@ PDNode *patterns::ReshapeTransposeMatmulPattern::operator()( transpose_out->assert_is_only_output_of_op("transpose2"); auto transpose_xshape = - with_transpose_xshape - ? pattern->NewNode(transpose_xshape_repr()) - ->AsIntermediate() - ->assert_is_op_output("transpose2", "XShape") - : nullptr; + with_transpose_xshape ? pattern->NewNode(transpose_xshape_repr()) + ->AsIntermediate() + ->assert_is_op_output("transpose2", "XShape") + : nullptr; auto matmul_out = pattern->NewNode(matmul_out_repr()) ->AsOutput() diff --git a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc index 5ac5a5d983992..b02b2e13edc97 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc @@ -152,12 +152,12 @@ TEST(GraphPatternDetecter, MultiSubgraph) { x.mutable_pattern()->AddEdge(any_var, any_op1); int count = 0; - GraphPatternDetector::handle_t handle = [&]( - const GraphPatternDetector::subgraph_t& s, Graph* g) { - LOG(INFO) << "Detect " << s.at(any_op)->Name() << " -> " - << s.at(any_var)->Name() << " -> " << s.at(any_op1)->Name(); - count++; - }; + GraphPatternDetector::handle_t handle = + [&](const GraphPatternDetector::subgraph_t& s, Graph* g) { + LOG(INFO) << "Detect " << s.at(any_op)->Name() << " -> " + << s.at(any_var)->Name() << " -> " << s.at(any_op1)->Name(); + count++; + }; x(&graph, handle); diff --git a/paddle/fluid/framework/ir/graph_printer.h b/paddle/fluid/framework/ir/graph_printer.h index 76b07f0d65309..1b0e059f122b5 100644 --- a/paddle/fluid/framework/ir/graph_printer.h +++ b/paddle/fluid/framework/ir/graph_printer.h @@ -15,11 +15,13 @@ #pragma once #include + #include #include #include #include #include + #include "paddle/fluid/framework/details/multi_devices_helper.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc index 1ff67ae0fe0d9..db18a735ce2dd 100644 --- a/paddle/fluid/framework/ir/graph_test.cc +++ b/paddle/fluid/framework/ir/graph_test.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/graph.h" + #include "gtest/gtest.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.cc b/paddle/fluid/framework/ir/graph_to_program_pass.cc index 3ad591c6dff04..f57cdd9d9746c 100644 --- a/paddle/fluid/framework/ir/graph_to_program_pass.cc +++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph_to_program_pass.h" #include + #include #include "paddle/fluid/framework/op_proto_maker.h" diff --git a/paddle/fluid/framework/ir/graph_traits.cc b/paddle/fluid/framework/ir/graph_traits.cc index b06314563025a..36bc3e6dd781b 100644 --- a/paddle/fluid/framework/ir/graph_traits.cc +++ b/paddle/fluid/framework/ir/graph_traits.cc @@ -12,11 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/ir/graph_traits.h" + #include #include -#include "paddle/fluid/framework/ir/graph_traits.h" - namespace paddle { namespace framework { namespace ir { @@ -76,21 +76,22 @@ NodesDFSIterator::NodesDFSIterator(const std::vector &source) { } NodesDFSIterator::NodesDFSIterator(NodesDFSIterator &&other) noexcept - : stack_(std::move(other.stack_)), - visited_(std::move(other.visited_)) {} + : stack_(std::move(other.stack_)), visited_(std::move(other.visited_)) {} NodesDFSIterator::NodesDFSIterator(const NodesDFSIterator &other) : stack_(other.stack_), visited_(other.visited_) {} Node &NodesDFSIterator::operator*() { - PADDLE_ENFORCE_EQ(stack_.empty(), false, platform::errors::OutOfRange( - "The iterator exceeds range.")); + PADDLE_ENFORCE_EQ( + stack_.empty(), false, + platform::errors::OutOfRange("The iterator exceeds range.")); return *stack_.top(); } NodesDFSIterator &NodesDFSIterator::operator++() { - PADDLE_ENFORCE_EQ(stack_.empty(), false, platform::errors::OutOfRange( - "The iterator exceeds range.")); + PADDLE_ENFORCE_EQ( + stack_.empty(), false, + platform::errors::OutOfRange("The iterator exceeds range.")); visited_.insert(stack_.top()); auto *cur = stack_.top(); stack_.pop(); diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc index 7311eb4b91df8..da48d1d19b60a 100644 --- a/paddle/fluid/framework/ir/graph_viz_pass.cc +++ b/paddle/fluid/framework/ir/graph_viz_pass.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/graph_viz_pass.h" + #include + #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_printer.h" #include "paddle/fluid/framework/op_proto_maker.h" diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc index 6b91ea4e360df..3d60148c170f9 100644 --- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc +++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/identity_scale_op_clean_pass.h" + #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/op_version_registry.h" @@ -46,42 +47,42 @@ void IdentityScaleOpCleanPass::ApplyImpl(ir::Graph* graph) const { scale_op->LinksFrom({scale_in}).LinksTo({scale_out}); int found_subgraph_count = 0; - GraphPatternDetector::handle_t handler = [&]( - const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { - Node* scale_op_var = subgraph.at(scale_op); - Node* scale_in_var = subgraph.at(scale_in); - Node* scale_out_var = subgraph.at(scale_out); - const std::string scale_in_name = scale_in_var->Name(); - const std::string scale_out_name = scale_out_var->Name(); - // Remove links in graph - GraphSafeRemoveNodes(graph, {scale_in_var, scale_op_var}); - // Modify pre_op_desc - // Link pre_op directly to scale_out - for (auto& node : graph->Nodes()) { - if (node->IsOp()) { - auto* op_desc = node->Op(); - auto out_vars_map = op_desc->Outputs(); - for (auto out_var_map : out_vars_map) { - auto names = out_var_map.second; - bool reset = false; - for (size_t i = 0; i < names.size(); i++) { - if (names[i] == scale_in_name) { - reset = true; - names[i] = scale_out_name; - break; + GraphPatternDetector::handle_t handler = + [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { + Node* scale_op_var = subgraph.at(scale_op); + Node* scale_in_var = subgraph.at(scale_in); + Node* scale_out_var = subgraph.at(scale_out); + const std::string scale_in_name = scale_in_var->Name(); + const std::string scale_out_name = scale_out_var->Name(); + // Remove links in graph + GraphSafeRemoveNodes(graph, {scale_in_var, scale_op_var}); + // Modify pre_op_desc + // Link pre_op directly to scale_out + for (auto& node : graph->Nodes()) { + if (node->IsOp()) { + auto* op_desc = node->Op(); + auto out_vars_map = op_desc->Outputs(); + for (auto out_var_map : out_vars_map) { + auto names = out_var_map.second; + bool reset = false; + for (size_t i = 0; i < names.size(); i++) { + if (names[i] == scale_in_name) { + reset = true; + names[i] = scale_out_name; + break; + } + } + if (reset) { + op_desc->SetOutput(out_var_map.first, names); + op_desc->Flush(); + IR_NODE_LINK_TO(node, scale_out_var); + break; + } } } - if (reset) { - op_desc->SetOutput(out_var_map.first, names); - op_desc->Flush(); - IR_NODE_LINK_TO(node, scale_out_var); - break; - } } - } - } - found_subgraph_count++; - }; + found_subgraph_count++; + }; detector(graph, handler); AddStatis(found_subgraph_count); diff --git a/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc b/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc index f1ee3c26b8f48..5c7373e1a77d8 100644 --- a/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc +++ b/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc @@ -14,10 +14,9 @@ #include "paddle/fluid/framework/ir/ipu/avg_shard_pass.h" -#include "paddle/fluid/platform/device/ipu/ipu_backend.h" - #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" +#include "paddle/fluid/platform/device/ipu/ipu_backend.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc index ebe40c3ee204e..cbe57eae4c496 100644 --- a/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc +++ b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/ipu/infer_shape_pass.h" + #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc index a6b82089dc4df..df4ea7fac4b35 100644 --- a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc +++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc @@ -14,11 +14,10 @@ #include "paddle/fluid/framework/ir/ipu/inference_process_pass.h" -#include "paddle/fluid/platform/device/ipu/ipu_backend.h" -#include "paddle/fluid/platform/device/ipu/ipu_strategy.h" - #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" +#include "paddle/fluid/platform/device/ipu/ipu_backend.h" +#include "paddle/fluid/platform/device/ipu/ipu_strategy.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc index 4da913e7176ca..12d646e153b4f 100644 --- a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc +++ b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h" + #include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/platform/device/ipu/ipu_backend.h" #include "paddle/fluid/platform/device/ipu/ipu_names.h" diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc index bf0667aeafe60..d2444295544b9 100644 --- a/paddle/fluid/framework/ir/is_test_pass_tester.cc +++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc @@ -11,9 +11,9 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/is_test_pass.h" - #include + +#include "paddle/fluid/framework/ir/is_test_pass.h" #ifdef _WIN32 #undef FALSE #undef TRUE diff --git a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc index 4b0dc4809f550..1b7b06213fe3c 100644 --- a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc @@ -12,11 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/ir/layer_norm_fuse_pass.h" + #include #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" -#include "paddle/fluid/framework/ir/layer_norm_fuse_pass.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.h b/paddle/fluid/framework/ir/lock_free_optimize_pass.h index 93b6396bf7f31..a72a59374f902 100644 --- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h +++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h @@ -29,31 +29,31 @@ class Node; class Graph; /* -* Remove the sum op of all gradients of the backward op. -* And remove the dependecies of the optimizer related to the -* same backward op. -* -* Before this pass: -* -* forward_op1 forward_op2 -* | | -* grad_op1 grad_op2 -* \ / -* \ / -* sum_op -* | -* sgd_op -* -* After this pass: -* forward_op1 forward_op2 -* | | -* grad_op1 grad_op2 -* | | -* sgd_op1 sgd_op2 -* -* sgd_op1 and sgd_op2 will update the same weight which holds the same -* memory, so we could benefits from the acceleration -*/ + * Remove the sum op of all gradients of the backward op. + * And remove the dependecies of the optimizer related to the + * same backward op. + * + * Before this pass: + * + * forward_op1 forward_op2 + * | | + * grad_op1 grad_op2 + * \ / + * \ / + * sum_op + * | + * sgd_op + * + * After this pass: + * forward_op1 forward_op2 + * | | + * grad_op1 grad_op2 + * | | + * sgd_op1 sgd_op2 + * + * sgd_op1 and sgd_op2 will update the same weight which holds the same + * memory, so we could benefits from the acceleration + */ class LockFreeOptimizePass : public Pass { public: virtual ~LockFreeOptimizePass() {} diff --git a/paddle/fluid/framework/ir/matmul_scale_fuse_pass.cc b/paddle/fluid/framework/ir/matmul_scale_fuse_pass.cc index 2335e5eee01db..a4bab58506e82 100644 --- a/paddle/fluid/framework/ir/matmul_scale_fuse_pass.cc +++ b/paddle/fluid/framework/ir/matmul_scale_fuse_pass.cc @@ -16,9 +16,9 @@ #include #include + #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/op_proto_maker.h" - #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt index 25b07ddf41414..32d02902e8643 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt +++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt @@ -1,24 +1,80 @@ -cc_library(op_graph_view SRCS op_graph_view.cc DEPS op_handle_base) -cc_library(conditional_block_op_eager_deletion_pass SRCS conditional_block_op_eager_deletion_pass.cc DEPS conditional_block_op_helper graph_helper pass computation_op_handle) -cc_library(while_op_eager_deletion_pass SRCS while_op_eager_deletion_pass.cc DEPS while_op_helper graph_helper pass computation_op_handle) -cc_library(recurrent_op_eager_deletion_pass SRCS recurrent_op_eager_deletion_pass.cc DEPS recurrent_op_helper graph_helper pass computation_op_handle) -cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle var_handle) -cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper) +cc_library( + op_graph_view + SRCS op_graph_view.cc + DEPS op_handle_base) +cc_library( + conditional_block_op_eager_deletion_pass + SRCS conditional_block_op_eager_deletion_pass.cc + DEPS conditional_block_op_helper graph_helper pass computation_op_handle) +cc_library( + while_op_eager_deletion_pass + SRCS while_op_eager_deletion_pass.cc + DEPS while_op_helper graph_helper pass computation_op_handle) +cc_library( + recurrent_op_eager_deletion_pass + SRCS recurrent_op_eager_deletion_pass.cc + DEPS recurrent_op_helper graph_helper pass computation_op_handle) +cc_library( + reference_count_pass_helper + SRCS reference_count_pass_helper.cc + DEPS garbage_collector computation_op_handle var_handle) +cc_library( + reference_count_pass + SRCS reference_count_pass.cc + DEPS computation_op_handle graph graph_helper pass op_graph_view + reference_count_pass_helper) -SET(EAGER_DELETETION_PASS_DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass conditional_block_op_eager_deletion_pass while_op_eager_deletion_pass recurrent_op_eager_deletion_pass reference_count_pass_helper) -if (WITH_CINN) - cc_library(share_varinfo_into_cinn_pass SRCS share_varinfo_into_cinn_pass.cc DEPS pass enforce graph_helper computation_op_handle eager_deletion_op_handle cinn_compiler) - cc_test(share_varinfo_into_cinn_pass_test SRCS share_varinfo_into_cinn_pass_test.cc DEPS share_varinfo_into_cinn_pass parallel_executor cinn_compiler elementwise_add_op mul_op cinn_launch_op) +set(EAGER_DELETETION_PASS_DEPS + computation_op_handle + eager_deletion_op_handle + graph + graph_helper + pass + conditional_block_op_eager_deletion_pass + while_op_eager_deletion_pass + recurrent_op_eager_deletion_pass + reference_count_pass_helper) +if(WITH_CINN) + cc_library( + share_varinfo_into_cinn_pass + SRCS share_varinfo_into_cinn_pass.cc + DEPS pass enforce graph_helper computation_op_handle + eager_deletion_op_handle cinn_compiler) + cc_test( + share_varinfo_into_cinn_pass_test + SRCS share_varinfo_into_cinn_pass_test.cc + DEPS share_varinfo_into_cinn_pass parallel_executor cinn_compiler + elementwise_add_op mul_op cinn_launch_op) list(APPEND EAGER_DELETETION_PASS_DEPS share_varinfo_into_cinn_pass) endif() -cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS ${EAGER_DELETETION_PASS_DEPS}) +cc_library( + eager_deletion_pass + SRCS eager_deletion_pass.cc + DEPS ${EAGER_DELETETION_PASS_DEPS}) -cc_library(memory_reuse_pass SRCS memory_reuse_pass.cc DEPS computation_op_handle reference_count_pass_helper share_tensor_buffer_op_handle graph pass multi_devices_helper) +cc_library( + memory_reuse_pass + SRCS memory_reuse_pass.cc + DEPS computation_op_handle reference_count_pass_helper + share_tensor_buffer_op_handle graph pass multi_devices_helper) -cc_library(buffer_shared_inplace_op_pass SRCS buffer_shared_inplace_op_pass.cc DEPS memory_reuse_pass executor_gc_helper) -cc_library(buffer_shared_cross_op_memory_reuse_pass SRCS buffer_shared_cross_op_memory_reuse_pass.cc DEPS memory_reuse_pass) +cc_library( + buffer_shared_inplace_op_pass + SRCS buffer_shared_inplace_op_pass.cc + DEPS memory_reuse_pass executor_gc_helper) +cc_library( + buffer_shared_cross_op_memory_reuse_pass + SRCS buffer_shared_cross_op_memory_reuse_pass.cc + DEPS memory_reuse_pass) -cc_library(inplace_addto_op_pass SRCS inplace_addto_op_pass.cc DEPS memory_reuse_pass) +cc_library( + inplace_addto_op_pass + SRCS inplace_addto_op_pass.cc + DEPS memory_reuse_pass) -cc_test(test_reference_count_pass_last_lived_ops SRCS test_reference_count_pass_last_lived_ops.cc DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op eigen_function) +cc_test( + test_reference_count_pass_last_lived_ops + SRCS test_reference_count_pass_last_lived_ops.cc + DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op + eigen_function) diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc index b12b84d4a491b..090673b87ed8f 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc @@ -321,13 +321,15 @@ size_t BufferSharedCrossOpMemoryReusePass::ResolveDependencyBetween( } void BufferSharedCrossOpMemoryReusePass::BuildOpDependencyMap() const { - PADDLE_ENFORCE_EQ(ops_.empty(), true, platform::errors::InvalidArgument( - "Ops must be initialized here.")); + PADDLE_ENFORCE_EQ( + ops_.empty(), true, + platform::errors::InvalidArgument("Ops must be initialized here.")); PADDLE_ENFORCE_EQ( op_to_idx_.empty(), true, platform::errors::InvalidArgument("Op to idx must be initialized here.")); - PADDLE_ENFORCE_EQ(deps_.empty(), true, platform::errors::InvalidArgument( - "Deps must be initialized here.")); + PADDLE_ENFORCE_EQ( + deps_.empty(), true, + platform::errors::InvalidArgument("Deps must be initialized here.")); // Toposort ops OpGraphView graph_view(ir::FilterByNodeWrapper(*graph_)); diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc index 1ca6e989f275c..682a72c5729ac 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc @@ -166,8 +166,9 @@ static std::string GetFirstVarName(const OpDesc &op, const std::string &slot, static std::vector>> GetInplaceVars(const BlockDesc &block, bool use_cuda, const std::vector &skip_vars) { - PADDLE_ENFORCE_EQ(block.ID(), 0, platform::errors::Unimplemented( - "Inplace can only perform in block 0.")); + PADDLE_ENFORCE_EQ( + block.ID(), 0, + platform::errors::Unimplemented("Inplace can only perform in block 0.")); // only take block 0 gc_vars const auto op_gc_vars = GetEagerDeletionCleanVars(*block.Program(), skip_vars)[0]; diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h index e89734bacec36..8d593254f90fa 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h +++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h @@ -19,6 +19,7 @@ #include #include #include + #include "paddle/fluid/platform/enforce.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h index d6f286afc5590..b5506dd1dcbdd 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h +++ b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h @@ -136,13 +136,15 @@ void OpGraphView::BreadthFirstVisit(Callback &&callback) const { } } - PADDLE_ENFORCE_EQ(num_calls, op_num, platform::errors::InvalidArgument( - "There are unvisited ops.")); + PADDLE_ENFORCE_EQ( + num_calls, op_num, + platform::errors::InvalidArgument("There are unvisited ops.")); PADDLE_ENFORCE_EQ( visited_ops.size(), op_num, platform::errors::InvalidArgument("There are unvisited ops.")); - PADDLE_ENFORCE_EQ(op_deps.empty(), true, platform::errors::InvalidArgument( - "There are unvisited ops.")); + PADDLE_ENFORCE_EQ( + op_deps.empty(), true, + platform::errors::InvalidArgument("There are unvisited ops.")); } } // namespace ir diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc index 6077069ea747a..b1fdb5e2160e0 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc @@ -26,9 +26,9 @@ namespace paddle { namespace framework { namespace ir { +using paddle::operators::OpAndGradOpPair; using paddle::operators::OpVariant; using paddle::operators::OpVariantSet; -using paddle::operators::OpAndGradOpPair; void RecurrentOpEagerDeletionPass::ApplyImpl(Graph *graph) const { // Find all recurrent_op and recurrent_grad_op in graph diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc index 313b2cc33459e..3f88aaad57e26 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/eager_deletion_op_handle.h" #include "paddle/fluid/framework/ir/graph_helper.h" diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc index 88bf9e3876399..848b6e494ad67 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "gtest/gtest.h" #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/eager_deletion_op_handle.h" diff --git a/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc index 4aa59d9196b1b..80f201d2d5afc 100644 --- a/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc +++ b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/mixed_precision_configure_pass.h" + #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/op_version_registry.h" @@ -25,9 +26,10 @@ void MixedPrecisionConfigurePass::InsertCastOps( VLOG(3) << "Insert the cast op before and after the kernel that does not " "supports fp16 precision"; - auto update_cast_desc = [&]( - framework::OpDesc& desc, const std::string& x_name, - const std::string& out_name, const int in_dtype, const int out_dtype) { + auto update_cast_desc = [&](framework::OpDesc& desc, + const std::string& x_name, + const std::string& out_name, const int in_dtype, + const int out_dtype) { desc.SetType("cast"); desc.SetInput("X", {x_name}); desc.SetOutput("Out", {out_name}); diff --git a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc index 9f6cd8992dcb9..62145cb6a0fb1 100644 --- a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.h" + #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc index e13d44ac23222..b1b546f085cf8 100644 --- a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc @@ -34,7 +34,7 @@ void SetBatchNormAttrs(OpDesc* bn_op, bool is_test = true, bn_op->SetAttr("fuse_with_relu", false); bn_op->SetAttr("epsilon", 0.001f); } -} +} // namespace // ------------------------------ Test cases ----------------------------------- @@ -48,11 +48,12 @@ TEST(FuseBatchNormActOneDNNPass, ThrowIsTestTrainableStats) { auto prog = test::BuildProgramDesc( {"x", "m", "v", "bn_y", "act_y", "m_out", "var_out", "sm", "sv"}, {"scale", "bias"}); - auto* bn_op = test::CreateOp(&prog, "batch_norm", {{"X", "x"}, - {"Scale", "scale"}, - {"Bias", "bias"}, - {"Mean", "m"}, - {"Variance", "v"}}, + auto* bn_op = test::CreateOp(&prog, "batch_norm", + {{"X", "x"}, + {"Scale", "scale"}, + {"Bias", "bias"}, + {"Mean", "m"}, + {"Variance", "v"}}, {{"Y", "bn_y"}, {"MeanOut", "m_out"}, {"VarianceOut", "var_out"}, @@ -73,11 +74,12 @@ TEST(FuseBatchNormActOneDNNPass, ThrowIsTestTrainableStats) { TEST(FuseBatchNormActOneDNNPass, FuseIsTest) { auto prog = test::BuildProgramDesc({"x", "m", "v", "bn_y", "act_y"}, {"scale", "bias"}); - auto* bn_op = test::CreateOp(&prog, "batch_norm", {{"X", "x"}, - {"Scale", "scale"}, - {"Bias", "bias"}, - {"Mean", "m"}, - {"Variance", "v"}}, + auto* bn_op = test::CreateOp(&prog, "batch_norm", + {{"X", "x"}, + {"Scale", "scale"}, + {"Bias", "bias"}, + {"Mean", "m"}, + {"Variance", "v"}}, {{"Y", "bn_y"}}); SetBatchNormAttrs(bn_op, true, false); test::CreateOp(&prog, "relu", {{"X", "bn_y"}}, {{"Out", "act_y"}}, false); @@ -106,11 +108,12 @@ TEST(FuseBatchNormActOneDNNPass, ThrowTrainableStats) { auto prog = test::BuildProgramDesc( {"x", "m", "v", "bn_y", "act_y", "m_out", "var_out", "sm", "sv"}, {"scale", "bias"}); - auto* bn_op = test::CreateOp(&prog, "batch_norm", {{"X", "x"}, - {"Scale", "scale"}, - {"Bias", "bias"}, - {"Mean", "m"}, - {"Variance", "v"}}, + auto* bn_op = test::CreateOp(&prog, "batch_norm", + {{"X", "x"}, + {"Scale", "scale"}, + {"Bias", "bias"}, + {"Mean", "m"}, + {"Variance", "v"}}, {{"Y", "bn_y"}, {"MeanOut", "m_out"}, {"VarianceOut", "var_out"}, @@ -132,11 +135,12 @@ TEST(FuseBatchNormActOneDNNPass, AllAttrsFalse) { auto prog = test::BuildProgramDesc( {"x", "m", "v", "bn_y", "act_y", "m_out", "var_out", "sm", "sv"}, {"scale", "bias"}); - auto* bn_op = test::CreateOp(&prog, "batch_norm", {{"X", "x"}, - {"Scale", "scale"}, - {"Bias", "bias"}, - {"Mean", "m"}, - {"Variance", "v"}}, + auto* bn_op = test::CreateOp(&prog, "batch_norm", + {{"X", "x"}, + {"Scale", "scale"}, + {"Bias", "bias"}, + {"Mean", "m"}, + {"Variance", "v"}}, {{"Y", "bn_y"}, {"MeanOut", "m_out"}, {"VarianceOut", "var_out"}, @@ -158,11 +162,12 @@ TEST(FuseBatchNormActOneDNNPass, ThrowUseMkldnn) { auto prog = test::BuildProgramDesc( {"x", "m", "v", "bn_y", "act_y", "m_out", "var_out", "sm", "sv"}, {"scale", "bias"}); - auto* bn_op = test::CreateOp(&prog, "batch_norm", {{"X", "x"}, - {"Scale", "scale"}, - {"Bias", "bias"}, - {"Mean", "m"}, - {"Variance", "v"}}, + auto* bn_op = test::CreateOp(&prog, "batch_norm", + {{"X", "x"}, + {"Scale", "scale"}, + {"Bias", "bias"}, + {"Mean", "m"}, + {"Variance", "v"}}, {{"Y", "bn_y"}, {"MeanOut", "m_out"}, {"VarianceOut", "var_out"}, diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc index d7d0b988b551e..e19426d01d195 100644 --- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc @@ -12,11 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h" + #include + #include #include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h" #include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h" #include "paddle/fluid/framework/op_version_registry.h" diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h index b0076c1b38cd4..26fb6e4978ff5 100644 --- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/fluid/framework/ir/fuse_pass_base.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc index 1fefab805b1d3..e3db85471766f 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h" - #include + #include + +#include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/op_proto_maker.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h index a74d7443ee1fe..18e09173491da 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h @@ -23,8 +23,8 @@ namespace paddle { namespace framework { namespace ir { /* -* Fuse the Conv and Elementwise_add to a ConvBiasOp. -*/ + * Fuse the Conv and Elementwise_add to a ConvBiasOp. + */ class Graph; class ConvBiasFusePass : public FusePassBase { @@ -38,8 +38,8 @@ class ConvBiasFusePass : public FusePassBase { const std::string name_scope_{"conv_bias_mkldnn_fuse"}; }; /* -* Fuse the Conv3D and Elementwise_add to a Conv3DBiasOp. -*/ + * Fuse the Conv3D and Elementwise_add to a Conv3DBiasOp. + */ class Conv2DTransposeBiasFusePass : public ConvBiasFusePass { public: Conv2DTransposeBiasFusePass(); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc index e9850483ebe91..0e052debaeeb2 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h" #include -#include "paddle/fluid/framework/naive_executor.h" -#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h" +#include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/imperative/type_defs.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc index 6b648608ca1d2..7d165b1a38a46 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h" - #include + +#include "paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/op_proto_maker.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc index eebc87f5d9988..58eec79344dd5 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc @@ -188,7 +188,8 @@ class DeQuantizer final : public Quanter { bool IsNotPermittedName(const std::string& output_name) const override { std::unordered_map> block_list{ {"layer_norm", - {"Mean", "Variance"}}}; // not used in inference in MKLDNN + {"Mean", "Variance"}}, // not used in inference in MKLDNN + {"fc", {"ResidualData"}}}; // artifical output, already dequantized std::vector blocked_outputs{"XShape"}; // blocklist for any op auto op_name = op->Name(); @@ -225,7 +226,7 @@ class DeQuantizer final : public Quanter { return Quanter::create_quant_op(output_name, input_name); } }; -} +} // namespace using string::PrettyLogDetail; void CPUBFloat16Pass::ApplyImpl(ir::Graph* graph) const { diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index a61c043b58065..452212664ec93 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -12,11 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h" + #include #include #include -#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h" #include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h" #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/string/pretty_log.h" diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc index 912c16288c2b9..fb36365ac54ef 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h" // NOLINT #include + #include +#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h" // NOLINT #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc index 350fad2c672d4..f6e5279ed23af 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h" - #include + +#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h" #include "paddle/fluid/platform/mkldnn_helper.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc index 06940b38ea8e0..979c601ac04c9 100644 --- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc @@ -12,10 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h" - #include +#include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h" #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc index b7f7a8071d214..2a8a248a99faf 100644 --- a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h" + #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc index 7fc8806452b88..afcd493f92f56 100644 --- a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.h" + #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass_tester.cc index 59d81cb86474d..4b158ccc5a8b0 100644 --- a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass_tester.cc @@ -32,7 +32,9 @@ TEST(FuseFCActOneDNNPass, ThrowUseMkldnn) { test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"}); test::CreateOp(&prog, "fc", { - {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"}, + {"Input", "x"}, + {"Weights", "weights"}, + {"Bias", "bias"}, }, {{"Out", "fc_y"}}, false); test::CreateOp(&prog, "gelu", {{"Input", "fc_y"}}, {{"Out", "act_y"}}, false); @@ -51,7 +53,9 @@ TEST(FuseFCActOneDNNPass, FuseWithGeluTanh) { test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"}); test::CreateOp(&prog, "fc", { - {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"}, + {"Input", "x"}, + {"Weights", "weights"}, + {"Bias", "bias"}, }, {{"Out", "fc_y"}}); auto* act_op = test::CreateOp(&prog, "gelu", {{"Input", "fc_y"}}, @@ -83,7 +87,9 @@ TEST(FuseFCActOneDNNPass, FuseWithGeluErf) { test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"}); test::CreateOp(&prog, "fc", { - {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"}, + {"Input", "x"}, + {"Weights", "weights"}, + {"Bias", "bias"}, }, {{"Out", "fc_y"}}); auto* act_op = test::CreateOp(&prog, "gelu", {{"Input", "fc_y"}}, @@ -115,7 +121,9 @@ TEST(FuseFCActOneDNNPass, FuseWithGeluAuto) { test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"}); test::CreateOp(&prog, "fc", { - {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"}, + {"Input", "x"}, + {"Weights", "weights"}, + {"Bias", "bias"}, }, {{"Out", "fc_y"}}); test::CreateOp(&prog, "gelu", {{"Input", "fc_y"}}, {{"Out", "act_y"}}, false); @@ -145,7 +153,9 @@ TEST(FuseFCActOneDNNPass, FuseWithTanh) { test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"}); test::CreateOp(&prog, "fc", { - {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"}, + {"Input", "x"}, + {"Weights", "weights"}, + {"Bias", "bias"}, }, {{"Out", "fc_y"}}); test::CreateOp(&prog, "tanh", {{"Input", "fc_y"}}, {{"Out", "act_y"}}, false); @@ -175,7 +185,9 @@ TEST(FuseFCActOneDNNPass, FuseWithSigmoid) { test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"}); test::CreateOp(&prog, "fc", { - {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"}, + {"Input", "x"}, + {"Weights", "weights"}, + {"Bias", "bias"}, }, {{"Out", "fc_y"}}); test::CreateOp(&prog, "sigmoid", {{"Input", "fc_y"}}, {{"Out", "act_y"}}, @@ -206,7 +218,9 @@ TEST(FuseFCActOneDNNPass, FuseWithMish) { test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"}); test::CreateOp(&prog, "fc", { - {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"}, + {"Input", "x"}, + {"Weights", "weights"}, + {"Bias", "bias"}, }, {{"Out", "fc_y"}}); test::CreateOp(&prog, "mish", {{"Input", "fc_y"}}, {{"Out", "act_y"}}, false); @@ -236,7 +250,9 @@ TEST(FuseFCActOneDNNPass, FuseWithHardSwish) { test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"}); test::CreateOp(&prog, "fc", { - {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"}, + {"Input", "x"}, + {"Weights", "weights"}, + {"Bias", "bias"}, }, {{"Out", "fc_y"}}); test::CreateOp(&prog, "hard_swish", {{"Input", "fc_y"}}, {{"Out", "act_y"}}, diff --git a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc index 2e62597f2ee29..60856512779ff 100644 --- a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h" + #include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/string/pretty_log.h" diff --git a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc index 678a8fb4a6955..a5481f5c6f30e 100644 --- a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc @@ -129,17 +129,13 @@ void Int8ScaleCalculationMkldnnPass::ApplyImpl(ir::Graph* graph) const { bool has_activation = !conv_op->Op()->GetAttrIfExists("fuse_activation").empty(); float activation_scale = - force_fp32_output - ? 1.0f - : has_activation - ? conv_op->Op()->GetAttrIfExists("Scale_out") - : 1.0f; + force_fp32_output ? 1.0f + : has_activation ? conv_op->Op()->GetAttrIfExists("Scale_out") + : 1.0f; auto scale_out_data = - force_fp32_output - ? 1.0f - : has_activation - ? 1.0f - : conv_op->Op()->GetAttrIfExists("Scale_out"); + force_fp32_output ? 1.0f + : has_activation ? 1.0f + : conv_op->Op()->GetAttrIfExists("Scale_out"); float sum_scale = fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f; diff --git a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc index 804d04e35f690..9d3940c96644b 100644 --- a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc @@ -12,9 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h" #include +#include "paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h" + namespace paddle { namespace framework { namespace ir { diff --git a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc index 4eb532b47cb4b..1ed36e06fb19f 100644 --- a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.h" + #include #include + #include "paddle/fluid/platform/enforce.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc index 34a35877a7f25..f6c99a477bcd8 100644 --- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc @@ -13,8 +13,11 @@ // limitations under the License. #include "paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h" + #include + #include + #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc index ed99989cf382f..ddb9e717392e1 100644 --- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.cc index dcf4664d963da..6e106fa9dae5f 100644 --- a/paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.h" + #include + #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc index 4236dc55d5186..06e0db4c93ea0 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc @@ -12,12 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include #include -#include - #include "gtest/gtest.h" #include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h" diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_fc_rnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_fc_rnn_fuse_pass_tester.cc index c4770a322db50..1ca9e76f79d6f 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_fc_rnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_fc_rnn_fuse_pass_tester.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h" #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h" #include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h" diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc index d2763bd6a6dc0..ae8dbceb7a64c 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc @@ -13,12 +13,14 @@ // limitations under the License. #include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h" + #include #include #include #include #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_info.h" diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h index 44b6d110db82c..880630055e916 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once #include + #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/pass.h" diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index 7df957b2c0eca..7f4e5d32536a0 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -12,13 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h" - #include -#include #include +#include +#include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h index 505bb2739e1d4..99a55b26e99db 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/fluid/framework/ir/graph_helper.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc index 4012e04f7d2af..671ad4c1c4b2f 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc @@ -12,11 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h" - #include + #include +#include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h" + namespace paddle { namespace framework { namespace ir { diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc index 76a0c883c8923..73089df571765 100644 --- a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h" + #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/platform/errors.h" diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h index 70f88104b4b52..cf53ecec9262e 100644 --- a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass_tester.cc index 7b6681ff96784..60890336b3052 100644 --- a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass_tester.cc @@ -12,9 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h" #include +#include "paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h" + namespace paddle { namespace framework { namespace ir { diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc index 7821501cc4b23..06125e51fb65e 100644 --- a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc @@ -13,10 +13,12 @@ // limitations under the License. #include "paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h" + #include #include #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/platform/errors.h" diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h index 546a3d6570b41..af58ae2bda49c 100644 --- a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h @@ -18,6 +18,7 @@ #include #include #include + #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass_tester.cc index 3738e3ebd68eb..2924401bc2e6a 100644 --- a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass_tester.cc @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h" #include + #include +#include "paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h" + namespace paddle { namespace framework { namespace ir { diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc index 63e402cb52983..15100b23407b0 100644 --- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h" + #include + #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h" #include "paddle/fluid/framework/op_version_registry.h" @@ -124,10 +126,11 @@ void QuantDequantMkldnnPass::CollectInputScalesFromFake( auto* op_desc = op_node->Op(); const int bit_length = BOOST_GET_CONST(int, op_desc->GetAttr("bit_length")); - PADDLE_ENFORCE_EQ(bit_length, 8, platform::errors::InvalidArgument( - "Unsupported number quantization " - "bits: %d, only 8 is supported now.", - bit_length)); + PADDLE_ENFORCE_EQ(bit_length, 8, + platform::errors::InvalidArgument( + "Unsupported number quantization " + "bits: %d, only 8 is supported now.", + bit_length)); auto x_var_name = op_desc->Input("X")[0]; auto scale_name = op_desc->Input("InScale")[0]; diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h index a9442f707402d..5003e1878bfeb 100644 --- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/fluid/framework/ir/fuse_pass_base.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc index 96f575745a3a2..05b1d419f6f4a 100644 --- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc @@ -13,9 +13,11 @@ // limitations under the License. #include "paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h" + #include #include #include + #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/string/pretty_log.h" diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc index e688635646001..023dd6af7ee01 100644 --- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include + #include "paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.h" - -#include #include "paddle/fluid/framework/ir/pass_tester_helper.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.cc index 203966dc682f5..ed57be12c78e3 100644 --- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.cc @@ -13,9 +13,11 @@ // limitations under the License. #include "paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.h" + #include #include #include + #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/string/pretty_log.h" diff --git a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc index 60f844ffc80ce..09bad959eb09f 100644 --- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc @@ -12,9 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h" #include +#include "paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h" + namespace paddle { namespace framework { namespace ir { diff --git a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc index bf603dc4bbcb9..a7e0f3a583441 100644 --- a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc @@ -12,9 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h" + #include -#include "paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h" #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc index fe42e8f96f851..86775e20aa73c 100644 --- a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include #include "paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h" diff --git a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc index 82d642264c2c4..cad92e3153b12 100644 --- a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.h" + #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass_tester.cc index 003a39f37d4a6..662dfb0f9d4f9 100644 --- a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass_tester.cc @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.h" - #include + #include + +#include "paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/op_proto_maker.h" namespace paddle { @@ -52,43 +53,27 @@ void MainTest(const std::string& activation_type) { } } -TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithTanh) { - MainTest("tanh") -} +TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithTanh){MainTest("tanh")} -TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithRelu) { - MainTest("relu") -} +TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithRelu){MainTest("relu")} -TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithLeakyRelu) { - MainTest("leaky_relu") -} +TEST(FuseSoftplusActivationOneDNNPass, + FuseSoftplusWithLeakyRelu){MainTest("leaky_relu")} -TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithSwish) { - MainTest("swish") -} +TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithSwish){MainTest("swish")} -TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithHardswish) { - MainTest("hardswish") -} +TEST(FuseSoftplusActivationOneDNNPass, + FuseSoftplusWithHardswish){MainTest("hardswish")} -TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithSqrt) { - MainTest("sqrt") -} +TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithSqrt){MainTest("sqrt")} -TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithAbs) { MainTest("abs") } +TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithAbs){MainTest("abs")} -TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithClip) { - MainTest("clip") -} +TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithClip){MainTest("clip")} -TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithGelu) { - MainTest("gelu") -} +TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithGelu){MainTest("gelu")} -TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithRelu6) { - MainTest("relu6") -} +TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithRelu6){MainTest("relu6")} TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithSigmoid) { MainTest("sigmoid") diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc index 06af5eaec13bc..b849076935afe 100644 --- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc +++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/ir/multi_batch_merge_pass.h" #include + #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/op_proto_maker.h" diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt b/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt index fea12baf0651f..e97331bc87a45 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt @@ -1,7 +1,17 @@ -cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle op_graph_view multi_devices_helper) +cc_library( + modify_op_lock_and_record_event_pass + SRCS modify_op_lock_and_record_event_pass.cc + DEPS computation_op_handle scale_loss_grad_op_handle op_graph_view + multi_devices_helper) -cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper) -cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper) +cc_library( + multi_devices_graph_print_pass + SRCS multi_devices_graph_print_pass.cc + DEPS multi_devices_helper) +cc_library( + multi_devices_graph_check_pass + SRCS multi_devices_graph_check_pass.cc + DEPS multi_devices_helper) set(ALL_REDUCE_OP_HANDLES all_reduce_op_handle) set(ALL_REDUCE_OP_HANDLES grad_merge_all_reduce_op_handle) @@ -9,13 +19,46 @@ if(WITH_GPU AND WITH_DGC) list(APPEND ALL_REDUCE_OP_HANDLES sparse_all_reduce_op_handle) endif() -cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle - scale_loss_grad_op_handle rpc_op_handle fetch_barrier_op_handle ${ALL_REDUCE_OP_HANDLES} reduce_op_handle broadcast_op_handle fused_broadcast_op_handle) -cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass) -cc_library(set_reader_device_info_utils SRCS set_reader_device_info_utils.cc DEPS graph graph_helper pass multi_devices_graph_pass) +cc_library( + multi_devices_graph_pass + SRCS multi_devices_graph_pass.cc + DEPS multi_devices_helper + computation_op_handle + scale_loss_grad_op_handle + rpc_op_handle + fetch_barrier_op_handle + ${ALL_REDUCE_OP_HANDLES} + reduce_op_handle + broadcast_op_handle + fused_broadcast_op_handle) +cc_library( + sequential_execution_pass + SRCS sequential_execution_pass.cc + DEPS graph graph_helper pass) +cc_library( + set_reader_device_info_utils + SRCS set_reader_device_info_utils.cc + DEPS graph graph_helper pass multi_devices_graph_pass) -cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle grad_merge_all_reduce_op_handle) -cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS all_reduce_op_handle graph graph_helper pass) -cc_library(backward_optimizer_op_deps_pass SRCS backward_optimizer_op_deps_pass.cc DEPS graph graph_helper pass) -cc_library(add_reader_dependency_pass SRCS add_reader_dependency_pass.cc DEPS graph graph_helper pass) -cc_library(fix_op_run_order_pass SRCS fix_op_run_order_pass.cc DEPS graph graph_helper multi_devices_helper pass op_handle_base eager_deletion_op_handle) +cc_library( + fuse_all_reduce_op_pass + SRCS fuse_all_reduce_op_pass.cc + DEPS graph graph_helper fused_all_reduce_op_handle + grad_merge_all_reduce_op_handle) +cc_library( + all_reduce_deps_pass + SRCS all_reduce_deps_pass.cc + DEPS all_reduce_op_handle graph graph_helper pass) +cc_library( + backward_optimizer_op_deps_pass + SRCS backward_optimizer_op_deps_pass.cc + DEPS graph graph_helper pass) +cc_library( + add_reader_dependency_pass + SRCS add_reader_dependency_pass.cc + DEPS graph graph_helper pass) +cc_library( + fix_op_run_order_pass + SRCS fix_op_run_order_pass.cc + DEPS graph graph_helper multi_devices_helper pass op_handle_base + eager_deletion_op_handle) diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc index abb1d062c96ef..b907869b4a38e 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "paddle/fluid/framework/ir/pass.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/fix_op_run_order_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/fix_op_run_order_pass.cc index 772b4c1c915cc..55b6389768cb4 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fix_op_run_order_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fix_op_run_order_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "paddle/fluid/framework/details/eager_deletion_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/op_handle_base.h" diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc index 484d09fd4441d..5189f410e3c70 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "paddle/fluid/framework/details/all_reduce_op_handle.h" #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h" diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc index 1b6245928d377..7180c3820c71e 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h" + #include #include #include @@ -20,6 +21,7 @@ #include #include #include + #include "paddle/fluid/framework/details/all_reduce_op_handle.h" #include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h" @@ -495,9 +497,9 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result, "use_dgc=%d, use_grad_merge=%d", is_encoded, is_grad_merge)); - auto append_allreduce_op = [&]( - const std::vector &scopes, - const std::vector &places) -> details::OpHandleBase * { + auto append_allreduce_op = [&](const std::vector &scopes, + const std::vector &places) + -> details::OpHandleBase * { if (is_encoded) { #if defined(PADDLE_WITH_DGC) && \ (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)) @@ -758,13 +760,14 @@ int BalanceVarSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const { "and Parameter@Grad.", node->Name(), OpProtoAndCheckerMaker::OpRoleVarAttrName())); int dev_id = GetVarDeviceID(param_grad[1]); - PADDLE_ENFORCE_NE(dev_id, -1, platform::errors::NotFound( - "Can not find Device ID, for NodeName:%s, " - "NodeType:%s, Param:%s, Param@Grad:%s" - "For this fault, you can consult the " - "Paddle technical personnel for answer ", - node->Name(), node->Op()->Type(), - param_grad[0], param_grad[1])); + PADDLE_ENFORCE_NE( + dev_id, -1, + platform::errors::NotFound("Can not find Device ID, for NodeName:%s, " + "NodeType:%s, Param:%s, Param@Grad:%s" + "For this fault, you can consult the " + "Paddle technical personnel for answer ", + node->Name(), node->Op()->Type(), + param_grad[0], param_grad[1])); return dev_id; } @@ -956,10 +959,11 @@ bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result, bool insert_op = false; if (OpHaveRole(*node, OpRole::kRPC)) { int op_dev_id = CreateRPCOp(result, node); - PADDLE_ENFORCE_NE(op_dev_id, -1, platform::errors::InvalidArgument( - "Can not schedule the RPC operator to " - "the right place. NodeName:%s.", - node->Name())); + PADDLE_ENFORCE_NE(op_dev_id, -1, + platform::errors::InvalidArgument( + "Can not schedule the RPC operator to " + "the right place. NodeName:%s.", + node->Name())); if (node->Op()->Type() == "recv") { auto recv_vars_attr = BOOST_GET_CONST(std::vector, diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h index c76f30016763a..7508074207768 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h @@ -46,7 +46,7 @@ class NCCLContextMap; class BKCLContextMap; class BKCLCommunicator; #endif -} +} // namespace platform namespace framework { class Scope; diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc index 09ef94c0826d7..c7b6e477fd5aa 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h" + #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc index 4a5947778056a..03d433f4db165 100644 --- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc +++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc @@ -51,11 +51,12 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) { multihead_pattern(); // Create New OpDesc - auto fuse_creater = [&]( - Node* input0, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out, - Node* mul1_out, Node* mul2_out, Node* eltadd0_b, Node* eltadd1_b, - Node* eltadd2_b, Node* eltadd_qk_b, Node* reshape2, - Node* reshape2_qkv_out, Node* scale, Node* scale_out) { + auto fuse_creater = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2, + Node* mul0_out, Node* mul1_out, Node* mul2_out, + Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b, + Node* eltadd_qk_b, Node* reshape2, + Node* reshape2_qkv_out, Node* scale, + Node* scale_out) { auto scale_attr = BOOST_GET_CONST(float, scale->Op()->GetAttr("scale")); // auto scale_bias = BOOST_GET_CONST(float, scale->Op()->GetAttr("bias")); // bool after_scale = @@ -756,13 +757,14 @@ int MultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph, multihead_pattern(); // Create New OpDesc - auto fuse_creater = [&]( - Node* input0, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out, - Node* mul1_out, Node* mul2_out, Node* mul0_w, Node* mul1_w, Node* mul2_w, - Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b, Node* eltadd_qk_b, - Node* reshape2, Node* reshape2_qkv_out, Node* scale, Node* scale_out, - Node* softmax_qk, Node* eltadd0, Node* eltadd1, Node* eltadd2, - Node* matmul_qk, Node* reshape2_qkv) { + auto fuse_creater = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2, + Node* mul0_out, Node* mul1_out, Node* mul2_out, + Node* mul0_w, Node* mul1_w, Node* mul2_w, + Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b, + Node* eltadd_qk_b, Node* reshape2, + Node* reshape2_qkv_out, Node* scale, Node* scale_out, + Node* softmax_qk, Node* eltadd0, Node* eltadd1, + Node* eltadd2, Node* matmul_qk, Node* reshape2_qkv) { auto scale_attr = BOOST_GET_CONST(float, scale->Op()->GetAttr("scale")); // mul (B * S * Hidden) x (Hidden * 3 * N * H) = (B * S * 3 * N * H) @@ -1207,11 +1209,12 @@ int MultiHeadMatmulV3FusePass::BuildFusionV3(Graph* graph, multihead_pattern(); // Create New OpDesc - auto fuse_creater = [&]( - Node* input0, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out, - Node* mul1_out, Node* mul2_out, Node* mul0_w, Node* mul1_w, Node* mul2_w, - Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b, Node* eltadd_qk_b, - Node* reshape2, Node* reshape2_qkv_out, Node* matmul_qk) { + auto fuse_creater = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2, + Node* mul0_out, Node* mul1_out, Node* mul2_out, + Node* mul0_w, Node* mul1_w, Node* mul2_w, + Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b, + Node* eltadd_qk_b, Node* reshape2, + Node* reshape2_qkv_out, Node* matmul_qk) { auto scale_attr = BOOST_GET_CONST(float, matmul_qk->Op()->GetAttr("alpha")); // mul (B * S * Hidden) x (Hidden * 3 * N * H) = (B * S * 3 * N * H) diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc index b121436ee870b..858ebf68b40fa 100644 --- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc @@ -9,8 +9,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h" // NOLINT #include + +#include "paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h" // NOLINT #include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/op_version_registry.h" diff --git a/paddle/fluid/framework/ir/node_test.cc b/paddle/fluid/framework/ir/node_test.cc index 9c47df402bdf2..2d84162e13aa6 100644 --- a/paddle/fluid/framework/ir/node_test.cc +++ b/paddle/fluid/framework/ir/node_test.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/node.h" + #include "gtest/gtest.h" #include "paddle/fluid/framework/var_desc.h" diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass.cc b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc index 73a8691f9e269..e309e068563e5 100644 --- a/paddle/fluid/framework/ir/op_compat_sensible_pass.cc +++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc @@ -13,9 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/op_compat_sensible_pass.h" + #include #include #include + #include "paddle/fluid/framework/op_def_api.h" #include "paddle/fluid/framework/op_info.h" diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass.h b/paddle/fluid/framework/ir/op_compat_sensible_pass.h index e24294a03a28a..393a2fb9392d5 100644 --- a/paddle/fluid/framework/ir/op_compat_sensible_pass.h +++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/pass.h" diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc index 756d3c2c77096..4b106d75f1c75 100644 --- a/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc +++ b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/ir/op_compat_sensible_pass.h" #include "gtest/gtest.h" +#include "paddle/fluid/framework/ir/op_compat_sensible_pass.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/program_desc.h" diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc index 2c10a68188eb4..85eecbd014e96 100644 --- a/paddle/fluid/framework/ir/pass.cc +++ b/paddle/fluid/framework/ir/pass.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/pass.h" #include + #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/op_proto_maker.h" @@ -90,9 +91,10 @@ static void MergePrograms(ProgramDesc *dst, const details::ProgramDescs &srcs, bool reverse = !append; auto create_var_visitor = [dst](const ProgramDesc &src) { - PADDLE_ENFORCE_EQ(src.Size(), 1, platform::errors::Unimplemented( - "MergePrograms can only support to " - "merge program with only one block.")); + PADDLE_ENFORCE_EQ( + src.Size(), 1, + platform::errors::Unimplemented("MergePrograms can only support to " + "merge program with only one block.")); const auto &src_block = src.Block(0); auto *dst_block = dst->MutableBlock(0); for (const auto *src_new_var : src_block.AllVars()) { diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc index 616ba7f1a9761..8c368a796ed10 100644 --- a/paddle/fluid/framework/ir/pass_test.cc +++ b/paddle/fluid/framework/ir/pass_test.cc @@ -84,8 +84,9 @@ TEST(PassTest, TestPassAttrCheck) { } catch (paddle::platform::EnforceNotMet& e) { exception = std::string(e.what()); } - std::string msg = "Invalid type for attritube test_pass_attr, expected: " + - try_type + ", actual: int"; + std::string msg = + "Invalid type for attritube test_pass_attr, expected: " + try_type + + ", actual: int"; ASSERT_TRUE(exception.find(msg) != exception.npos); } @@ -168,8 +169,9 @@ TEST(PassTest, TestPassAttrCheckConvertAllBlocks) { } catch (paddle::platform::EnforceNotMet& e) { exception = std::string(e.what()); } - std::string msg = "Invalid type for attritube test_pass_attr, expected: " + - try_type + ", actual: int"; + std::string msg = + "Invalid type for attritube test_pass_attr, expected: " + try_type + + ", actual: int"; ASSERT_TRUE(exception.find(msg) != exception.npos); } diff --git a/paddle/fluid/framework/ir/pass_test_util.cc b/paddle/fluid/framework/ir/pass_test_util.cc index 4d8965918f889..40dcb3cf1dbd8 100644 --- a/paddle/fluid/framework/ir/pass_test_util.cc +++ b/paddle/fluid/framework/ir/pass_test_util.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/ir/pass_test_util.h" + #include #include #include @@ -23,7 +25,6 @@ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/pass.h" -#include "paddle/fluid/framework/ir/pass_test_util.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/op_proto_maker.h" diff --git a/paddle/fluid/framework/ir/pass_tester_helper.h b/paddle/fluid/framework/ir/pass_tester_helper.h index acefde9df6854..ad58e4e4a0cf4 100644 --- a/paddle/fluid/framework/ir/pass_tester_helper.h +++ b/paddle/fluid/framework/ir/pass_tester_helper.h @@ -19,6 +19,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/framework/ir/placement_pass_base.cc b/paddle/fluid/framework/ir/placement_pass_base.cc index 35ba920060779..fd1b54f8c4d37 100644 --- a/paddle/fluid/framework/ir/placement_pass_base.cc +++ b/paddle/fluid/framework/ir/placement_pass_base.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/placement_pass_base.h" + #include + #include "paddle/fluid/framework/operator.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc index d6761d2e82ef3..929ffa2cadbef 100644 --- a/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc @@ -430,13 +430,15 @@ void PrelnEmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const { FusePassBase::Init(name_scope_, graph); bool enable_int8 = Get("enable_int8"); - bool use_oss = Get("use_oss"); + bool use_varseqlen = Get("use_varseqlen"); bool with_interleaved = Get("with_interleaved"); bool with_dynamic_shape = Get("with_dynamic_shape"); - if (!(enable_int8 && use_oss && with_interleaved && with_dynamic_shape)) { + if (!(enable_int8 && use_varseqlen && with_interleaved && + with_dynamic_shape)) { VLOG(4) << "preln_embedding_eltwise_layernorm_fuse_pass need: use_trt, " "enable_int8, " - "use_oss, with_interleaved, with_dynamic_shape. Stop this pass, " + "use_varseqlen, with_interleaved, with_dynamic_shape. Stop this " + "pass, " "please reconfig."; return; } diff --git a/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc index 978360d8f0a95..80e6c2b796798 100644 --- a/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc @@ -43,8 +43,8 @@ struct PrelnSkipLayerNorm : public PatternBase { PATTERN_DECL_NODE(layer_norm); // declare variable node's name PATTERN_DECL_NODE( - elementwise_out); // (elementwise_input_x,elementwise_input_y) -> - // elementwise_out + elementwise_out); // (elementwise_input_x,elementwise_input_y) + // -> elementwise_out PATTERN_DECL_NODE(layer_norm_bias); PATTERN_DECL_NODE(layer_norm_scale); PATTERN_DECL_NODE(layer_norm_out); @@ -109,12 +109,13 @@ void PrelnSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const { graph, platform::errors::PreconditionNotMet("graph should not be null.")); FusePassBase::Init("preln_skip_layernorm_fuse", graph); bool enable_int8 = Get("enable_int8"); - bool use_oss = Get("use_oss"); + bool use_varseqlen = Get("use_varseqlen"); bool with_interleaved = Get("with_interleaved"); bool with_dynamic_shape = Get("with_dynamic_shape"); - if (!(enable_int8 && use_oss && with_interleaved && with_dynamic_shape)) { + if (!(enable_int8 && use_varseqlen && with_interleaved && + with_dynamic_shape)) { VLOG(4) << "preln_skip_layernorm_fuse_pass need: use_trt, enable_int8, " - "use_oss, " + "use_varseqlen, " "with_interleaved, with_dynamic_shape. Stop this pass, please " "reconfig. "; return; diff --git a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc index 67dfe074dc075..ee9474f6fada0 100644 --- a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc +++ b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc @@ -22,6 +22,19 @@ namespace paddle { namespace framework { namespace ir { namespace patterns { +void EmbEltwiseLayernorm::operator()() { + // Create nodes for fused_embedding_eltwise_layernorm. + auto* emb_elt_layernorm_op = + pattern->NewNode(emb_elt_layernorm_op_repr()) + ->assert_is_op("fused_embedding_eltwise_layernorm"); + auto* emb_elt_layernorm_out = + pattern->NewNode(emb_elt_layernorm_out_repr()) + ->assert_is_op_output("fused_embedding_eltwise_layernorm", "Out"); + + // Add links for fused_embedding_eltwise_layernorm op. + emb_elt_layernorm_op->LinksTo({emb_elt_layernorm_out}); +} + void SkipLayernorm::operator()() { // Create nodes for skip_layernorm. auto* skip_layernorm_x = pattern->NewNode(skip_layernorm_x_repr()) @@ -59,16 +72,12 @@ void Fc::operator()() { auto* fc_input = pattern->NewNode(fc_input_repr())->assert_is_op_input("fc", "Input"); auto* fc_op = pattern->NewNode(fc_op_repr())->assert_is_op("fc"); - auto* fc_out = - pattern->NewNode(fc_out_repr())->assert_is_op_output("fc", "Out"); - - // Add links for fc op. - fc_op->LinksFrom({fc_input}).LinksTo({fc_out}); + fc_op->LinksFrom({fc_input}); } void Activation::operator()() { // Create nodes for activation. - std::unordered_set activation_ops{"relu", "sigmoid", "tanh"}; + std::unordered_set activation_ops{"relu", "sigmoid", "gelu"}; auto* activation_input = pattern->NewNode(activation_input_repr()) ->assert_is_ops_input(activation_ops); auto* activation_op = @@ -82,6 +91,18 @@ void Activation::operator()() { } // namespace patterns void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const { + bool use_varseqlen = Get("use_varseqlen"); + std::string pos_id = Get("tensorrt_transformer_posid"); + std::string mask_id = Get("tensorrt_transformer_maskid"); + + if (use_varseqlen && pos_id != "" && mask_id != "" && + graph->Has(framework::ir::kEmbEltwiseLayernormPass) && + graph->Has(framework::ir::kMultiheadMatmulPass)) { + VLOG(3) << "start varseqlen remove_padding_recover_padding_pass"; + } else { + return; + } + PADDLE_ENFORCE_NOT_NULL( graph, platform::errors::PreconditionNotMet("graph should not be null.")); FusePassBase::Init(name_scope_, graph); @@ -91,14 +112,14 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const { // Create an remove_padding op node auto insert_remove_padding_op = [&](Node* input_node, Node* op_node) { // create op, var in graph - OpDesc remove_padding; + OpDesc remove_padding(op_node->Op()->Block()); std::string remove_padding_out_name = input_node->Name() + ".remove_padding"; - - VarDesc remove_padding_out(remove_padding_out_name); - remove_padding_out.SetDataType(input_node->Var()->GetDataType()); - remove_padding_out.SetShape(input_node->Var()->GetShape()); - remove_padding_out.SetPersistable(false); + auto* remove_padding_out = + op_node->Op()->Block()->Var(remove_padding_out_name); + remove_padding_out->SetDataType(input_node->Var()->GetDataType()); + remove_padding_out->SetShape(input_node->Var()->GetShape()); + remove_padding_out->SetPersistable(false); // remove_padding_op remove_padding.SetType("remove_padding"); @@ -110,7 +131,7 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const { remove_padding.SetOutput("Out", {remove_padding_out_name}); auto remove_padding_op_node = graph->CreateOpNode(&remove_padding); - auto remove_padding_out_node = graph->CreateVarNode(&remove_padding_out); + auto remove_padding_out_node = graph->CreateVarNode(remove_padding_out); // replace link for (size_t i = 0; i < input_node->outputs.size(); ++i) { @@ -145,13 +166,14 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const { // create an remove_padding op node auto insert_recover_padding_op = [&](Node* op_node, Node* out_node) { // create op, var in graph - OpDesc recover_padding; + OpDesc recover_padding(op_node->Op()->Block()); std::string recover_padding_input_name = out_node->Name() + ".recover_padding"; - VarDesc recover_padding_input(recover_padding_input_name); - recover_padding_input.SetDataType(out_node->Var()->GetDataType()); - recover_padding_input.SetShape(out_node->Var()->GetShape()); - recover_padding_input.SetPersistable(false); + auto* recover_padding_input = + op_node->Op()->Block()->Var(recover_padding_input_name); + recover_padding_input->SetDataType(out_node->Var()->GetDataType()); + recover_padding_input->SetShape(out_node->Var()->GetShape()); + recover_padding_input->SetPersistable(false); // recover_padding_op recover_padding.SetType("recover_padding"); @@ -164,7 +186,7 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const { auto recover_padding_op_node = graph->CreateOpNode(&recover_padding); auto recover_padding_input_node = - graph->CreateVarNode(&recover_padding_input); + graph->CreateVarNode(recover_padding_input); // replace link for (size_t i = 0; i < op_node->outputs.size(); ++i) { @@ -195,39 +217,36 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const { op_node->Op()->RenameOutput(out_node->Name(), recover_padding_input_name); }; - GraphPatternDetector gpd1; - patterns::SkipLayernorm skip_layernorm(gpd1.mutable_pattern(), - "remove_padding_recover_padding_pass"); - skip_layernorm(); + bool check_flag = true; - auto handler1 = [&](const GraphPatternDetector::subgraph_t& subgraph, + GraphPatternDetector gpd0; + patterns::EmbEltwiseLayernorm fused_embedding_eltwise_layernorm( + gpd0.mutable_pattern(), "remove_padding_recover_padding_pass"); + fused_embedding_eltwise_layernorm(); + + auto handler0 = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { VLOG(3) << "remove_padding_recover_padding_pass for transformer: " - "skip_layernorm"; + "fused_embedding_eltwise_layernorm"; - GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_x, skip_layernorm_x, - skip_layernorm); - GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_y, skip_layernorm_y, - skip_layernorm); - GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_op, skip_layernorm_op, - skip_layernorm); - GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_out, skip_layernorm_out, - skip_layernorm); + GET_IR_NODE_FROM_SUBGRAPH(emb_elt_layernorm_op, emb_elt_layernorm_op, + fused_embedding_eltwise_layernorm); + GET_IR_NODE_FROM_SUBGRAPH(emb_elt_layernorm_out, emb_elt_layernorm_out, + fused_embedding_eltwise_layernorm); - insert_remove_padding_op(skip_layernorm_x, skip_layernorm_op); - insert_remove_padding_op(skip_layernorm_y, skip_layernorm_op); - insert_recover_padding_op(skip_layernorm_op, skip_layernorm_out); + insert_recover_padding_op(emb_elt_layernorm_op, emb_elt_layernorm_out); found_subgraph_count++; }; - gpd1(graph, handler1); + gpd0(graph, handler0); - GraphPatternDetector gpd2; + GraphPatternDetector gpd1; patterns::MultiheadMatmul multihead_matmul( - gpd2.mutable_pattern(), "remove_padding_recover_padding_pass"); + gpd1.mutable_pattern(), "remove_padding_recover_padding_pass"); multihead_matmul(); - auto handler2 = [&](const GraphPatternDetector::subgraph_t& subgraph, + std::vector multihead_matmul_input_shape; + auto handler1 = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { VLOG(3) << "remove_padding_recover_padding_pass for transformer: " "multihead_matmul"; @@ -239,11 +258,57 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const { GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul_out, multihead_matmul_out, multihead_matmul); + multihead_matmul_input_shape = multihead_matmul_input->Var()->GetShape(); + insert_remove_padding_op(multihead_matmul_input, multihead_matmul_op); insert_recover_padding_op(multihead_matmul_op, multihead_matmul_out); found_subgraph_count++; }; + gpd1(graph, handler1); + + GraphPatternDetector gpd2; + patterns::SkipLayernorm skip_layernorm(gpd2.mutable_pattern(), + "remove_padding_recover_padding_pass"); + skip_layernorm(); + + auto handler2 = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* graph) { + VLOG(3) << "remove_padding_recover_padding_pass for transformer: " + "skip_layernorm"; + + GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_x, skip_layernorm_x, + skip_layernorm); + GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_y, skip_layernorm_y, + skip_layernorm); + GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_op, skip_layernorm_op, + skip_layernorm); + GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_out, skip_layernorm_out, + skip_layernorm); + + std::vector skip_layernorm_x_shape = + skip_layernorm_x->Var()->GetShape(); + if (skip_layernorm_x_shape.size() != multihead_matmul_input_shape.size()) { + check_flag = false; + VLOG(3) << "Transformer model remove_padding shape check failed, return " + "remove_padding pass."; + return; + } + for (size_t i = 0; i < skip_layernorm_x_shape.size(); ++i) { + if (skip_layernorm_x_shape[i] != multihead_matmul_input_shape[i]) { + check_flag = false; + } + } + if (!check_flag) { + VLOG(3) << "Transformer model remove_padding shape check failed, return " + "remove_padding pass."; + return; + } + insert_remove_padding_op(skip_layernorm_x, skip_layernorm_op); + insert_remove_padding_op(skip_layernorm_y, skip_layernorm_op); + insert_recover_padding_op(skip_layernorm_op, skip_layernorm_out); + found_subgraph_count++; + }; gpd2(graph, handler2); GraphPatternDetector gpd3; @@ -257,11 +322,39 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const { GET_IR_NODE_FROM_SUBGRAPH(fc_input, fc_input, fc); GET_IR_NODE_FROM_SUBGRAPH(fc_op, fc_op, fc); - GET_IR_NODE_FROM_SUBGRAPH(fc_out, fc_out, fc); - insert_remove_padding_op(fc_input, fc_op); - insert_recover_padding_op(fc_op, fc_out); + std::vector fc_input_shape = fc_input->Var()->GetShape(); + if ((fc_input_shape.size() != multihead_matmul_input_shape.size()) || + (fc_input_shape.size() != 3)) { + check_flag = false; + VLOG(3) << "Transformer model remove_padding shape check failed, return " + "remove_padding pass."; + return; + } + if (fc_input_shape[0] != multihead_matmul_input_shape[0]) { + check_flag = false; + } + if (fc_input_shape[1] != multihead_matmul_input_shape[1]) { + check_flag = false; + } + if ((fc_input_shape[2] != multihead_matmul_input_shape[2]) && + (fc_input_shape[2] != 4 * multihead_matmul_input_shape[2])) { + check_flag = false; + } + if (BOOST_GET_CONST(int, fc_op->Op()->GetAttr("in_num_col_dims")) != 2) { + check_flag = false; + } + if (!check_flag) { + VLOG(3) << "Transformer model remove_padding shape check failed, return " + "remove_padding pass."; + return; + } + fc_op->Op()->RemoveAttr("in_num_col_dims"); + fc_op->Op()->SetAttr("in_num_col_dims", 1); + + insert_remove_padding_op(fc_input, fc_op); + insert_recover_padding_op(fc_op, fc_op->outputs[0]); found_subgraph_count++; }; gpd3(graph, handler3); @@ -280,6 +373,31 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const { GET_IR_NODE_FROM_SUBGRAPH(activation_op, activation_op, activation); GET_IR_NODE_FROM_SUBGRAPH(activation_out, activation_out, activation); + std::vector activation_input_shape = + activation_input->Var()->GetShape(); + if ((activation_input_shape.size() != + multihead_matmul_input_shape.size()) || + (activation_input_shape.size() != 3)) { + check_flag = false; + VLOG(3) << "Transformer model remove_padding shape check failed, return " + "remove_padding pass."; + return; + } + if (activation_input_shape[0] != multihead_matmul_input_shape[0]) { + check_flag = false; + } + if (activation_input_shape[1] != multihead_matmul_input_shape[1]) { + check_flag = false; + } + if ((activation_input_shape[2] != multihead_matmul_input_shape[2]) && + (activation_input_shape[2] != 4 * multihead_matmul_input_shape[2])) { + check_flag = false; + } + if (!check_flag) { + VLOG(3) << "Transformer model remove_padding shape check failed, return " + "remove_padding pass."; + return; + } insert_remove_padding_op(activation_input, activation_op); insert_recover_padding_op(activation_op, activation_out); diff --git a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h index d7ccfc75c2000..7b8075644cb51 100644 --- a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h +++ b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h @@ -32,6 +32,14 @@ namespace paddle { namespace framework { namespace ir { namespace patterns { +struct EmbEltwiseLayernorm : public PatternBase { + EmbEltwiseLayernorm(PDPattern *pattern, const std::string &name_scope) + : PatternBase(pattern, name_scope, "emb_elt_layernorm") {} + + void operator()(); + PATTERN_DECL_NODE(emb_elt_layernorm_op); + PATTERN_DECL_NODE(emb_elt_layernorm_out); +}; struct SkipLayernorm : public PatternBase { SkipLayernorm(PDPattern *pattern, const std::string &name_scope) diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc index a03a6f5b2c72c..a2dd846ba52d5 100644 --- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h" + #include #include "paddle/fluid/framework/op_version_registry.h" @@ -145,9 +146,9 @@ void BuildRepeatedFCReluPattern(PDPattern* pattern, return x->outputs[fc_idx]->outputs[0]; }; - auto var_next_is_fc_act_repeated_n_times = [=]( - Node* x, int repeated_times, const std::string& act_type = "relu", - bool check_in_has_only_one_out = true) -> bool { + auto var_next_is_fc_act_repeated_n_times = + [=](Node* x, int repeated_times, const std::string& act_type = "relu", + bool check_in_has_only_one_out = true) -> bool { for (int i = 0; i < repeated_times; ++i) { if (!var_next_is_fc_act(x, act_type, i == 0 && check_in_has_only_one_out)) { @@ -191,9 +192,9 @@ void BuildRepeatedFCReluPattern(PDPattern* pattern, return nullptr; }; - auto var_before_is_fc_act_repeated_n_times = [=]( - Node* x, int repeated_times, - const std::string& act_type = "relu") -> bool { + auto var_before_is_fc_act_repeated_n_times = [=](Node* x, int repeated_times, + const std::string& act_type = + "relu") -> bool { for (int i = 0; i < repeated_times; ++i) { if (!var_before_is_fc_act(x, act_type, i == repeated_times - 1)) { return false; diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc index f0ff77acf9ff8..3112b776ae5e6 100644 --- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc @@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h" - #include + #include "paddle/fluid/framework/ir/pass_tester_helper.h" +#include "paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc index 778e658354f26..451e41e767dc4 100644 --- a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc +++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/runtime_context_cache_pass.h" + #include "paddle/fluid/framework/operator.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc index 9fa951920f45a..2c0b142c98fbd 100644 --- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h" + #include #include "paddle/fluid/framework/op_version_registry.h" diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc index 2b084bd5734b9..052b0a4bdc1b8 100644 --- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc +++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc @@ -44,8 +44,8 @@ PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern, is_concat_op_with_inputs(x->outputs[0], num_inputs); }; - auto is_seqpool_op_with_pootype_of_nth_input_of_concat = [=]( - Node* x, const std::string& type, int idx) -> bool { + auto is_seqpool_op_with_pootype_of_nth_input_of_concat = + [=](Node* x, const std::string& type, int idx) -> bool { bool this_is_seqpool_op = x && x->IsOp() && x->Op()->Type() == "sequence_pool" && x->Op()->HasAttr("pooltype") && diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc index d366803851842..e56ba9ad1e751 100644 --- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc @@ -12,8 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h" #include + +#include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h" #include "paddle/fluid/framework/op_proto_maker.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc index 7200e0ac1d469..916adbbe33720 100644 --- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc +++ b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc @@ -44,11 +44,11 @@ static void GetConcatNodes(ir::Graph* graph, std::vector* concat_nodes) { GraphPatternDetector gpd; auto* pattern = gpd.mutable_pattern(); auto concat_op_node = BuildCVMConcatPattern(pattern); - GraphPatternDetector::handle_t handler = [&]( - const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { - Node* concat_op = subgraph.at(concat_op_node); - concat_nodes->push_back(concat_op); - }; + GraphPatternDetector::handle_t handler = + [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { + Node* concat_op = subgraph.at(concat_op_node); + concat_nodes->push_back(concat_op); + }; gpd(graph, handler); } } // anonymous namespace @@ -148,19 +148,19 @@ void SeqPoolCVMConcatFusePass::ApplyImpl(ir::Graph* graph) const { Node* cvm_input_of_cvm; Node* concat_out_var = concat_node->outputs[0]; - GraphPatternDetector::handle_t handler = [&]( - const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { - Node* seqpool_in_var = subgraph.at(seqpool_in_var_node); - Node* seqpool_op = subgraph.at(seqpool_op_node); - Node* seqpool_out_var = subgraph.at(seqpool_out_var_node); - Node* seqpool_idx_out_var = subgraph.at(seqpool_idx_out_var_node); - Node* cvm_op = subgraph.at(cvm_op_node); - Node* cvm_out_var = subgraph.at(cvm_out_var_node); - cvm_input_of_cvm = subgraph.at(cvm_cvm_in_var_node); - marked_nodes.insert({seqpool_op, seqpool_out_var, seqpool_idx_out_var, - cvm_op, cvm_out_var, concat_node}); - ins_to_concat[cvm_out_var->Name()] = seqpool_in_var; - }; + GraphPatternDetector::handle_t handler = + [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { + Node* seqpool_in_var = subgraph.at(seqpool_in_var_node); + Node* seqpool_op = subgraph.at(seqpool_op_node); + Node* seqpool_out_var = subgraph.at(seqpool_out_var_node); + Node* seqpool_idx_out_var = subgraph.at(seqpool_idx_out_var_node); + Node* cvm_op = subgraph.at(cvm_op_node); + Node* cvm_out_var = subgraph.at(cvm_out_var_node); + cvm_input_of_cvm = subgraph.at(cvm_cvm_in_var_node); + marked_nodes.insert({seqpool_op, seqpool_out_var, seqpool_idx_out_var, + cvm_op, cvm_out_var, concat_node}); + ins_to_concat[cvm_out_var->Name()] = seqpool_in_var; + }; gpd(graph, handler); if (!ins_to_concat.empty()) { diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc index bba640cf148d1..8d8ebc955d39e 100644 --- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc @@ -12,8 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h" #include + +#include "paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h" #include "paddle/fluid/framework/op_proto_maker.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/set_transformer_input_convert_pass.cc b/paddle/fluid/framework/ir/set_transformer_input_convert_pass.cc index 37e77bc134d3c..f177f60708773 100644 --- a/paddle/fluid/framework/ir/set_transformer_input_convert_pass.cc +++ b/paddle/fluid/framework/ir/set_transformer_input_convert_pass.cc @@ -21,129 +21,134 @@ namespace paddle { namespace framework { namespace ir { -SetTransformerInputConvertPass::SetTransformerInputConvertPass() { - AddOpCompat(OpCompat("elementwise_add")) - .AddInput("X") - .IsTensor() - .End() - .AddInput("Y") - .IsTensor() - .End() - .AddOutput("Out") - .IsTensor() - .End() - .AddAttr("axis") - .End(); -} namespace patterns { -void SetTransformerInputConvert::operator()() { +void SetTransformerInputConvert::operator()(const std::string &pos_id) { std::unordered_set lookup_table_ops{"lookup_table", "lookup_table_v2"}; - // Create nodes for lookup_table1 op. - auto *lookup_table1_x = pattern->NewNode(lookup_table1_x_repr()) - ->assert_is_ops_input(lookup_table_ops, "Ids"); - auto *lookup_table1_w = pattern->NewNode(lookup_table1_w_repr()) - ->assert_is_ops_input(lookup_table_ops, "W"); - auto *lookup_table1_op = - pattern->NewNode(lookup_table1_repr())->assert_is_ops(lookup_table_ops); - auto *lookup_table1_out = pattern->NewNode(lookup_table1_out_repr()) - ->assert_is_ops_output(lookup_table_ops) - ->AsIntermediate() - ->assert_is_op_input("elementwise_add", "X"); - - // Create nodes for lookup_table2 op. - auto *lookup_table2_x = pattern->NewNode(lookup_table2_x_repr()) - ->assert_is_ops_input(lookup_table_ops, "Ids"); - auto *lookup_table2_w = pattern->NewNode(lookup_table2_w_repr()) - ->assert_is_ops_input(lookup_table_ops, "W"); - auto *lookup_table2_op = - pattern->NewNode(lookup_table2_repr())->assert_is_ops(lookup_table_ops); - auto *lookup_table2_out = pattern->NewNode(lookup_table2_out_repr()) - ->assert_is_ops_output(lookup_table_ops) - ->AsIntermediate() - ->assert_is_op_input("elementwise_add", "Y"); - - // Create nodes for elementwise_add op. - auto *elementwise_op = - pattern->NewNode(elementwise_repr())->assert_is_op("elementwise_add"); - auto *elementwise_out = pattern->NewNode(elementwise_out_repr()) - ->AsOutput() - ->assert_is_only_output_of_op("elementwise_add"); + // Create nodes for lookup_table. + auto *lookup_table_id = + pattern->NewNode(lookup_table_id_repr()) + ->assert_is_ops_input(lookup_table_ops, "Ids") + ->assert_more([&](Node *node) { return node->Name() == pos_id; }); + auto *lookup_table_op = + pattern->NewNode(lookup_table_repr())->assert_is_ops(lookup_table_ops); // links nodes. - lookup_table1_op->LinksFrom({lookup_table1_x, lookup_table1_w}) - .LinksTo({lookup_table1_out}); - lookup_table2_op->LinksFrom({lookup_table2_x, lookup_table2_w}) - .LinksTo({lookup_table2_out}); - elementwise_op->LinksFrom({lookup_table1_out, lookup_table2_out}) - .LinksTo({elementwise_out}); + lookup_table_op->LinksFrom({lookup_table_id}); } +void MultiheadMatmulOP::operator()() { + // Create nodes for multihead_matmul op. + auto *multihead_matmul = pattern->NewNode(multihead_matmul_repr()) + ->assert_is_op("multihead_matmul"); + auto *multihead_matmul_out = + pattern->NewNode(multihead_matmul_out_repr()) + ->assert_is_op_output("multihead_matmul", "Out"); + + // links nodes. + multihead_matmul_out->LinksFrom({multihead_matmul}); +} } // namespace patterns void SetTransformerInputConvertPass::ApplyImpl(ir::Graph *graph) const { + bool with_dynamic_shape = Get("with_dynamic_shape"); + std::string pos_id = Get("tensorrt_transformer_posid"); + + if (!(graph->Has(framework::ir::kMultiheadMatmulPass) && with_dynamic_shape && + (pos_id != ""))) { + VLOG(3) << "Transformer model need MultiheadMatmul, and " + "with_dynamic_shape. Stop this pass, " + "please reconfig."; + return; + } PADDLE_ENFORCE_NOT_NULL( graph, platform::errors::PreconditionNotMet("graph should not be null.")); FusePassBase::Init(name_scope_, graph); int found_subgraph_count = 0; - - GraphPatternDetector gpd; + Node *transformer_input_convert_out0_node; + Node *transformer_input_convert_out1_node; + GraphPatternDetector gpd0; patterns::SetTransformerInputConvert fused_pattern( - gpd.mutable_pattern(), "transformer_input_convert_pass"); - fused_pattern(); - - auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, - Graph *graph) { - if (!IsCompat(subgraph, graph)) { - LOG(WARNING) << "transformer_input_convert_pass in op compat failed."; - return; - } - - VLOG(3) << "transformer_input_convert_pass for pos_id, max_seqlen"; - - GET_IR_NODE_FROM_SUBGRAPH(lookup_table2_x, lookup_table2_x, fused_pattern); + gpd0.mutable_pattern(), "transformer_input_convert_pass"); + fused_pattern(pos_id); + auto handler0 = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *graph) { + VLOG(3) + << "transformer_input_convert_pass for pos_id, max_seqlen, mask_tensor"; + GET_IR_NODE_FROM_SUBGRAPH(lookup_table, lookup_table, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table_id, lookup_table_id, fused_pattern); // create op, var in graph - OpDesc new_desc; + OpDesc new_desc(lookup_table->Op()->Block()); + new_desc.SetType("transformer_input_convert"); // inputs - new_desc.SetInput("X", {lookup_table2_x->Name()}); + new_desc.SetInput("Input", {lookup_table_id->Name()}); // outputs - std::vector output_0 = {"pos_id_tensor"}; - std::vector output_1 = {"max_seqlen_tensor"}; - new_desc.SetOutput("PosId", output_0); - new_desc.SetOutput("MaxSeqlen", output_1); - std::string transformer_input_convert_out0_name = "pos_id_tensor"; std::string transformer_input_convert_out1_name = "max_seqlen_tensor"; - VarDesc transformer_input_convert_out0(transformer_input_convert_out0_name); - VarDesc transformer_input_convert_out1(transformer_input_convert_out1_name); - transformer_input_convert_out0.SetDataType(proto::VarType::INT32); - transformer_input_convert_out1.SetDataType(proto::VarType::INT32); - transformer_input_convert_out0.SetShape({-1}); - transformer_input_convert_out1.SetShape({-1}); - transformer_input_convert_out0.SetPersistable(false); - transformer_input_convert_out1.SetPersistable(false); + std::string transformer_input_convert_out2_name = "mask_tensor"; + std::vector output_0 = {transformer_input_convert_out0_name}; + std::vector output_1 = {transformer_input_convert_out1_name}; + std::vector output_2 = {transformer_input_convert_out2_name}; + new_desc.SetOutput("PosId", output_0); + new_desc.SetOutput("MaxSeqlen", output_1); + new_desc.SetOutput("MaskTensor", output_2); + + auto *transformer_input_convert_out0 = + lookup_table->Op()->Block()->Var(transformer_input_convert_out0_name); + auto *transformer_input_convert_out1 = + lookup_table->Op()->Block()->Var(transformer_input_convert_out1_name); + auto *transformer_input_convert_out2 = + lookup_table->Op()->Block()->Var(transformer_input_convert_out2_name); + transformer_input_convert_out0->SetDataType(proto::VarType::INT32); + transformer_input_convert_out1->SetDataType(proto::VarType::INT32); + transformer_input_convert_out2->SetDataType(proto::VarType::INT32); + transformer_input_convert_out0->SetShape({-1}); + transformer_input_convert_out1->SetShape({-1}); + + transformer_input_convert_out2->SetShape({-1}); + + transformer_input_convert_out0->SetPersistable(false); + transformer_input_convert_out1->SetPersistable(false); + transformer_input_convert_out2->SetPersistable(false); auto new_op_node = graph->CreateOpNode(&new_desc); auto transformer_input_convert_out0_node = - graph->CreateVarNode(&transformer_input_convert_out0); + graph->CreateVarNode(transformer_input_convert_out0); auto transformer_input_convert_out1_node = - graph->CreateVarNode(&transformer_input_convert_out1); + graph->CreateVarNode(transformer_input_convert_out1); + auto transformer_input_convert_out2_node = + graph->CreateVarNode(transformer_input_convert_out2); // needn't create variable in scope - IR_NODE_LINK_TO(lookup_table2_x, new_op_node); + IR_NODE_LINK_TO(lookup_table_id, new_op_node); IR_NODE_LINK_TO(new_op_node, transformer_input_convert_out0_node); IR_NODE_LINK_TO(new_op_node, transformer_input_convert_out1_node); - - found_subgraph_count++; + IR_NODE_LINK_TO(new_op_node, transformer_input_convert_out2_node); + }; + gpd0(graph, handler0); + + GraphPatternDetector gpd1; + patterns::MultiheadMatmulOP multihead_matmul_pattern( + gpd1.mutable_pattern(), "transformer_input_convert_pass"); + multihead_matmul_pattern(); + auto handler1 = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *graph) { + VLOG(3) << "link pos_id, max_seqlen to multihead_matmul."; + GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul, multihead_matmul, + multihead_matmul_pattern); + + IR_NODE_LINK_TO(transformer_input_convert_out0_node, multihead_matmul); + IR_NODE_LINK_TO(transformer_input_convert_out1_node, multihead_matmul); }; + gpd1(graph, handler1); - gpd(graph, handler); + found_subgraph_count++; AddStatis(found_subgraph_count); } @@ -153,9 +158,3 @@ void SetTransformerInputConvertPass::ApplyImpl(ir::Graph *graph) const { REGISTER_PASS(set_transformer_input_convert_pass, paddle::framework::ir::SetTransformerInputConvertPass); -REGISTER_PASS_CAPABILITY(set_transformer_input_convert_pass) - .AddCombination( - paddle::framework::compatible::OpVersionComparatorCombination() - .LE("lookup_table", 1) - .LE("lookup_table_v2", 1) - .LE("elementweise_add", 1)); diff --git a/paddle/fluid/framework/ir/set_transformer_input_convert_pass.h b/paddle/fluid/framework/ir/set_transformer_input_convert_pass.h index 5a5843e810f9a..01c9b1c854bd1 100644 --- a/paddle/fluid/framework/ir/set_transformer_input_convert_pass.h +++ b/paddle/fluid/framework/ir/set_transformer_input_convert_pass.h @@ -33,41 +33,36 @@ namespace framework { namespace ir { namespace patterns { -// in_var emb in_var emb -// | | | | -// lookup_table lookup_table -// | | -// lkt_var lkt_var -// \ / -// elementwise_add -// | -// elt_out_var +// in_var emb +// | | +// lookup_table +// | +// lkt_var + // struct SetTransformerInputConvert : public PatternBase { SetTransformerInputConvert(PDPattern *pattern, const std::string &name_scope) - : PatternBase(pattern, name_scope, "transformer_input_convert") {} + : PatternBase(pattern, name_scope, "transformer_input_convert_pass") {} + void operator()(const std::string &pos_id); + // declare operator node's name + PATTERN_DECL_NODE(lookup_table); + // declare variable node's name + PATTERN_DECL_NODE(lookup_table_id); +}; +struct MultiheadMatmulOP : public PatternBase { + MultiheadMatmulOP(PDPattern *pattern, const std::string &name_scope) + : PatternBase(pattern, name_scope, "transformer_input_convert_pass") {} void operator()(); - // declare operator node's name - PATTERN_DECL_NODE(lookup_table1); - PATTERN_DECL_NODE(lookup_table2); - PATTERN_DECL_NODE(elementwise); - - // declare variable node's name - PATTERN_DECL_NODE(lookup_table1_x); - PATTERN_DECL_NODE(lookup_table1_w); - PATTERN_DECL_NODE(lookup_table1_out); - PATTERN_DECL_NODE(lookup_table2_x); - PATTERN_DECL_NODE(lookup_table2_w); - PATTERN_DECL_NODE(lookup_table2_out); - PATTERN_DECL_NODE(elementwise_out); + PATTERN_DECL_NODE(multihead_matmul); + PATTERN_DECL_NODE(multihead_matmul_out); }; } // namespace patterns class SetTransformerInputConvertPass : public FusePassBase { public: - SetTransformerInputConvertPass(); + SetTransformerInputConvertPass() {} virtual ~SetTransformerInputConvertPass() {} protected: diff --git a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc index bcd7bedcc43a6..9007105950b47 100644 --- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc +++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc @@ -12,9 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/ir/shuffle_channel_detect_pass.h" + #include -#include "paddle/fluid/framework/ir/shuffle_channel_detect_pass.h" #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc index 80f387c442760..908797163d21c 100644 --- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc +++ b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc @@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h" - #include + #include "paddle/fluid/framework/ir/pass_tester_helper.h" +#include "paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc index bfa14d9296b26..6bebe8de9f2e3 100644 --- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc @@ -43,8 +43,8 @@ struct SkipLayerNorm : public PatternBase { PATTERN_DECL_NODE(layer_norm); // declare variable node's name PATTERN_DECL_NODE( - elementwise_out); // (elementwise_input_x,elementwise_input_y) -> - // elementwise_out + elementwise_out); // (elementwise_input_x,elementwise_input_y) + // -> elementwise_out PATTERN_DECL_NODE(layer_norm_bias); PATTERN_DECL_NODE(layer_norm_scale); PATTERN_DECL_NODE(layer_norm_out); diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc index 29be2c3cb09a7..c95fd0abd5294 100644 --- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc @@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h" - #include + #include "paddle/fluid/framework/ir/pass_tester_helper.h" +#include "paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h" #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc index 7c43b02218213..a8c7150d6e3e0 100644 --- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc +++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc @@ -170,8 +170,9 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern, auto* matmul_xy_op = pattern->NewNode( [=](Node* x) { - return x && x->IsOp() && (x->Op()->Type() == "matmul_v2" || - x->Op()->Type() == "matmul") && + return x && x->IsOp() && + (x->Op()->Type() == "matmul_v2" || + x->Op()->Type() == "matmul") && is_fusion_first_mul_out(x->outputs[0]); }, name_scope + "/matmul_xy_op"); @@ -212,8 +213,9 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern, auto* matmul_squared_x_y_op = pattern->NewNode( [=](Node* x) { - return x && x->IsOp() && (x->Op()->Type() == "matmul_v2" || - x->Op()->Type() == "matmul") && + return x && x->IsOp() && + (x->Op()->Type() == "matmul_v2" || + x->Op()->Type() == "matmul") && is_fusion_mat_squared_x_y_op_out(x->outputs[0]); }, name_scope + "/matmul_squared_x_y_op"); diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc index 94fb68506413c..78dafaa1e2f12 100644 --- a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc +++ b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include #include "paddle/fluid/framework/ir/pass.h" diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc index bda6b90386475..6802310383d37 100644 --- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc +++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h" + #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/trt_embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/trt_embedding_eltwise_layernorm_fuse_pass.cc new file mode 100644 index 0000000000000..8f1fdb0b521dd --- /dev/null +++ b/paddle/fluid/framework/ir/trt_embedding_eltwise_layernorm_fuse_pass.cc @@ -0,0 +1,477 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/trt_embedding_eltwise_layernorm_fuse_pass.h" + +#include + +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace framework { +namespace ir { +class Node; +} // namespace ir +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace framework { +namespace ir { +namespace patterns { + +static PDNode* create_emb_vars(PDPattern* pattern, const std::string& name, + const std::string& arg, + bool is_persist = false) { + std::unordered_set embedding_ops{"lookup_table", + "lookup_table_v2"}; + PDNode* node = + pattern->NewNode(name)->assert_is_ops_input(embedding_ops, arg); + if (is_persist) return node->assert_is_persistable_var(); + return node; +} +static PDNode* create_emb_out_vars(PDPattern* pattern, const std::string& name, + const std::string& arg) { + std::unordered_set embedding_ops{"lookup_table", + "lookup_table_v2"}; + PDNode* node = pattern->NewNode(name) + ->assert_is_only_output_of_ops(embedding_ops) + ->assert_is_op_input("elementwise_add", arg) + ->AsIntermediate(); + return node; +} +void TrtEmbedding2Eltwise1Pattern::operator()() { + auto* lookup_table1_x = + create_emb_vars(pattern, lookup_table1_x_repr(), "Ids"); + auto* lookup_table2_x = + create_emb_vars(pattern, lookup_table2_x_repr(), "Ids"); + auto* lookup_table1_w = + create_emb_vars(pattern, lookup_table1_w_repr(), "W", true); + auto* lookup_table2_w = + create_emb_vars(pattern, lookup_table2_w_repr(), "W", true); + std::unordered_set embedding_ops{"lookup_table", + "lookup_table_v2"}; + auto* feed1 = pattern->NewNode(feed1_repr())->assert_is_op("feed"); + auto* feed2 = pattern->NewNode(feed2_repr())->assert_is_op("feed"); + + auto* lookup_table1 = + pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops); + auto* lookup_table2 = + pattern->NewNode(lookup_table2_repr())->assert_is_ops(embedding_ops); + auto* lookup_table1_out = + create_emb_out_vars(pattern, lookup_table1_out_repr(), "X"); + auto* lookup_table2_out = + create_emb_out_vars(pattern, lookup_table2_out_repr(), "Y"); + auto* eltwise_add = + pattern->NewNode(eltwise_add_repr())->assert_is_op("elementwise_add"); + auto* eltwise_add_out = pattern->NewNode(eltwise_add_out_repr()) + ->assert_is_op_output("elementwise_add"); + feed1->LinksTo({lookup_table1_x}); + lookup_table1->LinksFrom({lookup_table1_x, lookup_table1_w}) + .LinksTo({lookup_table1_out}); + feed2->LinksTo({lookup_table2_x}); + lookup_table2->LinksFrom({lookup_table2_x, lookup_table2_w}) + .LinksTo({lookup_table2_out}); + eltwise_add->LinksFrom({lookup_table1_out, lookup_table2_out}) + .LinksTo({eltwise_add_out}); +} +void TrtEmbedding1Eltwise1Pattern::operator()() { + auto* lookup_table1_x = + create_emb_vars(pattern, lookup_table1_x_repr(), "Ids"); + auto* lookup_table1_w = + create_emb_vars(pattern, lookup_table1_w_repr(), "W", true); + std::unordered_set embedding_ops{"lookup_table", + "lookup_table_v2"}; + auto* feed1 = pattern->NewNode(feed1_repr())->assert_is_op("feed"); + + auto* lookup_table1 = + pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops); + auto* lookup_table1_out = + create_emb_out_vars(pattern, lookup_table1_out_repr(), "Y"); + auto* eltwise_add = + pattern->NewNode(eltwise_add_repr())->assert_is_op("elementwise_add"); + auto* eltwise_add_in = pattern->NewNode(eltwise_add_in_repr()) + ->assert_is_op_input("elementwise_add", "X") + ->assert_is_op_output("elementwise_add"); + auto* eltwise_add_out = pattern->NewNode(eltwise_add_out_repr()) + ->assert_is_op_output("elementwise_add"); + lookup_table1->LinksFrom({lookup_table1_x, lookup_table1_w}) + .LinksTo({lookup_table1_out}); + feed1->LinksTo({lookup_table1_x}); + eltwise_add->LinksFrom({lookup_table1_out, eltwise_add_in}) + .LinksTo({eltwise_add_out}); +} +void TrtSkipLayerNorm::operator()() { + auto* eltwise_add = + pattern->NewNode(eltwise_add_repr())->assert_is_op("elementwise_add"); + auto* eltwise_add_out = pattern->NewNode(eltwise_add_out_repr()) + ->assert_is_op_output("elementwise_add") + ->assert_is_op_input("layer_norm", "X") + ->AsIntermediate(); + auto* layer_norm = + pattern->NewNode(layer_norm_repr())->assert_is_op("layer_norm"); + auto* layer_norm_out = pattern->NewNode(layer_norm_out_repr()) + ->assert_is_op_output("layer_norm", "Y") + ->AsOutput(); + auto* layer_norm_bias_var = pattern->NewNode(layer_norm_bias_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("layer_norm", "Bias"); + auto* layer_norm_scale_var = pattern->NewNode(layer_norm_scale_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("layer_norm", "Scale"); + auto* layer_norm_mean_var = pattern->NewNode(layer_norm_mean_repr()) + ->AsOutput() + ->assert_is_op_output("layer_norm", "Mean"); + auto* layer_norm_variance_var = + pattern->NewNode(layer_norm_variance_repr()) + ->AsOutput() + ->assert_is_op_output("layer_norm", "Variance"); + eltwise_add->LinksTo({eltwise_add_out}); + layer_norm + ->LinksFrom({eltwise_add_out, layer_norm_bias_var, layer_norm_scale_var}) + .LinksTo({layer_norm_out, layer_norm_mean_var, layer_norm_variance_var}); +} + +} // namespace patterns + +int TrtEmbeddingEltwiseLayerNormFusePass::BuildFusion( + Graph* graph, const std::string& name_scope + /*const Scope* scope*/) const { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + bool use_varseqlen = Get("use_varseqlen"); + std::string pos_id = Get("tensorrt_transformer_posid"); + std::string mask_id = Get("tensorrt_transformer_maskid"); + std::vector>> start_pattern_in_nodes; + std::vector start_pattern_out_node; + std::vector> start_pattern_remove_nodes; + + // Create pattern. + patterns::TrtEmbedding2Eltwise1Pattern start_pattern(pattern, + name_scope + "/start"); + start_pattern(); + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_x, lookup_table1_x, start_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table2_x, lookup_table2_x, start_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_w, lookup_table1_w, start_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table2_w, lookup_table2_w, start_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table1, lookup_table1, start_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table2, lookup_table2, start_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_out, lookup_table1_out, + start_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table2_out, lookup_table2_out, + start_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltwise_add, eltwise_add, start_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltwise_add_out, eltwise_add_out, start_pattern); + if (!IsCompat(subgraph, graph)) { + LOG(WARNING) << "Pass(TrtEmbedding2Eltwise1Pattern) in op compat failed."; + return; + } + std::vector> ins; + ins.push_back(std::make_pair(lookup_table1_x, lookup_table1_w)); + ins.push_back(std::make_pair(lookup_table2_x, lookup_table2_w)); + start_pattern_in_nodes.push_back(ins); + start_pattern_out_node.push_back(eltwise_add_out); + + std::unordered_set rm_nodes; + rm_nodes.insert({lookup_table1, lookup_table2, lookup_table1_out, + lookup_table2_out, eltwise_add, eltwise_add_out}); + start_pattern_remove_nodes.push_back(rm_nodes); + }; + gpd(graph, handler); + + std::vector> inner_pattern_ins; + std::vector inner_pattern_tmp_in; + std::vector inner_pattern_out; + std::vector> inner_pattern_remove_nodes; + + GraphPatternDetector gpd2; + auto* pattern2 = gpd2.mutable_pattern(); + patterns::TrtEmbedding1Eltwise1Pattern second_pattern(pattern2, + name_scope + "/second"); + second_pattern(); + auto handler2 = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_x, lookup_table1_x, second_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_w, lookup_table1_w, second_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table1, lookup_table1, second_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_out, lookup_table1_out, + second_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltwise_add_in, eltwise_add_in, second_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltwise_add, eltwise_add, second_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltwise_add_out, eltwise_add_out, second_pattern); + if (!IsCompat(subgraph, graph)) { + LOG(WARNING) << "Pass(TrtEmbedding1Eltwise1Pattern) in op compat failed."; + return; + } + auto in = std::make_pair(lookup_table1_x, lookup_table1_w); + inner_pattern_ins.push_back(in); + inner_pattern_tmp_in.push_back(eltwise_add_in); + inner_pattern_out.push_back(eltwise_add_out); + + std::unordered_set rm_nodes; + rm_nodes.insert( + {lookup_table1, lookup_table1_out, eltwise_add, eltwise_add_out}); + inner_pattern_remove_nodes.push_back(rm_nodes); + }; + gpd2(graph, handler2); + + std::vector end_pattern_elt_out; + std::vector end_pattern_scales; + std::vector end_pattern_biases; + std::vector end_pattern_out; + std::vector end_patter_layernorms; + std::vector> end_pattern_remove_nodes; + GraphPatternDetector gpd3; + auto* pattern3 = gpd3.mutable_pattern(); + patterns::TrtSkipLayerNorm skip_layernorm_pattern(pattern3, + name_scope + "/third"); + skip_layernorm_pattern(); + auto handler3 = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(eltwise_add, eltwise_add, skip_layernorm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltwise_add_out, eltwise_add_out, + skip_layernorm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm, layer_norm, skip_layernorm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_out, layer_norm_out, + skip_layernorm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_bias, layer_norm_bias, + skip_layernorm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_scale, layer_norm_scale, + skip_layernorm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_mean, layer_norm_mean, + skip_layernorm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance, layer_norm_variance, + skip_layernorm_pattern); + if (!IsCompat(subgraph, graph)) { + LOG(WARNING) << "Pass(TrtSkipLayerNorm) in op compat failed."; + return; + } + end_pattern_elt_out.push_back(eltwise_add_out); + std::unordered_set rm_nodes; + rm_nodes.insert({layer_norm, layer_norm_mean, layer_norm_variance}); + end_pattern_remove_nodes.push_back(rm_nodes); + end_pattern_biases.push_back(layer_norm_bias); + end_pattern_scales.push_back(layer_norm_scale); + end_pattern_out.push_back(layer_norm_out); + end_patter_layernorms.push_back(layer_norm); + }; + gpd3(graph, handler3); + + if (start_pattern_in_nodes.empty() || end_pattern_elt_out.empty()) { + return 0; + } + // only reserve the subgraphs that in connected domains. + int fusion_count = 0; + // fusion_id for (i, k, js) + std::vector>>> + fusion_ids; + for (size_t i = 0; i < start_pattern_in_nodes.size(); ++i) { + Node* tmp = start_pattern_out_node[i]; + Node* old_tmp = nullptr; + // get correct inner pattern node order. + std::vector js; + while (tmp != old_tmp) { + old_tmp = tmp; + for (size_t j = 0; j < inner_pattern_tmp_in.size(); ++j) { + if (inner_pattern_tmp_in[j] == tmp) { + tmp = inner_pattern_out[j]; + js.push_back(j); + break; + } + } + } + + for (size_t k = 0; k < end_pattern_elt_out.size(); ++k) { + if (tmp == end_pattern_elt_out[k]) { + fusion_ids.push_back(std::make_pair(i, std::make_pair(k, js))); + break; + } + } + } + + for (size_t num = 0; num < fusion_ids.size(); ++num) { + int i = fusion_ids[num].first; + int k = fusion_ids[num].second.first; + std::vector js = fusion_ids[num].second.second; + + std::vector ids; + std::vector embs; + for (size_t iter = 0; iter < start_pattern_in_nodes[i].size(); ++iter) { + ids.push_back(start_pattern_in_nodes[i][iter].first->Name()); + embs.push_back(start_pattern_in_nodes[i][iter].second->Name()); + } + for (size_t iter = 0; iter < js.size(); ++iter) { + ids.push_back(inner_pattern_ins[js[iter]].first->Name()); + embs.push_back(inner_pattern_ins[js[iter]].second->Name()); + } + + OpDesc new_op_desc(end_patter_layernorms[0]->Op()->Block()); + new_op_desc.SetType("fused_embedding_eltwise_layernorm"); + new_op_desc.SetInput("Ids", ids); + new_op_desc.SetInput("Embs", embs); + new_op_desc.SetInput("WordId", {ids[0]}); + if (use_varseqlen && pos_id != "" && mask_id != "") { + new_op_desc.SetInput("PosId", {pos_id}); + new_op_desc.SetInput("MaskId", {mask_id}); + } else { + new_op_desc.SetInput("PosId", {ids[1]}); + } + if (ids.size() > 2) { + new_op_desc.SetInput("SentId", {ids[2]}); + } + + new_op_desc.SetInput("WordEmbedding", {embs[0]}); + new_op_desc.SetInput("PosEmbedding", {embs[1]}); + if (embs.size() > 2) { + new_op_desc.SetInput("SentEmbedding", {embs[2]}); + } + + new_op_desc.SetInput("Bias", {end_pattern_biases[k]->Name()}); + new_op_desc.SetInput("Scale", {end_pattern_scales[k]->Name()}); + new_op_desc.SetOutput("Out", {end_pattern_out[k]->Name()}); + new_op_desc.SetAttr("epsilon", + end_patter_layernorms[k]->Op()->GetAttr("epsilon")); + + if (end_patter_layernorms[k]->Op()->HasAttr("out_threshold")) { + new_op_desc.SetAttr("enable_int8", true); + new_op_desc.SetAttr( + "out_threshold", + end_patter_layernorms[k]->Op()->GetAttr("out_threshold")); + } + + auto* embedding_eltwise_layernorm = graph->CreateOpNode(&new_op_desc); + + for (size_t iter = 0; iter < start_pattern_in_nodes[i].size(); ++iter) { + IR_NODE_LINK_TO(start_pattern_in_nodes[i][iter].first, + embedding_eltwise_layernorm); + IR_NODE_LINK_TO(start_pattern_in_nodes[i][iter].second, + embedding_eltwise_layernorm); + } + for (size_t iter = 0; iter < js.size(); ++iter) { + IR_NODE_LINK_TO(inner_pattern_ins[js[iter]].first, + embedding_eltwise_layernorm); + IR_NODE_LINK_TO(inner_pattern_ins[js[iter]].second, + embedding_eltwise_layernorm); + } + IR_NODE_LINK_TO(end_pattern_biases[k], embedding_eltwise_layernorm); + IR_NODE_LINK_TO(end_pattern_scales[k], embedding_eltwise_layernorm); + IR_NODE_LINK_TO(embedding_eltwise_layernorm, end_pattern_out[k]); + + // Remove unneeded nodes. + std::unordered_set marked_nodes; + marked_nodes.insert(start_pattern_remove_nodes[i].begin(), + start_pattern_remove_nodes[i].end()); + marked_nodes.insert(end_pattern_remove_nodes[k].begin(), + end_pattern_remove_nodes[k].end()); + for (size_t iter = 0; iter < js.size(); ++iter) { + marked_nodes.insert(inner_pattern_remove_nodes[js[iter]].begin(), + inner_pattern_remove_nodes[js[iter]].end()); + } + GraphSafeRemoveNodes(graph, marked_nodes); + ++fusion_count; + } + + return fusion_count; +} + +TrtEmbeddingEltwiseLayerNormFusePass::TrtEmbeddingEltwiseLayerNormFusePass() { + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .End(); + + AddOpCompat(OpCompat("layer_norm")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Scale") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .End() + .AddOutput("Y") + .IsTensor() + .End() + .AddOutput("Mean") + .IsTensor() + .End() + .AddOutput("Variance") + .IsTensor() + .End() + .AddAttr("epsilon") + .IsNumGE(0.0f) + .IsNumLE(0.001f) + .End() + .AddAttr("begin_norm_axis") + .IsNumGT(0) + .End(); +} + +void TrtEmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const { + bool with_dynamic_shape = Get("with_dynamic_shape"); + if (!with_dynamic_shape) { + VLOG(3) << "trt_embedding_eltwise_layernorm_fuse_pass need: use_varseqlen, " + "with_dynamic_shape. Stop this pass, " + "please reconfig."; + return; + } + FusePassBase::Init(name_scope_, graph); + int fusion_count = + TrtEmbeddingEltwiseLayerNormFusePass::BuildFusion(graph, name_scope_); + if (fusion_count > 0) { + bool use_varseqlen = Get("use_varseqlen"); + std::string pos_id = Get("tensorrt_transformer_posid"); + std::string mask_id = Get("tensorrt_transformer_maskid"); + + if ((use_varseqlen && pos_id != "" && mask_id != "") || + (!use_varseqlen && pos_id == "" && mask_id == "")) { + VLOG(3) << "start trt_embedding_eltwise_layernorm_fuse_pass"; + } else { + PADDLE_THROW( + platform::errors::Fatal("Use transformer'varseqlen need config: " + "use_varseqlen, set pos_id, set " + "mask_id. Or not use varseqlen, do not set " + "pos_id, set mask_id. Please " + "reconfig")); + } + graph->Set(kEmbEltwiseLayernormPass, new bool(true)); + } + AddStatis(fusion_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(trt_embedding_eltwise_layernorm_fuse_pass, + paddle::framework::ir::TrtEmbeddingEltwiseLayerNormFusePass); +REGISTER_PASS_CAPABILITY(trt_embedding_eltwise_layernorm_fuse_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .LE("lookup_table", 1) + .LE("lookup_table_v2", 1) + .LE("elementweise_add", 1)); diff --git a/paddle/fluid/framework/ir/trt_embedding_eltwise_layernorm_fuse_pass.h b/paddle/fluid/framework/ir/trt_embedding_eltwise_layernorm_fuse_pass.h new file mode 100644 index 0000000000000..2d956a38aac3c --- /dev/null +++ b/paddle/fluid/framework/ir/trt_embedding_eltwise_layernorm_fuse_pass.h @@ -0,0 +1,167 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { +class Graph; +} // namespace ir +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace framework { +namespace ir { +namespace patterns { + +// detect start pattern. +// +// in_var emb in_var emb +// | | | | +// lookup_table lookup_table +// | | +// lkt_var lkt_var +// \ / +// elementwise_add +// | +// elt_out_var +// +struct TrtEmbedding2Eltwise1Pattern : public PatternBase { + TrtEmbedding2Eltwise1Pattern(PDPattern* pattern, + const std::string& name_scope) + : PatternBase(pattern, name_scope, "embedding2_eltwise1") {} + + void operator()(); + PATTERN_DECL_NODE(feed1); + PATTERN_DECL_NODE(feed2); + PATTERN_DECL_NODE(lookup_table1_x); + PATTERN_DECL_NODE(lookup_table2_x); + PATTERN_DECL_NODE(lookup_table1_w); + PATTERN_DECL_NODE(lookup_table2_w); + PATTERN_DECL_NODE(lookup_table1); + PATTERN_DECL_NODE(lookup_table2); + PATTERN_DECL_NODE(lookup_table1_out); + PATTERN_DECL_NODE(lookup_table2_out); + PATTERN_DECL_NODE(eltwise_add); + PATTERN_DECL_NODE(eltwise_add_out); +}; + +// detect repeats inner pattern +// +// elt_out_var in_var emb +// \ | | +// \ lookup_table +// \ | +// \ lkt_var +// \ / +// elementwise_add +// | +// elt_out_var +// +struct TrtEmbedding1Eltwise1Pattern : public PatternBase { + TrtEmbedding1Eltwise1Pattern(PDPattern* pattern, + const std::string& name_scope) + : PatternBase(pattern, name_scope, "embedding1_eltwise1") {} + void operator()(); + PATTERN_DECL_NODE(feed1); + PATTERN_DECL_NODE(lookup_table1_x); + PATTERN_DECL_NODE(lookup_table1_w); + PATTERN_DECL_NODE(lookup_table1); + PATTERN_DECL_NODE(lookup_table1_out); + PATTERN_DECL_NODE(eltwise_add_in); + PATTERN_DECL_NODE(eltwise_add); + PATTERN_DECL_NODE(eltwise_add_out); +}; + +// detect end pattern +// +// elementwise_add +// | +// elt_out_var +// scale | bias +// \ | / +// layer_norm +// +struct TrtSkipLayerNorm : public PatternBase { + TrtSkipLayerNorm(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "skip_layernorm") {} + void operator()(); + PATTERN_DECL_NODE(eltwise_add); + PATTERN_DECL_NODE(eltwise_add_out); + PATTERN_DECL_NODE(layer_norm); + PATTERN_DECL_NODE(layer_norm_bias); + PATTERN_DECL_NODE(layer_norm_scale); + PATTERN_DECL_NODE(layer_norm_out); + // Delete the mean and var nodes in the graph. + PATTERN_DECL_NODE(layer_norm_mean); + PATTERN_DECL_NODE(layer_norm_variance); +}; +} // namespace patterns + +// The TrtEmbeddingEltwiseLayerNormFusePass detect the following pattern: +// +// inputs operator output +// -------------------------------------------------------------------- +// (word, weights_0) lookup_table -> word_emb +// (pos, weights_1) lookup_table -> pos_emb +// (sent, weights_2) lookup_table -> sent_emb +// (word_emb, pos_emb) elementweise_add -> elementwise_out_0 +// (elemtwise_out_0, sent_emb) elementweise_add -> elementwise_out_1 +// (elementwise_out_1, scale, bias) layer_norm -> layer_norm_out +// +// and then convert the corresponding subgraph to: +// +// (word, pos, sent, weights_0, weights_1, weights_2, +// scale, baias) embedding_eltwise_layernorm -> layer_norm_out +// +// +// in_var emb_var in_var emb_var in_var emb_var in_var emb_var +// | | | | | | | | +// lookup_table lookup_table lookup_table ... lookup_table +// | | | | +// lkt_var lkt_var lkt_var lkt_var +// \ / | ... | +// elementwise_add | | +// \ / | +// elementwise_add | +// | | +// elt_var / +// \ / +// elementwise_add +// | +// layer_norm + +class TrtEmbeddingEltwiseLayerNormFusePass : public FusePassBase { + public: + TrtEmbeddingEltwiseLayerNormFusePass(); + virtual ~TrtEmbeddingEltwiseLayerNormFusePass() {} + + protected: + void ApplyImpl(Graph* graph) const; + int BuildFusion(Graph* graph, const std::string& name_scope + /*const Scope* scope*/) const; + const std::string name_scope_{"trt_embedding_eltwise_layernorm_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/trt_map_matmul_to_mul_pass.cc b/paddle/fluid/framework/ir/trt_map_matmul_to_mul_pass.cc index d3211c0841416..a6e3780fd22c9 100644 --- a/paddle/fluid/framework/ir/trt_map_matmul_to_mul_pass.cc +++ b/paddle/fluid/framework/ir/trt_map_matmul_to_mul_pass.cc @@ -16,9 +16,9 @@ #include #include + #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/op_proto_maker.h" - #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc new file mode 100644 index 0000000000000..2e3e957fd15f1 --- /dev/null +++ b/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc @@ -0,0 +1,1549 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.h" + +#include + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace framework { +class Scope; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace framework { +namespace ir { +namespace patterns { + +static void ReplaceOutputVar(Node* op, Node* old_var, Node* new_var) { + if (op->IsOp() && op->Op()) { + new_var->inputs.push_back(op); + for (size_t i = 0; i < op->outputs.size(); ++i) { + if (op->outputs[i] == old_var) { + op->outputs[i] = new_var; + op->Op()->RenameOutput(old_var->Name(), new_var->Name()); + } + } + } +} + +static int BuildFusion(Graph* graph, const std::string& name_scope) { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + + // Create pattern. + TrtMultiHeadMatmulPattern multihead_pattern(pattern, name_scope); + + multihead_pattern(); + // Create New OpDesc + auto fuse_creater = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2, + Node* mul0_out, Node* mul1_out, Node* mul2_out, + Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b, + Node* eltadd_qk_b, Node* reshape2, + Node* reshape2_qkv_out, Node* scale, + Node* scale_out) { + auto scale_attr = BOOST_GET_CONST(float, scale->Op()->GetAttr("scale")); + // auto scale_bias = BOOST_GET_CONST(float, scale->Op()->GetAttr("bias")); + // bool after_scale = + // BOOST_GET_CONST(bool, scale->Op()->GetAttr("bias_after_scale")); + + // create multihead + OpDesc multihead_op_desc(mul0->Op()->Block()); + + // create tmp tensor + VarDesc k_var_desc(*mul1_out->Var()); + k_var_desc.SetName("K" + mul1_out->Name()); + auto* k_var_node = graph->CreateVarNode(&k_var_desc); + + VarDesc q_var_desc(*mul0_out->Var()); + q_var_desc.SetName("Q" + mul0_out->Name()); + auto* q_var_node = graph->CreateVarNode(&q_var_desc); + + VarDesc v_var_desc(*mul2_out->Var()); + v_var_desc.SetName("V" + mul2_out->Name()); + auto* v_var_node = graph->CreateVarNode(&v_var_desc); + + auto reshape_desc = reshape2->Op(); + int head_number = + BOOST_GET_CONST(std::vector, reshape_desc->GetAttr("shape")).at(2); + + ReplaceOutputVar(mul0, mul0_out, q_var_node); + ReplaceOutputVar(mul1, mul1_out, k_var_node); + ReplaceOutputVar(mul2, mul2_out, v_var_node); + + multihead_op_desc.SetType("multihead_matmul"); + multihead_op_desc.SetInput("Q", {q_var_node->Name()}); + multihead_op_desc.SetInput("K", {k_var_node->Name()}); + multihead_op_desc.SetInput("V", {v_var_node->Name()}); + + multihead_op_desc.SetInput("BiasQ", {eltadd0_b->Name()}); + multihead_op_desc.SetInput("BiasK", {eltadd1_b->Name()}); + multihead_op_desc.SetInput("BiasV", {eltadd2_b->Name()}); + multihead_op_desc.SetInput("BiasQK", {eltadd_qk_b->Name()}); + + multihead_op_desc.SetOutput("Out", {reshape2_qkv_out->Name()}); + multihead_op_desc.SetAttr("alpha", scale_attr); + multihead_op_desc.SetAttr("head_number", head_number); + + auto* multihead = graph->CreateOpNode(&multihead_op_desc); + IR_NODE_LINK_TO(q_var_node, multihead); + IR_NODE_LINK_TO(k_var_node, multihead); + IR_NODE_LINK_TO(v_var_node, multihead); + + IR_NODE_LINK_TO(eltadd0_b, multihead); + IR_NODE_LINK_TO(eltadd1_b, multihead); + IR_NODE_LINK_TO(eltadd2_b, multihead); + IR_NODE_LINK_TO(eltadd_qk_b, multihead); + + IR_NODE_LINK_TO(multihead, reshape2_qkv_out); + }; + + int fusion_count{0}; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + // GET_IR_NODE_FROM_SUBGRAPH(dropout_out, dropout_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(input0, input0, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(mul0, mul0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul0_out, mul0_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul0_w, mul0_w, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_0, reshape2_0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_0_out, reshape2_0_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_0, transpose2_0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_0_out, transpose2_0_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(scale, scale, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(scale_out, scale_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(mul1, mul1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul1_out, mul1_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul1_w, mul1_w, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_1, reshape2_1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_1_out, reshape2_1_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_1, transpose2_1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_1_out, transpose2_1_out, + multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(mul2, mul2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul2_out, mul2_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul2_w, mul2_w, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_2, reshape2_2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_2_out, reshape2_2_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_2, transpose2_2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_2_out, transpose2_2_out, + multihead_pattern); + + // nodes need be removed + GET_IR_NODE_FROM_SUBGRAPH(eltadd0, eltadd0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd0_b, eltadd0_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd0_out, eltadd0_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(eltadd1, eltadd1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd1_b, eltadd1_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd1_out, eltadd1_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(eltadd2, eltadd2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd2_b, eltadd2_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd2_out, eltadd2_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(matmul_qk, matmul_qk, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_qk_out, matmul_qk_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk, eltadd_qk, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_b, eltadd_qk_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_out, eltadd_qk_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(softmax_qk, softmax_qk, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(softmax_qk_out, softmax_qk_out, + multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv, matmul_qkv, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv_out, matmul_qkv_out, + multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv, reshape2_qkv, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv_out, reshape2_qkv_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv, transpose2_qkv, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out, transpose2_qkv_out, + multihead_pattern); + + fuse_creater(input0, mul0, mul1, mul2, mul0_out, mul1_out, mul2_out, + eltadd0_b, eltadd1_b, eltadd2_b, eltadd_qk_b, reshape2_0, + reshape2_qkv_out, scale, scale_out); + + std::unordered_set marked_nodes( + {eltadd0, + eltadd1, + eltadd2, + eltadd0_out, + eltadd1_out, + eltadd2_out, + reshape2_0, + reshape2_1, + reshape2_2, + reshape2_0_out, + reshape2_1_out, + reshape2_2_out, + transpose2_0, + transpose2_1, + transpose2_2, + transpose2_0_out, + transpose2_1_out, + transpose2_2_out, + matmul_qk, + matmul_qk_out, + eltadd_qk, + eltadd_qk_out, + softmax_qk, + softmax_qk_out, // dropout_qk, dropout_qk_out, + transpose2_qkv, + transpose2_qkv_out, + matmul_qkv, + matmul_qkv_out, + mul0_out, + mul1_out, + mul2_out, + reshape2_qkv, + scale}); + // Remove unneeded nodes. + GraphSafeRemoveNodes(graph, marked_nodes); + ++fusion_count; + }; + gpd(graph, handler); + + return fusion_count; +} + +PDNode* TrtMultiHeadMatmulPattern::operator()() { + auto* input0 = pattern->NewNode(input0_repr()); + input0->assert_is_op_input("mul"); + + // First path with scale + auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_op("mul"); + auto* mul0_w_var = pattern->NewNode(mul0_w_repr()) + ->AsInput() + ->assert_is_op_input("mul", "Y"); + auto* mul0_out_var = + pattern->NewNode(mul0_out_repr())->assert_is_op_output("mul"); + + decltype(mul0) eltadd0; + decltype(mul0) eltadd0_b_var; + decltype(mul0) eltadd0_out_var; + + mul0_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + + eltadd0 = pattern->NewNode(eltadd0_repr())->assert_is_op("elementwise_add"); + eltadd0_b_var = pattern->NewNode(eltadd0_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + + eltadd0_out_var = pattern->NewNode(eltadd0_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd0_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_0 = + pattern->NewNode(reshape2_0_repr())->assert_is_op("reshape2"); + + auto* reshape2_0_out_var = + pattern->NewNode(reshape2_0_out_repr())->assert_is_op_output("reshape2"); + reshape2_0_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_0 = + pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2"); + auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_0_out_var->AsIntermediate()->assert_is_op_input("scale"); + + auto* scale = pattern->NewNode(scale_repr())->assert_is_op("scale"); + auto* scale_out_var = + pattern->NewNode(scale_out_repr())->assert_is_op_output("scale"); + scale_out_var->AsIntermediate()->assert_is_op_input("matmul"); + + auto* matmul_qk = pattern->NewNode(matmul_qk_repr())->assert_is_op("matmul"); + auto* matmul_qk_out_var = + pattern->NewNode(matmul_qk_out_repr())->assert_is_op_output("matmul"); + matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + + auto* eltadd_qk = + pattern->NewNode(eltadd_qk_repr())->assert_is_op("elementwise_add"); + auto* eltadd_qk_b_var = pattern->NewNode(eltadd_qk_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + auto* eltadd_qk_out_var = pattern->NewNode(eltadd_qk_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd_qk_out_var->AsIntermediate()->assert_is_op_input("softmax"); + + auto* softmax_qk = + pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax"); + auto* softmax_qk_out_var = + pattern->NewNode(softmax_qk_out_repr())->assert_is_op_output("softmax"); + softmax_qk_out_var->AsIntermediate()->assert_is_op_input("matmul"); + + auto* matmul_qkv = + pattern->NewNode(matmul_qkv_repr())->assert_is_op("matmul"); + auto* matmul_qkv_out_var = + pattern->NewNode(matmul_qkv_out_repr())->assert_is_op_output("matmul"); + matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_qkv = + pattern->NewNode(transpose2_qkv_repr())->assert_is_op("transpose2"); + auto* transpose2_qkv_out_var = pattern->NewNode(transpose2_qkv_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_qkv_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_qkv = + pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2"); + auto* reshape2_qkv_out_var = pattern->NewNode(reshape2_qkv_out_repr()) + ->assert_is_op_output("reshape2"); + reshape2_qkv_out_var->assert_is_op_input("mul"); + + // Second path to matmul + auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_op("mul"); + auto* mul1_w_var = pattern->NewNode(mul1_w_repr()) + ->AsInput() + ->assert_is_op_input("mul", "Y"); + auto* mul1_out_var = + pattern->NewNode(mul1_out_repr())->assert_is_op_output("mul"); + + decltype(mul1) eltadd1; + decltype(mul1) eltadd1_b_var; + decltype(mul1) eltadd1_out_var; + + mul1_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + eltadd1 = pattern->NewNode(eltadd1_repr())->assert_is_op("elementwise_add"); + eltadd1_b_var = pattern->NewNode(eltadd1_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + + eltadd1_out_var = pattern->NewNode(eltadd1_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd1_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_1 = + pattern->NewNode(reshape2_1_repr())->assert_is_op("reshape2"); + + auto* reshape2_1_out_var = + pattern->NewNode(reshape2_1_out_repr())->assert_is_op_output("reshape2"); + reshape2_1_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_1 = + pattern->NewNode(transpose2_1_repr())->assert_is_op("transpose2"); + auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_1_out_var->AsIntermediate()->assert_is_op_input( + "matmul"); // link to matmul qk + + // Third path to matmul + auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_op("mul"); + auto* mul2_w_var = pattern->NewNode(mul2_w_repr()) + ->AsInput() + ->assert_is_op_input("mul", "Y"); + auto* mul2_out_var = + pattern->NewNode(mul2_out_repr())->assert_is_op_output("mul"); + + decltype(mul2) eltadd2; + decltype(mul2) eltadd2_b_var; + decltype(mul2) eltadd2_out_var; + + mul2_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + eltadd2 = pattern->NewNode(eltadd2_repr())->assert_is_op("elementwise_add"); + eltadd2_b_var = pattern->NewNode(eltadd2_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + + eltadd2_out_var = pattern->NewNode(eltadd2_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd2_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_2 = + pattern->NewNode(reshape2_2_repr())->assert_is_op("reshape2"); + + auto* reshape2_2_out_var = + pattern->NewNode(reshape2_2_out_repr())->assert_is_op_output("reshape2"); + reshape2_2_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_2 = + pattern->NewNode(transpose2_2_repr())->assert_is_op("transpose2"); + auto* transpose2_2_out_var = pattern->NewNode(transpose2_2_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_2_out_var->AsIntermediate()->assert_is_op_input( + "matmul"); // link to matmul qkv + + // Q path + mul0->LinksFrom({input0, mul0_w_var}).LinksTo({mul0_out_var}); + eltadd0->LinksFrom({mul0_out_var, eltadd0_b_var}).LinksTo({eltadd0_out_var}); + + reshape2_0->LinksFrom({eltadd0_out_var}).LinksTo({reshape2_0_out_var}); + transpose2_0->LinksFrom({reshape2_0_out_var}).LinksTo({transpose2_0_out_var}); + scale->LinksFrom({transpose2_0_out_var}).LinksTo({scale_out_var}); + // K path + mul1->LinksFrom({input0, mul1_w_var}).LinksTo({mul1_out_var}); + eltadd1->LinksFrom({mul1_out_var, eltadd1_b_var}).LinksTo({eltadd1_out_var}); + reshape2_1->LinksFrom({eltadd1_out_var}).LinksTo({reshape2_1_out_var}); + transpose2_1->LinksFrom({reshape2_1_out_var}).LinksTo({transpose2_1_out_var}); + // compute q*k + matmul_qk->LinksFrom({scale_out_var, transpose2_1_out_var}) + .LinksTo({matmul_qk_out_var}); + eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var}) + .LinksTo({eltadd_qk_out_var}); + softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var}); + // V path + mul2->LinksFrom({input0, mul2_w_var}).LinksTo({mul2_out_var}); + eltadd2->LinksFrom({mul2_out_var, eltadd2_b_var}).LinksTo({eltadd2_out_var}); + reshape2_2->LinksFrom({eltadd2_out_var}).LinksTo({reshape2_2_out_var}); + transpose2_2->LinksFrom({reshape2_2_out_var}).LinksTo({transpose2_2_out_var}); + // compute q*k*v + matmul_qkv->LinksFrom({softmax_qk_out_var, transpose2_2_out_var}) + .LinksTo({matmul_qkv_out_var}); + transpose2_qkv->LinksFrom({matmul_qkv_out_var}) + .LinksTo({transpose2_qkv_out_var}); + reshape2_qkv->LinksFrom({transpose2_qkv_out_var}) + .LinksTo({reshape2_qkv_out_var}); + + return transpose2_2_out_var; +} + +PDNode* TrtMultiHeadMatmulV3Pattern::operator()() { + std::unordered_set matmul_ops{"matmul", "matmul_v2"}; + auto* input0 = pattern->NewNode(input0_repr()); + input0->assert_is_ops_input(matmul_ops); + + // First path with scale + auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_ops(matmul_ops); + auto* mul0_w_var = pattern->NewNode(mul0_w_repr()) + ->AsInput() + ->assert_is_ops_input(matmul_ops, "Y"); + auto* mul0_out_var = + pattern->NewNode(mul0_out_repr())->assert_is_ops_output(matmul_ops); + + decltype(mul0) eltadd0; + decltype(mul0) eltadd0_b_var; + decltype(mul0) eltadd0_out_var; + + mul0_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + + eltadd0 = pattern->NewNode(eltadd0_repr())->assert_is_op("elementwise_add"); + eltadd0_b_var = pattern->NewNode(eltadd0_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + + eltadd0_out_var = pattern->NewNode(eltadd0_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd0_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_0 = + pattern->NewNode(reshape2_0_repr())->assert_is_op("reshape2"); + + auto* reshape2_0_out_var = + pattern->NewNode(reshape2_0_out_repr())->assert_is_op_output("reshape2"); + reshape2_0_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_0 = + pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2"); + auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_0_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops, "X"); + + auto* matmul_qk = + pattern->NewNode(matmul_qk_repr())->assert_is_ops(matmul_ops); + auto* matmul_qk_out_var = + pattern->NewNode(matmul_qk_out_repr())->assert_is_ops_output(matmul_ops); + matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + + auto* eltadd_qk = + pattern->NewNode(eltadd_qk_repr())->assert_is_op("elementwise_add"); + auto* eltadd_qk_b_var = pattern->NewNode(eltadd_qk_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + auto* eltadd_qk_out_var = pattern->NewNode(eltadd_qk_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd_qk_out_var->AsIntermediate()->assert_is_op_input("softmax"); + + auto* softmax_qk = + pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax"); + auto* softmax_qk_out_var = + pattern->NewNode(softmax_qk_out_repr())->assert_is_op_output("softmax"); + softmax_qk_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops); + + auto* matmul_qkv = + pattern->NewNode(matmul_qkv_repr())->assert_is_ops(matmul_ops); + auto* matmul_qkv_out_var = + pattern->NewNode(matmul_qkv_out_repr())->assert_is_ops_output(matmul_ops); + matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_qkv = + pattern->NewNode(transpose2_qkv_repr())->assert_is_op("transpose2"); + auto* transpose2_qkv_out_var = pattern->NewNode(transpose2_qkv_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_qkv_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_qkv = + pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2"); + auto* reshape2_qkv_out_var = pattern->NewNode(reshape2_qkv_out_repr()) + ->assert_is_op_output("reshape2"); + reshape2_qkv_out_var->assert_is_ops_input(matmul_ops); + // Second path to matmul + auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_ops(matmul_ops); + auto* mul1_w_var = pattern->NewNode(mul1_w_repr()) + ->AsInput() + ->assert_is_ops_input(matmul_ops, "Y"); + auto* mul1_out_var = + pattern->NewNode(mul1_out_repr())->assert_is_ops_output(matmul_ops); + + decltype(mul1) eltadd1; + decltype(mul1) eltadd1_b_var; + decltype(mul1) eltadd1_out_var; + + mul1_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + eltadd1 = pattern->NewNode(eltadd1_repr())->assert_is_op("elementwise_add"); + eltadd1_b_var = pattern->NewNode(eltadd1_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + + eltadd1_out_var = pattern->NewNode(eltadd1_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd1_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_1 = + pattern->NewNode(reshape2_1_repr())->assert_is_op("reshape2"); + + auto* reshape2_1_out_var = + pattern->NewNode(reshape2_1_out_repr())->assert_is_op_output("reshape2"); + reshape2_1_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_1 = + pattern->NewNode(transpose2_1_repr())->assert_is_op("transpose2"); + auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_1_out_var->AsIntermediate()->assert_is_ops_input( + matmul_ops, "Y"); // link to matmul qk + + // Third path to matmul + auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_ops(matmul_ops); + auto* mul2_w_var = pattern->NewNode(mul2_w_repr()) + ->AsInput() + ->assert_is_ops_input(matmul_ops, "Y"); + auto* mul2_out_var = + pattern->NewNode(mul2_out_repr())->assert_is_ops_output(matmul_ops); + + decltype(mul2) eltadd2; + decltype(mul2) eltadd2_b_var; + decltype(mul2) eltadd2_out_var; + + mul2_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + eltadd2 = pattern->NewNode(eltadd2_repr())->assert_is_op("elementwise_add"); + eltadd2_b_var = pattern->NewNode(eltadd2_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + + eltadd2_out_var = pattern->NewNode(eltadd2_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd2_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_2 = + pattern->NewNode(reshape2_2_repr())->assert_is_op("reshape2"); + + auto* reshape2_2_out_var = + pattern->NewNode(reshape2_2_out_repr())->assert_is_op_output("reshape2"); + reshape2_2_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_2 = + pattern->NewNode(transpose2_2_repr())->assert_is_op("transpose2"); + auto* transpose2_2_out_var = pattern->NewNode(transpose2_2_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_2_out_var->AsIntermediate()->assert_is_ops_input( + matmul_ops); // link to matmul qkv + + // Q path + mul0->LinksFrom({input0, mul0_w_var}).LinksTo({mul0_out_var}); + eltadd0->LinksFrom({mul0_out_var, eltadd0_b_var}).LinksTo({eltadd0_out_var}); + + reshape2_0->LinksFrom({eltadd0_out_var}).LinksTo({reshape2_0_out_var}); + transpose2_0->LinksFrom({reshape2_0_out_var}).LinksTo({transpose2_0_out_var}); + // K path + mul1->LinksFrom({input0, mul1_w_var}).LinksTo({mul1_out_var}); + eltadd1->LinksFrom({mul1_out_var, eltadd1_b_var}).LinksTo({eltadd1_out_var}); + reshape2_1->LinksFrom({eltadd1_out_var}).LinksTo({reshape2_1_out_var}); + transpose2_1->LinksFrom({reshape2_1_out_var}).LinksTo({transpose2_1_out_var}); + // compute q*k + matmul_qk->LinksFrom({transpose2_0_out_var, transpose2_1_out_var}) + .LinksTo({matmul_qk_out_var}); + eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var}) + .LinksTo({eltadd_qk_out_var}); + softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var}); + // V path + mul2->LinksFrom({input0, mul2_w_var}).LinksTo({mul2_out_var}); + eltadd2->LinksFrom({mul2_out_var, eltadd2_b_var}).LinksTo({eltadd2_out_var}); + reshape2_2->LinksFrom({eltadd2_out_var}).LinksTo({reshape2_2_out_var}); + transpose2_2->LinksFrom({reshape2_2_out_var}).LinksTo({transpose2_2_out_var}); + // compute q*k*v + matmul_qkv->LinksFrom({softmax_qk_out_var, transpose2_2_out_var}) + .LinksTo({matmul_qkv_out_var}); + transpose2_qkv->LinksFrom({matmul_qkv_out_var}) + .LinksTo({transpose2_qkv_out_var}); + reshape2_qkv->LinksFrom({transpose2_qkv_out_var}) + .LinksTo({reshape2_qkv_out_var}); + + return transpose2_2_out_var; +} +} // namespace patterns + +void TrtMultiHeadMatmulFusePass::ApplyImpl(Graph* graph) const { + FusePassBase::Init(name_scope_, graph); + + int fusion_count = patterns::BuildFusion(graph, name_scope_); + AddStatis(fusion_count); +} + +TrtMultiHeadMatmulV2FusePass::TrtMultiHeadMatmulV2FusePass() { + AddOpCompat(OpCompat("mul")) + .AddInput("X") // the shape shoule be (B, S, N*H) + .IsTensor() + .End() + .AddInput("Y") // the shape shoule be (N*H, N*H) + .IsTensor() + .End() + .AddOutput("Out") // the shape shoule be (B, S, N*H) + .IsTensor() + .End() + .AddAttr("x_num_col_dims") + .IsNumEQ(2) + .End() + .AddAttr("y_num_col_dims") + .IsNumEQ(1) + .End(); + + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + // in bias, shape is (B, S, N*H), + // in biasqk, shape is (B, H, S, S) + .IsTensor() + .End() + .AddInput("Y") + // in bias, shape is (N*H) + // in biasqk, shape is (B, H, S, S) + .IsTensor() + .End() + // in bias, shape is (B, S, N*H) + // in biasqk, shape is (B, H, S, S) + .AddOutput("Out") + .IsTensor() + .End() + // in bias, it equal to 2 + // in biasqk, it equal to -1 or 0 + .AddAttr("axis") + .IsIntIn({2, -1, 0}) + .End(); + + AddOpCompat(OpCompat("reshape2")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Shape") + .IsTensor() + .IsOptional() + .End() + .AddInput("ShapeTensor") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsOptional() + .IsTensor() + .End() + .AddAttr("shape") // -->(B, S, H, N) <--(B, S, N*H) + .IsType>() + .End(); + + // -->: (B, S, H, N) -> (B, H, S, N) + // <--: (B, H, S, N) -> (B, S, H, N) + AddOpCompat(OpCompat("transpose2")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsOptional() + .IsTensor() + .End() + .AddAttr("axis") // {0, 2, 1, 3} + .IsType>() + .End(); + + AddOpCompat(OpCompat("scale")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("scale") + .IsType() // copy to new op. so unconstrained. + .End() + .AddAttr("bias") + .IsNumEQ(0.f) + .End() + .AddAttr("bias_after_scale") // bias is 0, so unconstrained. + .IsType() + .End(); + + // QK (B, H, S, N)*(B, H, S, N) -> (B, H, S, S) + // QKV (B, H, S, S)*(B, H, S, N) -> (B, H, S, N) + AddOpCompat(OpCompat("matmul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("alpha") + .IsNumEQ(1.0f) + .End() + .AddAttr("transpose_X") + .IsBoolEQ(false) + .End() + .AddAttr("transpose_Y") // QK(true) QKV(false) + .IsType() + .End(); + + AddOpCompat(OpCompat("softmax")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsIntIn({-1, 3}) // shape is (B, H, S, S), so axis is -1 or 3 + .End(); +} + +int TrtMultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph, + const std::string& name_scope, + Scope* scope) const { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + + // Create pattern. + patterns::TrtMultiHeadMatmulPattern multihead_pattern(pattern, name_scope); + + multihead_pattern(); + // Create New OpDesc + auto fuse_creater = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2, + Node* mul0_out, Node* mul1_out, Node* mul2_out, + Node* mul0_w, Node* mul1_w, Node* mul2_w, + Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b, + Node* eltadd_qk_b, Node* reshape2, + Node* reshape2_qkv_out, Node* scale, Node* scale_out, + Node* softmax_qk, Node* eltadd0, Node* eltadd1, + Node* eltadd2, Node* matmul_qk, Node* reshape2_qkv) { + auto scale_attr = BOOST_GET_CONST(float, scale->Op()->GetAttr("scale")); + + // mul (B * S * Hidden) x (Hidden * 3 * N * H) = (B * S * 3 * N * H) + // bias (B * S * 3 * N * H) + bias (3 * N * H) + // Transpose (B * S * 3 * N * H) -> (3 * B * N * S * H) + auto* wq_tensor = scope->FindVar(mul0_w->Name())->GetMutable(); + auto* wk_tensor = scope->FindVar(mul1_w->Name())->GetMutable(); + auto* wv_tensor = scope->FindVar(mul2_w->Name())->GetMutable(); + + auto* bq_tensor = + scope->FindVar(eltadd0_b->Name())->GetMutable(); + auto* bk_tensor = + scope->FindVar(eltadd1_b->Name())->GetMutable(); + auto* bv_tensor = + scope->FindVar(eltadd2_b->Name())->GetMutable(); + + auto* wq_data = wq_tensor->mutable_data(platform::CPUPlace()); + auto* wk_data = wk_tensor->mutable_data(platform::CPUPlace()); + auto* wv_data = wv_tensor->mutable_data(platform::CPUPlace()); + auto* bq_data = bq_tensor->mutable_data(platform::CPUPlace()); + auto* bk_data = bk_tensor->mutable_data(platform::CPUPlace()); + auto* bv_data = bv_tensor->mutable_data(platform::CPUPlace()); + + auto combined_w_dims = + phi::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]}); + auto combined_bias_dims = phi::make_ddim({3, bq_tensor->dims()[0]}); + + // reuse the mul0_w and eltadd_0_b nodes for the combined nodes. + auto* combined_w_desc = mul0_w->Var(); + combined_w_desc->SetShape({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]}); + combined_w_desc->SetPersistable(true); + + auto* combined_bias_desc = eltadd0_b->Var(); + combined_bias_desc->SetShape({3, bq_tensor->dims()[0]}); + combined_bias_desc->SetPersistable(true); + + framework::LoDTensor tmp_combined_w_tensor; + tmp_combined_w_tensor.Resize(combined_w_dims); + auto* tmp_combined_w_data = + tmp_combined_w_tensor.mutable_data(platform::CPUPlace()); + + std::vector w_vec = {wq_data, wk_data, wv_data}; + int dims_h = combined_w_dims[0], dims_w = combined_w_dims[2]; + // Combine the three fc weights together. + for (int i = 0; i < dims_h; i++) { + for (int j = 0; j < 3; j++) { + for (int k = 0; k < dims_w; k++) { + int out_index = i * (3 * dims_w) + j * dims_w + k; + int in_index = i * dims_w + k; + tmp_combined_w_data[out_index] = w_vec[j][in_index]; + } + } + } + + wq_tensor->Resize(combined_w_dims); + auto* new_combined_w_data = + wq_tensor->mutable_data(platform::CPUPlace()); + memcpy(new_combined_w_data, tmp_combined_w_data, + sizeof(float) * wq_tensor->numel()); + + scope->EraseVars({mul1_w->Name(), mul2_w->Name()}); + + framework::LoDTensor tmp_combined_bias_tensor; + tmp_combined_bias_tensor.Resize(combined_bias_dims); + auto* tmp_combined_bias_data = + tmp_combined_bias_tensor.mutable_data(platform::CPUPlace()); + + size_t bias_size = bq_tensor->numel(); + memcpy(tmp_combined_bias_data, bq_data, sizeof(float) * bias_size); + memcpy(tmp_combined_bias_data + bias_size, bk_data, + sizeof(float) * bias_size); + memcpy(tmp_combined_bias_data + 2 * bias_size, bv_data, + sizeof(float) * bias_size); + + bq_tensor->Resize(combined_bias_dims); + auto* new_combined_bias_data = + bq_tensor->mutable_data(platform::CPUPlace()); + memcpy(new_combined_bias_data, tmp_combined_bias_data, + sizeof(float) * bq_tensor->numel()); + + scope->EraseVars({eltadd1_b->Name(), eltadd2_b->Name()}); + + auto reshape_desc = reshape2->Op(); + int head_number = + BOOST_GET_CONST(std::vector, reshape_desc->GetAttr("shape")).at(2); + + OpDesc multihead_op_desc(mul0->Op()->Block()); + multihead_op_desc.SetType("multihead_matmul"); + + multihead_op_desc.SetInput("Input", {input0->Name()}); + multihead_op_desc.SetInput("W", {mul0_w->Name()}); + multihead_op_desc.SetInput("Bias", {eltadd0_b->Name()}); + multihead_op_desc.SetInput("BiasQK", {eltadd_qk_b->Name()}); + + multihead_op_desc.SetOutput("Out", {reshape2_qkv_out->Name()}); + multihead_op_desc.SetAttr("alpha", scale_attr); + multihead_op_desc.SetAttr("head_number", head_number); + + auto* mul0_op_desc = mul0->Op(); + + // all mul op has same input. + if (multihead_op_desc.HasAttr("Input_scale")) { + multihead_op_desc.SetAttr("Input_scale", + mul0_op_desc->GetAttr("Input_scale")); + } + auto* add0_op_desc = eltadd0->Op(); + auto* add1_op_desc = eltadd1->Op(); + auto* add2_op_desc = eltadd2->Op(); + if (add0_op_desc->HasAttr("out_threshold")) { + auto out_scale0 = + BOOST_GET_CONST(float, add0_op_desc->GetAttr("out_threshold")); + auto out_scale1 = + BOOST_GET_CONST(float, add1_op_desc->GetAttr("out_threshold")); + auto out_scale2 = + BOOST_GET_CONST(float, add2_op_desc->GetAttr("out_threshold")); + auto out_scale_max = std::max(out_scale0, out_scale1); + out_scale_max = std::max(out_scale_max, out_scale2); + multihead_op_desc.SetAttr("fc_out_threshold", out_scale_max); + } + + auto* softmax_qk_op_desc = softmax_qk->Op(); + auto* matmul_qk_op_desc = matmul_qk->Op(); + if (matmul_qk_op_desc->HasAttr("Input_scale")) { + multihead_op_desc.SetAttr("qkv2context_plugin_int8", true); + if (softmax_qk_op_desc->HasAttr("out_threshold")) { + auto qkv_plugin_scale = BOOST_GET_CONST( + float, softmax_qk_op_desc->GetAttr("out_threshold")); + multihead_op_desc.SetAttr("dp_probs", qkv_plugin_scale); + } + } + if (reshape2_qkv->Op()->HasAttr("out_threshold")) { + multihead_op_desc.SetAttr("out_threshold", + reshape2_qkv->Op()->GetAttr("out_threshold")); + } + auto* multihead = graph->CreateOpNode(&multihead_op_desc); + + IR_NODE_LINK_TO(input0, multihead); + IR_NODE_LINK_TO(mul0_w, multihead); + IR_NODE_LINK_TO(eltadd0_b, multihead); + IR_NODE_LINK_TO(eltadd_qk_b, multihead); + + IR_NODE_LINK_TO(multihead, reshape2_qkv_out); + }; + + int fusion_count{0}; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) + << "Op compat check in trt_multihead_matmul_fuse_pass_v2 failed."; + return; + } + // GET_IR_NODE_FROM_SUBGRAPH(dropout_out, dropout_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(input0, input0, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(mul0, mul0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul0_out, mul0_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul0_w, mul0_w, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_0, reshape2_0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_0_out, reshape2_0_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_0, transpose2_0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_0_out, transpose2_0_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(scale, scale, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(scale_out, scale_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(mul1, mul1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul1_out, mul1_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul1_w, mul1_w, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_1, reshape2_1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_1_out, reshape2_1_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_1, transpose2_1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_1_out, transpose2_1_out, + multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(mul2, mul2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul2_out, mul2_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul2_w, mul2_w, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_2, reshape2_2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_2_out, reshape2_2_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_2, transpose2_2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_2_out, transpose2_2_out, + multihead_pattern); + + // nodes need be removed + GET_IR_NODE_FROM_SUBGRAPH(eltadd0, eltadd0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd0_b, eltadd0_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd0_out, eltadd0_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(eltadd1, eltadd1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd1_b, eltadd1_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd1_out, eltadd1_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(eltadd2, eltadd2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd2_b, eltadd2_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd2_out, eltadd2_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(matmul_qk, matmul_qk, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_qk_out, matmul_qk_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk, eltadd_qk, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_b, eltadd_qk_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_out, eltadd_qk_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(softmax_qk, softmax_qk, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(softmax_qk_out, softmax_qk_out, + multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv, matmul_qkv, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv_out, matmul_qkv_out, + multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv, reshape2_qkv, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv_out, reshape2_qkv_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv, transpose2_qkv, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out, transpose2_qkv_out, + multihead_pattern); + + // If weights or biases in qkv's fc are shared by multiple multihead_matmul + // patterns, we do not support this kind of fusion, this pass will not take + // effect. + bool is_fc_params_shared = + mul0_w->outputs.size() > 1 || mul1_w->outputs.size() > 1 || + mul2_w->outputs.size() > 1 || eltadd0_b->outputs.size() > 1 || + eltadd1_b->outputs.size() > 1 || eltadd2_b->outputs.size() > 1; + if (is_fc_params_shared) { + return; + } + fuse_creater(input0, mul0, mul1, mul2, mul0_out, mul1_out, mul2_out, mul0_w, + mul1_w, mul2_w, eltadd0_b, eltadd1_b, eltadd2_b, eltadd_qk_b, + reshape2_0, reshape2_qkv_out, scale, scale_out, softmax_qk, + eltadd0, eltadd1, eltadd2, matmul_qk, reshape2_qkv); + + std::unordered_set marked_nodes({eltadd0, + eltadd1, + eltadd2, + eltadd1_b, + eltadd2_b, + eltadd0_out, + eltadd1_out, + eltadd2_out, + reshape2_0, + reshape2_1, + reshape2_2, + reshape2_0_out, + reshape2_1_out, + reshape2_2_out, + transpose2_0, + transpose2_1, + transpose2_2, + transpose2_0_out, + transpose2_1_out, + transpose2_2_out, + matmul_qk, + matmul_qk_out, + eltadd_qk, + eltadd_qk_out, + softmax_qk, + softmax_qk_out, + transpose2_qkv, + transpose2_qkv_out, + matmul_qkv, + matmul_qkv_out, + mul0, + mul1, + mul2, + mul0_out, + mul1_out, + mul2_out, + mul1_w, + mul2_w, + reshape2_qkv, + scale}); + // Remove unneeded nodes. + GraphSafeRemoveNodes(graph, marked_nodes); + ++fusion_count; + }; + gpd(graph, handler); + + return fusion_count; +} + +void TrtMultiHeadMatmulV2FusePass::ApplyImpl(Graph* graph) const { + FusePassBase::Init(name_scope_, graph); + auto* scope = param_scope(); + PADDLE_ENFORCE_NOT_NULL( + scope, + platform::errors::Fatal( + "During the multiheadMatmul pass, The scope should not be null.")); + + int fusion_count = BuildFusionV2(graph, name_scope_, scope); + if (fusion_count > 0) { + bool use_varseqlen = Get("use_varseqlen"); + std::string pos_id = Get("tensorrt_transformer_posid"); + std::string mask_id = Get("tensorrt_transformer_maskid"); + + if (use_varseqlen && pos_id != "" && mask_id != "") { + if (graph->Has(framework::ir::kEmbEltwiseLayernormPass)) { + VLOG(3) << "start varseqlen trt_multihead_matmul_fuse_pass_v2"; + } else { + PADDLE_THROW(platform::errors::Fatal( + "Use transformer'varseqlen need " + "embedding_eltwise_layernorm_fuse_pass. please use no_varseqlen")); + } + } else if (!use_varseqlen && pos_id == "" && mask_id == "") { + VLOG(3) << "start no_varseqlen trt_multihead_matmul_fuse_pass_v2"; + } else { + PADDLE_THROW( + platform::errors::Fatal("Use transformer'varseqlen need config: " + "use_varseqlen, set pos_id, set " + "mask_id. Or not use varseqlen, do not set " + "pos_id, set mask_id. Please " + "reconfig")); + } + graph->Set(kMultiheadMatmulPass, new bool(true)); + } + AddStatis(fusion_count); +} + +TrtMultiHeadMatmulV3FusePass::TrtMultiHeadMatmulV3FusePass() { + AddOpCompat(OpCompat("mul")) + .AddInput("X") // the shape shoule be (B, S, N*H) + .IsTensor() + .End() + .AddInput("Y") // the shape shoule be (N*H, N*H) + .IsTensor() + .End() + .AddOutput("Out") // the shape shoule be (B, S, N*H) + .IsTensor() + .End() + .AddAttr("x_num_col_dims") + .IsNumEQ(2) + .End() + .AddAttr("y_num_col_dims") + .IsNumEQ(1) + .End(); + + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + // in bias, shape is (B, S, N*H), + // in biasqk, shape is (B, H, S, S) + .IsTensor() + .End() + .AddInput("Y") + // in bias, shape is (N*H) + // in biasqk, shape is (B, H, S, S) + .IsTensor() + .End() + // in bias, shape is (B, S, N*H) + // in biasqk, shape is (B, H, S, S) + .AddOutput("Out") + .IsTensor() + .End() + // in bias, it equal to 2 + // in biasqk, it equal to -1 or 0 + .AddAttr("axis") + .IsIntIn({2, -1, 0}) + .End(); + + AddOpCompat(OpCompat("reshape2")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Shape") + .IsTensor() + .IsOptional() + .End() + .AddInput("ShapeTensor") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsOptional() + .IsTensor() + .End() + .AddAttr("shape") // -->(B, S, H, N) <--(B, S, N*H) + .IsType>() + .End(); + + // -->: (B, S, H, N) -> (B, H, S, N) + // <--: (B, H, S, N) -> (B, S, H, N) + AddOpCompat(OpCompat("transpose2")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsOptional() + .IsTensor() + .End() + .AddAttr("axis") // {0, 2, 1, 3} + .IsType>() + .End(); + + // QK (B, H, S, N)*(B, H, S, N) -> (B, H, S, S) + // QKV (B, H, S, S)*(B, H, S, N) -> (B, H, S, N) + AddOpCompat(OpCompat("matmul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("alpha") + .IsType() // QK(anyvalue, will copy to new op) QKV(1.0) + .End() + .AddAttr("transpose_X") + .IsBoolEQ(false) + .End() + .AddAttr("transpose_Y") // QK(true) QKV(false) + .IsType() + .End(); + + AddOpCompat(OpCompat("matmul_v2")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("trans_x") + .IsBoolEQ(false) + .End() + .AddAttr("trans_y") // QK(true) QKV(false) + .IsType() + .End(); + + AddOpCompat(OpCompat("softmax")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsIntIn({-1, 3}) // shape is (B, H, S, S), so axis is -1 or 3 + .End(); +} + +int TrtMultiHeadMatmulV3FusePass::BuildFusionV3(Graph* graph, + const std::string& name_scope, + Scope* scope) const { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + + // Create pattern. + patterns::TrtMultiHeadMatmulV3Pattern multihead_pattern(pattern, name_scope); + + multihead_pattern(); + // Create New OpDesc + auto fuse_creater = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2, + Node* mul0_out, Node* mul1_out, Node* mul2_out, + Node* mul0_w, Node* mul1_w, Node* mul2_w, + Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b, + Node* eltadd_qk_b, Node* reshape2, + Node* reshape2_qkv_out, Node* matmul_qk) { + auto scale_attr = BOOST_GET_CONST(float, matmul_qk->Op()->GetAttr("alpha")); + + // mul (B * S * Hidden) x (Hidden * 3 * N * H) = (B * S * 3 * N * H) + // bias (B * S * 3 * N * H) + bias (3 * N * H) + // Transpose (B * S * 3 * N * H) -> (3 * B * N * S * H) + auto* wq_tensor = scope->FindVar(mul0_w->Name())->GetMutable(); + auto* wk_tensor = scope->FindVar(mul1_w->Name())->GetMutable(); + auto* wv_tensor = scope->FindVar(mul2_w->Name())->GetMutable(); + + auto* bq_tensor = + scope->FindVar(eltadd0_b->Name())->GetMutable(); + auto* bk_tensor = + scope->FindVar(eltadd1_b->Name())->GetMutable(); + auto* bv_tensor = + scope->FindVar(eltadd2_b->Name())->GetMutable(); + + auto* wq_data = wq_tensor->mutable_data(platform::CPUPlace()); + auto* wk_data = wk_tensor->mutable_data(platform::CPUPlace()); + auto* wv_data = wv_tensor->mutable_data(platform::CPUPlace()); + auto* bq_data = bq_tensor->mutable_data(platform::CPUPlace()); + auto* bk_data = bk_tensor->mutable_data(platform::CPUPlace()); + auto* bv_data = bv_tensor->mutable_data(platform::CPUPlace()); + + auto combined_w_dims = + phi::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]}); + auto combined_bias_dims = phi::make_ddim({3, bq_tensor->dims()[0]}); + + // reuse the mul0_w and eltadd_0_b nodes for the combined nodes. + auto* combined_w_desc = mul0_w->Var(); + combined_w_desc->SetShape({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]}); + combined_w_desc->SetPersistable(true); + + auto* combined_bias_desc = eltadd0_b->Var(); + combined_bias_desc->SetShape({3, bq_tensor->dims()[0]}); + combined_bias_desc->SetPersistable(true); + + framework::LoDTensor tmp_combined_w_tensor; + tmp_combined_w_tensor.Resize(combined_w_dims); + auto* tmp_combined_w_data = + tmp_combined_w_tensor.mutable_data(platform::CPUPlace()); + + std::vector w_vec = {wq_data, wk_data, wv_data}; + int dims_h = combined_w_dims[0], dims_w = combined_w_dims[2]; + // Combine the three fc weights together. + for (int i = 0; i < dims_h; i++) { + for (int j = 0; j < 3; j++) { + for (int k = 0; k < dims_w; k++) { + int out_index = i * (3 * dims_w) + j * dims_w + k; + int in_index = i * dims_w + k; + tmp_combined_w_data[out_index] = w_vec[j][in_index]; + } + } + } + + wq_tensor->Resize(combined_w_dims); + auto* new_combined_w_data = + wq_tensor->mutable_data(platform::CPUPlace()); + memcpy(new_combined_w_data, tmp_combined_w_data, + sizeof(float) * wq_tensor->numel()); + + scope->EraseVars({mul1_w->Name(), mul2_w->Name()}); + + framework::LoDTensor tmp_combined_bias_tensor; + tmp_combined_bias_tensor.Resize(combined_bias_dims); + auto* tmp_combined_bias_data = + tmp_combined_bias_tensor.mutable_data(platform::CPUPlace()); + + size_t bias_size = bq_tensor->numel(); + memcpy(tmp_combined_bias_data, bq_data, sizeof(float) * bias_size); + memcpy(tmp_combined_bias_data + bias_size, bk_data, + sizeof(float) * bias_size); + memcpy(tmp_combined_bias_data + 2 * bias_size, bv_data, + sizeof(float) * bias_size); + + bq_tensor->Resize(combined_bias_dims); + auto* new_combined_bias_data = + bq_tensor->mutable_data(platform::CPUPlace()); + memcpy(new_combined_bias_data, tmp_combined_bias_data, + sizeof(float) * bq_tensor->numel()); + + scope->EraseVars({eltadd1_b->Name(), eltadd2_b->Name()}); + + auto reshape_desc = reshape2->Op(); + int head_number = + BOOST_GET_CONST(std::vector, reshape_desc->GetAttr("shape")).at(2); + + OpDesc multihead_op_desc(mul0->Op()->Block()); + multihead_op_desc.SetType("multihead_matmul"); + + multihead_op_desc.SetInput("Input", {input0->Name()}); + multihead_op_desc.SetInput("W", {mul0_w->Name()}); + multihead_op_desc.SetInput("Bias", {eltadd0_b->Name()}); + multihead_op_desc.SetInput("BiasQK", {eltadd_qk_b->Name()}); + + multihead_op_desc.SetOutput("Out", {reshape2_qkv_out->Name()}); + multihead_op_desc.SetAttr("alpha", scale_attr); + multihead_op_desc.SetAttr("head_number", head_number); + + auto* multihead = graph->CreateOpNode(&multihead_op_desc); + + IR_NODE_LINK_TO(input0, multihead); + IR_NODE_LINK_TO(mul0_w, multihead); + IR_NODE_LINK_TO(eltadd0_b, multihead); + IR_NODE_LINK_TO(eltadd_qk_b, multihead); + + IR_NODE_LINK_TO(multihead, reshape2_qkv_out); + }; + + int fusion_count{0}; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + // GET_IR_NODE_FROM_SUBGRAPH(dropout_out, dropout_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(input0, input0, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(mul0, mul0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul0_out, mul0_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul0_w, mul0_w, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_0, reshape2_0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_0_out, reshape2_0_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_0, transpose2_0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_0_out, transpose2_0_out, + multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(mul1, mul1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul1_out, mul1_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul1_w, mul1_w, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_1, reshape2_1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_1_out, reshape2_1_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_1, transpose2_1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_1_out, transpose2_1_out, + multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(mul2, mul2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul2_out, mul2_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul2_w, mul2_w, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_2, reshape2_2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_2_out, reshape2_2_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_2, transpose2_2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_2_out, transpose2_2_out, + multihead_pattern); + + // nodes need be removed + GET_IR_NODE_FROM_SUBGRAPH(eltadd0, eltadd0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd0_b, eltadd0_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd0_out, eltadd0_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(eltadd1, eltadd1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd1_b, eltadd1_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd1_out, eltadd1_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(eltadd2, eltadd2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd2_b, eltadd2_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd2_out, eltadd2_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(matmul_qk, matmul_qk, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_qk_out, matmul_qk_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk, eltadd_qk, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_b, eltadd_qk_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_out, eltadd_qk_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(softmax_qk, softmax_qk, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(softmax_qk_out, softmax_qk_out, + multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv, matmul_qkv, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv_out, matmul_qkv_out, + multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv, reshape2_qkv, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv_out, reshape2_qkv_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv, transpose2_qkv, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out, transpose2_qkv_out, + multihead_pattern); + + // If weights or biases in qkv's fc are shared by multiple multihead_matmul + // patterns, we do not support this kind of fusion, this pass will not take + // effect. + bool is_fc_params_shared = + mul0_w->outputs.size() > 1 || mul1_w->outputs.size() > 1 || + mul2_w->outputs.size() > 1 || eltadd0_b->outputs.size() > 1 || + eltadd1_b->outputs.size() > 1 || eltadd2_b->outputs.size() > 1; + if (is_fc_params_shared) { + return; + } + fuse_creater(input0, mul0, mul1, mul2, mul0_out, mul1_out, mul2_out, mul0_w, + mul1_w, mul2_w, eltadd0_b, eltadd1_b, eltadd2_b, eltadd_qk_b, + reshape2_0, reshape2_qkv_out, matmul_qk); + + std::unordered_set marked_nodes({eltadd0, + eltadd1, + eltadd2, + eltadd1_b, + eltadd2_b, + eltadd0_out, + eltadd1_out, + eltadd2_out, + reshape2_0, + reshape2_1, + reshape2_2, + reshape2_0_out, + reshape2_1_out, + reshape2_2_out, + transpose2_0, + transpose2_1, + transpose2_2, + transpose2_0_out, + transpose2_1_out, + transpose2_2_out, + matmul_qk, + matmul_qk_out, + eltadd_qk, + eltadd_qk_out, + softmax_qk, + softmax_qk_out, + transpose2_qkv, + transpose2_qkv_out, + matmul_qkv, + matmul_qkv_out, + mul0, + mul1, + mul2, + mul0_out, + mul1_out, + mul2_out, + mul1_w, + mul2_w, + reshape2_qkv}); + // Remove unneeded nodes. + GraphSafeRemoveNodes(graph, marked_nodes); + ++fusion_count; + }; + gpd(graph, handler); + + return fusion_count; +} + +void TrtMultiHeadMatmulV3FusePass::ApplyImpl(Graph* graph) const { + FusePassBase::Init(name_scope_, graph); + auto* scope = param_scope(); + PADDLE_ENFORCE_NOT_NULL( + scope, + platform::errors::Fatal( + "During the multiheadMatmul pass, The scope should not be null.")); + + int fusion_count = BuildFusionV3(graph, name_scope_, scope); + if (fusion_count > 0) { + bool use_varseqlen = Get("use_varseqlen"); + std::string pos_id = Get("tensorrt_transformer_posid"); + std::string mask_id = Get("tensorrt_transformer_maskid"); + + if (use_varseqlen && pos_id != "" && mask_id != "") { + if (graph->Has(framework::ir::kEmbEltwiseLayernormPass)) { + VLOG(3) << "start varseqlen trt_multihead_matmul_fuse_pass_v3"; + } else { + PADDLE_THROW(platform::errors::Fatal( + "Use transformer'varseqlen need " + "embedding_eltwise_layernorm_fuse_pass. please use no_varseqlen")); + } + } else if (!use_varseqlen && pos_id == "" && mask_id == "") { + VLOG(3) << "start no_varseqlen trt_multihead_matmul_fuse_pass_v3"; + } else { + PADDLE_THROW( + platform::errors::Fatal("Use transformer'varseqlen need config: " + "use_varseqlen, set pos_id, set " + "mask_id. Or not use varseqlen, do not set " + "pos_id, set mask_id. Please " + "reconfig")); + } + graph->Set(kMultiheadMatmulPass, new bool(true)); + } + AddStatis(fusion_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(trt_multihead_matmul_fuse_pass, + paddle::framework::ir::TrtMultiHeadMatmulFusePass); + +REGISTER_PASS(trt_multihead_matmul_fuse_pass_v2, + paddle::framework::ir::TrtMultiHeadMatmulV2FusePass); +REGISTER_PASS(trt_multihead_matmul_fuse_pass_v3, + paddle::framework::ir::TrtMultiHeadMatmulV3FusePass); +REGISTER_PASS_CAPABILITY(trt_multihead_matmul_fuse_pass_v2) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .EQ("mul", 0) + .LE("elementwise_add", 1) + .EQ("reshape2", 0) + .EQ("transpose2", 0) + .EQ("scale", 0) + .LE("matmul", 1) + .EQ("softmax", 0)); +REGISTER_PASS_CAPABILITY(trt_multihead_matmul_fuse_pass_v3) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .LE("elementwise_add", 1) + .EQ("reshape2", 0) + .EQ("transpose2", 0) + .EQ("scale", 0) + .LE("matmul", 1) + .EQ("matmul_v2", 0) + .EQ("softmax", 0)); diff --git a/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.h b/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.h new file mode 100644 index 0000000000000..467e803b4974c --- /dev/null +++ b/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.h @@ -0,0 +1,179 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { +namespace patterns { + +struct TrtMultiHeadMatmulPattern : public PatternBase { + TrtMultiHeadMatmulPattern(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "multihead_matmul") {} + + PDNode* operator()(); + + // declare operator node's name + PATTERN_DECL_NODE(input0); + PATTERN_DECL_NODE(mul0); + PATTERN_DECL_NODE(mul1); + PATTERN_DECL_NODE(mul2); + PATTERN_DECL_NODE(mul0_w); + PATTERN_DECL_NODE(mul1_w); + PATTERN_DECL_NODE(mul2_w); + PATTERN_DECL_NODE(mul0_out); + PATTERN_DECL_NODE(mul1_out); + PATTERN_DECL_NODE(mul2_out); + PATTERN_DECL_NODE(eltadd0); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd1); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd2); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd0_b); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd1_b); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd2_b); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd0_out); + PATTERN_DECL_NODE(eltadd1_out); + PATTERN_DECL_NODE(eltadd2_out); + PATTERN_DECL_NODE(reshape2_0); + PATTERN_DECL_NODE(reshape2_1); + PATTERN_DECL_NODE(reshape2_2); + PATTERN_DECL_NODE(reshape2_qkv); + PATTERN_DECL_NODE(reshape2_0_out); + PATTERN_DECL_NODE(reshape2_1_out); + PATTERN_DECL_NODE(reshape2_2_out); + PATTERN_DECL_NODE(reshape2_qkv_out); + PATTERN_DECL_NODE(transpose2_0); + PATTERN_DECL_NODE(transpose2_1); + PATTERN_DECL_NODE(transpose2_2); + PATTERN_DECL_NODE(transpose2_qkv); + PATTERN_DECL_NODE(transpose2_0_out); + PATTERN_DECL_NODE(transpose2_1_out); + PATTERN_DECL_NODE(transpose2_2_out); + PATTERN_DECL_NODE(transpose2_qkv_out); + PATTERN_DECL_NODE(scale); + PATTERN_DECL_NODE(scale_out); + PATTERN_DECL_NODE(matmul_qk); + PATTERN_DECL_NODE(matmul_qk_out); + PATTERN_DECL_NODE(eltadd_qk); + PATTERN_DECL_NODE(eltadd_qk_b); + PATTERN_DECL_NODE(eltadd_qk_out); + PATTERN_DECL_NODE(softmax_qk); + PATTERN_DECL_NODE(softmax_qk_out); + + PATTERN_DECL_NODE(matmul_qkv); + PATTERN_DECL_NODE(matmul_qkv_out); +}; + +struct TrtMultiHeadMatmulV3Pattern : public PatternBase { + TrtMultiHeadMatmulV3Pattern(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "multihead_matmul_v3") {} + + PDNode* operator()(); + + // declare operator node's name + PATTERN_DECL_NODE(input0); + PATTERN_DECL_NODE(mul0); + PATTERN_DECL_NODE(mul1); + PATTERN_DECL_NODE(mul2); + PATTERN_DECL_NODE(mul0_w); + PATTERN_DECL_NODE(mul1_w); + PATTERN_DECL_NODE(mul2_w); + PATTERN_DECL_NODE(mul0_out); + PATTERN_DECL_NODE(mul1_out); + PATTERN_DECL_NODE(mul2_out); + PATTERN_DECL_NODE(eltadd0); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd1); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd2); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd0_b); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd1_b); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd2_b); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd0_out); + PATTERN_DECL_NODE(eltadd1_out); + PATTERN_DECL_NODE(eltadd2_out); + PATTERN_DECL_NODE(reshape2_0); + PATTERN_DECL_NODE(reshape2_1); + PATTERN_DECL_NODE(reshape2_2); + PATTERN_DECL_NODE(reshape2_qkv); + PATTERN_DECL_NODE(reshape2_0_out); + PATTERN_DECL_NODE(reshape2_1_out); + PATTERN_DECL_NODE(reshape2_2_out); + PATTERN_DECL_NODE(reshape2_qkv_out); + PATTERN_DECL_NODE(transpose2_0); + PATTERN_DECL_NODE(transpose2_1); + PATTERN_DECL_NODE(transpose2_2); + PATTERN_DECL_NODE(transpose2_qkv); + PATTERN_DECL_NODE(transpose2_0_out); + PATTERN_DECL_NODE(transpose2_1_out); + PATTERN_DECL_NODE(transpose2_2_out); + PATTERN_DECL_NODE(transpose2_qkv_out); + PATTERN_DECL_NODE(matmul_qk); + PATTERN_DECL_NODE(matmul_qk_out); + PATTERN_DECL_NODE(eltadd_qk); + PATTERN_DECL_NODE(eltadd_qk_b); + PATTERN_DECL_NODE(eltadd_qk_out); + PATTERN_DECL_NODE(softmax_qk); + PATTERN_DECL_NODE(softmax_qk_out); + + PATTERN_DECL_NODE(matmul_qkv); + PATTERN_DECL_NODE(matmul_qkv_out); +}; + +} // namespace patterns + +class TrtMultiHeadMatmulFusePass : public FusePassBase { + public: + virtual ~TrtMultiHeadMatmulFusePass() {} + + protected: + void ApplyImpl(Graph* graph) const; + + const std::string name_scope_{"trt_multihead_matmul_fuse"}; +}; + +class TrtMultiHeadMatmulV2FusePass : public FusePassBase { + public: + TrtMultiHeadMatmulV2FusePass(); + + protected: + void ApplyImpl(Graph* graph) const; + + const std::string name_scope_{"trt_multihead_matmul_fuse_v2"}; + + private: + int BuildFusionV2(Graph* graph, const std::string& name_scope, + Scope* scope) const; +}; + +class TrtMultiHeadMatmulV3FusePass : public FusePassBase { + public: + TrtMultiHeadMatmulV3FusePass(); + + protected: + void ApplyImpl(Graph* graph) const; + + const std::string name_scope_{"trt_multihead_matmul_fuse_v3"}; + + private: + int BuildFusionV3(Graph* graph, const std::string& name_scope, + Scope* scope) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc new file mode 100644 index 0000000000000..13883909435f7 --- /dev/null +++ b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc @@ -0,0 +1,232 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.h" + +#include + +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace framework { +namespace ir { +class Node; +} // namespace ir +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace framework { +namespace ir { +namespace patterns { + +struct TrtSkipLayerNorm : public PatternBase { + TrtSkipLayerNorm(PDPattern *pattern, const std::string &name_scope) + : PatternBase(pattern, name_scope, "skip_layernorm") {} + + PDNode *operator()(PDNode *x, PDNode *y); + + // declare operator node's name + PATTERN_DECL_NODE(elementwise); + PATTERN_DECL_NODE(layer_norm); + // declare variable node's name + PATTERN_DECL_NODE( + elementwise_out); // (elementwise_input_x,elementwise_input_y) + // -> elementwise_out + PATTERN_DECL_NODE(layer_norm_bias); + PATTERN_DECL_NODE(layer_norm_scale); + PATTERN_DECL_NODE(layer_norm_out); + PATTERN_DECL_NODE(layer_norm_mean); + PATTERN_DECL_NODE(layer_norm_variance); +}; + +PDNode *TrtSkipLayerNorm::operator()(PDNode *x, PDNode *y) { + // Create nodes for elementwise add op. + x->assert_is_op_input("elementwise_add", "X"); + y->assert_is_op_input("elementwise_add", "Y"); + auto *elementwise = + pattern->NewNode(elementwise_repr())->assert_is_op("elementwise_add"); + auto *elementwise_out_var = + pattern->NewNode(elementwise_out_repr()) + ->AsOutput() + ->assert_is_only_output_of_op("elementwise_add"); + + // Add links for elementwise_add op. + elementwise->LinksFrom({x, y}).LinksTo({elementwise_out_var}); + + // Create nodes for layer_norm op. + elementwise_out_var->AsIntermediate()->assert_is_op_input("layer_norm"); + auto *layer_norm = + pattern->NewNode(layer_norm_repr())->assert_is_op("layer_norm"); + auto *layer_norm_bias_var = pattern->NewNode(layer_norm_bias_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("layer_norm", "Bias"); + auto *layer_norm_scale_var = pattern->NewNode(layer_norm_scale_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("layer_norm", "Scale"); + + auto *layer_norm_out_var = pattern->NewNode(layer_norm_out_repr()) + ->AsOutput() + ->assert_is_op_output("layer_norm", "Y"); + auto *layer_norm_mean_var = pattern->NewNode(layer_norm_mean_repr()) + ->AsOutput() + ->assert_is_op_output("layer_norm", "Mean"); + auto *layer_norm_variance_var = + pattern->NewNode(layer_norm_variance_repr()) + ->AsOutput() + ->assert_is_op_output("layer_norm", "Variance"); + + // Add links for layer_norm op. + layer_norm + ->LinksFrom( + {elementwise_out_var, layer_norm_bias_var, layer_norm_scale_var}) + .LinksTo( + {layer_norm_out_var, layer_norm_mean_var, layer_norm_variance_var}); + return layer_norm_out_var; +} + +} // namespace patterns + +void TrtSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::PreconditionNotMet("graph should not be null.")); + FusePassBase::Init("skip_layernorm_fuse", graph); + int found_subgraph_count = 0; + + GraphPatternDetector gpd; + auto *x = gpd.mutable_pattern() + ->NewNode("skip_layernorm_fuse/x") + ->AsInput() + ->assert_is_op_input("elementwise_add", "X") + ->assert_var_not_persistable(); + auto *y = gpd.mutable_pattern() + ->NewNode("skip_layernorm_fuse/y") + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y") + ->assert_var_not_persistable(); + patterns::TrtSkipLayerNorm fused_pattern(gpd.mutable_pattern(), + "skip_layernorm_fuse"); + fused_pattern(x, y); + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *graph) { + if (subgraph.count(x) <= 0 || subgraph.count(y) <= 0) { + LOG(WARNING) << "The subgraph is empty."; + return; + } + + if (!IsCompat(subgraph, graph)) { + LOG(WARNING) << "skip_layernorm pass in op compat failed."; + return; + } + + VLOG(4) << "handle TrtSkipLayerNorm fuse"; + GET_IR_NODE_FROM_SUBGRAPH(elementwise, elementwise, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm, layer_norm, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_bias, layer_norm_bias, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_scale, layer_norm_scale, + fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_out, layer_norm_out, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_mean, layer_norm_mean, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance, layer_norm_variance, + fused_pattern); + + std::unordered_set del_node_set; + + // Create an TrtSkipLayerNorm op node + OpDesc new_desc(elementwise->Op()->Block()); + new_desc.SetType("skip_layernorm"); + + // inputs + new_desc.SetInput("X", {subgraph.at(x)->Name()}); + new_desc.SetInput("Y", {subgraph.at(y)->Name()}); + new_desc.SetInput("Scale", {layer_norm_scale->Name()}); + new_desc.SetInput("Bias", {layer_norm_bias->Name()}); + + if (layer_norm->Op()->HasAttr("out_threshold")) { + new_desc.SetAttr("enable_int8", true); + new_desc.SetAttr("out_threshold", + layer_norm->Op()->GetAttr("out_threshold")); + } + + // outputs + new_desc.SetOutput("Out", {layer_norm_out->Name()}); + + // attrs + new_desc.SetAttr("epsilon", layer_norm->Op()->GetAttr("epsilon")); + new_desc.SetAttr("begin_norm_axis", + layer_norm->Op()->GetAttr("begin_norm_axis")); + + auto fused_node = graph->CreateOpNode(&new_desc); // OpDesc will be copied. + + del_node_set.insert(elementwise); + del_node_set.insert(layer_norm); + del_node_set.insert(elementwise_out); + del_node_set.insert(layer_norm_mean); + del_node_set.insert(layer_norm_variance); + GraphSafeRemoveNodes(graph, del_node_set); + + IR_NODE_LINK_TO(subgraph.at(x), fused_node); + IR_NODE_LINK_TO(subgraph.at(y), fused_node); + IR_NODE_LINK_TO(layer_norm_scale, fused_node); + IR_NODE_LINK_TO(layer_norm_bias, fused_node); + IR_NODE_LINK_TO(fused_node, layer_norm_out); + + found_subgraph_count++; + }; + + gpd(graph, handler); + if (found_subgraph_count > 0) { + bool use_varseqlen = Get("use_varseqlen"); + std::string pos_id = Get("tensorrt_transformer_posid"); + std::string mask_id = Get("tensorrt_transformer_maskid"); + + if (use_varseqlen && pos_id != "" && mask_id != "") { + if (graph->Has(framework::ir::kEmbEltwiseLayernormPass) && + graph->Has(framework::ir::kMultiheadMatmulPass)) { + VLOG(3) << "start varseqlen trt_skip_layernorm_fuse_pass"; + } else { + PADDLE_THROW(platform::errors::Fatal( + "Use transformer'varseqlen need " + "embedding_eltwise_layernorm_fuse_pass. please use no_varseqlen")); + } + } else if (!use_varseqlen && pos_id == "" && mask_id == "") { + VLOG(3) << "start no_varseqlen trt_skip_layernorm_fuse_pass"; + } else { + PADDLE_THROW( + platform::errors::Fatal("Use transformer'varseqlen need config: " + "use_varseqlen, set pos_id, set " + "mask_id. Or not use varseqlen, do not set " + "pos_id, set mask_id. Please " + "reconfig")); + } + } + AddStatis(found_subgraph_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(trt_skip_layernorm_fuse_pass, + paddle::framework::ir::TrtSkipLayerNormFusePass); +REGISTER_PASS_CAPABILITY(trt_skip_layernorm_fuse_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .LE("elementwise_add", 1) + .EQ("layer_norm", 0)); diff --git a/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.h b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.h new file mode 100644 index 0000000000000..a299493efa0e9 --- /dev/null +++ b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.h @@ -0,0 +1,87 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +// | | | | +// other_op1 other_op2 other_op1 other_op2 +// | | fuse \ / +// |------elementwise_add -> skip_layernorm +// | | +// layer_norm other_op3 +// | | +// other_op3 +// | +class Graph; + +class TrtSkipLayerNormFusePass : public FusePassBase { + public: + TrtSkipLayerNormFusePass() { + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsIntIn({0, -1}) + .End(); + + AddOpCompat(OpCompat("layer_norm")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Scale") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .End() + .AddOutput("Y") + .IsTensor() + .End() + .AddOutput("Mean") + .IsTensor() + .End() + .AddOutput("Variance") + .IsTensor() + .End() + .AddAttr("epsilon") + .IsNumGE(0.0f) + .IsNumLE(0.001f) + .End() + .AddAttr("begin_norm_axis") + .IsNumGT(0) + .End(); + } + + virtual ~TrtSkipLayerNormFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass_tester.cc b/paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass_tester.cc index 067a37c611a73..3ebd61ff575e3 100644 --- a/paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass_tester.cc @@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass.h" - #include + #include "paddle/fluid/framework/ir/pass_tester_helper.h" +#include "paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass.h" #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc b/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc index 20075a49749f7..19836b69ae9bf 100644 --- a/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc +++ b/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/yolo_box_fuse_pass.h" + #include + #include "glog/logging.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/pass.h" diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h index 1c5c12b3d57df..dd316a0979cc7 100644 --- a/paddle/fluid/framework/lod_tensor.h +++ b/paddle/fluid/framework/lod_tensor.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include #include #include diff --git a/paddle/fluid/framework/lod_tensor_array.h b/paddle/fluid/framework/lod_tensor_array.h index 36a5c3c5d6013..7aa180ed75ce2 100644 --- a/paddle/fluid/framework/lod_tensor_array.h +++ b/paddle/fluid/framework/lod_tensor_array.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/lod_tensor.h" namespace paddle { diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc index a89baac3e7a10..254e70231ea4e 100644 --- a/paddle/fluid/framework/lod_tensor_test.cc +++ b/paddle/fluid/framework/lod_tensor_test.cc @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/lod_tensor.h" + #include #include -#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/phi/core/lod_utils.h" namespace paddle { diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index dba3b3ff1e690..1c2740c2b2ee7 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/framework/naive_executor.h" + #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/platform/denormal.h" diff --git a/paddle/fluid/framework/naive_executor_test.cc b/paddle/fluid/framework/naive_executor_test.cc index 2f3c3f3d06e32..763e314d226e6 100644 --- a/paddle/fluid/framework/naive_executor_test.cc +++ b/paddle/fluid/framework/naive_executor_test.cc @@ -13,8 +13,11 @@ // limitations under the License. #include "paddle/fluid/framework/naive_executor.h" + #include + #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/program_desc.h" diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt index 6046000739976..44d540769f2da 100644 --- a/paddle/fluid/framework/new_executor/CMakeLists.txt +++ b/paddle/fluid/framework/new_executor/CMakeLists.txt @@ -1,76 +1,136 @@ -set(INTERPRETERCORE_DEPS op_registry device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog -lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method -graph_to_program_pass variable_helper timer monitor nan_inf_utils) - +set(INTERPRETERCORE_DEPS + op_registry + device_context + scope + framework_proto + data_feed_proto + heter_service_proto + trainer_desc_proto + glog + lod_rank_table + fs + shell + fleet_wrapper + heter_wrapper + ps_gpu_wrapper + box_wrapper + lodtensor_printer + feed_fetch_method + graph_to_program_pass + variable_helper + timer + monitor + nan_inf_utils) add_subdirectory(workqueue) add_subdirectory(garbage_collector) -cc_library(data_transfer SRCS data_transfer.cc DEPS enforce scope glog) -cc_library(new_executor_defs SRCS new_executor_defs.cc DEPS enforce glog scope) -cc_library(interpretercore_util SRCS interpretercore_util.cc DEPS ${INTERPRETERCORE_DEPS} workqueue new_executor_defs data_transfer) -cc_library(event_manager SRCS event_manager.cc DEPS ${DEVICE_EVENT_LIBS} glog new_executor_defs) -cc_library(stream_analyzer SRCS stream_analyzer.cc DEPS ${DEVICE_EVENT_LIBS} glog device_context new_executor_defs) +cc_library( + data_transfer + SRCS data_transfer.cc + DEPS enforce scope glog) +cc_library( + new_executor_defs + SRCS new_executor_defs.cc + DEPS enforce glog scope) +cc_library( + interpretercore_util + SRCS interpretercore_util.cc + DEPS ${INTERPRETERCORE_DEPS} workqueue new_executor_defs data_transfer) +cc_library( + event_manager + SRCS event_manager.cc + DEPS ${DEVICE_EVENT_LIBS} glog new_executor_defs) +cc_library( + stream_analyzer + SRCS stream_analyzer.cc + DEPS ${DEVICE_EVENT_LIBS} glog device_context new_executor_defs) if(WITH_GPU OR WITH_ROCM) -cc_library(interpretercore SRCS interpretercore.cc DEPS workqueue ${DEVICE_EVENT_LIBS} interpretercore_util interpretercore_event_garbage_collector interpretercore_fast_garbage_collector stream_analyzer event_manager) + cc_library( + interpretercore + SRCS interpretercore.cc + DEPS workqueue + ${DEVICE_EVENT_LIBS} + interpretercore_util + interpretercore_event_garbage_collector + interpretercore_fast_garbage_collector + stream_analyzer + event_manager) else() -cc_library(interpretercore SRCS interpretercore.cc DEPS workqueue ${DEVICE_EVENT_LIBS} interpretercore_util interpretercore_event_garbage_collector stream_analyzer event_manager) + cc_library( + interpretercore + SRCS interpretercore.cc + DEPS workqueue ${DEVICE_EVENT_LIBS} interpretercore_util + interpretercore_event_garbage_collector stream_analyzer event_manager) endif() -cc_library(standalone_executor SRCS standalone_executor.cc DEPS interpretercore) +cc_library( + standalone_executor + SRCS standalone_executor.cc + DEPS interpretercore) -cc_library(staticgraph_executor_statistics SRCS executor_statistics.cc DEPS enforce glog os_info) +cc_library( + staticgraph_executor_statistics + SRCS executor_statistics.cc + DEPS enforce glog os_info) # cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler) # skip win32 since wget is not installed by default on windows machine. -if (WITH_GPU AND WITH_TESTING AND NOT WIN32 AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") - add_custom_target( - download_program - COMMAND wget -nc https://paddle-ci.gz.bcebos.com/new_exec/lm_main_program - COMMAND wget -nc https://paddle-ci.gz.bcebos.com/new_exec/lm_startup_program - ) - - # all operators used in the program - set(OPS - fill_constant_op - uniform_random_op - lookup_table_op - transpose_op - reshape_op - split_op - slice_op - concat_op - matmul_op - elementwise_add_op - elementwise_mul_op - softmax_with_cross_entropy_op - reduce_mean_op - reduce_sum_op - activation_op - sum_op - elementwise_max_op - elementwise_div_op - sgd_op - squared_l2_norm_op - memcpy_h2d_op - memcpy_d2h_op) - - # All deps of the operators above, part of GLOB_OPERATOR_DEPS. - set(OP_DEPS - generator - softmax - selected_rows_functor - jit_kernel_helper - concat_and_split - cross_entropy) +if(WITH_GPU + AND WITH_TESTING + AND NOT WIN32 + AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") + add_custom_target( + download_program + COMMAND wget -nc https://paddle-ci.gz.bcebos.com/new_exec/lm_main_program + COMMAND wget -nc + https://paddle-ci.gz.bcebos.com/new_exec/lm_startup_program) + + # all operators used in the program + set(OPS + fill_constant_op + uniform_random_op + lookup_table_op + transpose_op + reshape_op + split_op + slice_op + concat_op + matmul_op + elementwise_add_op + elementwise_mul_op + softmax_with_cross_entropy_op + reduce_mean_op + reduce_sum_op + activation_op + sum_op + elementwise_max_op + elementwise_div_op + sgd_op + squared_l2_norm_op + memcpy_h2d_op + memcpy_d2h_op) + + # All deps of the operators above, part of GLOB_OPERATOR_DEPS. + set(OP_DEPS generator softmax selected_rows_functor jit_kernel_helper + concat_and_split cross_entropy) - cc_test(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${OPS} ${OP_DEPS}) - set_tests_properties(standalone_executor_test PROPERTIES TIMEOUT 100) + cc_test( + standalone_executor_test + SRCS standalone_executor_test.cc + DEPS interpretercore + standalone_executor + operator + op_registry + executor + ${OPS} + ${OP_DEPS}) + set_tests_properties(standalone_executor_test PROPERTIES TIMEOUT 100) - add_dependencies(standalone_executor_test download_program) - if (WITH_PROFILER) - target_link_libraries(standalone_executor_test profiler) - add_dependencies(standalone_executor_test profiler) - endif() + add_dependencies(standalone_executor_test download_program) + if(WITH_PROFILER) + target_link_libraries(standalone_executor_test profiler) + add_dependencies(standalone_executor_test profiler) + endif() endif() diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc index d0e5565139c54..171e15162fb45 100644 --- a/paddle/fluid/framework/new_executor/data_transfer.cc +++ b/paddle/fluid/framework/new_executor/data_transfer.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/new_executor/data_transfer.h" + #include "paddle/fluid/framework/convert_utils.h" namespace paddle { @@ -276,9 +277,9 @@ std::shared_ptr TransferDevice(const std::string& var_name, // 2. Construct VariableNameMap VariableNameMap in_name_map = {{"X", {var_name}}}; VariableNameMap out_name_map = {{"Out", {*new_var_name}}}; - int dst_place_type = platform::is_cpu_place(dst_place) - ? 0 - : platform::is_gpu_place(dst_place) ? 1 : -1; + int dst_place_type = platform::is_cpu_place(dst_place) ? 0 + : platform::is_gpu_place(dst_place) ? 1 + : -1; AttributeMap attr_map = {{"dst_place_type", dst_place_type}}; // 3. Create memcpy_d2h_op or memcpy_h2d_op diff --git a/paddle/fluid/framework/new_executor/event_manager.cc b/paddle/fluid/framework/new_executor/event_manager.cc index bca2264b66afc..0bfa00494d611 100644 --- a/paddle/fluid/framework/new_executor/event_manager.cc +++ b/paddle/fluid/framework/new_executor/event_manager.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/new_executor/event_manager.h" + #include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { diff --git a/paddle/fluid/framework/new_executor/executor_statistics.cc b/paddle/fluid/framework/new_executor/executor_statistics.cc index fb79712d47d9e..f6afcf2f24d18 100644 --- a/paddle/fluid/framework/new_executor/executor_statistics.cc +++ b/paddle/fluid/framework/new_executor/executor_statistics.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/new_executor/executor_statistics.h" + #include #include #include @@ -21,6 +22,7 @@ #include #include #include + #include "glog/logging.h" #include "paddle/fluid/platform/flags.h" #include "paddle/fluid/platform/os_info.h" @@ -520,7 +522,7 @@ void StatisticsEngine::MergeEvents(std::function merger, int StatisticsEngine::MergeInnerthreadEvents( std::vector>* all_evts) { - auto merger = [& priorities = priorities_](size_t idx1, size_t idx2) { + auto merger = [&priorities = priorities_](size_t idx1, size_t idx2) { return priorities[idx1].innerthread_priority <= priorities[idx2].innerthread_priority ? idx1 @@ -541,7 +543,7 @@ int StatisticsEngine::MergeInnerthreadEvents( int StatisticsEngine::MergeInterthreadEvents( std::vector>* all_evts) { - auto merger = [& priorities = priorities_](size_t idx1, size_t idx2) { + auto merger = [&priorities = priorities_](size_t idx1, size_t idx2) { return priorities[idx1].interthread_priority <= priorities[idx2].interthread_priority ? idx1 diff --git a/paddle/fluid/framework/new_executor/executor_statistics.h b/paddle/fluid/framework/new_executor/executor_statistics.h index 530e9455968a8..ebe9d3a2e7925 100644 --- a/paddle/fluid/framework/new_executor/executor_statistics.h +++ b/paddle/fluid/framework/new_executor/executor_statistics.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/fluid/platform/profiler/event_node.h" namespace paddle { diff --git a/paddle/fluid/framework/new_executor/garbage_collector/CMakeLists.txt b/paddle/fluid/framework/new_executor/garbage_collector/CMakeLists.txt index 2033eba88f9d1..359c56c561a4d 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/CMakeLists.txt +++ b/paddle/fluid/framework/new_executor/garbage_collector/CMakeLists.txt @@ -1,10 +1,22 @@ -cc_library(interpretercore_garbage_collector SRCS garbage_collector.cc DEPS garbage_collector) -cc_library(interpretercore_event_garbage_collector SRCS event_garbage_collector.cc DEPS interpretercore_garbage_collector) +cc_library( + interpretercore_garbage_collector + SRCS garbage_collector.cc + DEPS garbage_collector) +cc_library( + interpretercore_event_garbage_collector + SRCS event_garbage_collector.cc + DEPS interpretercore_garbage_collector) if(WITH_GPU OR WITH_ROCM) - if(WITH_GPU) - nv_library(interpretercore_fast_garbage_collector SRCS fast_garbage_collector.cc DEPS interpretercore_garbage_collector) - elseif(WITH_ROCM) - hip_library(interpretercore_fast_garbage_collector SRCS fast_garbage_collector.cc DEPS interpretercore_garbage_collector) - endif() + if(WITH_GPU) + nv_library( + interpretercore_fast_garbage_collector + SRCS fast_garbage_collector.cc + DEPS interpretercore_garbage_collector) + elseif(WITH_ROCM) + hip_library( + interpretercore_fast_garbage_collector + SRCS fast_garbage_collector.cc + DEPS interpretercore_garbage_collector) + endif() endif() diff --git a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc index 46c85a22dc3a3..1ae9f4223d3d9 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc +++ b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc @@ -110,7 +110,7 @@ void InterpreterCoreEventGarbageCollector::Free( const platform::DeviceContext* ctx) { event->Record(ctx); event->SetFininshed(); // Only for CPU Event - queue_->AddTask([ container = garbages, event = event ]() { + queue_->AddTask([container = garbages, event = event]() { while (!event->Query()) { #if defined(_WIN32) SleepEx(50, FALSE); @@ -128,7 +128,7 @@ void InterpreterCoreEventGarbageCollector::Free( const platform::DeviceContext* ctx) { event->Record(ctx); event->SetFininshed(); // Only for CPU Event - queue_->AddTask([ container = garbage, event = event ]() { + queue_->AddTask([container = garbage, event = event]() { while (!event->Query()) { #if defined(_WIN32) SleepEx(50, FALSE); diff --git a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h index 33954713d4e9f..57963269663d0 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h +++ b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h @@ -14,6 +14,7 @@ #pragma once #include + #include "paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h" #include "paddle/fluid/framework/new_executor/workqueue/workqueue.h" diff --git a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc index a20cd27539848..8e849c79bd235 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc +++ b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h" + #include "paddle/fluid/framework/garbage_collector.h" namespace paddle { diff --git a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h index 34f95eee7316d..d0159c0ca83e5 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h +++ b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h @@ -14,6 +14,7 @@ #pragma once #include + #include "paddle/fluid/memory/allocation/spin_lock.h" #include "paddle/fluid/platform/device_event.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index da2fd0c8c6114..fe0c7fe072178 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/framework/new_executor/interpretercore.h" + #include + #include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h" #include "paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h" @@ -585,10 +587,12 @@ void InterpreterCore::ExecuteInstructionList( for (size_t i = 0; i < dependecy_count_.size(); ++i) { if (dependecy_count_[i] == 0) { - async_work_queue_->AddTask(vec_instr.at(i).KernelType(), [ - this, i, atomic_deps = atomic_deps.get(), - atomic_var_ref = atomic_var_ref.get() - ] { RunInstructionAsync(i, atomic_deps, atomic_var_ref); }); + async_work_queue_->AddTask(vec_instr.at(i).KernelType(), + [this, i, atomic_deps = atomic_deps.get(), + atomic_var_ref = atomic_var_ref.get()] { + RunInstructionAsync(i, atomic_deps, + atomic_var_ref); + }); } } @@ -692,10 +696,10 @@ void InterpreterCore::RunInstructionAsync( ready_ops.pop(); auto& instr_node = vec_instruction_.at(instr_id); VLOG(5) << __func__ << " OP id:" << instr_node.Id() - << " name:" << instr_node.OpBase()->Type() - << " type:" << (instr_node.KernelType() == OpFuncType::kQueueSync - ? "kQueueSync" - : "kQueueAsync") + << " name:" << instr_node.OpBase()->Type() << " type:" + << (instr_node.KernelType() == OpFuncType::kQueueSync + ? "kQueueSync" + : "kQueueAsync") << " runs on " << platform::GetCurrentThreadName(); auto* op = instr_node.OpBase(); diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc index f601a4ad28bd7..0b75964b94e91 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/framework/new_executor/interpretercore_util.h" + #include #include "paddle/fluid/framework/executor_gc_helper.h" @@ -398,9 +399,10 @@ void build_op_func_list(const platform::Place& place, // But some OPs do have such behavior (e.g., cinn_launch OP). Here special // treatment for them. if (op_with_kernel->Type() == "cinn_launch") { - VLOG(6) << "OP(" << op_with_kernel->Type() << ") use scope in kernel, " - "so pass a real scope to " - "ExecutionContext"; + VLOG(6) << "OP(" << op_with_kernel->Type() + << ") use scope in kernel, " + "so pass a real scope to " + "ExecutionContext"; runtime_scope = local_scope; } @@ -747,8 +749,9 @@ std::map> get_downstream_map( std::map> build_op_downstream_map( const std::vector& vec_instruction, std::vector>* op_happens_before) { - auto var2min_rw_op = std::map< - int, std::list>(); // # map from variable id to read / write op id. + auto var2min_rw_op = + std::map>(); // # map from variable id to read / + // write op id. auto var2recent_write_op = std::map(); // # map from variable to recent write op. auto op2dependences = @@ -825,8 +828,14 @@ std::map> build_op_downstream_map( // add dependences for random op, make sure that the random op is scheduled // sequentially const std::set random_op_set = { - "bernoulli", "poisson", "multinomial", "gaussian_random", - "truncated_gaussian_random", "uniform_random", "randint", "randperm", + "bernoulli", + "poisson", + "multinomial", + "gaussian_random", + "truncated_gaussian_random", + "uniform_random", + "randint", + "randperm", "exponential", "sampling_id" "dropout", @@ -846,7 +855,10 @@ std::map> build_op_downstream_map( // add dependency for communication op auto is_comm_op = [](std::string op) -> bool { const std::set special_comm_op_set = { - "send", "recv", "send_v2", "recv_v2", + "send", + "recv", + "send_v2", + "recv_v2", }; const std::string communication_op_prefix = "c_"; if (op.find(communication_op_prefix) != std::string::npos || diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.h b/paddle/fluid/framework/new_executor/interpretercore_util.h index 60ac3702f4b3c..3d5b067c18792 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.h +++ b/paddle/fluid/framework/new_executor/interpretercore_util.h @@ -22,10 +22,9 @@ #include #include -#include - #include #include +#include #include #include diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc index c75a7871d63e9..1a4dd2edf2793 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.cc +++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc @@ -12,12 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/new_executor/new_executor_defs.h" + #include #include #include #include -#include "paddle/fluid/framework/new_executor/new_executor_defs.h" #include "paddle/phi/core/utils/rw_lock.h" // When in inference scenario, the scopes will not be written by two threads in @@ -385,10 +386,11 @@ InterpretercoreInferShapeContext::GetOutputsVarType( void InterpretercoreInferShapeContext::SetOutputDim(const std::string& name, const DDim& dim) { auto& vars = OutputVars(name); - PADDLE_ENFORCE_EQ(vars.size(), 1UL, platform::errors::InvalidArgument( - "Output(%s) should hold one element, " - "but now it holds %zu elements.", - name, vars.size())); + PADDLE_ENFORCE_EQ( + vars.size(), 1UL, + platform::errors::InvalidArgument("Output(%s) should hold one element, " + "but now it holds %zu elements.", + name, vars.size())); SetDim(vars[0], dim); } @@ -653,8 +655,9 @@ void VariableScope::CheckExist(int id) const { } void VariableScope::CheckExist(const std::string& name) const { - PADDLE_ENFORCE_EQ(HasVar(name), true, platform::errors::NotFound( - "%s not in VariableScope.", name)); + PADDLE_ENFORCE_EQ( + HasVar(name), true, + platform::errors::NotFound("%s not in VariableScope.", name)); } void VariableScope::ClearListener() { @@ -709,8 +712,9 @@ void VariableScopeListener::onClear() {} Instruction::Instruction(size_t id, OpFuncNode&& op_func_node, const platform::DeviceContext& dev_ctx) : id_(id), op_func_node_(op_func_node), dev_ctx_(dev_ctx) { - PADDLE_ENFORCE_GE(id, 0, platform::errors::PreconditionNotMet( - "Required id >= 0, but received id = %d", id)); + PADDLE_ENFORCE_GE(id, 0, + platform::errors::PreconditionNotMet( + "Required id >= 0, but received id = %d", id)); } size_t Instruction::Id() const { return id_; } diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index 31315df5701e5..64332d7fc90b0 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/framework/new_executor/standalone_executor.h" + #include "paddle/fluid/framework/new_executor/interpretercore_util.h" #include "paddle/fluid/platform/profiler/event_tracing.h" diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index 23bd777fae1d5..60d59899549fa 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include #include #include diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.cc b/paddle/fluid/framework/new_executor/stream_analyzer.cc index fdcd19b03098c..6c689c8548b90 100644 --- a/paddle/fluid/framework/new_executor/stream_analyzer.cc +++ b/paddle/fluid/framework/new_executor/stream_analyzer.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/new_executor/stream_analyzer.h" + #include namespace paddle { diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.h b/paddle/fluid/framework/new_executor/stream_analyzer.h index 2a276c6f5097a..8a6552c6883c5 100644 --- a/paddle/fluid/framework/new_executor/stream_analyzer.h +++ b/paddle/fluid/framework/new_executor/stream_analyzer.h @@ -15,6 +15,7 @@ #pragma once #include #include + #include "paddle/fluid/framework/new_executor/new_executor_defs.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_event.h" diff --git a/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt b/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt index 2690b29e01b9d..781ef9a64a253 100644 --- a/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt +++ b/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt @@ -1,3 +1,12 @@ -cc_library(workqueue_utils SRCS workqueue_utils.cc events_waiter.cc DEPS enforce glog) -cc_library(workqueue SRCS workqueue.cc DEPS workqueue_utils enforce glog os_info) -cc_test(workqueue_test SRCS workqueue_test.cc DEPS workqueue) +cc_library( + workqueue_utils + SRCS workqueue_utils.cc events_waiter.cc + DEPS enforce glog) +cc_library( + workqueue + SRCS workqueue.cc + DEPS workqueue_utils enforce glog os_info) +cc_test( + workqueue_test + SRCS workqueue_test.cc + DEPS workqueue) diff --git a/paddle/fluid/framework/new_executor/workqueue/event_count.h b/paddle/fluid/framework/new_executor/workqueue/event_count.h index 7a826c3990713..7c20e12ff1f94 100644 --- a/paddle/fluid/framework/new_executor/workqueue/event_count.h +++ b/paddle/fluid/framework/new_executor/workqueue/event_count.h @@ -54,6 +54,7 @@ #include #include #include + #include "glog/logging.h" namespace paddle { diff --git a/paddle/fluid/framework/new_executor/workqueue/events_waiter.cc b/paddle/fluid/framework/new_executor/workqueue/events_waiter.cc index 346e20d811e84..dbe609427adcf 100644 --- a/paddle/fluid/framework/new_executor/workqueue/events_waiter.cc +++ b/paddle/fluid/framework/new_executor/workqueue/events_waiter.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/framework/new_executor/workqueue/events_waiter.h" + #include + #include "paddle/fluid/platform/enforce.h" namespace paddle { diff --git a/paddle/fluid/framework/new_executor/workqueue/events_waiter.h b/paddle/fluid/framework/new_executor/workqueue/events_waiter.h index 9d85f4a27242c..9284ffa853a85 100644 --- a/paddle/fluid/framework/new_executor/workqueue/events_waiter.h +++ b/paddle/fluid/framework/new_executor/workqueue/events_waiter.h @@ -20,6 +20,7 @@ #include #include #include + #include "paddle/fluid/framework/new_executor/workqueue/event_count.h" #include "paddle/fluid/memory/allocation/spin_lock.h" diff --git a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h index 559eb6a7490cd..20aebfba8e8f8 100644 --- a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h +++ b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h @@ -12,6 +12,7 @@ #include #include #include + #include "glog/logging.h" #include "paddle/fluid/framework/new_executor/workqueue/event_count.h" #include "paddle/fluid/framework/new_executor/workqueue/run_queue.h" diff --git a/paddle/fluid/framework/new_executor/workqueue/run_queue.h b/paddle/fluid/framework/new_executor/workqueue/run_queue.h index 2fc42cf308ab8..7644425a48491 100644 --- a/paddle/fluid/framework/new_executor/workqueue/run_queue.h +++ b/paddle/fluid/framework/new_executor/workqueue/run_queue.h @@ -42,6 +42,7 @@ #include #include #include + #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h" #include "paddle/fluid/memory/allocation/spin_lock.h" @@ -76,9 +77,8 @@ class RunQueue { unsigned front = front_.load(std::memory_order_relaxed); Elem* e = &array_[front & kMask]; uint8_t s = e->state.load(std::memory_order_relaxed); - if (s != kEmpty || - !e->state.compare_exchange_strong(s, kBusy, - std::memory_order_acquire)) { + if (s != kEmpty || !e->state.compare_exchange_strong( + s, kBusy, std::memory_order_acquire)) { return w; } front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed); @@ -93,9 +93,8 @@ class RunQueue { unsigned front = front_.load(std::memory_order_relaxed); Elem* e = &array_[(front - 1) & kMask]; uint8_t s = e->state.load(std::memory_order_relaxed); - if (s != kReady || - !e->state.compare_exchange_strong(s, kBusy, - std::memory_order_acquire)) { + if (s != kReady || !e->state.compare_exchange_strong( + s, kBusy, std::memory_order_acquire)) { return Work(); } Work w = std::move(e->w); @@ -112,9 +111,8 @@ class RunQueue { unsigned back = back_.load(std::memory_order_relaxed); Elem* e = &array_[(back - 1) & kMask]; uint8_t s = e->state.load(std::memory_order_relaxed); - if (s != kEmpty || - !e->state.compare_exchange_strong(s, kBusy, - std::memory_order_acquire)) { + if (s != kEmpty || !e->state.compare_exchange_strong( + s, kBusy, std::memory_order_acquire)) { return w; } back = ((back - 1) & kMask2) | (back & ~kMask2); @@ -134,9 +132,8 @@ class RunQueue { unsigned back = back_.load(std::memory_order_relaxed); Elem* e = &array_[back & kMask]; uint8_t s = e->state.load(std::memory_order_relaxed); - if (s != kReady || - !e->state.compare_exchange_strong(s, kBusy, - std::memory_order_acquire)) { + if (s != kReady || !e->state.compare_exchange_strong( + s, kBusy, std::memory_order_acquire)) { return Work(); } Work w = std::move(e->w); @@ -163,9 +160,8 @@ class RunQueue { Elem* e = &array_[mid & kMask]; uint8_t s = e->state.load(std::memory_order_relaxed); if (n == 0) { - if (s != kReady || - !e->state.compare_exchange_strong(s, kBusy, - std::memory_order_acquire)) + if (s != kReady || !e->state.compare_exchange_strong( + s, kBusy, std::memory_order_acquire)) continue; start = mid; } else { diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc index 0f0de8ef9b05d..b06c540b756da 100644 --- a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc +++ b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc @@ -5,6 +5,7 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #include "paddle/fluid/framework/new_executor/workqueue/workqueue.h" + #include "paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h" #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h" #include "paddle/fluid/platform/enforce.h" @@ -64,11 +65,8 @@ class WorkQueueImpl : public WorkQueue { platform::TracerEventType::UserDefined, 10 /*level*/); if (tracker_ != nullptr) { - fn = [ - task = std::move(fn), raii = CounterGuard(tracker_) - ]() mutable { - task(); - }; + fn = [task = std::move(fn), + raii = CounterGuard(tracker_)]() mutable { task(); }; } queue_->AddTask(std::move(fn)); } @@ -158,11 +156,8 @@ void WorkQueueGroupImpl::AddTask(size_t queue_idx, std::function fn) { 10 /*level*/); assert(queue_idx < queues_.size()); if (queues_options_.at(queue_idx).track_task) { - fn = [ - task = std::move(fn), raii = CounterGuard(tracker_) - ]() mutable { - task(); - }; + fn = [task = std::move(fn), + raii = CounterGuard(tracker_)]() mutable { task(); }; } queues_[queue_idx]->AddTask(std::move(fn)); } diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue.h b/paddle/fluid/framework/new_executor/workqueue/workqueue.h index 2c2576528fe0e..1a1900c56872d 100644 --- a/paddle/fluid/framework/new_executor/workqueue/workqueue.h +++ b/paddle/fluid/framework/new_executor/workqueue/workqueue.h @@ -20,6 +20,7 @@ #include #include #include + #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -118,10 +119,10 @@ class WorkQueue { std::bind(std::forward(f), std::forward(args)...); std::promise prom; std::future res = prom.get_future(); - AddTask([ - t = std::move(task), - p = FakeCopyable>(std::move(prom)) - ]() mutable { p.Get().set_value(t()); }); + AddTask([t = std::move(task), p = FakeCopyable>( + std::move(prom))]() mutable { + p.Get().set_value(t()); + }); return res; } @@ -158,10 +159,9 @@ class WorkQueueGroup { std::bind(std::forward(f), std::forward(args)...); std::promise prom; std::future res = prom.get_future(); - AddTask(queue_idx, [ - t = std::move(task), - p = FakeCopyable>(std::move(prom)) - ]() mutable { p.Get().set_value(t()); }); + AddTask(queue_idx, [t = std::move(task), + p = FakeCopyable>(std::move( + prom))]() mutable { p.Get().set_value(t()); }); return res; } diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue_test.cc b/paddle/fluid/framework/new_executor/workqueue/workqueue_test.cc index 857eaead5b658..3e38d0dbbf9a3 100644 --- a/paddle/fluid/framework/new_executor/workqueue/workqueue_test.cc +++ b/paddle/fluid/framework/new_executor/workqueue/workqueue_test.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "paddle/fluid/framework/new_executor/workqueue/workqueue.h" + #include #include + #include "glog/logging.h" #include "gtest/gtest.h" #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h" @@ -37,10 +39,10 @@ TEST(WorkQueueUtils, TestEventsWaiter) { TEST(WorkQueue, TestSingleThreadedWorkQueue) { VLOG(1) << "In Test"; - using paddle::framework::WorkQueueOptions; - using paddle::framework::WorkQueue; using paddle::framework::CreateSingleThreadedWorkQueue; using paddle::framework::EventsWaiter; + using paddle::framework::WorkQueue; + using paddle::framework::WorkQueueOptions; std::atomic finished{false}; std::atomic counter{0}; constexpr unsigned kLoopNum = 1000000; @@ -83,10 +85,10 @@ TEST(WorkQueue, TestSingleThreadedWorkQueue) { TEST(WorkQueue, TestMultiThreadedWorkQueue) { VLOG(1) << "In Test"; - using paddle::framework::WorkQueueOptions; - using paddle::framework::WorkQueue; using paddle::framework::CreateMultiThreadedWorkQueue; using paddle::framework::EventsWaiter; + using paddle::framework::WorkQueue; + using paddle::framework::WorkQueueOptions; std::atomic finished{false}; std::atomic counter{0}; constexpr unsigned kExternalLoopNum = 100; @@ -136,10 +138,10 @@ TEST(WorkQueue, TestMultiThreadedWorkQueue) { } TEST(WorkQueue, TestWorkQueueGroup) { - using paddle::framework::WorkQueueOptions; - using paddle::framework::WorkQueueGroup; using paddle::framework::CreateWorkQueueGroup; using paddle::framework::EventsWaiter; + using paddle::framework::WorkQueueGroup; + using paddle::framework::WorkQueueOptions; std::atomic finished{false}; std::atomic counter{0}; constexpr unsigned kExternalLoopNum = 100; diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.cc b/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.cc index 82dcbbd509dd5..152f89d9ef0b5 100644 --- a/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.cc +++ b/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h" + #include #include diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h b/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h index b6e6ede8c334f..380746c05d604 100644 --- a/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h +++ b/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h @@ -21,6 +21,7 @@ #include #include #include + #include "paddle/fluid/framework/new_executor/workqueue/events_waiter.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/no_need_buffer_vars_inference.cc b/paddle/fluid/framework/no_need_buffer_vars_inference.cc index 25f64838c6d39..665c9b811faee 100644 --- a/paddle/fluid/framework/no_need_buffer_vars_inference.cc +++ b/paddle/fluid/framework/no_need_buffer_vars_inference.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/framework/no_need_buffer_vars_inference.h" + #include + #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/imperative/saved_variable_wrapper_list.h" diff --git a/paddle/fluid/framework/no_need_buffer_vars_inference_test.cc b/paddle/fluid/framework/no_need_buffer_vars_inference_test.cc index a92d52fd2e9ea..a2c7df763a7ef 100644 --- a/paddle/fluid/framework/no_need_buffer_vars_inference_test.cc +++ b/paddle/fluid/framework/no_need_buffer_vars_inference_test.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/no_need_buffer_vars_inference.h" + #include "gtest/gtest.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/imperative/layer.h" diff --git a/paddle/fluid/framework/op_def_api.cc b/paddle/fluid/framework/op_def_api.cc index 73f1409ae690e..b62f17987e651 100644 --- a/paddle/fluid/framework/op_def_api.cc +++ b/paddle/fluid/framework/op_def_api.cc @@ -17,6 +17,7 @@ #define _LINUX #endif #include "paddle/fluid/framework/op_def_api.h" + #include #include #include @@ -28,6 +29,7 @@ #endif #include #include + #include "glog/logging.h" #include "paddle/fluid/framework/op_def.pb.h" diff --git a/paddle/fluid/framework/op_def_api.h b/paddle/fluid/framework/op_def_api.h index 1ef2254d0da36..754b76663df1a 100644 --- a/paddle/fluid/framework/op_def_api.h +++ b/paddle/fluid/framework/op_def_api.h @@ -21,5 +21,5 @@ namespace framework { const proto::OpDef& GetOpDef(const std::string& op_name); bool HasOpDef(const std::string& op_name); -} -} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 87d3a048d0be0..db2a411da0086 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -495,8 +495,9 @@ bool OpDesc::HasProtoAttr(const std::string &name) const { proto::AttrType OpDesc::GetAttrType(const std::string &name) const { auto it = attrs_.find(name); - PADDLE_ENFORCE_NE(it, attrs_.end(), platform::errors::NotFound( - "Attribute %s is not found.", name)); + PADDLE_ENFORCE_NE( + it, attrs_.end(), + platform::errors::NotFound("Attribute %s is not found.", name)); return static_cast(it->second.which() - 1); } @@ -599,8 +600,9 @@ void OpDesc::SetAttrMap( Attribute OpDesc::GetAttr(const std::string &name) const { auto it = attrs_.find(name); - PADDLE_ENFORCE_NE(it, attrs_.end(), platform::errors::NotFound( - "Attribute %s is not found.", name)); + PADDLE_ENFORCE_NE( + it, attrs_.end(), + platform::errors::NotFound("Attribute %s is not found.", name)); return it->second; } @@ -854,10 +856,11 @@ bool CompileTimeInferShapeContext::HasInput(const std::string &name) const { if (length == 0) { return false; } - PADDLE_ENFORCE_EQ(length, 1UL, platform::errors::InvalidArgument( - "Input(%s) should have only one value, " - "but it has %d values now.", - name, length)); + PADDLE_ENFORCE_EQ( + length, 1UL, + platform::errors::InvalidArgument("Input(%s) should have only one value, " + "but it has %d values now.", + name, length)); return block_.HasVarRecursive(input_names[0]); } @@ -870,10 +873,11 @@ bool CompileTimeInferShapeContext::HasOutput(const std::string &name) const { if (length == 0) { return false; } - PADDLE_ENFORCE_EQ(length, 1UL, platform::errors::InvalidArgument( - "Output(%s) should have only one value, " - "but it has %d values now.", - name, length)); + PADDLE_ENFORCE_EQ(length, 1UL, + platform::errors::InvalidArgument( + "Output(%s) should have only one value, " + "but it has %d values now.", + name, length)); return block_.HasVarRecursive(output_names[0]); } diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index 903ee73b2c013..51aeed2e5d734 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "glog/logging.h" #include "paddle/fluid/framework/attribute.h" namespace paddle { diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc index 889b6b0c86b2f..8b77b1d260c42 100644 --- a/paddle/fluid/framework/op_registry_test.cc +++ b/paddle/fluid/framework/op_registry_test.cc @@ -12,11 +12,11 @@ See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/op_registry.h" + #include #include -#include "paddle/fluid/framework/op_registry.h" - namespace pd = paddle::framework; namespace paddle { @@ -58,8 +58,9 @@ class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker { AddInput("input", "input of cosine op").AsDuplicable(); AddOutput("output", "output of cosine op").AsIntermediate(); auto my_checker = [](int i) { - PADDLE_ENFORCE_EQ(i % 2, 0, platform::errors::InvalidArgument( - "'test_attr' must be even!")); + PADDLE_ENFORCE_EQ( + i % 2, 0, + platform::errors::InvalidArgument("'test_attr' must be even!")); }; AddAttr("test_attr", "a simple test attribute") .AddCustomChecker(my_checker); diff --git a/paddle/fluid/framework/op_version_proto.h b/paddle/fluid/framework/op_version_proto.h index 9b70bb93bb967..022531d53de1c 100644 --- a/paddle/fluid/framework/op_version_proto.h +++ b/paddle/fluid/framework/op_version_proto.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include #include "paddle/fluid/framework/framework.pb.h" diff --git a/paddle/fluid/framework/op_version_registry_test.cc b/paddle/fluid/framework/op_version_registry_test.cc index e66d0dc5a1f79..8f83631c272ee 100644 --- a/paddle/fluid/framework/op_version_registry_test.cc +++ b/paddle/fluid/framework/op_version_registry_test.cc @@ -12,10 +12,10 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include - #include "paddle/fluid/framework/op_version_registry.h" +#include + namespace paddle { namespace framework { namespace compatible { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 69f14d7903c0b..7395a8e0da8e8 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -12,6 +12,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include + #include #include @@ -1205,10 +1206,11 @@ bool OperatorWithKernel::SupportsMKLDNN( const proto::VarType::Type data_type) const { auto op_kernel_iter = OperatorWithKernel::AllOpKernels().find(type_); if (op_kernel_iter == OperatorWithKernel::AllOpKernels().end()) { - VLOG(6) << "Warning: " << type_ << " don't find its MKLDNN Kernel in Fluid " - "Registered Kernels. And We don't " - "search its kernels in phi lib, " - "SupportsMKLDNN() return false."; + VLOG(6) << "Warning: " << type_ + << " don't find its MKLDNN Kernel in Fluid " + "Registered Kernels. And We don't " + "search its kernels in phi lib, " + "SupportsMKLDNN() return false."; return false; } auto& op_kernels = op_kernel_iter->second; @@ -1440,7 +1442,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, #if defined(PADDLE_WITH_XPU_KP) && (!is_xpu_unsupport || use_phi_xpu_kp) #endif - ) { + ) { run_phi_kernel_ = true; } else { auto& all_op_kernels = AllOpKernels(); @@ -1464,7 +1466,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, #if defined(PADDLE_WITH_XPU_KP) || (is_xpu_unsupport && !is_xpu_kp_support) #endif - ) { + ) { auto pt_cpu_kernel_key = FallBackToCpu(*kernel_type_.get(), pt_kernel_key, *this); pt_kernel_.reset( @@ -2238,8 +2240,9 @@ phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs( if (arg_map_fn) { arg_map_fn_.reset(new phi::ArgumentMappingFn(*arg_map_fn)); } else { - auto func = [this]( - const phi::ArgumentMappingContext& ctx) -> phi::KernelSignature { + auto func = + [this]( + const phi::ArgumentMappingContext& ctx) -> phi::KernelSignature { return phi::DefaultKernelSignatureMap::Instance().Get(type_); }; arg_map_fn_.reset(new phi::ArgumentMappingFn(func)); diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 2efa2e4bd8a75..dc13287b5aad3 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -27,6 +27,7 @@ limitations under the License. */ #include "glog/logging.h" // For VLOG #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_kernel_type.h" @@ -38,12 +39,10 @@ limitations under the License. */ #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/variant.h" -#include "paddle/utils/flat_hash_map.h" - -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/phi/core/compat/arg_map_context.h" #include "paddle/phi/core/compat/op_utils.h" #include "paddle/phi/core/kernel_factory.h" +#include "paddle/utils/flat_hash_map.h" namespace paddle { namespace framework { @@ -610,12 +609,12 @@ class OperatorWithKernel : public OperatorBase { /* member functions for adapting to phi lib */ /** In the Tensor calculation library, the new Kernel adopts a clearer and - * more streamlined design. The arguments of the Kernel and the input and - * output arguments registered in the original OpMaker do not match in some - * cases, so we use map to record the arguments required by the kernel. - * When selecting Kernel during Op execution, select the arguments of the - * original Op according to the GetExpectedPhiKernelArgs returned arguments. - */ + * more streamlined design. The arguments of the Kernel and the input and + * output arguments registered in the original OpMaker do not match in some + * cases, so we use map to record the arguments required by the kernel. + * When selecting Kernel during Op execution, select the arguments of the + * original Op according to the GetExpectedPhiKernelArgs returned arguments. + */ phi::KernelSignature GetExpectedPhiKernelArgs( const ExecutionContext& ctx) const; diff --git a/paddle/fluid/framework/operator_exception_test.cc b/paddle/fluid/framework/operator_exception_test.cc index 7b513996fb40e..0f635d170de2f 100644 --- a/paddle/fluid/framework/operator_exception_test.cc +++ b/paddle/fluid/framework/operator_exception_test.cc @@ -12,12 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/operator.h" #include #include #include #include + #include "gtest/gtest.h" +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/framework/operator_kernel_configs.h b/paddle/fluid/framework/operator_kernel_configs.h index ab812a30981f0..57d377f1389cf 100644 --- a/paddle/fluid/framework/operator_kernel_configs.h +++ b/paddle/fluid/framework/operator_kernel_configs.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include + #include "glog/logging.h" namespace paddle { diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc index 24e09bcd463dc..3dda60de12ad4 100644 --- a/paddle/fluid/framework/operator_test.cc +++ b/paddle/fluid/framework/operator_test.cc @@ -11,11 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "gtest/gtest.h" +#include "paddle/fluid/framework/operator.h" +#include "gtest/gtest.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/errors.h" #include "paddle/fluid/platform/init.h" diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt index 75e258d14764c..7cb9cf254fb1a 100644 --- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt +++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt @@ -1,29 +1,85 @@ -cc_library(cinn_cache_key SRCS cinn_cache_key.cc DEPS boost graph graph_helper lod_tensor proto_desc) -cc_library(build_cinn_pass SRCS build_cinn_pass.cc DEPS pass subgraph_detector graph_pattern_detector cinn_compiler errors enforce) -cc_library(transform_desc SRCS transform_desc.cc DEPS proto_desc cinn) -cc_library(transform_type SRCS transform_type.cc DEPS errors enforce cinn) -cc_library(cinn_graph_symbolization SRCS cinn_graph_symbolization.cc DEPS lod_tensor graph transform_desc cinn) -cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn cinn_launch_context) +cc_library( + cinn_cache_key + SRCS cinn_cache_key.cc + DEPS boost graph graph_helper lod_tensor proto_desc) +cc_library( + build_cinn_pass + SRCS build_cinn_pass.cc + DEPS pass subgraph_detector graph_pattern_detector cinn_compiler errors + enforce) +cc_library( + transform_desc + SRCS transform_desc.cc + DEPS proto_desc cinn) +cc_library( + transform_type + SRCS transform_type.cc + DEPS errors enforce cinn) +cc_library( + cinn_graph_symbolization + SRCS cinn_graph_symbolization.cc + DEPS lod_tensor graph transform_desc cinn) +cc_library( + cinn_compiler + SRCS cinn_compiler.cc + DEPS framework_proto + graph + lod_tensor + cinn_cache_key + cinn_graph_symbolization + cinn + cinn_launch_context) -if (WITH_TESTING) - cc_test(cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn) +if(WITH_TESTING) + cc_test( + cinn_lib_test + SRCS cinn_lib_test.cc + DEPS cinn) set_tests_properties(cinn_lib_test PROPERTIES LABELS "RUN_TYPE=CINN") - cc_test(cinn_cache_key_test SRCS cinn_cache_key_test.cc DEPS cinn_cache_key) + cc_test( + cinn_cache_key_test + SRCS cinn_cache_key_test.cc + DEPS cinn_cache_key) set_tests_properties(cinn_cache_key_test PROPERTIES LABELS "RUN_TYPE=CINN") - cc_test(build_cinn_pass_test SRCS build_cinn_pass_test.cc DEPS build_cinn_pass cinn_compiler op_registry mul_op activation_op elementwise_add_op) + cc_test( + build_cinn_pass_test + SRCS build_cinn_pass_test.cc + DEPS build_cinn_pass cinn_compiler op_registry mul_op activation_op + elementwise_add_op) set_tests_properties(build_cinn_pass_test PROPERTIES LABELS "RUN_TYPE=CINN") - cc_test(transform_desc_test SRCS transform_desc_test.cc DEPS transform_desc) + cc_test( + transform_desc_test + SRCS transform_desc_test.cc + DEPS transform_desc) set_tests_properties(transform_desc_test PROPERTIES LABELS "RUN_TYPE=CINN") - cc_test(transform_type_test SRCS transform_type_test.cc DEPS transform_type) + cc_test( + transform_type_test + SRCS transform_type_test.cc + DEPS transform_type) set_tests_properties(transform_type_test PROPERTIES LABELS "RUN_TYPE=CINN") - cc_test(cinn_graph_symbolization_test SRCS cinn_graph_symbolization_test.cc DEPS cinn_graph_symbolization) - set_tests_properties(cinn_graph_symbolization_test PROPERTIES LABELS "RUN_TYPE=CINN") + cc_test( + cinn_graph_symbolization_test + SRCS cinn_graph_symbolization_test.cc + DEPS cinn_graph_symbolization) + set_tests_properties(cinn_graph_symbolization_test PROPERTIES LABELS + "RUN_TYPE=CINN") - cc_test(cinn_compiler_test SRCS cinn_compiler_test.cc DEPS cinn_compiler place proto_desc graph_viz_pass build_cinn_pass cinn mul_op activation_op elementwise_add_op) + cc_test( + cinn_compiler_test + SRCS cinn_compiler_test.cc + DEPS cinn_compiler + place + proto_desc + graph_viz_pass + build_cinn_pass + cinn + mul_op + activation_op + elementwise_add_op) set_tests_properties(cinn_compiler_test PROPERTIES LABELS "RUN_TYPE=CINN") endif() diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc index 295510cdb1cf2..a2bdd2bc4c105 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc @@ -334,7 +334,7 @@ std::unique_ptr CreateNewSubGraph(const GraphNodeSet& cluster, } GraphNodeSet need_feed_vars; - std::unordered_set param_vars, output_vars; + std::unordered_set param_vars, output_vars; // the subgraph is independently, so here we only need link // to the node in new subgraph, and discard the link to // out-graph. @@ -386,18 +386,18 @@ std::unique_ptr CreateNewSubGraph(const GraphNodeSet& cluster, subgraph.get()); // Save lists of input variables, internal variables and output variables // of the cluster as attributes of the subgraph for convenience. - auto collect_names_fn = []( - const GraphNodeSet& nodes, - const std::unordered_set& ignore_names) { - auto result = std::make_unique>(); - for (auto* node : nodes) { - if (!node->Var() || ignore_names.count(node->Name())) { - continue; - } - result->emplace_back(node->Name()); - } - return result; - }; + auto collect_names_fn = + [](const GraphNodeSet& nodes, + const std::unordered_set& ignore_names) { + auto result = std::make_unique>(); + for (auto* node : nodes) { + if (!node->Var() || ignore_names.count(node->Name())) { + continue; + } + result->emplace_back(node->Name()); + } + return result; + }; subgraph->Set>( kInternalVars, collect_names_fn(cluster_internals, {}).release()); subgraph->Set>( diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc index d593aadc02c73..e9c517af2c395 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc @@ -19,7 +19,6 @@ limitations under the License. */ #include #include "gtest/gtest.h" - #include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/node.h" diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc index 9b5ce876c256f..585f9edce868a 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc @@ -100,7 +100,7 @@ size_t CinnCacheKeyByStructure::HashGraph(const ir::Graph& graph) { // graph.Nodes() return unordered_set, here using set to avoid the same graph // may return different result - std::set node_set(compare), + std::set node_set(compare), output_set(compare); node_set.insert(graph.Nodes().begin(), graph.Nodes().end()); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc index 1ebeecbff954a..24e65599018fa 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +// clang-format off #include #include @@ -21,6 +22,7 @@ #include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/phi/core/ddim.h" +// clang-format on namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc index 12f603542066f..2a6a51d73f2b8 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc @@ -51,14 +51,14 @@ namespace paddle { namespace framework { namespace paddle2cinn { -using ir::Graph; -using ir::Node; -using inference::analysis::Dot; using ::cinn::auto_schedule::AutoTuner; using ::cinn::common::Target; using ::cinn::frontend::Optimize; using ::cinn::hlir::framework::BuildScope; using ::cinn::hlir::framework::GraphCompiler; +using inference::analysis::Dot; +using ir::Graph; +using ir::Node; CinnCompiler* CinnCompiler::GetInstance() { static CinnCompiler* instance = new CinnCompiler(); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h index a38e8b4c5f674..91c559767642a 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h @@ -20,6 +20,7 @@ #include #include #include + #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h" diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc index 255e318c9fa69..5a84a97ee8da7 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc @@ -44,8 +44,8 @@ DECLARE_string(deny_cinn_ops); namespace paddle { namespace framework { namespace paddle2cinn { -using ir::Graph; using ::cinn::common::Target; +using ir::Graph; namespace { template > diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc index 31bf8d9b726d8..4e362057c915f 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +// clang-format off #include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h" #include @@ -30,6 +31,7 @@ limitations under the License. */ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/errors.h" +// clang-format on namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h index 526eb65a56ede..4155147da4b8f 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h +++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +// clang-format off #include #include #include @@ -26,6 +27,7 @@ limitations under the License. */ #include "cinn/frontend/net_builder.h" #include "cinn/frontend/op_mapper_registry.h" +// clang-format on namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc index c0e1ca8f0d123..8a6f92a6f45d0 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc @@ -12,18 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +// clang-format off #include "gtest/gtest.h" #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h" +// clang-format on namespace paddle { namespace framework { namespace paddle2cinn { +using ::cinn::frontend::NetBuilder; using ir::Graph; using ir::Node; -using ::cinn::frontend::NetBuilder; using CinnTensor = ::cinn::hlir::framework::Tensor; using OpMapperContext = CinnGraphSymbolization::OpMapperContext; using CinnOpDesc = CinnGraphSymbolization::CinnOpDesc; diff --git a/paddle/fluid/framework/paddle2cinn/transform_desc.h b/paddle/fluid/framework/paddle2cinn/transform_desc.h index 76a4f812730df..6f0931b6d038d 100644 --- a/paddle/fluid/framework/paddle2cinn/transform_desc.h +++ b/paddle/fluid/framework/paddle2cinn/transform_desc.h @@ -14,6 +14,8 @@ #pragma once +// The headers cant be sorted by clang-format or compilint error occurs. +// clang-format off #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/program_desc.h" @@ -24,6 +26,7 @@ #include "cinn/frontend/paddle/cpp/op_desc.h" #include "cinn/frontend/paddle/cpp/program_desc.h" #include "cinn/frontend/paddle/cpp/var_desc.h" +// clang-format on namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc b/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc index ba324295cad72..ae9f51c3f6790 100644 --- a/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc +++ b/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +// clang-format off #include #include "gtest/gtest.h" #include "paddle/fluid/framework/paddle2cinn/transform_desc.h" +// clang-format on namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/paddle2cinn/transform_type.cc b/paddle/fluid/framework/paddle2cinn/transform_type.cc index 0e348084d254e..60502edd99acf 100644 --- a/paddle/fluid/framework/paddle2cinn/transform_type.cc +++ b/paddle/fluid/framework/paddle2cinn/transform_type.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/paddle2cinn/transform_type.h" + #include "cinn/common/type.h" #include "cinn/runtime/cinn_runtime.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/paddle2cinn/transform_type.h b/paddle/fluid/framework/paddle2cinn/transform_type.h index e44960abbd98d..f0b08ba1e00a4 100644 --- a/paddle/fluid/framework/paddle2cinn/transform_type.h +++ b/paddle/fluid/framework/paddle2cinn/transform_type.h @@ -19,7 +19,7 @@ struct cinn_type_t; namespace cinn::common { struct Type; -} // ::cinn::common +} // namespace cinn::common namespace paddle::framework::paddle2cinn { diff --git a/paddle/fluid/framework/paddle2cinn/transform_type_test.cc b/paddle/fluid/framework/paddle2cinn/transform_type_test.cc index 6c5d360d34cdd..4456642b3e9a0 100644 --- a/paddle/fluid/framework/paddle2cinn/transform_type_test.cc +++ b/paddle/fluid/framework/paddle2cinn/transform_type_test.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/paddle2cinn/transform_type.h" + #include "cinn/common/type.h" #include "cinn/runtime/cinn_runtime.h" #include "gtest/gtest.h" diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b088a535a1232..00d48098a13f6 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -666,8 +666,9 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, ir::Graph *graph) : member_(new ParallelExecutorPrivate(places, scope)) { PADDLE_ENFORCE_EQ(places.size() > 0 && !platform::is_npu_place(places[0]), - true, platform::errors::Unavailable( - "NPU is not supported in ParallelExecutor.")); + true, + platform::errors::Unavailable( + "NPU is not supported in ParallelExecutor.")); InitP2P(places); ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_), member_->places_.size()); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 18d0ee78ffbbc..3dc9fbcfbf312 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -42,9 +42,9 @@ namespace framework { class ParallelExecutorPrivate; -using details::VariableInfo; using details::BuildStrategy; using details::ExecutionStrategy; +using details::VariableInfo; namespace p = paddle::platform; using DeviceType = paddle::platform::DeviceType; diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc index 3eda00006f959..19f7b024b27f2 100644 --- a/paddle/fluid/framework/phi_utils.cc +++ b/paddle/fluid/framework/phi_utils.cc @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/phi_utils.h" + #include #include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/phi_utils.h" - #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/selected_rows_utils.h" diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h index 785ede5c60175..535672f2e1288 100644 --- a/paddle/fluid/framework/phi_utils.h +++ b/paddle/fluid/framework/phi_utils.h @@ -21,11 +21,10 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/place.h" - -#include "paddle/fluid/framework/operator.h" #include "paddle/phi/api/lib/utils/tensor_utils.h" #include "paddle/phi/common/backend.h" #include "paddle/phi/core/compat/arg_map_context.h" diff --git a/paddle/fluid/framework/phi_utils_test.cc b/paddle/fluid/framework/phi_utils_test.cc index cbcdf24c9f32b..02eb23f8ac17b 100644 --- a/paddle/fluid/framework/phi_utils_test.cc +++ b/paddle/fluid/framework/phi_utils_test.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/phi_utils.h" + #include "gtest/gtest.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/selected_rows_utils.h" diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc index 4a31adcca65ec..88738255af78e 100644 --- a/paddle/fluid/framework/program_desc.cc +++ b/paddle/fluid/framework/program_desc.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/program_desc.h" + #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/version.h" diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h index 4ceb0c5c82481..7e1c12f4ac5b1 100644 --- a/paddle/fluid/framework/program_desc.h +++ b/paddle/fluid/framework/program_desc.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include #include #include diff --git a/paddle/fluid/framework/program_processing.cc b/paddle/fluid/framework/program_processing.cc index 3bcf6f8f3855f..95b28b79dcf36 100644 --- a/paddle/fluid/framework/program_processing.cc +++ b/paddle/fluid/framework/program_processing.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/program_processing.h" + #include "paddle/fluid/framework/block_desc.h" namespace paddle { diff --git a/paddle/fluid/framework/prune.cc b/paddle/fluid/framework/prune.cc index 4c95f01ae569f..fbeedcc311ac7 100644 --- a/paddle/fluid/framework/prune.cc +++ b/paddle/fluid/framework/prune.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_proto_maker.h" namespace paddle { diff --git a/paddle/fluid/framework/prune_test.cc b/paddle/fluid/framework/prune_test.cc index 64b30878150d0..5fbfda716b437 100644 --- a/paddle/fluid/framework/prune_test.cc +++ b/paddle/fluid/framework/prune_test.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/prune.h" #include + #include #include "paddle/fluid/framework/block_desc.h" diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc index aec40a5a7ebdd..c86bfbc43bfb9 100644 --- a/paddle/fluid/framework/ps_gpu_trainer.cc +++ b/paddle/fluid/framework/ps_gpu_trainer.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include #include #include diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc index a12079a135dbd..7a0fe65182d13 100644 --- a/paddle/fluid/framework/pull_dense_worker.cc +++ b/paddle/fluid/framework/pull_dense_worker.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/device_worker.h" namespace phi { diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc index b418339bf3296..27940f726dca1 100644 --- a/paddle/fluid/framework/reader.cc +++ b/paddle/fluid/framework/reader.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/reader.h" + #include namespace paddle { diff --git a/paddle/fluid/framework/save_load_util.cc b/paddle/fluid/framework/save_load_util.cc index 44488fca01c02..284965fdfe9a8 100644 --- a/paddle/fluid/framework/save_load_util.cc +++ b/paddle/fluid/framework/save_load_util.cc @@ -342,8 +342,9 @@ bool LoadTensorFromDisk( uint32_t version; fin.read(reinterpret_cast(&version), sizeof(version)); CheckInStreamState(fin, sizeof(version)); - PADDLE_ENFORCE_EQ(version, 0U, platform::errors::InvalidArgument( - "Only version 0 tensor is supported.")); + PADDLE_ENFORCE_EQ(version, 0U, + platform::errors::InvalidArgument( + "Only version 0 tensor is supported.")); proto::VarType::TensorDesc desc; { // int32_t size diff --git a/paddle/fluid/framework/save_load_util_test.cc b/paddle/fluid/framework/save_load_util_test.cc index 10a34d7ce91ad..623f0f27bdaa2 100644 --- a/paddle/fluid/framework/save_load_util_test.cc +++ b/paddle/fluid/framework/save_load_util_test.cc @@ -11,11 +11,12 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/save_load_util.h" + #include #include #include "gtest/gtest.h" -#include "paddle/fluid/framework/save_load_util.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/scope_guard.h b/paddle/fluid/framework/scope_guard.h index 83387842e94ef..9c741f7bfc573 100644 --- a/paddle/fluid/framework/scope_guard.h +++ b/paddle/fluid/framework/scope_guard.h @@ -16,6 +16,7 @@ #include #include + #include "paddle/fluid/platform/macros.h" namespace paddle { @@ -41,12 +42,12 @@ class ScopeGuard { #define _PADDLE_CONCAT_TOKEN(x, y) x##y #define PADDLE_CONCAT_TOKEN(x, y) _PADDLE_CONCAT_TOKEN(x, y) -#define DEFINE_PADDLE_SCOPE_GUARD(...) \ - auto PADDLE_CONCAT_TOKEN(__scope_guard_func, __LINE__) = __VA_ARGS__; \ - ::paddle::framework::ScopeGuard::type> \ - PADDLE_CONCAT_TOKEN(__scope_guard, __LINE__)( \ - PADDLE_CONCAT_TOKEN(__scope_guard_func, __LINE__)) +#define DEFINE_PADDLE_SCOPE_GUARD(...) \ + auto PADDLE_CONCAT_TOKEN(__scope_guard_func, __LINE__) = __VA_ARGS__; \ + ::paddle::framework::ScopeGuard::type> \ + PADDLE_CONCAT_TOKEN(__scope_guard, __LINE__)( \ + PADDLE_CONCAT_TOKEN(__scope_guard_func, __LINE__)) } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/scope_guard_test.cc b/paddle/fluid/framework/scope_guard_test.cc index d7a7a6168a368..793b3a1652a1c 100644 --- a/paddle/fluid/framework/scope_guard_test.cc +++ b/paddle/fluid/framework/scope_guard_test.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/scope_guard.h" + #include "gtest/gtest.h" namespace paddle { diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc index 1f821720d64d2..7bb8550926d63 100644 --- a/paddle/fluid/framework/section_worker.cc +++ b/paddle/fluid/framework/section_worker.cc @@ -12,6 +12,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_ASCEND_CL) #include + #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/framework/selected_rows_utils.h b/paddle/fluid/framework/selected_rows_utils.h index 8606295c45199..9ecff5719fb91 100644 --- a/paddle/fluid/framework/selected_rows_utils.h +++ b/paddle/fluid/framework/selected_rows_utils.h @@ -21,10 +21,9 @@ limitations under the License. */ #include #include -#include "paddle/phi/core/selected_rows.h" - #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/core/selected_rows.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/selected_rows_utils_test.cc b/paddle/fluid/framework/selected_rows_utils_test.cc index f23510c721e24..db2c6c1f991b7 100644 --- a/paddle/fluid/framework/selected_rows_utils_test.cc +++ b/paddle/fluid/framework/selected_rows_utils_test.cc @@ -9,11 +9,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/selected_rows_utils.h" + #include + #include // NOLINT #include "gtest/gtest.h" -#include "paddle/fluid/framework/selected_rows_utils.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/string_array.cc b/paddle/fluid/framework/string_array.cc old mode 100755 new mode 100644 index 3071e6bf4cff3..f6aee9b82f2c6 --- a/paddle/fluid/framework/string_array.cc +++ b/paddle/fluid/framework/string_array.cc @@ -12,12 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/string_array.h" + #include #include #include "glog/logging.h" -#include "paddle/fluid/framework/string_array.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 57eddf782f06b..7ad9839d79dca 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -23,15 +23,14 @@ limitations under the License. */ #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/core/ddim.h" -#include "paddle/phi/core/stream.h" - -#include "paddle/fluid/framework/mixed_vector.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/stream.h" namespace paddle { diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h index f5e230773fb2f..946b119ecb39f 100644 --- a/paddle/fluid/framework/tensor_impl.h +++ b/paddle/fluid/framework/tensor_impl.h @@ -24,12 +24,13 @@ namespace framework { inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { int rank = src.dims().size(); PADDLE_ENFORCE_GE( - rank, 2, platform::errors::InvalidArgument( - "'ReshapeToMatrix()' is only used for flatten high rank " - "tensors to matrixs. The dimensions of Tensor must be " - "greater or equal than 2. " - "But received dimensions of Tensor is %d", - rank)); + rank, 2, + platform::errors::InvalidArgument( + "'ReshapeToMatrix()' is only used for flatten high rank " + "tensors to matrixs. The dimensions of Tensor must be " + "greater or equal than 2. " + "But received dimensions of Tensor is %d", + rank)); if (rank == 2) { return src; } diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index 3e104807535e9..05dd41eb6ffc5 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/tensor.h" #include + #include namespace framework = paddle::framework; diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 1159280762f5a..1e25acb2c4ecb 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/tensor_util.h" + #include #include #include @@ -21,10 +23,8 @@ limitations under the License. */ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/profiler/event_tracing.h" - #include "paddle/phi/core/dense_tensor.h" #ifdef PADDLE_WITH_MKLDNN @@ -1249,10 +1249,12 @@ void TensorFromStream(std::istream& is, Tensor* tensor, // proto buffer int32_t size = -1; is.read(reinterpret_cast(&size), sizeof(size)); - PADDLE_ENFORCE_EQ(is.good(), true, platform::errors::Unavailable( - "Cannot read tensor desc size")); - PADDLE_ENFORCE_GE(size, 0, platform::errors::InvalidArgument( - "Tensor desc size should >= 0")); + PADDLE_ENFORCE_EQ( + is.good(), true, + platform::errors::Unavailable("Cannot read tensor desc size")); + PADDLE_ENFORCE_GE( + size, 0, + platform::errors::InvalidArgument("Tensor desc size should >= 0")); std::unique_ptr buf(new char[size]); is.read(reinterpret_cast(buf.get()), size); PADDLE_ENFORCE_EQ( diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc index 5e6e1227b1aac..2511fdf27ce69 100644 --- a/paddle/fluid/framework/tensor_util_test.cc +++ b/paddle/fluid/framework/tensor_util_test.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/tensor_util.h" #include + #include namespace paddle { @@ -254,64 +255,61 @@ TEST(TensorToVector, Tensor) { #endif } -TEST(TensorToVector, Tensor_bool) { - { - paddle::framework::Tensor src; - bool* src_ptr = - src.mutable_data({3, 3}, paddle::platform::CPUPlace()); - for (int i = 0; i < 3 * 3; ++i) { - src_ptr[i] = static_cast(i % 2); - } +TEST(TensorToVector, Tensor_bool){{paddle::framework::Tensor src; +bool* src_ptr = src.mutable_data({3, 3}, paddle::platform::CPUPlace()); +for (int i = 0; i < 3 * 3; ++i) { + src_ptr[i] = static_cast(i % 2); +} - paddle::platform::CPUPlace place; - std::vector dst; - paddle::framework::TensorToVector(src, &dst); +paddle::platform::CPUPlace place; +std::vector dst; +paddle::framework::TensorToVector(src, &dst); - for (int i = 0; i < 3 * 3; ++i) { - EXPECT_EQ(src_ptr[i], dst[i]); - } - } +for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_ptr[i], dst[i]); +} +} // namespace framework #ifdef PADDLE_WITH_CUDA - { - std::vector src_vec = { - false, true, false, true, false, true, false, true, false, - }; - paddle::framework::Tensor gpu_tensor; - paddle::platform::CUDAPlace place; - paddle::platform::CUDADeviceContext gpu_ctx(place); - gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(place, gpu_ctx.stream()) - .get()); - gpu_ctx.PartialInitWithAllocator(); - paddle::framework::TensorFromVector(src_vec, gpu_ctx, &gpu_tensor); - - std::vector dst; - paddle::framework::TensorToVector(gpu_tensor, gpu_ctx, &dst); - - for (int i = 0; i < 3 * 3; ++i) { - EXPECT_EQ(src_vec[i], dst[i]); - } +{ + std::vector src_vec = { + false, true, false, true, false, true, false, true, false, + }; + paddle::framework::Tensor gpu_tensor; + paddle::platform::CUDAPlace place; + paddle::platform::CUDADeviceContext gpu_ctx(place); + gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place, gpu_ctx.stream()) + .get()); + gpu_ctx.PartialInitWithAllocator(); + paddle::framework::TensorFromVector(src_vec, gpu_ctx, &gpu_tensor); + + std::vector dst; + paddle::framework::TensorToVector(gpu_tensor, gpu_ctx, &dst); + + for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_vec[i], dst[i]); } +} #endif #ifdef PADDLE_WITH_ASCEND_CL - { - std::vector src_vec = { - false, true, false, true, false, true, false, true, false, - }; - paddle::framework::Tensor npu_tensor; - paddle::platform::NPUPlace place(0); - paddle::platform::NPUDeviceContext npu_ctx(place); - paddle::framework::TensorFromVector(src_vec, npu_ctx, &npu_tensor); - - std::vector dst; - paddle::framework::TensorToVector(npu_tensor, npu_ctx, &dst); - - for (int i = 0; i < 3 * 3; ++i) { - EXPECT_EQ(src_vec[i], dst[i]); - } +{ + std::vector src_vec = { + false, true, false, true, false, true, false, true, false, + }; + paddle::framework::Tensor npu_tensor; + paddle::platform::NPUPlace place(0); + paddle::platform::NPUDeviceContext npu_ctx(place); + paddle::framework::TensorFromVector(src_vec, npu_ctx, &npu_tensor); + + std::vector dst; + paddle::framework::TensorToVector(npu_tensor, npu_ctx, &dst); + + for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_vec[i], dst[i]); } -#endif } +#endif +} // namespace paddle TEST(TensorFromDLPack, Tensor) { { diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc index 33533b1d10feb..b704ac4329dc8 100644 --- a/paddle/fluid/framework/threadpool.cc +++ b/paddle/fluid/framework/threadpool.cc @@ -43,8 +43,9 @@ void ThreadPool::Init() { num_threads = FLAGS_dist_threadpool_size; VLOG(1) << "set dist_threadpool_size to " << num_threads; } - PADDLE_ENFORCE_GT(num_threads, 0, platform::errors::InvalidArgument( - "The number of threads is 0.")); + PADDLE_ENFORCE_GT( + num_threads, 0, + platform::errors::InvalidArgument("The number of threads is 0.")); threadpool_.reset(new ThreadPool(num_threads)); } } diff --git a/paddle/fluid/framework/threadpool_test.cc b/paddle/fluid/framework/threadpool_test.cc index 1278a0f0643f4..0b6e12967fe1b 100644 --- a/paddle/fluid/framework/threadpool_test.cc +++ b/paddle/fluid/framework/threadpool_test.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" + #include + #include namespace framework = paddle::framework; diff --git a/paddle/fluid/framework/trainer.cc b/paddle/fluid/framework/trainer.cc index b033f9a99d6d9..dc48a8f8d8f2f 100644 --- a/paddle/fluid/framework/trainer.cc +++ b/paddle/fluid/framework/trainer.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/trainer.h" + #include "io/fs.h" namespace paddle { diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc index 1f1122d32f5c3..48ea9143d621a 100644 --- a/paddle/fluid/framework/trainer_factory.cc +++ b/paddle/fluid/framework/trainer_factory.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/trainer_factory.h" #include + #include #include diff --git a/paddle/fluid/framework/trainer_test.cc b/paddle/fluid/framework/trainer_test.cc index f689679d48696..1f4a162f90616 100644 --- a/paddle/fluid/framework/trainer_test.cc +++ b/paddle/fluid/framework/trainer_test.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/trainer.h" + #include namespace paddle { @@ -23,5 +24,5 @@ TEST() { // create dataset // train for a while } -} -} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index 0937d96ad4c20..5feedb2c3d670 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -21,6 +21,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/platform/variant.h" #include "paddle/utils/small_vector.h" diff --git a/paddle/fluid/framework/unused_var_check.cc b/paddle/fluid/framework/unused_var_check.cc index 2f03dc41ce002..43c44ff525fca 100644 --- a/paddle/fluid/framework/unused_var_check.cc +++ b/paddle/fluid/framework/unused_var_check.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/unused_var_check.h" #include + #include #include "gflags/gflags.h" diff --git a/paddle/fluid/framework/unused_var_check.h b/paddle/fluid/framework/unused_var_check.h index 95f6917fbcde7..cc4977e439c4c 100644 --- a/paddle/fluid/framework/unused_var_check.h +++ b/paddle/fluid/framework/unused_var_check.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include #include diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc index 0a24efd003bcf..3a3edc9b4c64e 100644 --- a/paddle/fluid/framework/var_desc.cc +++ b/paddle/fluid/framework/var_desc.cc @@ -318,18 +318,20 @@ void VarDesc::SetAttr(const std::string &name, const Attribute &v) { bool valid = attr_type == proto::AttrType::INT || attr_type == proto::AttrType::STRING || attr_type == proto::AttrType::INTS; - PADDLE_ENFORCE_EQ(valid, true, platform::errors::InvalidArgument( - "The value for attr (%s) must be " - "one of list or int or string.", - name)); + PADDLE_ENFORCE_EQ( + valid, true, + platform::errors::InvalidArgument("The value for attr (%s) must be " + "one of list or int or string.", + name)); this->attrs_[name] = v; } Attribute VarDesc::GetAttr(const std::string &name) const { auto it = attrs_.find(name); - PADDLE_ENFORCE_NE(it, attrs_.end(), platform::errors::NotFound( - "Attribute %s is not found.", name)); + PADDLE_ENFORCE_NE( + it, attrs_.end(), + platform::errors::NotFound("Attribute %s is not found.", name)); return it->second; } diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc index 5483ef01c0844..ce489a57a019e 100644 --- a/paddle/fluid/framework/var_type_inference_test.cc +++ b/paddle/fluid/framework/var_type_inference_test.cc @@ -245,9 +245,12 @@ TEST(InferVarType, multiple_api) { ASSERT_ANY_THROW(infer.SetDataTypes(&ctx, "test2_a_out", {})); ASSERT_EQ(0u, infer.GetShape(&ctx, "test2_a_out").size()); - infer.SetShape(&ctx, "test2_a_out", { - 1, 3, 3, - }); + infer.SetShape(&ctx, "test2_a_out", + { + 1, + 3, + 3, + }); ASSERT_EQ(3u, infer.GetShape(&ctx, "test2_a_out").size()); ASSERT_EQ(0, infer.GetLoDLevel(&ctx, "test2_a_out")); diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc index 401ccb03d78d6..345928666bd52 100644 --- a/paddle/fluid/framework/var_type_traits.cc +++ b/paddle/fluid/framework/var_type_traits.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/var_type_traits.h" + #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/scope.h" @@ -25,6 +26,7 @@ #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif #include + #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/cudnn_rnn_cache.h" #endif @@ -41,6 +43,8 @@ #include "paddle/fluid/platform/device/xpu/bkcl_helper.h" #endif +#include "paddle/fluid/operators/cuda_graph_with_in_out.h" + namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index 9fe67e1dcdff3..463331494d908 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -87,6 +87,8 @@ namespace operators { class CudnnRNNCache; +class CUDAGraphWithInOuts; + namespace reader { class LoDTensorBlockingQueueHolder; class OrderedMultiDeviceLoDTensorBlockingQueueHolder; @@ -189,7 +191,8 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< #if defined(PADDLE_WITH_CNCL) cnclCliqueId, #endif - int, float, Vocab>; + std::vector>, int, float, + Vocab>; template struct VarTypeTrait { static_assert(VarTypeRegistry::IsRegistered(), "Must be registered type"); diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc index 00ae5154f83ab..4a81f66948de3 100644 --- a/paddle/fluid/framework/var_type_traits_test.cc +++ b/paddle/fluid/framework/var_type_traits_test.cc @@ -12,13 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/var_type_traits.h" + #include #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows_utils.h" -#include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_NCCL) diff --git a/paddle/fluid/framework/version.cc b/paddle/fluid/framework/version.cc index 92042e4725986..c01bef79cdccd 100644 --- a/paddle/fluid/framework/version.cc +++ b/paddle/fluid/framework/version.cc @@ -24,7 +24,7 @@ bool IsProgramVersionSupported(int64_t version) { * new version. The compatibility judgment cannot be made only * by the version number. Please do not use this interface, * it may be discarded because backward compatibility. - */ + */ return true; } @@ -33,7 +33,7 @@ bool IsTensorVersionSupported(uint32_t version) { * new version. The compatibility judgment cannot be made only * by the version number. Please do not use this interface, * it may be discarded because backward compatibility. - */ + */ return true; } diff --git a/paddle/fluid/framework/version_test.cc b/paddle/fluid/framework/version_test.cc index ec5a340ee6ef3..7c52209981ff9 100644 --- a/paddle/fluid/framework/version_test.cc +++ b/paddle/fluid/framework/version_test.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/version.h" + #include "gtest/gtest.h" namespace paddle { diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 92af1901b71ab..eaf0a09541d77 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -1,65 +1,214 @@ -cc_library(imperative_flag SRCS flags.cc DEPS gflags flags) -cc_library(var_helper SRCS var_helper.cc DEPS tensor phi_api) -IF(WITH_XPU) -cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils phi_api phi_utils var_helper) -ELSE() -cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils phi_api phi_utils var_helper) -ENDIF() -cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry var_helper phi_api) +cc_library( + imperative_flag + SRCS flags.cc + DEPS gflags flags) +cc_library( + var_helper + SRCS var_helper.cc + DEPS tensor phi_api) +if(WITH_XPU) + cc_library( + prepared_operator + SRCS prepared_operator.cc + DEPS xpu_op_list + proto_desc + operator + device_context + lod_tensor + selected_rows_utils + var_type_traits + op_kernel_type + data_transform + nan_inf_utils + phi_api + phi_utils + var_helper) +else() + cc_library( + prepared_operator + SRCS prepared_operator.cc + DEPS proto_desc + operator + device_context + lod_tensor + selected_rows_utils + var_type_traits + op_kernel_type + data_transform + nan_inf_utils + phi_api + phi_utils + var_helper) +endif() +cc_library( + layer + SRCS layer.cc + DEPS prepared_operator + math_function + imperative_flag + variable_helper + op_registry + var_helper + phi_api) add_subdirectory(jit) -if (WITH_GPU) -cc_library(layout_autotune SRCS layout_autotune.cc DEPS op_info phi_gpu_info) +if(WITH_GPU) + cc_library( + layout_autotune + SRCS layout_autotune.cc + DEPS op_info phi_gpu_info) else() -cc_library(layout_autotune SRCS layout_autotune.cc DEPS op_info) + cc_library( + layout_autotune + SRCS layout_autotune.cc + DEPS op_info) endif() -cc_library(amp SRCS amp_auto_cast.cc DEPS layer var_helper) -cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal garbage_collector var_helper layout_autotune) -cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator switch_autotune) -cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator switch_autotune) -cc_library(imperative_profiler SRCS profiler.cc DEPS flags) +cc_library( + amp + SRCS amp_auto_cast.cc + DEPS layer var_helper) +cc_library( + tracer + SRCS tracer.cc + DEPS layer + engine + program_desc_tracer + amp + denormal + garbage_collector + var_helper + layout_autotune) +cc_library( + basic_engine + SRCS basic_engine.cc + DEPS layer gradient_accumulator switch_autotune) +cc_library( + engine + SRCS basic_engine.cc partial_grad_engine.cc + DEPS layer gradient_accumulator switch_autotune) +cc_library( + imperative_profiler + SRCS profiler.cc + DEPS flags) if(NOT WIN32) - if(WITH_NCCL OR WITH_RCCL) - cc_library(imperative_all_reduce SRCS all_reduce.cc DEPS collective_helper device_context selected_rows_utils tensor) - cc_library(nccl_context SRCS nccl_context.cc DEPS collective_helper device_context imperative_all_reduce var_type_traits) - if(WITH_NCCL) - nv_library(reducer SRCS reducer.cc reducer.cu DEPS layer imperative_all_reduce) - endif() - if(WITH_RCCL) - hip_library(reducer SRCS reducer.cc reducer.cu DEPS layer imperative_all_reduce) - endif() - endif() - if(WITH_XPU_BKCL) - cc_library(bkcl_context SRCS bkcl_context.cc DEPS collective_helper device_context tensor var_type_traits) - cc_library(reducer SRCS reducer.cc DEPS layer) + if(WITH_NCCL OR WITH_RCCL) + cc_library( + imperative_all_reduce + SRCS all_reduce.cc + DEPS collective_helper device_context selected_rows_utils tensor) + cc_library( + nccl_context + SRCS nccl_context.cc + DEPS collective_helper device_context imperative_all_reduce + var_type_traits) + if(WITH_NCCL) + nv_library( + reducer + SRCS reducer.cc reducer.cu + DEPS layer imperative_all_reduce) endif() - if(WITH_ASCEND_CL) - cc_library(hccl_context SRCS hccl_context.cc DEPS collective_helper device_context tensor var_type_traits) - cc_library(reducer SRCS reducer.cc DEPS layer) + if(WITH_RCCL) + hip_library( + reducer + SRCS reducer.cc reducer.cu + DEPS layer imperative_all_reduce) endif() - if(WITH_CNCL) - cc_library(cncl_context SRCS cncl_context.cc DEPS collective_helper device_context tensor var_type_traits) - cc_library(reducer SRCS reducer.cc DEPS layer) - endif() - if(WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL) - cc_library(heter_ccl_context SRCS heter_ccl_context.cc DEPS collective_helper device_context tensor var_type_traits) - endif() - cc_library(data_loader SRCS data_loader.cc DEPS enforce) + endif() + if(WITH_XPU_BKCL) + cc_library( + bkcl_context + SRCS bkcl_context.cc + DEPS collective_helper device_context tensor var_type_traits) + cc_library( + reducer + SRCS reducer.cc + DEPS layer) + endif() + if(WITH_ASCEND_CL) + cc_library( + hccl_context + SRCS hccl_context.cc + DEPS collective_helper device_context tensor var_type_traits) + cc_library( + reducer + SRCS reducer.cc + DEPS layer) + endif() + if(WITH_CNCL) + cc_library( + cncl_context + SRCS cncl_context.cc + DEPS collective_helper device_context tensor var_type_traits) + cc_library( + reducer + SRCS reducer.cc + DEPS layer) + endif() + if(WITH_NCCL + OR WITH_RCCL + OR WITH_XPU_BKCL + OR WITH_ASCEND_CL) + cc_library( + heter_ccl_context + SRCS heter_ccl_context.cc + DEPS collective_helper device_context tensor var_type_traits) + endif() + cc_library( + data_loader + SRCS data_loader.cc + DEPS enforce) endif(NOT WIN32) if(WITH_GLOO) - cc_library(imperative_gloo_context SRCS gloo_context.cc DEPS collective_helper device_context tensor var_type_traits) - if ( WIN32 OR (NOT (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL OR WITH_CNCL) )) - cc_library(reducer SRCS reducer.cc DEPS layer) - endif() + cc_library( + imperative_gloo_context + SRCS gloo_context.cc + DEPS collective_helper device_context tensor var_type_traits) + if(WIN32 + OR (NOT + (WITH_NCCL + OR WITH_RCCL + OR WITH_XPU_BKCL + OR WITH_ASCEND_CL + OR WITH_CNCL) + )) + cc_library( + reducer + SRCS reducer.cc + DEPS layer) + endif() endif() if(WITH_MLU) - SET(MLU_DEPS mlu_baseop) + set(MLU_DEPS mlu_baseop) endif() if(NOT WITH_ASCEND_CL) -cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function phi_tensor ${MLU_DEPS}) + cc_library( + gradient_accumulator + SRCS gradient_accumulator.cc + DEPS blas + operator + lod_tensor + selected_rows_utils + selected_rows_functor + var_type_traits + layer + math_function + phi_tensor + ${MLU_DEPS}) else() -cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function npu_op_runner phi_tensor) + cc_library( + gradient_accumulator + SRCS gradient_accumulator.cc + DEPS blas + operator + lod_tensor + selected_rows_utils + selected_rows_functor + var_type_traits + layer + math_function + npu_op_runner + phi_tensor) endif() add_subdirectory(tests) diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc index 436e22f00c303..f6484d5cdda08 100644 --- a/paddle/fluid/imperative/all_reduce.cc +++ b/paddle/fluid/imperative/all_reduce.cc @@ -15,6 +15,7 @@ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/imperative/all_reduce.h" + #include "paddle/fluid/framework/convert_utils.h" #ifdef PADDLE_WITH_NCCL diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index 3f6863d642cc8..ff6e297ba8003 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "paddle/fluid/imperative/amp_auto_cast.h" + #include #include + #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/type_defs.h" @@ -302,9 +304,8 @@ static inline framework::proto::VarType::Type GetPromoteType( // dtype of input(X) if (op_type == "moving_average_abs_max_scale") { for (const auto& pair : ins) { - if (pair.first == "X" && - GetDataType(pair.second.front()) == - framework::proto::VarType::FP16) { + if (pair.first == "X" && GetDataType(pair.second.front()) == + framework::proto::VarType::FP16) { dst_type = framework::proto::VarType::FP16; } } diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h index 49761a8df0b6b..fcc30b2590a6c 100644 --- a/paddle/fluid/imperative/basic_engine.h +++ b/paddle/fluid/imperative/basic_engine.h @@ -19,6 +19,7 @@ #include #include #include + #include "paddle/fluid/imperative/engine.h" #include "paddle/fluid/imperative/gradient_accumulator.h" diff --git a/paddle/fluid/imperative/bkcl_context.cc b/paddle/fluid/imperative/bkcl_context.cc index 11abbfe7cf6a3..9990fde95ce64 100644 --- a/paddle/fluid/imperative/bkcl_context.cc +++ b/paddle/fluid/imperative/bkcl_context.cc @@ -14,13 +14,14 @@ #if defined(PADDLE_WITH_XPU_BKCL) +#include "paddle/fluid/imperative/bkcl_context.h" + #include #include #include #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/imperative/bkcl_context.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/xpu/bkcl_helper.h" #include "paddle/fluid/platform/device_context.h" @@ -46,10 +47,11 @@ static void AllReduce(const framework::Tensor &src, framework::Tensor *dst, auto bkcl_dtype = platform::ToBKCLDataType(framework::TransToProtoVarType(src.dtype())); - PADDLE_ENFORCE_EQ(bkcl_all_reduce(comm->comm(), src_ptr, dst_ptr, src.numel(), - bkcl_dtype, BKCL_ADD, stream), - BKCL_SUCCESS, platform::errors::PreconditionNotMet( - "BKCL all reduce failed")); + PADDLE_ENFORCE_EQ( + bkcl_all_reduce(comm->comm(), src_ptr, dst_ptr, src.numel(), bkcl_dtype, + BKCL_ADD, stream), + BKCL_SUCCESS, + platform::errors::PreconditionNotMet("BKCL all reduce failed")); } /* Baidu Kunlun Communication Library(BKCL) is designed for multi Baidu Kunlun diff --git a/paddle/fluid/imperative/cncl_context.cc b/paddle/fluid/imperative/cncl_context.cc index 779b748c2d2d4..19f22e7402989 100644 --- a/paddle/fluid/imperative/cncl_context.cc +++ b/paddle/fluid/imperative/cncl_context.cc @@ -18,14 +18,12 @@ limitations under the License. */ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/variable.h" - -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/gen_comm_id_helper.h" -#include "paddle/fluid/platform/place.h" - #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/mlu/cncl_helper.h" #include "paddle/fluid/platform/device/mlu/mlu_info.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/gen_comm_id_helper.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace framework { @@ -184,8 +182,9 @@ paddle::platform::DeviceContext *CNCLParallelContext::GetDeviceContext( } void CNCLParallelContext::WaitCompute(int ring_id) { - PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange( - "ring id must >= 0, but got %d", ring_id)); + PADDLE_ENFORCE_GE( + ring_id, 0, + platform::errors::OutOfRange("ring id must >= 0, but got %d", ring_id)); PADDLE_ENFORCE_LT(ring_id, compute_events_.size(), platform::errors::OutOfRange( "ring id must < compute events size," @@ -205,8 +204,9 @@ void CNCLParallelContext::WaitCompute(int ring_id) { } void CNCLParallelContext::WaitComm(int ring_id) { - PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange( - "ring id must >= 0, but got %d", ring_id)); + PADDLE_ENFORCE_GE( + ring_id, 0, + platform::errors::OutOfRange("ring id must >= 0, but got %d", ring_id)); PADDLE_ENFORCE_LT(ring_id, comm_events_.size(), platform::errors::OutOfRange( "ring id must < comm events size," diff --git a/paddle/fluid/imperative/data_loader.cc b/paddle/fluid/imperative/data_loader.cc index c43149c9b563e..66eed2981062a 100644 --- a/paddle/fluid/imperative/data_loader.cc +++ b/paddle/fluid/imperative/data_loader.cc @@ -19,6 +19,7 @@ #include #include #include + #include #include "glog/logging.h" diff --git a/paddle/fluid/imperative/data_loader.h b/paddle/fluid/imperative/data_loader.h index fdfa117eafe76..e66a3b9edc3ff 100644 --- a/paddle/fluid/imperative/data_loader.h +++ b/paddle/fluid/imperative/data_loader.h @@ -17,6 +17,7 @@ #ifndef _WIN32 #include + #include #include diff --git a/paddle/fluid/imperative/execution_context.h b/paddle/fluid/imperative/execution_context.h index 124c31df73349..fe426a76b3292 100644 --- a/paddle/fluid/imperative/execution_context.h +++ b/paddle/fluid/imperative/execution_context.h @@ -16,6 +16,7 @@ #include #include + #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/framework/variable.h" diff --git a/paddle/fluid/imperative/flags.cc b/paddle/fluid/imperative/flags.cc index c2d668eccdaf9..df424b32fcadf 100644 --- a/paddle/fluid/imperative/flags.cc +++ b/paddle/fluid/imperative/flags.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/imperative/flags.h" + #include "paddle/fluid/platform/flags.h" PADDLE_DEFINE_EXPORTED_uint64(dygraph_debug, 0, diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc index dd34b8b619f80..c5bcab4daa9a9 100644 --- a/paddle/fluid/imperative/gloo_context.cc +++ b/paddle/fluid/imperative/gloo_context.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/imperative/gloo_context.h" + #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #include "paddle/fluid/framework/tensor_util.h" diff --git a/paddle/fluid/imperative/gloo_context.h b/paddle/fluid/imperative/gloo_context.h index 23e4e02945bf6..5e0973e7e9913 100644 --- a/paddle/fluid/imperative/gloo_context.h +++ b/paddle/fluid/imperative/gloo_context.h @@ -16,6 +16,7 @@ #include #include #include + #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/variable.h" diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 499cf4d8ad6d8..36e6f551dc63b 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -874,8 +874,9 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr var, } PADDLE_ENFORCE_EQ(var_info.var->Var().IsType(), - true, platform::errors::PermissionDenied( - "Gradient var must be LoDTensor")); + true, + platform::errors::PermissionDenied( + "Gradient var must be LoDTensor")); if (CurCnt() == 0) { MoveOrCopyVar(dst_var->MutableVar(), var_info.var->MutableVar(), var_info.unchange_input); @@ -896,9 +897,10 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr var, PADDLE_ENFORCE_EQ( var_info.var->Var().IsType() || var_info.var->Var().IsType(), - true, platform::errors::PermissionDenied("The type of Gradient " - "var must be LoDTensor " - "or SelectedRows")); + true, + platform::errors::PermissionDenied("The type of Gradient " + "var must be LoDTensor " + "or SelectedRows")); if (CurCnt() == 0) { MoveOrCopyVar(dst_var->MutableVar(), var_info.var->MutableVar(), var_info.unchange_input); diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h index 03f6775defc2f..382623b627623 100644 --- a/paddle/fluid/imperative/gradient_accumulator.h +++ b/paddle/fluid/imperative/gradient_accumulator.h @@ -17,6 +17,7 @@ #include #include #include + #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/imperative/hooks.h" #include "paddle/fluid/imperative/layer.h" diff --git a/paddle/fluid/imperative/hccl_context.cc b/paddle/fluid/imperative/hccl_context.cc index 31d988753f23c..8fb434cbc2aee 100644 --- a/paddle/fluid/imperative/hccl_context.cc +++ b/paddle/fluid/imperative/hccl_context.cc @@ -13,18 +13,16 @@ // limitations under the License. #include "paddle/fluid/imperative/hccl_context.h" -#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/variable.h" - +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/npu/hccl_helper.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/gen_comm_id_helper.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/device/npu/hccl_helper.h" - namespace paddle { namespace framework { class Variable; @@ -193,8 +191,9 @@ paddle::platform::DeviceContext *HCCLParallelContext::GetDeviceContext( } void HCCLParallelContext::WaitCompute(int ring_id) { - PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange( - "ring id must >= 0, but got %d", ring_id)); + PADDLE_ENFORCE_GE( + ring_id, 0, + platform::errors::OutOfRange("ring id must >= 0, but got %d", ring_id)); PADDLE_ENFORCE_LT(ring_id, compute_events_.size(), platform::errors::OutOfRange( "ring id must < compute events size," @@ -214,8 +213,9 @@ void HCCLParallelContext::WaitCompute(int ring_id) { } void HCCLParallelContext::WaitComm(int ring_id) { - PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange( - "ring id must >= 0, but got %d", ring_id)); + PADDLE_ENFORCE_GE( + ring_id, 0, + platform::errors::OutOfRange("ring id must >= 0, but got %d", ring_id)); PADDLE_ENFORCE_LT(ring_id, comm_events_.size(), platform::errors::OutOfRange( "ring id must < comm events size," diff --git a/paddle/fluid/imperative/infer_var_type_context.h b/paddle/fluid/imperative/infer_var_type_context.h index 297ec840db4c0..079e180c2a70d 100644 --- a/paddle/fluid/imperative/infer_var_type_context.h +++ b/paddle/fluid/imperative/infer_var_type_context.h @@ -18,6 +18,7 @@ #include #include #include + #include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/framework/var_type_inference.h" #include "paddle/fluid/imperative/type_defs.h" diff --git a/paddle/fluid/imperative/jit/CMakeLists.txt b/paddle/fluid/imperative/jit/CMakeLists.txt index 66f2a9840798c..bcc1c0746b823 100644 --- a/paddle/fluid/imperative/jit/CMakeLists.txt +++ b/paddle/fluid/imperative/jit/CMakeLists.txt @@ -1,2 +1,8 @@ -cc_library(op_desc_meta SRCS op_desc_meta.cc DEPS proto_desc layer) -cc_library(program_desc_tracer SRCS program_desc_tracer.cc DEPS op_desc_meta) +cc_library( + op_desc_meta + SRCS op_desc_meta.cc + DEPS proto_desc layer) +cc_library( + program_desc_tracer + SRCS program_desc_tracer.cc + DEPS op_desc_meta) diff --git a/paddle/fluid/imperative/jit/program_desc_tracer.cc b/paddle/fluid/imperative/jit/program_desc_tracer.cc index 35ff262fe3d86..e0f52beb6e555 100644 --- a/paddle/fluid/imperative/jit/program_desc_tracer.cc +++ b/paddle/fluid/imperative/jit/program_desc_tracer.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/imperative/jit/program_desc_tracer.h" + #include "paddle/fluid/framework/convert_utils.h" namespace paddle { diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 76f64ab73a64b..7357db4e2001b 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -16,7 +16,6 @@ #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/framework/convert_utils.h" - #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/imperative/infer_var_type_context.h" #include "paddle/fluid/imperative/op_base.h" @@ -284,9 +283,10 @@ std::shared_ptr VarBase::NewVarBase(const platform::Place& dst_place, PADDLE_ENFORCE_EQ( Var().IsInitialized() && (Var().IsType() || Var().IsType()), - true, platform::errors::InvalidArgument( - "Variable is not initialized or Variable's type is not " - "LoDTensor or SelectedRows when getting numpy tensor")); + true, + platform::errors::InvalidArgument( + "Variable is not initialized or Variable's type is not " + "LoDTensor or SelectedRows when getting numpy tensor")); if (Var().IsType()) { auto& src_tensor = Var().Get(); diff --git a/paddle/fluid/imperative/layout_autotune.cc b/paddle/fluid/imperative/layout_autotune.cc index ed0526eaad316..e936505b2ae03 100644 --- a/paddle/fluid/imperative/layout_autotune.cc +++ b/paddle/fluid/imperative/layout_autotune.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/imperative/layout_autotune.h" + #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/imperative/layout_transformer.h" #include "paddle/phi/backends/gpu/gpu_info.h" @@ -119,8 +120,9 @@ paddle::imperative::NameVarMap AutoTuneLayout( LayoutAutoTune::Instance().SetDesiredLayout(DataLayout::NHWC); VLOG(3) << "Tune the layout from " << BOOST_GET_CONST(std::string, (*attrs)["data_format"]) - << " to " << paddle::framework::DataLayoutToString( - LayoutAutoTune::Instance().GetDesiredLayout()); + << " to " + << paddle::framework::DataLayoutToString( + LayoutAutoTune::Instance().GetDesiredLayout()); } else { LayoutAutoTune::Instance().DisableLayoutAutoTune(); return ins; diff --git a/paddle/fluid/imperative/layout_autotune.h b/paddle/fluid/imperative/layout_autotune.h index df3772b826da1..2da368910e6c3 100644 --- a/paddle/fluid/imperative/layout_autotune.h +++ b/paddle/fluid/imperative/layout_autotune.h @@ -14,8 +14,10 @@ #pragma once #include + #include #include + #include "paddle/fluid/framework/type_defs.h" #include "paddle/phi/common/layout.h" diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc index e9d987cc7045f..4a0dcb1b3bbea 100644 --- a/paddle/fluid/imperative/nccl_context.cc +++ b/paddle/fluid/imperative/nccl_context.cc @@ -22,6 +22,7 @@ #ifdef PADDLE_WITH_NCCL #include + #include "paddle/fluid/platform/dynload/nccl.h" #endif @@ -159,8 +160,9 @@ paddle::platform::DeviceContext *NCCLParallelContext::GetDeviceContext( } void NCCLParallelContext::WaitCompute(int ring_id) { - PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange( - "ring id must >= 0, but got %d", ring_id)); + PADDLE_ENFORCE_GE( + ring_id, 0, + platform::errors::OutOfRange("ring id must >= 0, but got %d", ring_id)); PADDLE_ENFORCE_LT(ring_id, compute_events_.size(), platform::errors::OutOfRange( "ring id must < compute events size," @@ -185,8 +187,9 @@ void NCCLParallelContext::WaitCompute(int ring_id) { } void NCCLParallelContext::WaitComm(int ring_id) { - PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange( - "ring id must >= 0, but got %d", ring_id)); + PADDLE_ENFORCE_GE( + ring_id, 0, + platform::errors::OutOfRange("ring id must >= 0, but got %d", ring_id)); PADDLE_ENFORCE_LT(ring_id, comm_events_.size(), platform::errors::OutOfRange( "ring id must < comm events size," diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h index b8a616ae67d21..ba0221a1729fa 100644 --- a/paddle/fluid/imperative/op_base.h +++ b/paddle/fluid/imperative/op_base.h @@ -20,6 +20,7 @@ #include #include #include + #include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/imperative/saved_variable_wrapper_list.h" #include "paddle/fluid/imperative/type_defs.h" diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc index f2f64d92a23fc..a4baca6f25724 100644 --- a/paddle/fluid/imperative/partial_grad_engine.cc +++ b/paddle/fluid/imperative/partial_grad_engine.cc @@ -24,6 +24,7 @@ #include #include #include + #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/imperative/gradient_accumulator.h" #include "paddle/fluid/imperative/layer.h" diff --git a/paddle/fluid/imperative/partial_grad_engine.h b/paddle/fluid/imperative/partial_grad_engine.h index b5da39f8d4237..4ec6cdb3fcd5d 100644 --- a/paddle/fluid/imperative/partial_grad_engine.h +++ b/paddle/fluid/imperative/partial_grad_engine.h @@ -16,6 +16,7 @@ #include #include + #include "paddle/fluid/imperative/engine.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index cfd3813d60d44..ac99755786359 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -258,7 +258,7 @@ PreparedOp PrepareImpl( #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) && !is_xpu_unsupport #endif - ) { + ) { VLOG(6) << "Dynamic mode PrepareImpl - kernel name: " << pt_kernel_name << " | kernel key: " << pt_kernel_key << " | kernel: " << phi_kernel; @@ -306,7 +306,7 @@ PreparedOp PrepareImpl( #if defined(PADDLE_WITH_XPU_KP) || (is_xpu_unsupport && !is_xpu_kp_support) #endif - ) { + ) { if (has_phi_kernel) { auto pt_cpu_kernel_key = FallBackToCpu(expected_kernel_key, pt_kernel_key, op); diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index ccc8d64517f95..0c2d70dfe3c82 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -19,6 +19,7 @@ #include #include "paddle/fluid/eager/eager_tensor.h" +#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/op_kernel_type.h" #include "paddle/fluid/framework/operator.h" @@ -28,8 +29,6 @@ #include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/imperative/var_helper.h" - -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_context.h" #include "paddle/phi/core/selected_rows.h" diff --git a/paddle/fluid/imperative/profiler.cc b/paddle/fluid/imperative/profiler.cc index 48af63056c5e3..097f62fe42258 100644 --- a/paddle/fluid/imperative/profiler.cc +++ b/paddle/fluid/imperative/profiler.cc @@ -18,7 +18,9 @@ #include "gperftools/profiler.h" #endif #include + #include // NOLINT + #include "paddle/fluid/platform/flags.h" PADDLE_DEFINE_EXPORTED_string( diff --git a/paddle/fluid/imperative/py_layer_fwd.h b/paddle/fluid/imperative/py_layer_fwd.h index 2d7d319203833..f5951a52d718e 100644 --- a/paddle/fluid/imperative/py_layer_fwd.h +++ b/paddle/fluid/imperative/py_layer_fwd.h @@ -16,12 +16,12 @@ #include #include -#include "paddle/fluid/imperative/layer.h" -#include "paddle/fluid/imperative/prepared_operator.h" -#include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/imperative/prepared_operator.h" +#include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/operators/py_layer_op.h" namespace paddle { diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index c7fd2215eb42a..47d7b6366f700 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -18,13 +18,10 @@ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/imperative/layer.h" -#include "paddle/fluid/string/string_helper.h" - +#include "paddle/fluid/imperative/parallel_context.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/strided_memcpy.h" - -#include "paddle/fluid/imperative/parallel_context.h" - +#include "paddle/fluid/string/string_helper.h" #include "paddle/phi/core/dense_tensor.h" namespace paddle { namespace imperative { @@ -452,8 +449,9 @@ void Reducer::InitializeDenseGroups( "Tensor %s is not initialized.", var_name)); const auto size = lod_tensor->numel(); PADDLE_ENFORCE_GT( - size, 0, platform::errors::PreconditionNotMet( - "The number of tensor %s's elements is 0.", var_name)); + size, 0, + platform::errors::PreconditionNotMet( + "The number of tensor %s's elements is 0.", var_name)); all_length += size; p_group->length_.push_back(size); diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h index 9fac4b41cbde0..852d8cf076acb 100644 --- a/paddle/fluid/imperative/reducer.h +++ b/paddle/fluid/imperative/reducer.h @@ -14,6 +14,7 @@ #pragma once #include + #include #include #include diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt index 09de0106ed619..5084363b9c135 100644 --- a/paddle/fluid/imperative/tests/CMakeLists.txt +++ b/paddle/fluid/imperative/tests/CMakeLists.txt @@ -1,26 +1,108 @@ if(WIN32) - cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS device_context) + cc_test( + nccl_context_test + SRCS nccl_context_test.cc + DEPS device_context) else() - if (WITH_GLOO AND (WITH_NCCL OR WITH_RCCL)) - cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context) - cc_test(heter_ccl_context_test SRCS heter_ccl_context_test.cc DEPS heter_ccl_context nccl_context imperative_gloo_context gloo_context gloo_wrapper gloo fs shell) - #set_tests_properties(heter_ccl_context_test PROPERTIES LABELS "RUN_TYPE=DIST") - endif() - if (WITH_XPU_BKCL) - cc_test(bkcl_context_test SRCS bkcl_context_test.cc DEPS bkcl_context) - endif() - if (WITH_CNCL) - cc_test(cncl_context_test SRCS cncl_context_test.cc DEPS cncl_context) - endif() + if(WITH_GLOO AND (WITH_NCCL OR WITH_RCCL)) + cc_test( + nccl_context_test + SRCS nccl_context_test.cc + DEPS nccl_context) + cc_test( + heter_ccl_context_test + SRCS heter_ccl_context_test.cc + DEPS heter_ccl_context + nccl_context + imperative_gloo_context + gloo_context + gloo_wrapper + gloo + fs + shell) + #set_tests_properties(heter_ccl_context_test PROPERTIES LABELS "RUN_TYPE=DIST") + endif() + if(WITH_XPU_BKCL) + cc_test( + bkcl_context_test + SRCS bkcl_context_test.cc + DEPS bkcl_context) + endif() + if(WITH_CNCL) + cc_test( + cncl_context_test + SRCS cncl_context_test.cc + DEPS cncl_context) + endif() endif(WIN32) - -cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy selected_rows_utils selected_rows_functor gradient_accumulator math_function phi_tensor phi_api phi_api_utils) -cc_test(test_layer SRCS test_layer.cc DEPS layer proto_desc operator op_registry variable_helper mul_op memcpy) -cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info split_op layer concat_and_split activation_op place) -cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy) -cc_test(test_hooks SRCS test_hooks.cc DEPS tracer basic_engine layer proto_desc operator op_registry variable_helper mul_op elementwise_add_op memcpy) -cc_test(test_eager SRCS test_eager.cc DEPS tracer layer prepared_operator mul_op) -if (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_CNCL) -cc_test(test_group SRCS test_group.cc DEPS reducer concat_and_split memcpy) +cc_test( + test_gradient_accmulator + SRCS test_gradient_accmulator.cc + DEPS memcpy + selected_rows_utils + selected_rows_functor + gradient_accumulator + math_function + phi_tensor + phi_api + phi_api_utils) +cc_test( + test_layer + SRCS test_layer.cc + DEPS layer + proto_desc + operator + op_registry + variable_helper + mul_op + memcpy) +cc_test( + test_prepare_op + SRCS test_prepare_op.cc + DEPS prepared_operator + op_info + split_op + layer + concat_and_split + activation_op + place) +cc_test( + test_tracer + SRCS test_tracer.cc + DEPS tracer + layer + proto_desc + operator + op_registry + variable_helper + mul_op + reduce_sum_op + elementwise_add_op + memcpy) +cc_test( + test_hooks + SRCS test_hooks.cc + DEPS tracer + basic_engine + layer + proto_desc + operator + op_registry + variable_helper + mul_op + elementwise_add_op + memcpy) +cc_test( + test_eager + SRCS test_eager.cc + DEPS tracer layer prepared_operator mul_op) +if(WITH_NCCL + OR WITH_RCCL + OR WITH_XPU_BKCL + OR WITH_CNCL) + cc_test( + test_group + SRCS test_group.cc + DEPS reducer concat_and_split memcpy) endif() diff --git a/paddle/fluid/imperative/tests/bkcl_context_test.cc b/paddle/fluid/imperative/tests/bkcl_context_test.cc index 580d86b1696bc..b4d299ba829d9 100644 --- a/paddle/fluid/imperative/tests/bkcl_context_test.cc +++ b/paddle/fluid/imperative/tests/bkcl_context_test.cc @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include // NOLINT - #include "paddle/fluid/imperative/bkcl_context.h" +#include // NOLINT + #include "gtest/gtest.h" namespace imperative = paddle::imperative; diff --git a/paddle/fluid/imperative/tests/cncl_context_test.cc b/paddle/fluid/imperative/tests/cncl_context_test.cc index 1d5ee8e7fc899..1019d4eacdc9f 100644 --- a/paddle/fluid/imperative/tests/cncl_context_test.cc +++ b/paddle/fluid/imperative/tests/cncl_context_test.cc @@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/imperative/cncl_context.h" + #include // NOLINT +#include "gtest/gtest.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/imperative/cncl_context.h" #include "paddle/fluid/platform/gen_comm_id_helper.h" -#include "gtest/gtest.h" - namespace imperative = paddle::imperative; namespace platform = paddle::platform; namespace framework = paddle::framework; diff --git a/paddle/fluid/imperative/tests/heter_ccl_context_test.cc b/paddle/fluid/imperative/tests/heter_ccl_context_test.cc index 91f38f82ed058..67059916d0317 100644 --- a/paddle/fluid/imperative/tests/heter_ccl_context_test.cc +++ b/paddle/fluid/imperative/tests/heter_ccl_context_test.cc @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/imperative/heter_ccl_context.h" + #include #include // NOLINT +#include "gtest/gtest.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/imperative/heter_ccl_context.h" - -#include "gtest/gtest.h" namespace imperative = paddle::imperative; namespace platform = paddle::platform; diff --git a/paddle/fluid/imperative/tests/nccl_context_test.cc b/paddle/fluid/imperative/tests/nccl_context_test.cc index 9ee083626c5b8..48479e1412b4b 100644 --- a/paddle/fluid/imperative/tests/nccl_context_test.cc +++ b/paddle/fluid/imperative/tests/nccl_context_test.cc @@ -12,15 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/imperative/nccl_context.h" + #include // NOLINT +#include "gtest/gtest.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/imperative/nccl_context.h" #include "paddle/fluid/platform/gen_comm_id_helper.h" -#include "gtest/gtest.h" - namespace imperative = paddle::imperative; namespace platform = paddle::platform; namespace framework = paddle::framework; diff --git a/paddle/fluid/imperative/tests/test_eager.cc b/paddle/fluid/imperative/tests/test_eager.cc index 3def103ae9aa5..1d6ec7330756f 100644 --- a/paddle/fluid/imperative/tests/test_eager.cc +++ b/paddle/fluid/imperative/tests/test_eager.cc @@ -88,8 +88,9 @@ TEST(test_var_helper, eager_var_helper) { egr_tensor, framework::OpKernelType(framework::proto::VarType::FP32, platform::CPUPlace())); SetCachedValue( - egr_tensor, framework::OpKernelType(framework::proto::VarType::FP32, - platform::CPUPlace()), + egr_tensor, + framework::OpKernelType(framework::proto::VarType::FP32, + platform::CPUPlace()), egr_tensor2); ASSERT_ANY_THROW(GetPlace(egr_tensor2)); ASSERT_ANY_THROW(SetType( diff --git a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc index 88b18a4c17620..d2e768d6ef114 100644 --- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc +++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc @@ -384,7 +384,7 @@ static void TestGradientAccumulatorTestUnchangeInput( for (auto use_tensor2 : use_tensors) { /** g_accum1 && g_accum2: has not been initialized * test accumulate on this graph - */ + */ auto g_var1 = std::make_shared("g_var1"); g_var1->SetOverridedStopGradient(false); auto g_accum1 = CreateAccumulator(g_var1, sort_gradient); @@ -437,7 +437,7 @@ static void TestGradientAccumulatorTestUnchangeInput( /** g_accum3 && g_accum4: has been initialized * test accumulate on previous graph - */ + */ auto var3 = create_var(use_tensor1); auto var_wrapper3_3 = std::make_shared("tmp1_3"); auto var_wrapper4_3 = std::make_shared("tmp2_3"); diff --git a/paddle/fluid/imperative/tests/test_group.cc b/paddle/fluid/imperative/tests/test_group.cc index 5e674af1a08a8..0025103c53196 100644 --- a/paddle/fluid/imperative/tests/test_group.cc +++ b/paddle/fluid/imperative/tests/test_group.cc @@ -14,8 +14,8 @@ #include #include -#include "gtest/gtest.h" +#include "gtest/gtest.h" #include "paddle/fluid/imperative/reducer.h" namespace paddle { diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc index 4cda3f32fdf3f..cfda7a0cac4e9 100644 --- a/paddle/fluid/imperative/tests/test_prepare_op.cc +++ b/paddle/fluid/imperative/tests/test_prepare_op.cc @@ -17,9 +17,11 @@ // #include + #include #include #include + #include "gtest/gtest.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/imperative/prepared_operator.h" diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 350263bc5457d..2295ea4bf67c9 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/imperative/tracer.h" + #include #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/imperative/amp_auto_cast.h" #include "paddle/fluid/imperative/execution_context.h" diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 4e671d52457e2..b9048c4847075 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -21,6 +21,7 @@ #include #include #include + #include "ThreadPool.h" #include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/imperative/amp_auto_cast.h" diff --git a/paddle/fluid/imperative/var_helper.h b/paddle/fluid/imperative/var_helper.h index 9ce456b1103b3..91788e73fa583 100644 --- a/paddle/fluid/imperative/var_helper.h +++ b/paddle/fluid/imperative/var_helper.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/fluid/framework/variable.h" namespace egr { diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 633f481df808b..109cb5d8fe07d 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -17,19 +17,20 @@ if(WITH_TESTING) include(tests/test.cmake) # some generic cmake function for inference endif() -cc_library(paddle_inference_io - SRCS io.cc - DEPS paddle_framework ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) +cc_library( + paddle_inference_io + SRCS io.cc + DEPS paddle_framework ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) # analysis and tensorrt must be added before creating static library, # otherwise, there would be undefined reference to them in static library. add_subdirectory(analysis) add_subdirectory(utils) -if (TENSORRT_FOUND) +if(TENSORRT_FOUND) add_subdirectory(tensorrt) endif() -if (WITH_LITE) +if(WITH_LITE) add_subdirectory(lite) endif() @@ -42,20 +43,30 @@ add_subdirectory(api) # Create static inference library if needed # All static libs in inference/api -set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor - zero_copy_tensor reset_tensor_array - analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg}) +set(STATIC_INFERENCE_API + paddle_inference_api + analysis_predictor + zero_copy_tensor + reset_tensor_array + analysis_config + paddle_pass_builder + activation_functions + ${mkldnn_quantizer_cfg}) #windows GPU static library over the limit, so not create_static_lib, and cc_library is dummy if(WIN32 AND WITH_GPU) - cc_library(paddle_inference DEPS ${fluid_modules} phi ${STATIC_INFERENCE_API} ${utils_modules}) + cc_library(paddle_inference DEPS ${fluid_modules} phi ${STATIC_INFERENCE_API} + ${utils_modules}) else() - create_static_lib(paddle_inference ${fluid_modules} ${phi_modules} ${STATIC_INFERENCE_API} ${utils_modules}) + create_static_lib(paddle_inference ${fluid_modules} ${phi_modules} + ${STATIC_INFERENCE_API} ${utils_modules}) endif() if(NOT APPLE) # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. - set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference.sym") + set(LINK_FLAGS + "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference.sym" + ) set_target_properties(paddle_inference PROPERTIES LINK_FLAGS "${LINK_FLAGS}") endif() @@ -63,7 +74,7 @@ endif() add_subdirectory(capi_exp) if(WITH_TESTING AND WITH_INFERENCE_API_TEST) - add_subdirectory(tests/api) + add_subdirectory(tests/api) endif() set(SHARED_INFERENCE_SRCS @@ -80,43 +91,53 @@ set(SHARED_INFERENCE_SRCS ${PADDLE_CUSTOM_OP_SRCS}) # shared inference library deps -set(SHARED_INFERENCE_DEPS ${fluid_modules} phi analysis_predictor ${utils_modules}) +set(SHARED_INFERENCE_DEPS ${fluid_modules} phi analysis_predictor + ${utils_modules}) -if (WITH_CRYPTO) - set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} paddle_crypto) -endif (WITH_CRYPTO) +if(WITH_CRYPTO) + set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} paddle_crypto) +endif(WITH_CRYPTO) -if (WITH_PSCORE) - set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} fleet ps_service tensor_table) -endif () +if(WITH_PSCORE) + set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} fleet ps_service + tensor_table) +endif() -if (WITH_ONNXRUNTIME) - set(SHARED_INFERENCE_SRCS ${SHARED_INFERENCE_SRCS} - ${CMAKE_CURRENT_SOURCE_DIR}/api/onnxruntime_predictor.cc - ) -endif (WITH_ONNXRUNTIME) +if(WITH_ONNXRUNTIME) + set(SHARED_INFERENCE_SRCS + ${SHARED_INFERENCE_SRCS} + ${CMAKE_CURRENT_SOURCE_DIR}/api/onnxruntime_predictor.cc) +endif(WITH_ONNXRUNTIME) # Create shared inference library -cc_library(paddle_inference_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} - DEPS ${SHARED_INFERENCE_DEPS}) +cc_library( + paddle_inference_shared SHARED + SRCS ${SHARED_INFERENCE_SRCS} + DEPS ${SHARED_INFERENCE_DEPS}) get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(paddle_inference_shared ${os_dependency_modules}) if(WIN32) - target_link_libraries(paddle_inference_shared gflags) + target_link_libraries(paddle_inference_shared gflags) endif() -set_target_properties(paddle_inference_shared PROPERTIES OUTPUT_NAME paddle_inference) +set_target_properties(paddle_inference_shared PROPERTIES OUTPUT_NAME + paddle_inference) if(NOT APPLE AND NOT WIN32) # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac. - if (WITH_CUSTOM_DEVICE) - set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference_custom_device.map") + if(WITH_CUSTOM_DEVICE) + set(LINK_FLAGS + "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference_custom_device.map" + ) else() - set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference.map") + set(LINK_FLAGS + "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference.map") endif() - set_target_properties(paddle_inference_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}") + set_target_properties(paddle_inference_shared PROPERTIES LINK_FLAGS + "${LINK_FLAGS}") # check symbol hidden - FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake + file( + WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake "execute_process(COMMAND sh -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh" " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_inference.so\" RESULT_VARIABLE symbol_res)\n" "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n" @@ -126,5 +147,6 @@ if(NOT APPLE AND NOT WIN32) OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol" COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake" DEPENDS paddle_inference_shared) - add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol") + add_custom_target(check_symbol ALL + DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol") endif() diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 3d1a467565c84..f374c5c7cc20f 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -1,72 +1,112 @@ unset(analysis_deps CACHE) set(analysis_deps # analysis_deps can be extended across the project - framework_proto proto_desc graph pass paddle_inference_io executor pretty_log - ir_pass_manager - CACHE INTERNAL "") + framework_proto + proto_desc + graph + pass + paddle_inference_io + executor + pretty_log + ir_pass_manager + CACHE INTERNAL "") add_subdirectory(ir_passes) add_subdirectory(passes) -cc_library(analysis_helper SRCS helper.cc DEPS framework_proto proto_desc graph paddle_inference_io) +cc_library( + analysis_helper + SRCS helper.cc + DEPS framework_proto proto_desc graph paddle_inference_io) -cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass ${INFER_IR_PASSES} analysis_helper) +cc_library( + ir_pass_manager + SRCS ir_pass_manager.cc + DEPS graph pass ${INFER_IR_PASSES} analysis_helper) -cc_library(argument INTERFACE SRCS argument.cc DEPS scope proto_desc) -cc_library(analysis_pass INTERFACE SRCS analysis_pass.cc DEPS proto_desc) +cc_library( + argument INTERFACE + SRCS argument.cc + DEPS scope proto_desc) +cc_library( + analysis_pass INTERFACE + SRCS analysis_pass.cc + DEPS proto_desc) -cc_library(analysis SRCS analyzer.cc - DEPS ${analysis_deps} analysis_helper - analysis_pass ${INFER_IR_PASSES} - ) +cc_library( + analysis + SRCS analyzer.cc + DEPS ${analysis_deps} analysis_helper analysis_pass ${INFER_IR_PASSES}) function(inference_analysis_test_build TARGET) if(WITH_TESTING) - set(options "") - set(oneValueArgs "") - set(multiValueArgs SRCS EXTRA_DEPS) - cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - inference_base_test_build(${TARGET} - SRCS ${analysis_test_SRCS} - DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS EXTRA_DEPS) + cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + inference_base_test_build( + ${TARGET} + SRCS + ${analysis_test_SRCS} + DEPS + analysis + pass + ${GLOB_PASS_LIB} + ${analysis_test_EXTRA_DEPS}) endif() endfunction() function(inference_analysis_test_run TARGET) if(WITH_TESTING) - set(options "") - set(oneValueArgs "") - set(multiValueArgs COMMAND ARGS) - cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - inference_base_test_run(${TARGET} - COMMAND ${analysis_test_COMMAND} - ARGS ${analysis_test_ARGS}) + set(options "") + set(oneValueArgs "") + set(multiValueArgs COMMAND ARGS) + cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + inference_base_test_run(${TARGET} COMMAND ${analysis_test_COMMAND} ARGS + ${analysis_test_ARGS}) endif() endfunction() function(inference_analysis_test TARGET) if(WITH_TESTING) - set(options "") - set(oneValueArgs "") - set(multiValueArgs SRCS ARGS EXTRA_DEPS) - cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - inference_base_test_build(${TARGET} - SRCS ${analysis_test_SRCS} - DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}) - inference_base_test_run(${TARGET} - COMMAND ${TARGET} - ARGS ${analysis_test_ARGS}) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS ARGS EXTRA_DEPS) + cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + inference_base_test_build( + ${TARGET} + SRCS + ${analysis_test_SRCS} + DEPS + analysis + pass + ${GLOB_PASS_LIB} + ${analysis_test_EXTRA_DEPS}) + inference_base_test_run(${TARGET} COMMAND ${TARGET} ARGS + ${analysis_test_ARGS}) endif() endfunction(inference_analysis_test) - -if (NOT APPLE AND NOT WIN32) - inference_analysis_test(test_analyzer - SRCS analyzer_tester.cc - EXTRA_DEPS reset_tensor_array paddle_inference_shared - ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR}) +if(NOT APPLE AND NOT WIN32) + inference_analysis_test( + test_analyzer + SRCS + analyzer_tester.cc + EXTRA_DEPS + reset_tensor_array + paddle_inference_shared + ARGS + --inference_model_dir=${WORD2VEC_MODEL_DIR}) elseif(WIN32) - inference_analysis_test(test_analyzer - SRCS analyzer_tester.cc - EXTRA_DEPS reset_tensor_array paddle_inference_api - ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR}) + inference_analysis_test( + test_analyzer + SRCS + analyzer_tester.cc + EXTRA_DEPS + reset_tensor_array + paddle_inference_api + ARGS + --inference_model_dir=${WORD2VEC_MODEL_DIR}) endif() diff --git a/paddle/fluid/inference/analysis/analysis_pass.h b/paddle/fluid/inference/analysis/analysis_pass.h index 14a1c3eea3417..a95498d82d0e6 100644 --- a/paddle/fluid/inference/analysis/analysis_pass.h +++ b/paddle/fluid/inference/analysis/analysis_pass.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include #include diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index be7d6ab868022..2b56f8e00d644 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/analyzer.h" + #include #include + #include "paddle/fluid/inference/analysis/passes/passes.h" #include "paddle/fluid/string/pretty_log.h" diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index 4db54706285d4..95a985158e678 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -37,6 +37,7 @@ limitations under the License. */ #include #include + #include "gflags/gflags.h" #include "paddle/fluid/inference/analysis/analysis_pass.h" #include "paddle/fluid/inference/analysis/flags.h" diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 3f96fd69e4ee1..84fcd4e3c396f 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/analyzer.h" - #include #include + +#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 2336fd1980d2e..07b7b37485956 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -216,8 +216,12 @@ struct Argument { DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine, bool); DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool); - DECL_ARGUMENT_FIELD(tensorrt_use_oss, TensorRtUseOSS, bool); + DECL_ARGUMENT_FIELD(tensorrt_use_varseqlen, TensorRtUseOSS, bool); DECL_ARGUMENT_FIELD(tensorrt_with_interleaved, TensorRtWithInterleaved, bool); + DECL_ARGUMENT_FIELD(tensorrt_transformer_posid, TensorRtTransformerPosid, + std::string); + DECL_ARGUMENT_FIELD(tensorrt_transformer_maskid, TensorRtTransformerMaskid, + std::string); DECL_ARGUMENT_FIELD(tensorrt_shape_range_info_path, TensorRtShapeRangeInfoPath, std::string); DECL_ARGUMENT_FIELD(tensorrt_tuned_dynamic_shape, TensorRtTunedDynamicShape, diff --git a/paddle/fluid/inference/analysis/dot.h b/paddle/fluid/inference/analysis/dot.h index 6d883f558709b..619e3461d3ea5 100644 --- a/paddle/fluid/inference/analysis/dot.h +++ b/paddle/fluid/inference/analysis/dot.h @@ -20,6 +20,7 @@ #pragma once #include + #include #include #include diff --git a/paddle/fluid/inference/analysis/dot_tester.cc b/paddle/fluid/inference/analysis/dot_tester.cc index c785a312bf96c..0b669093a1f32 100644 --- a/paddle/fluid/inference/analysis/dot_tester.cc +++ b/paddle/fluid/inference/analysis/dot_tester.cc @@ -12,11 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/dot.h" - #include + #include +#include "paddle/fluid/inference/analysis/dot.h" + namespace paddle { namespace inference { namespace analysis { diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index 88ae61ff1fc98..f952016516184 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include #include #include @@ -72,8 +73,9 @@ struct DataTypeNamer { template const std::string &repr() const { auto x = std::type_index(typeid(T)); - PADDLE_ENFORCE_GT(dic_.count(x), 0, platform::errors::PreconditionNotMet( - "unknown type for representation")); + PADDLE_ENFORCE_GT(dic_.count(x), 0, + platform::errors::PreconditionNotMet( + "unknown type for representation")); return dic_.at(x); } diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index aafbe57e05ff2..6c74d7b738cf6 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/ir_pass_manager.h" + #include #include #include @@ -20,6 +21,7 @@ #include #include #include + #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/scope.h" @@ -29,8 +31,8 @@ namespace paddle { namespace inference { namespace analysis { -using string::PrettyLogEndl; using string::PrettyLog; +using string::PrettyLogEndl; using string::Style; IRPassManager::IRPassManager(Argument *argument) { @@ -55,9 +57,13 @@ void IRPassManager::CreatePasses(Argument *argument, int pass_num = 0; for (const std::string &pass_name : passes) { auto pass = framework::ir::PassRegistry::Instance().Get(pass_name); - pass->Set("use_oss", new bool(argument->tensorrt_use_oss())); + pass->Set("use_varseqlen", new bool(argument->tensorrt_use_varseqlen())); pass->Set("with_interleaved", new bool(argument->tensorrt_with_interleaved())); + pass->Set("tensorrt_transformer_posid", + new std::string(argument->tensorrt_transformer_posid())); + pass->Set("tensorrt_transformer_maskid", + new std::string(argument->tensorrt_transformer_maskid())); pass->Set("disable_logs", new bool(argument->disable_logs())); auto precision_mode = argument->tensorrt_precision_mode(); bool enable_int8 = precision_mode == AnalysisConfig::Precision::kInt8; diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h index 823dc8907ea53..9f9a5fc347123 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.h +++ b/paddle/fluid/inference/analysis/ir_pass_manager.h @@ -27,6 +27,7 @@ #include #include #include + #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/program_desc.h" diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt index 7faef7d391f02..a7a561b7b37a1 100644 --- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt @@ -1,34 +1,63 @@ -cc_library(subgraph_util SRCS subgraph_util.cc DEPS subgraph_detector) +cc_library( + subgraph_util + SRCS subgraph_util.cc + DEPS subgraph_detector) -if (WITH_GPU AND TENSORRT_FOUND) - cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_util tensorrt_op_teller infer_io_utils) +if(WITH_GPU AND TENSORRT_FOUND) + cc_library( + tensorrt_subgraph_pass + SRCS tensorrt_subgraph_pass.cc + DEPS subgraph_util tensorrt_op_teller infer_io_utils) - set(analysis_deps ${analysis_deps} - subgraph_util tensorrt_subgraph_pass - CACHE INTERNAL "") + set(analysis_deps + ${analysis_deps} subgraph_util tensorrt_subgraph_pass + CACHE INTERNAL "") - set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp) + set(pass_file + ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp + ) file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n") - set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "") + set(INFER_IR_PASSES + ${INFER_IR_PASSES} tensorrt_subgraph_pass + CACHE INTERNAL "") endif() -if (WITH_LITE) - cc_library(lite_subgraph_pass SRCS lite_subgraph_pass.cc DEPS ${analysis_deps} subgraph_util lite_op_teller) - set(analysis_deps ${analysis_deps} subgraph_util lite_subgraph_pass CACHE INTERNAL "") - set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp) +if(WITH_LITE) + cc_library( + lite_subgraph_pass + SRCS lite_subgraph_pass.cc + DEPS ${analysis_deps} subgraph_util lite_op_teller) + set(analysis_deps + ${analysis_deps} subgraph_util lite_subgraph_pass + CACHE INTERNAL "") + set(pass_file + ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp + ) file(APPEND ${pass_file} "USE_PASS(lite_subgraph_pass);\n") - set(INFER_IR_PASSES ${INFER_IR_PASSES} lite_subgraph_pass CACHE INTERNAL "") - cc_test(lite_subgraph_pass_tester SRCS lite_subgraph_pass_tester.cc DEPS lite_subgraph_pass gtest glog) + set(INFER_IR_PASSES + ${INFER_IR_PASSES} lite_subgraph_pass + CACHE INTERNAL "") + cc_test( + lite_subgraph_pass_tester + SRCS lite_subgraph_pass_tester.cc + DEPS lite_subgraph_pass gtest glog) endif() -MESSAGE("WITH_DLNNE:${WITH_DLNNE}") +message("WITH_DLNNE:${WITH_DLNNE}") if(WITH_DLNNE) - cc_library(dlnne_subgraph_pass SRCS dlnne_subgraph_pass.cc DEPS ${analysis_deps} subgraph_util) - set(analysis_deps ${analysis_deps} - subgraph_util dlnne_subgraph_pass - CACHE INTERNAL "") + cc_library( + dlnne_subgraph_pass + SRCS dlnne_subgraph_pass.cc + DEPS ${analysis_deps} subgraph_util) + set(analysis_deps + ${analysis_deps} subgraph_util dlnne_subgraph_pass + CACHE INTERNAL "") - set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp) + set(pass_file + ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp + ) file(APPEND ${pass_file} "USE_PASS(dlnne_subgraph_pass);\n") - set(INFER_IR_PASSES ${INFER_IR_PASSES} dlnne_subgraph_pass CACHE INTERNAL "") + set(INFER_IR_PASSES + ${INFER_IR_PASSES} dlnne_subgraph_pass + CACHE INTERNAL "") endif() diff --git a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc index 8f789139af9bf..b2a07722829be 100644 --- a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc @@ -11,19 +11,19 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#include -#include -#include +#include "paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h" +#include #include #include +#include +#include #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/subgraph_detector.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h" -#include "paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h" #include "paddle/fluid/string/pretty_log.h" namespace paddle { @@ -52,18 +52,39 @@ using framework::ir::Node; void analysis::DlnneSubgraphPass::ApplyImpl(framework::ir::Graph *graph) const { static std::unordered_set teller_set{ - "mul", "matmul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", - "hard_swish", "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", - "elementwise_add", "elementwise_mul", "dropout", "prelu", - "conv2d_transpose", "leaky_relu", + "mul", + "matmul", + "conv2d", + "pool2d", + "relu", + "softmax", + "sigmoid", + "hard_swish", + "depthwise_conv2d", + "batch_norm", + "concat", + "tanh", + "pad", + "elementwise_add", + "elementwise_mul", + "dropout", + "prelu", + "conv2d_transpose", + "leaky_relu", // "fc", - "shuffle_channel", "swish", "split", + "shuffle_channel", + "swish", + "split", // "instance_norm", "gelu", // "layer_norm", // "scale", // "stack", - "relu6", "reshape2", "transpose2", "concat", "slice", + "relu6", + "reshape2", + "transpose2", + "concat", + "slice", }; framework::ir::FusePassBase::Init("dlnne_subgraph_pass", graph); diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc index 083fc8991192e..b5ddacd440e25 100644 --- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc @@ -12,7 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h" + #include +#include +#include #include #include #include @@ -21,28 +25,22 @@ #include #include -#include -#include - +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/subgraph_detector.h" #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/inference/lite/engine.h" #include "paddle/fluid/inference/lite/op_teller.h" #include "paddle/fluid/inference/utils/singleton.h" - -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" -#include "paddle/fluid/framework/ir/subgraph_detector.h" -#include "paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h" #include "paddle/fluid/string/pretty_log.h" -#include "paddle/fluid/inference/lite/engine.h" - namespace paddle { namespace inference { namespace analysis { -using framework::ir::Node; using framework::ir::Agent; -using framework::ir::SubGraphFuser; using framework::ir::Graph; +using framework::ir::Node; +using framework::ir::SubGraphFuser; namespace lite { diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h index e79a64f0f72cf..198a86c185bc6 100644 --- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h +++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h @@ -14,10 +14,12 @@ #pragma once #include + #include #include #include #include + #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h" diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass_tester.cc index 90ad7ec0b4437..8c88e2869cce3 100644 --- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass_tester.cc +++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass_tester.cc @@ -12,8 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h" #include + +#include "paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h" #include "paddle/fluid/inference/io.h" #include "paddle/fluid/inference/lite/op_teller.h" @@ -29,7 +30,7 @@ void AppendLiteSubBlocks(const std::vector& subgraph_ops, framework::ProgramDesc* engine_program, framework::ProgramDesc* host_program, const int32_t host_sub_id); -} +} // namespace lite TEST(LiteSubgraphPass, basic) { framework::ProgramDesc host_program; diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index b73eb624db85b..394ce7799e8ee 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -377,12 +377,18 @@ void TensorRtSubgraphPass::CreateTensorRTOp( Get("workspace_size"), precision_mode, calibrator.get(), Get("gpu_device_id"), min_input_shape, max_input_shape, opt_input_shape, disable_trt_plugin_fp16); - trt_engine->SetUseOSS(Get("use_oss")); + trt_engine->SetUseOSS(Get("use_varseqlen")); trt_engine->SetWithInterleaved(Get("with_interleaved")); + trt_engine->SetTransformerPosid( + Get("tensorrt_transformer_posid")); + trt_engine->SetTransformerMaskid( + Get("tensorrt_transformer_maskid")); trt_engine->SetUseDLA(Get("trt_use_dla")); trt_engine->SetDLACore(Get("trt_dla_core")); trt_engine->SetUseInspector(Get("use_inspector")); - trt_engine->SetWithErnie(graph->Has(framework::ir::kMultiheadMatmulPass)); + trt_engine->SetWithErnie( + graph->Has(framework::ir::kEmbEltwiseLayernormPass) && + graph->Has(framework::ir::kMultiheadMatmulPass)); if (use_static_engine) { trt_engine_serialized_data = GetTrtEngineSerializedData( diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt index a950899a8a458..17bb8b6c62ab7 100644 --- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt @@ -1,28 +1,55 @@ -cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager) -cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager) -cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass zero_copy_tensor) -cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager) -cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass) -cc_library(adjust_cudnn_workspace_size_pass SRCS adjust_cudnn_workspace_size_pass.cc DEPS analysis_pass graph_to_program_pass) -cc_library(inference_op_replace_pass SRCS inference_op_replace_pass.cc DEPS analysis_pass graph_to_program_pass) -IF(WITH_TESTING) - cc_library(ir_graph_clean_pass SRCS ir_graph_clean_pass.cc DEPS analysis_pass gtest) -ELSE() - cc_library(ir_graph_clean_pass SRCS ir_graph_clean_pass.cc DEPS analysis_pass) -ENDIF() - -cc_library(analysis_passes SRCS passes.cc DEPS +cc_library( ir_graph_build_pass + SRCS ir_graph_build_pass.cc + DEPS analysis_pass argument ir_pass_manager) +cc_library( ir_analysis_pass + SRCS ir_analysis_pass.cc + DEPS analysis_pass argument ir_pass_manager) +cc_library( + memory_optim_pass + SRCS memory_optimize_pass.cc + DEPS analysis_pass zero_copy_tensor) +cc_library( ir_params_sync_among_devices_pass + SRCS ir_params_sync_among_devices_pass.cc + DEPS analysis_pass argument ir_pass_manager) +cc_library( + ir_graph_to_program_pass + SRCS ir_graph_to_program_pass.cc + DEPS analysis_pass graph_to_program_pass) +cc_library( adjust_cudnn_workspace_size_pass - memory_optim_pass + SRCS adjust_cudnn_workspace_size_pass.cc + DEPS analysis_pass graph_to_program_pass) +cc_library( inference_op_replace_pass - ir_graph_to_program_pass - ir_graph_clean_pass -) + SRCS inference_op_replace_pass.cc + DEPS analysis_pass graph_to_program_pass) +if(WITH_TESTING) + cc_library( + ir_graph_clean_pass + SRCS ir_graph_clean_pass.cc + DEPS analysis_pass gtest) +else() + cc_library( + ir_graph_clean_pass + SRCS ir_graph_clean_pass.cc + DEPS analysis_pass) +endif() + +cc_library( + analysis_passes + SRCS passes.cc + DEPS ir_graph_build_pass + ir_analysis_pass + ir_params_sync_among_devices_pass + adjust_cudnn_workspace_size_pass + memory_optim_pass + inference_op_replace_pass + ir_graph_to_program_pass + ir_graph_clean_pass) -set(analysis_deps ${analysis_deps} - analysis_passes - subgraph_detector - CACHE INTERNAL "") +set(analysis_deps + ${analysis_deps} analysis_passes subgraph_detector + CACHE INTERNAL "") diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc index 34192965297a6..05bda4e75c9bd 100644 --- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h" + #include #include + #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/inference/analysis/ir_pass_manager.h" diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h index 2c2113c06d917..fca431b5d7779 100644 --- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/fluid/inference/analysis/analysis_pass.h" namespace paddle { diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc index 321716b1c8a18..fca5e2563424e 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h" + #include #include + #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/inference/io.h" diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h index adbde0433fad2..e7ef23e791e9d 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/analysis/analysis_pass.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc index 0f3633ca6fa4b..999fb4ad8d764 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h" + #include "paddle/fluid/framework/ir/graph_to_program_pass.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/program_desc.h" diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h index 613eb04497e61..5b20667d62ab6 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/fluid/inference/analysis/analysis_pass.h" namespace paddle { diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index 614eea24a0e2e..a0c7a94cd1b30 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h" + #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/lod_tensor.h" diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc index 3fa417c2ea631..70620e8692cd8 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc @@ -61,7 +61,8 @@ void MemoryOptimizePass::CollectLifeCycle( auto reads = op_node->inputs; auto writes = op_node->outputs; - std::vector requires(reads.begin(), reads.end()); + std::vector + requires(reads.begin(), reads.end()); requires.insert(requires.end(), writes.begin(), writes.end()); // Disable reuse of feed variables. diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h index 8ca5ffa2581f1..5dcd8b1059ebc 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h @@ -35,16 +35,15 @@ namespace inference { namespace analysis { /* Memory optimization. -* We will perform the following operation: -* 1. Collect all var's lifetime. -* 2. Make reuse plan: the vars can be reused if there is no overlap(on lifetime) -* between -* them. -* The final plan is a mapping table in which the key represents the original -* name of var and the value in the table represents the current name of var. -* 3. Perform reuse plan: Replace all var's name in the model according to the -* mapping table. -*/ + * We will perform the following operation: + * 1. Collect all var's lifetime. + * 2. Make reuse plan: the vars can be reused if there is no overlap(on + * lifetime) between them. The final plan is a mapping table in which the key + * represents the original name of var and the value in the table represents the + * current name of var. + * 3. Perform reuse plan: Replace all var's name in the model according to the + * mapping table. + */ class MemoryOptimizePass : public AnalysisPass { public: using space_table_t = std::unordered_map; diff --git a/paddle/fluid/inference/analysis/passes/passes.cc b/paddle/fluid/inference/analysis/passes/passes.cc index ca0b25c29d495..19aab1a948dd2 100644 --- a/paddle/fluid/inference/analysis/passes/passes.cc +++ b/paddle/fluid/inference/analysis/passes/passes.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/passes/passes.h" + #include "paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h" #include "paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h" diff --git a/paddle/fluid/inference/analysis/passes/passes.h b/paddle/fluid/inference/analysis/passes/passes.h index 8a13091d083e5..b3b240c280c96 100644 --- a/paddle/fluid/inference/analysis/passes/passes.h +++ b/paddle/fluid/inference/analysis/passes/passes.h @@ -17,6 +17,7 @@ #include #include #include + #include "paddle/fluid/inference/analysis/analysis_pass.h" namespace paddle { diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h index 56565c8f3f72a..6c7690a4779bf 100644 --- a/paddle/fluid/inference/analysis/ut_helper.h +++ b/paddle/fluid/inference/analysis/ut_helper.h @@ -14,8 +14,10 @@ limitations under the License. */ #pragma once #include + #include #include + #include "gflags/gflags.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/inference/analysis/helper.h" diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 56cc4aa755bda..e25c5e963982f 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -14,7 +14,7 @@ # if(APPLE) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move") endif(APPLE) add_subdirectory(details) @@ -22,76 +22,139 @@ add_subdirectory(details) if(WITH_MKLDNN) set(mkldnn_quantizer_cfg mkldnn_quantizer_config) set(mkldnn_quantizer_src ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn_quantizer.cc) - cc_library(${mkldnn_quantizer_cfg} SRCS mkldnn_quantizer_config.cc DEPS lod_tensor paddle_pass_builder) - set(mkldnn_quantizer_cfg ${mkldnn_quantizer_cfg} PARENT_SCOPE) + cc_library( + ${mkldnn_quantizer_cfg} + SRCS mkldnn_quantizer_config.cc + DEPS lod_tensor paddle_pass_builder) + set(mkldnn_quantizer_cfg + ${mkldnn_quantizer_cfg} + PARENT_SCOPE) endif() -cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer utf8proc) -cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tensor) +cc_library( + analysis_config + SRCS analysis_config.cc + DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer + utf8proc) +cc_library( + paddle_infer_contrib + SRCS paddle_infer_contrib.cc + DEPS zero_copy_tensor) cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) -set(paddle_inference_api_deps lod_tensor scope reset_tensor_array - analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator) +set(paddle_inference_api_deps + lod_tensor + scope + reset_tensor_array + analysis_config + paddle_infer_contrib + zero_copy_tensor + trainer_desc_proto + custom_operator) if(WITH_CRYPTO) - list(APPEND paddle_inference_api_deps paddle_crypto) + list(APPEND paddle_inference_api_deps paddle_crypto) endif() -cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS ${paddle_inference_api_deps}) +cc_library( + paddle_inference_api + SRCS api.cc api_impl.cc helper.cc + DEPS ${paddle_inference_api_deps}) if(WIN32) - target_link_libraries(paddle_inference_api gflags) + target_link_libraries(paddle_inference_api gflags) endif() -set(inference_deps ${analysis_deps} paddle_inference_api analysis naive_executor ${GLOB_PASS_LIB}) +set(inference_deps ${analysis_deps} paddle_inference_api analysis + naive_executor ${GLOB_PASS_LIB}) if(WITH_GPU AND TENSORRT_FOUND) - set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter) + set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter) endif() -if (WITH_ONNXRUNTIME) - cc_library(analysis_predictor SRCS analysis_predictor.cc onnxruntime_predictor.cc resource_manager.cc infer_context.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} - zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils onnxruntime paddle2onnx) -else (WITH_ONNXRUNTIME) - cc_library(analysis_predictor SRCS analysis_predictor.cc resource_manager.cc infer_context.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} - zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils) -endif (WITH_ONNXRUNTIME) - - -cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api) +if(WITH_ONNXRUNTIME) + cc_library( + analysis_predictor + SRCS analysis_predictor.cc onnxruntime_predictor.cc resource_manager.cc + infer_context.cc ${mkldnn_quantizer_src} + DEPS ${inference_deps} + zero_copy_tensor + ir_pass_manager + op_compatible_info + infer_io_utils + onnxruntime + paddle2onnx) +else(WITH_ONNXRUNTIME) + cc_library( + analysis_predictor + SRCS analysis_predictor.cc resource_manager.cc infer_context.cc + ${mkldnn_quantizer_src} + DEPS ${inference_deps} zero_copy_tensor ir_pass_manager op_compatible_info + infer_io_utils) +endif(WITH_ONNXRUNTIME) + +cc_test( + test_paddle_inference_api + SRCS api_tester.cc + DEPS paddle_inference_api) if(WITH_TESTING) - if (NOT APPLE AND NOT WIN32) - if (WITH_GPU) - inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_inference_shared - ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR}) + if(NOT APPLE AND NOT WIN32) + if(WITH_GPU) + inference_base_test( + test_api_impl + SRCS + api_impl_tester.cc + DEPS + paddle_inference_shared + ARGS + --word2vec_dirname=${WORD2VEC_MODEL_DIR} + --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR}) endif() elseif(WIN32) - inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps} - ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR}) + inference_base_test( + test_api_impl + SRCS + api_impl_tester.cc + DEPS + ${inference_deps} + ARGS + --word2vec_dirname=${WORD2VEC_MODEL_DIR} + --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR}) endif() endif() -if (NOT APPLE AND NOT WIN32) - cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS paddle_inference_shared - ARGS --dirname=${WORD2VEC_MODEL_DIR}) -elseif (WIN32) - cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps} - ARGS --dirname=${WORD2VEC_MODEL_DIR}) +if(NOT APPLE AND NOT WIN32) + cc_test( + test_analysis_predictor + SRCS analysis_predictor_tester.cc + DEPS paddle_inference_shared ARGS --dirname=${WORD2VEC_MODEL_DIR}) +elseif(WIN32) + cc_test( + test_analysis_predictor + SRCS analysis_predictor_tester.cc + DEPS analysis_predictor benchmark ${inference_deps} ARGS + --dirname=${WORD2VEC_MODEL_DIR}) endif() if(WITH_TESTING AND WITH_MKLDNN) - if (NOT APPLE AND NOT WIN32) - cc_test(test_mkldnn_quantizer SRCS mkldnn_quantizer_tester.cc DEPS paddle_inference_shared ARGS --dirname=${WORD2VEC_MODEL_DIR}) - elseif (WIN32) - cc_test(test_mkldnn_quantizer SRCS mkldnn_quantizer_tester.cc DEPS analysis_predictor benchmark ${inference_deps} - ARGS --dirname=${WORD2VEC_MODEL_DIR}) + if(NOT APPLE AND NOT WIN32) + cc_test( + test_mkldnn_quantizer + SRCS mkldnn_quantizer_tester.cc + DEPS paddle_inference_shared ARGS --dirname=${WORD2VEC_MODEL_DIR}) + elseif(WIN32) + cc_test( + test_mkldnn_quantizer + SRCS mkldnn_quantizer_tester.cc + DEPS analysis_predictor benchmark ${inference_deps} ARGS + --dirname=${WORD2VEC_MODEL_DIR}) endif() endif() if(WITH_TESTING AND TEST test_api_impl) - if(NOT APPLE) - set_tests_properties(test_api_impl PROPERTIES TIMEOUT 120) - endif() + if(NOT APPLE) + set_tests_properties(test_api_impl PROPERTIES TIMEOUT 120) + endif() endif() diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 735e1b7be4c1f..c23397a082860 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -15,6 +15,7 @@ #include #include #include + #include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/api/paddle_pass_builder.h" #include "paddle/fluid/inference/utils/table_printer.h" @@ -256,8 +257,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(trt_dla_core_); CP_MEMBER(trt_use_static_engine_); CP_MEMBER(trt_use_calib_mode_); - CP_MEMBER(trt_use_oss_); + CP_MEMBER(trt_use_varseqlen_); CP_MEMBER(trt_with_interleaved_); + CP_MEMBER(tensorrt_transformer_posid_); + CP_MEMBER(tensorrt_transformer_maskid_); CP_MEMBER(trt_tuned_dynamic_shape_); CP_MEMBER(trt_allow_build_at_runtime_); CP_MEMBER(collect_shape_range_info_); @@ -546,7 +549,7 @@ void AnalysisConfig::Exp_DisableTensorRtOPs( trt_disabled_ops_.insert(trt_disabled_ops_.end(), ops.begin(), ops.end()); } -void AnalysisConfig::EnableTensorRtOSS() { trt_use_oss_ = true; } +void AnalysisConfig::EnableVarseqlen() { trt_use_varseqlen_ = true; } // TODO(Superjomn) refactor this, buggy. void AnalysisConfig::Update() { @@ -1034,9 +1037,13 @@ std::string AnalysisConfig::Summary() { ? shape_range_info_path_ : "false"}); - os.InsertRow({"tensorrt_use_oss", trt_use_oss_ ? "true" : "false"}); + os.InsertRow( + {"tensorrt_use_varseqlen", trt_use_varseqlen_ ? "true" : "false"}); os.InsertRow({"tensorrt_with_interleaved", trt_with_interleaved_ ? "true" : "false"}); + os.InsertRow({"tensorrt_transformer_posid", tensorrt_transformer_posid_}); + os.InsertRow( + {"tensorrt_transformer_maskid", tensorrt_transformer_maskid_}); os.InsertRow({"tensorrt_use_dla", trt_use_dla_ ? "true" : "false"}); if (trt_use_dla_) { os.InsertRow({"tensorrt_dla_core", std::to_string(trt_dla_core_)}); @@ -1099,8 +1106,9 @@ LiteNNAdapterConfig &LiteNNAdapterConfig::SetModelCacheBuffers( platform::errors::InvalidArgument( "model_cache_buffer should not be empty.")); PADDLE_ENFORCE_EQ(nnadapter_model_cache_buffers.count(model_cache_token), - false, platform::errors::InvalidArgument( - "model_cache_token has already been set.")); + false, + platform::errors::InvalidArgument( + "model_cache_token has already been set.")); nnadapter_model_cache_buffers[model_cache_token] = model_cache_buffer; return *this; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 09a5bbddba87c..5f9051ff2fdb9 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -83,9 +83,9 @@ namespace paddle { using inference::Singleton; #if PADDLE_WITH_TENSORRT -using inference::tensorrt::TRTInt8Calibrator; using inference::tensorrt::TRTCalibratorEngine; using inference::tensorrt::TRTCalibratorEngineManager; +using inference::tensorrt::TRTInt8Calibrator; #endif int AnalysisPredictor::clone_num_ = 1; @@ -853,8 +853,10 @@ void AnalysisPredictor::PrepareArgument() { } argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_); - argument_.SetTensorRtUseOSS(config_.trt_use_oss_); + argument_.SetTensorRtUseOSS(config_.trt_use_varseqlen_); argument_.SetTensorRtWithInterleaved(config_.trt_with_interleaved_); + argument_.SetTensorRtTransformerPosid(config_.tensorrt_transformer_posid_); + argument_.SetTensorRtTransformerMaskid(config_.tensorrt_transformer_maskid_); argument_.SetMinInputShape(config_.min_input_shape_); argument_.SetMaxInputShape(config_.max_input_shape_); argument_.SetOptimInputShape(config_.optim_input_shape_); @@ -1025,8 +1027,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() { } template <> -std::unique_ptr CreatePaddlePredictor< - AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) { +std::unique_ptr +CreatePaddlePredictor( + const AnalysisConfig &config) { // TODO(NHZlX): Should add the link to the doc of // paddle_infer::CreatePredictor if (config.glog_info_disabled()) { @@ -1803,6 +1806,9 @@ USE_TRT_CONVERTER(fused_preln_embedding_eltwise_layernorm) USE_TRT_CONVERTER(preln_skip_layernorm) USE_TRT_CONVERTER(roll) USE_TRT_CONVERTER(strided_slice) +USE_TRT_CONVERTER(transformer_input_convert) +USE_TRT_CONVERTER(recover_padding) +USE_TRT_CONVERTER(remove_padding) #endif namespace paddle_infer { @@ -1971,6 +1977,20 @@ void InternalUtils::UpdateConfigInterleaved(paddle_infer::Config *c, #endif } +void InternalUtils::SetTransformerPosid( + paddle_infer::Config *c, const std::string &tensorrt_transformer_posid) { +#ifdef PADDLE_WITH_CUDA + c->tensorrt_transformer_posid_ = tensorrt_transformer_posid; +#endif +} + +void InternalUtils::SetTransformerMaskid( + paddle_infer::Config *c, const std::string &tensorrt_transformer_maskid) { +#ifdef PADDLE_WITH_CUDA + c->tensorrt_transformer_maskid_ = tensorrt_transformer_maskid; +#endif +} + void InternalUtils::SyncStream(paddle_infer::Predictor *p) { #ifdef PADDLE_WITH_CUDA auto *pred = dynamic_cast(p->predictor_.get()); diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index e96526730fdea..1cfdaf1a55864 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -41,7 +41,7 @@ using float16 = paddle::platform::float16; namespace experimental { class InternalUtils; }; -} +} // namespace paddle_infer /// /// \file analysis_predictor.h /// @@ -55,10 +55,10 @@ class InternalUtils; namespace paddle { -using inference::analysis::Argument; -using inference::analysis::Analyzer; -using framework::proto::ProgramDesc; using framework::NaiveExecutor; +using framework::proto::ProgramDesc; +using inference::analysis::Analyzer; +using inference::analysis::Argument; /// /// \class AnalysisPredictor diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index e8a1384166aff..f16054565a7fc 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -18,7 +18,9 @@ #endif #include #include + #include // NOLINT + #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/inference/api/helper.h" diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc index e2befadf0a89b..9e4633774a2fc 100644 --- a/paddle/fluid/inference/api/api.cc +++ b/paddle/fluid/inference/api/api.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "gflags/gflags.h" #include "paddle/fluid/framework/commit.h" #include "paddle/fluid/framework/lod_tensor.h" diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 1c4369af646af..38960aecb703b 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -12,13 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/inference/api/api_impl.h" + #include + #include #include #include #include "paddle/fluid/framework/feed_fetch_method.h" -#include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/place.h" @@ -348,8 +350,9 @@ bool NativePaddlePredictor::GetFetch(std::vector *outputs, } template <> -std::unique_ptr CreatePaddlePredictor< - NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) { +std::unique_ptr +CreatePaddlePredictor( + const NativeConfig &config) { // TODO(NHZlX): Should add the link to the doc of // paddle_infer::CreatePredictor VLOG(3) << "create NativePaddlePredictor"; diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h index b91eff4573ed0..d503d2581392a 100644 --- a/paddle/fluid/inference/api/api_impl.h +++ b/paddle/fluid/inference/api/api_impl.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include #include #include diff --git a/paddle/fluid/inference/api/api_tester.cc b/paddle/fluid/inference/api/api_tester.cc index 46724fa6b1aca..1faf46fad2be6 100644 --- a/paddle/fluid/inference/api/api_tester.cc +++ b/paddle/fluid/inference/api/api_tester.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include #include diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index 547e265d2fdb5..a76ed63f10646 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -1,31 +1,33 @@ cmake_minimum_required(VERSION 3.0) project(cpp_inference_demo CXX C) -option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL." ON) -option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF) -option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." ON) -option(USE_TENSORRT "Compile demo with TensorRT." OFF) -option(WITH_ONNXRUNTIME "Compile demo with ONNXRuntime" OFF) +option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL." ON) +option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF) +option(WITH_STATIC_LIB + "Compile demo with static/shared library, default use static." ON) +option(USE_TENSORRT "Compile demo with TensorRT." OFF) +option(WITH_ONNXRUNTIME "Compile demo with ONNXRuntime" OFF) if(NOT WITH_STATIC_LIB) add_definitions("-DPADDLE_WITH_SHARED_LIB") else() - # PD_INFER_DECL is mainly used to set the dllimport/dllexport attribute in dynamic library mode. + # PD_INFER_DECL is mainly used to set the dllimport/dllexport attribute in dynamic library mode. # Set it to empty in static library mode to avoid compilation issues. add_definitions("/DPD_INFER_DECL=") endif() macro(safe_set_static_flag) - foreach(flag_var - CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE - CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) - if(${flag_var} MATCHES "/MD") - string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") - endif(${flag_var} MATCHES "/MD") - endforeach(flag_var) + foreach(flag_var + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) + if(${flag_var} MATCHES "/MD") + string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") + endif(${flag_var} MATCHES "/MD") + endforeach(flag_var) endmacro() if(NOT DEFINED PADDLE_LIB) - message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") + message( + FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") endif() if(NOT DEFINED DEMO_NAME) message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name") @@ -47,7 +49,7 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib") link_directories("${PADDLE_LIB}/paddle/lib") -if (WITH_ONNXRUNTIME) +if(WITH_ONNXRUNTIME) include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include") @@ -55,21 +57,25 @@ if (WITH_ONNXRUNTIME) link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib") endif() -if (WIN32) +if(WIN32) add_definitions("/DGOOGLE_GLOG_DLL_DECL=") option(MSVC_STATIC_CRT "use static C Runtime library by default" ON) - if (MSVC_STATIC_CRT) - if (WITH_MKL) + if(MSVC_STATIC_CRT) + if(WITH_MKL) set(FLAG_OPENMP "/openmp") endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4244 /wd4251 /wd4267 /wd4305") - set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}") + set(CMAKE_C_FLAGS_DEBUG + "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}") + set(CMAKE_C_FLAGS_RELEASE + "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4251 /wd4267 /wd4305") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}") + set(CMAKE_CXX_FLAGS_DEBUG + "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}") + set(CMAKE_CXX_FLAGS_RELEASE + "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}") safe_set_static_flag() - if (WITH_STATIC_LIB) + if(WITH_STATIC_LIB) add_definitions(-DSTATIC_LIB) endif() endif() @@ -82,42 +88,55 @@ endif() if(WITH_GPU) if(NOT WIN32) - set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library") + set(CUDA_LIB + "/usr/local/cuda/lib64/" + CACHE STRING "CUDA Library") else() - set(CUDA_LIB "" CACHE STRING "CUDA_LIB") + set(CUDA_LIB + "" + CACHE STRING "CUDA_LIB") if("${CUDA_LIB}" STREQUAL "") if(DEFINED ENV{CUDA_PATH}) set(CUDA_LIB "$ENV{CUDA_PATH}\\lib\\x64") else() - set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\lib\\x64") + set(CUDA_LIB + "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\lib\\x64" + ) endif() endif() message(STATUS "Current CUDA lib path: ${CUDA_LIB}") endif(NOT WIN32) endif() -if (USE_TENSORRT AND WITH_GPU) - set(TENSORRT_ROOT "" CACHE STRING "The root directory of TensorRT library") +if(USE_TENSORRT AND WITH_GPU) + set(TENSORRT_ROOT + "" + CACHE STRING "The root directory of TensorRT library") if("${TENSORRT_ROOT}" STREQUAL "") - message(FATAL_ERROR "The TENSORRT_ROOT is empty, you must assign it a value with CMake command. Such as: -DTENSORRT_ROOT=TENSORRT_ROOT_PATH ") + message( + FATAL_ERROR + "The TENSORRT_ROOT is empty, you must assign it a value with CMake command. Such as: -DTENSORRT_ROOT=TENSORRT_ROOT_PATH " + ) endif() set(TENSORRT_INCLUDE_DIR ${TENSORRT_ROOT}/include) set(TENSORRT_LIB_DIR ${TENSORRT_ROOT}/lib) file(READ ${TENSORRT_INCLUDE_DIR}/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS) - string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION - "${TENSORRT_VERSION_FILE_CONTENTS}") + string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" + TENSORRT_MAJOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}") if("${TENSORRT_MAJOR_VERSION}" STREQUAL "") - file(READ ${TENSORRT_INCLUDE_DIR}/NvInferVersion.h TENSORRT_VERSION_FILE_CONTENTS) - string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION - "${TENSORRT_VERSION_FILE_CONTENTS}") + file(READ ${TENSORRT_INCLUDE_DIR}/NvInferVersion.h + TENSORRT_VERSION_FILE_CONTENTS) + string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" + TENSORRT_MAJOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}") endif() if("${TENSORRT_MAJOR_VERSION}" STREQUAL "") message(SEND_ERROR "Failed to detect TensorRT version.") endif() string(REGEX REPLACE "define NV_TENSORRT_MAJOR +([0-9]+)" "\\1" - TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}") - message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. " - "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ") + TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}") + message( + STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. " + "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ") include_directories("${TENSORRT_INCLUDE_DIR}") link_directories("${TENSORRT_LIB_DIR}") endif() @@ -129,8 +148,9 @@ if(WITH_MKL) set(MATH_LIB ${MATH_LIB_PATH}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX} ${MATH_LIB_PATH}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX}) else() - set(MATH_LIB ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} - ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(MATH_LIB + ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} + ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) endif() set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn") if(EXISTS ${MKLDNN_PATH}) @@ -145,65 +165,99 @@ else() set(OPENBLAS_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}openblas") include_directories("${OPENBLAS_LIB_PATH}/include/openblas") if(WIN32) - set(MATH_LIB ${OPENBLAS_LIB_PATH}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(MATH_LIB + ${OPENBLAS_LIB_PATH}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}) else() - set(MATH_LIB ${OPENBLAS_LIB_PATH}/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(MATH_LIB + ${OPENBLAS_LIB_PATH}/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX}) endif() endif() if(WITH_STATIC_LIB) - set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS + ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX} + ) else() if(WIN32) - set(DEPS ${PADDLE_LIB}/paddle/lib/paddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS + ${PADDLE_LIB}/paddle/lib/paddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX}) else() - set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(DEPS + ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX} + ) endif() endif() -if (WITH_ONNXRUNTIME) +if(WITH_ONNXRUNTIME) if(WIN32) - set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib paddle2onnx) + set(DEPS + ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib + paddle2onnx) elseif(APPLE) - set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx) + set(DEPS + ${DEPS} + ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib + paddle2onnx) else() - set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx) + set(DEPS + ${DEPS} + ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 + paddle2onnx) endif() endif() - -if (NOT WIN32) +if(NOT WIN32) set(EXTERNAL_LIB "-lrt -ldl -lpthread") - set(DEPS ${DEPS} - ${MATH_LIB} ${MKLDNN_LIB} - glog gflags protobuf xxhash cryptopp utf8proc + set(DEPS + ${DEPS} + ${MATH_LIB} + ${MKLDNN_LIB} + glog + gflags + protobuf + xxhash + cryptopp + utf8proc ${EXTERNAL_LIB}) else() - set(DEPS ${DEPS} - ${MATH_LIB} ${MKLDNN_LIB} - glog gflags_static libprotobuf xxhash cryptopp-static utf8proc_static + set(DEPS + ${DEPS} + ${MATH_LIB} + ${MKLDNN_LIB} + glog + gflags_static + libprotobuf + xxhash + cryptopp-static + utf8proc_static ${EXTERNAL_LIB}) set(DEPS ${DEPS} shlwapi.lib) endif(NOT WIN32) if(WITH_GPU) if(NOT WIN32) - if (USE_TENSORRT) - set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX}) - set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}) + if(USE_TENSORRT) + set(DEPS ${DEPS} + ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(DEPS + ${DEPS} + ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}) endif() set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX}) else() if(USE_TENSORRT) - set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_STATIC_LIBRARY_SUFFIX}) - set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} + ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} + ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX}) if(${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 7) - set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/myelin64_1${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} + ${TENSORRT_LIB_DIR}/myelin64_1${CMAKE_STATIC_LIBRARY_SUFFIX}) endif() endif() - set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} ) - set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} ) - set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX}) endif() endif() @@ -217,40 +271,61 @@ if(WIN32) endif() if(USE_TENSORRT) - add_custom_command(TARGET ${DEMO_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_SHARED_LIBRARY_SUFFIX} - ${LIB_PATH} - COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX} - ${LIB_PATH} - ) + add_custom_command( + TARGET ${DEMO_NAME} + POST_BUILD + COMMAND + ${CMAKE_COMMAND} -E copy + ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_SHARED_LIBRARY_SUFFIX} ${LIB_PATH} + COMMAND + ${CMAKE_COMMAND} -E copy + ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX} + ${LIB_PATH}) if(${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 7) - add_custom_command(TARGET ${DEMO_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/myelin64_1${CMAKE_SHARED_LIBRARY_SUFFIX} - ${LIB_PATH}) + add_custom_command( + TARGET ${DEMO_NAME} + POST_BUILD + COMMAND + ${CMAKE_COMMAND} -E copy + ${TENSORRT_LIB_DIR}/myelin64_1${CMAKE_SHARED_LIBRARY_SUFFIX} + ${LIB_PATH}) endif() endif() if(WITH_MKL) - add_custom_command(TARGET ${DEMO_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/mklml.dll ${LIB_PATH} - COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/libiomp5md.dll ${LIB_PATH} - COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/mkldnn.dll ${LIB_PATH} - ) + add_custom_command( + TARGET ${DEMO_NAME} + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/mklml.dll + ${LIB_PATH} + COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/libiomp5md.dll + ${LIB_PATH} + COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/mkldnn.dll + ${LIB_PATH}) else() - add_custom_command(TARGET ${DEMO_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${LIB_PATH} - ) + add_custom_command( + TARGET ${DEMO_NAME} + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll + ${LIB_PATH}) endif() if(WITH_ONNXRUNTIME) - add_custom_command(TARGET ${DEMO_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll - ${LIB_PATH} - COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll - ${LIB_PATH} - ) + add_custom_command( + TARGET ${DEMO_NAME} + POST_BUILD + COMMAND + ${CMAKE_COMMAND} -E copy + ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll + ${LIB_PATH} + COMMAND + ${CMAKE_COMMAND} -E copy + ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll + ${LIB_PATH}) endif() if(NOT WITH_STATIC_LIB) - add_custom_command(TARGET ${DEMO_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${LIB_PATH} - ) + add_custom_command( + TARGET ${DEMO_NAME} + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${LIB_PATH}) endif() endif() diff --git a/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc index ef5c08cd041eb..f9ac07a830459 100644 --- a/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc @@ -17,7 +17,9 @@ limitations under the License. */ */ #include // use glog instead of CHECK to avoid importing other paddle header files. + #include + #include "gflags/gflags.h" #include "utils.h" // NOLINT diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc index 9edb4ecbfd228..551b66fcaf7fa 100644 --- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc @@ -17,6 +17,7 @@ limitations under the License. */ */ #include // use glog instead of CHECK to avoid importing other paddle header files. + #include "gflags/gflags.h" #include "utils.h" // NOLINT diff --git a/paddle/fluid/inference/api/demo_ci/utils.h b/paddle/fluid/inference/api/demo_ci/utils.h index b4f40194aa947..dfba4b8ebf6cd 100644 --- a/paddle/fluid/inference/api/demo_ci/utils.h +++ b/paddle/fluid/inference/api/demo_ci/utils.h @@ -14,11 +14,13 @@ #pragma once #include + #include #include #include #include #include + #include "paddle/include/paddle_inference_api.h" namespace paddle { diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc index 818444fbcb648..352efc1e63dbd 100644 --- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc @@ -17,6 +17,7 @@ limitations under the License. */ */ #include + #include "gflags/gflags.h" #include "utils.h" // NOLINT diff --git a/paddle/fluid/inference/api/demo_ci/windows_mobilenet.cc b/paddle/fluid/inference/api/demo_ci/windows_mobilenet.cc index 8d0538f8fa52d..b1f770066e7be 100644 --- a/paddle/fluid/inference/api/demo_ci/windows_mobilenet.cc +++ b/paddle/fluid/inference/api/demo_ci/windows_mobilenet.cc @@ -13,14 +13,15 @@ // limitations under the License. #include + #include #include #include #include #include #include -#include "gflags/gflags.h" +#include "gflags/gflags.h" #include "paddle/include/paddle_inference_api.h" DEFINE_string(modeldir, "", "Directory of the inference model."); diff --git a/paddle/fluid/inference/api/details/CMakeLists.txt b/paddle/fluid/inference/api/details/CMakeLists.txt index 0d7a8d57a9c5a..c1ff6ea68a2bd 100644 --- a/paddle/fluid/inference/api/details/CMakeLists.txt +++ b/paddle/fluid/inference/api/details/CMakeLists.txt @@ -13,13 +13,28 @@ # limitations under the License. # -cc_library(reset_tensor_array SRCS reset_tensor_array.cc DEPS lod_tensor scope) -if (WITH_ONNXRUNTIME) - cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce onnxruntime) - cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc DEPS onnxruntime) -else (WITH_ONNXRUNTIME) - cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce) - cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc) -endif (WITH_ONNXRUNTIME) +cc_library( + reset_tensor_array + SRCS reset_tensor_array.cc + DEPS lod_tensor scope) +if(WITH_ONNXRUNTIME) + cc_library( + zero_copy_tensor + SRCS zero_copy_tensor.cc + DEPS scope lod_tensor enforce onnxruntime) + cc_library( + zero_copy_tensor_dummy + SRCS zero_copy_tensor_dummy.cc + DEPS onnxruntime) +else(WITH_ONNXRUNTIME) + cc_library( + zero_copy_tensor + SRCS zero_copy_tensor.cc + DEPS scope lod_tensor enforce) + cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc) +endif(WITH_ONNXRUNTIME) -cc_test(zero_copy_tensor_test SRCS zero_copy_tensor_test.cc DEPS paddle_inference_api) +cc_test( + zero_copy_tensor_test + SRCS zero_copy_tensor_test.cc + DEPS paddle_inference_api) diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index bb966dc5c6c1b..661d9def40653 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -340,8 +340,9 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, #ifdef PADDLE_WITH_MKLDNN if (tensor->layout() == paddle::framework::DataLayout::kMKLDNN) paddle::framework::innerTransDataLayoutFromMKLDNN( - tensor->layout(), paddle::platform::MKLDNNDeviceContext::tls() - .get_cur_paddle_data_layout(), + tensor->layout(), + paddle::platform::MKLDNNDeviceContext::tls() + .get_cur_paddle_data_layout(), *tensor, &out, paddle::platform::CPUPlace(), true); else std::memcpy(static_cast(data), t_data, ele_num * sizeof(T)); @@ -852,8 +853,9 @@ void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t, T *data, #ifdef PADDLE_WITH_MKLDNN if (tensor->layout() == paddle::framework::DataLayout::kMKLDNN) paddle::framework::innerTransDataLayoutFromMKLDNN( - tensor->layout(), paddle::platform::MKLDNNDeviceContext::tls() - .get_cur_paddle_data_layout(), + tensor->layout(), + paddle::platform::MKLDNNDeviceContext::tls() + .get_cur_paddle_data_layout(), *tensor, &out, paddle::platform::CPUPlace(), true); else std::memcpy(static_cast(data), t_data, ele_num * sizeof(T)); diff --git a/paddle/fluid/inference/api/helper.cc b/paddle/fluid/inference/api/helper.cc index b9e0e90a40316..3454c5c8fd17b 100644 --- a/paddle/fluid/inference/api/helper.cc +++ b/paddle/fluid/inference/api/helper.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/api/helper.h" + #include "paddle/fluid/framework/custom_operator.h" #include "paddle/fluid/framework/operator.h" #include "paddle/phi/api/ext/op_meta_info.h" diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index acc52ac046815..1c58b004e6d31 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -15,6 +15,7 @@ #pragma once #include + #include #if !defined(_WIN32) #include @@ -377,8 +378,9 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid, double batch_latency, int epoch = 1, const framework::proto::VarType::Type data_type = framework::proto::VarType::FP32) { - PADDLE_ENFORCE_GT(batch_size, 0, platform::errors::InvalidArgument( - "Non-positive batch size.")); + PADDLE_ENFORCE_GT( + batch_size, 0, + platform::errors::InvalidArgument("Non-positive batch size.")); double sample_latency = batch_latency / batch_size; LOG(INFO) << "====== threads: " << num_threads << ", thread id: " << tid << " ======"; diff --git a/paddle/fluid/inference/api/infer_context.h b/paddle/fluid/inference/api/infer_context.h index b7a8bf637d872..c2a23a7ca2ce5 100644 --- a/paddle/fluid/inference/api/infer_context.h +++ b/paddle/fluid/inference/api/infer_context.h @@ -25,21 +25,21 @@ class InferCPUContext : public phi::CPUContext { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) class InferGPUContext : public phi::GPUContext { public: - using phi::GPUContext::SetStream; - using phi::GPUContext::SetEigenDevice; using phi::GPUContext::SetBlasHandle; using phi::GPUContext::SetBlasTensorCoreHandle; using phi::GPUContext::SetBlasTF32Handle; using phi::GPUContext::SetDnnHandle; + using phi::GPUContext::SetEigenDevice; using phi::GPUContext::SetSolverHandle; using phi::GPUContext::SetSparseHandle; + using phi::GPUContext::SetStream; // using phi::GPUContext::SetDnnWorkspaceHandle; using phi::GPUContext::SetComputeCapability; + using phi::GPUContext::SetDriverVersion; + using phi::GPUContext::SetMaxGridDimSize; + using phi::GPUContext::SetMaxThreadsPerBlock; using phi::GPUContext::SetMaxThreadsPerMultiProcessor; using phi::GPUContext::SetMultiProcessors; - using phi::GPUContext::SetMaxThreadsPerBlock; - using phi::GPUContext::SetMaxGridDimSize; - using phi::GPUContext::SetDriverVersion; using phi::GPUContext::SetRuntimeVersion; }; #endif diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc index 4dc80a1d75390..73096973c381c 100644 --- a/paddle/fluid/inference/api/mkldnn_quantizer.cc +++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc @@ -13,12 +13,14 @@ // limitations under the License. #include "paddle/fluid/inference/api/mkldnn_quantizer.h" + #include #include #include #include #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" @@ -33,10 +35,10 @@ namespace paddle { -using platform::CPUPlace; using framework::LoDTensor; using framework::Variable; using framework::ir::Graph; +using platform::CPUPlace; using ConstEigenVectorArrayMap = Eigen::Map>; using EigenMatrixDoubleArray = @@ -57,8 +59,9 @@ static void check_var(const Variable* var, const std::string& var_name) { } static void check_tensor(const LoDTensor& tensor) { - PADDLE_ENFORCE_GT(tensor.dims().size(), 0, platform::errors::InvalidArgument( - "Tensor dimension is empty.")); + PADDLE_ENFORCE_GT( + tensor.dims().size(), 0, + platform::errors::InvalidArgument("Tensor dimension is empty.")); } void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForRNNWeights( @@ -531,8 +534,9 @@ AnalysisPredictor::MkldnnQuantizer::Histogram( PADDLE_ENFORCE_GE(max_val, min_val, platform::errors::InvalidArgument( "MkldnnQuantizer: To calculate Histogram, max_val (" + - std::to_string(max_val) + ") must be greater or equal" - "to min_val (" + + std::to_string(max_val) + + ") must be greater or equal" + "to min_val (" + std::to_string(min_val) + ").")); ConstEigenVectorArrayMap eigen_tensor{var_tensor.data(), var_tensor.numel(), 1}; @@ -570,7 +574,8 @@ void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const { auto* builder = predictor_.config_.pass_builder(); builder->SetPasses({ - "cpu_quantize_pass", "cpu_quantize_squash_pass", + "cpu_quantize_pass", + "cpu_quantize_squash_pass", "int8_scale_calculation_mkldnn_pass", }); if (predictor_.config_.ir_debug_) builder->TurnOnDebug(); diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.h b/paddle/fluid/inference/api/mkldnn_quantizer.h index 5e7aa39de52bc..811f2941a7d14 100644 --- a/paddle/fluid/inference/api/mkldnn_quantizer.h +++ b/paddle/fluid/inference/api/mkldnn_quantizer.h @@ -20,6 +20,7 @@ #include #include #include + #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/api/analysis_predictor.h" diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_tester.cc b/paddle/fluid/inference/api/mkldnn_quantizer_tester.cc index 2bee4763d4fe9..05077f8ba34cc 100644 --- a/paddle/fluid/inference/api/mkldnn_quantizer_tester.cc +++ b/paddle/fluid/inference/api/mkldnn_quantizer_tester.cc @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/api/mkldnn_quantizer.h" #include + #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/inference/api/mkldnn_quantizer.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" DEFINE_string(dirname, "", "dirname to tests."); diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.h b/paddle/fluid/inference/api/onnxruntime_predictor.h index d01756e4b96b1..294a83a4335ba 100644 --- a/paddle/fluid/inference/api/onnxruntime_predictor.h +++ b/paddle/fluid/inference/api/onnxruntime_predictor.h @@ -18,6 +18,9 @@ #include #include #include + +#include "onnxruntime_c_api.h" // NOLINT +#include "onnxruntime_cxx_api.h" // NOLINT #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/op_compatible_info.h" #include "paddle/fluid/inference/analysis/analyzer.h" @@ -27,9 +30,6 @@ #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/platform/device/gpu/gpu_types.h" #include "paddle/fluid/string/printf.h" - -#include "onnxruntime_c_api.h" // NOLINT -#include "onnxruntime_cxx_api.h" // NOLINT #include "paddle2onnx/converter.h" #ifdef PADDLE_WITH_TESTING diff --git a/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc index 4a702edacc903..ff8528c085009 100644 --- a/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc +++ b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc @@ -12,16 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/api/onnxruntime_predictor.h" - #include #include + #include #include // NOLINT #include + #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/onnxruntime_predictor.h" #include "paddle/fluid/inference/api/paddle_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index af6cf88a3224f..489c32bc59d17 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -618,14 +618,14 @@ struct PD_INFER_DECL AnalysisConfig { /// may be more high-performance. Libnvinfer_plugin.so greater than /// V7.2.1 is needed. /// - void EnableTensorRtOSS(); + void EnableVarseqlen(); /// /// \brief A boolean state telling whether to use the TensorRT OSS. /// /// \return bool Whether to use the TensorRT OSS. /// - bool tensorrt_oss_enabled() { return trt_use_oss_; } + bool tensorrt_varseqlen_enabled() { return trt_use_varseqlen_; } /// /// \brief Enable TensorRT DLA @@ -912,11 +912,18 @@ struct PD_INFER_DECL AnalysisConfig { bool thread_local_stream_{false}; bool use_gpu_fp16_{false}; std::unordered_set gpu_fp16_disabled_op_types_{ - "conv2d_fusion", "conv2d", "roll", "strided_slice", "depthwise_conv2d", - "unfold", "generate_proposals_v2", "nearest_interp_v2", + "conv2d_fusion", + "conv2d", + "roll", + "strided_slice", + "depthwise_conv2d", + "unfold", + "generate_proposals_v2", + "nearest_interp_v2", "bilinear_interp_v2" "yolo_box", - "multiclass_nms3", "matrix_nms"}; + "multiclass_nms3", + "matrix_nms"}; bool use_cudnn_{false}; @@ -954,8 +961,10 @@ struct PD_INFER_DECL AnalysisConfig { Precision tensorrt_precision_mode_{Precision::kFloat32}; bool trt_use_static_engine_{false}; bool trt_use_calib_mode_{true}; - bool trt_use_oss_{false}; + bool trt_use_varseqlen_{false}; bool trt_with_interleaved_{false}; + std::string tensorrt_transformer_posid_{""}; + std::string tensorrt_transformer_maskid_{""}; bool trt_use_dla_{false}; int trt_dla_core_{0}; std::map> min_input_shape_{}; diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index dc9f7debe5f2f..78af756c24b03 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -27,6 +27,7 @@ #include #include #include + #include "crypto/cipher.h" #include "paddle_infer_declare.h" // NOLINT #include "paddle_tensor.h" // NOLINT @@ -391,12 +392,14 @@ PD_INFER_DECL std::unique_ptr CreatePaddlePredictor( const ConfigT& config); template <> -PD_INFER_DECL std::unique_ptr CreatePaddlePredictor< - NativeConfig, PaddleEngineKind::kNative>(const NativeConfig& config); +PD_INFER_DECL std::unique_ptr +CreatePaddlePredictor( + const NativeConfig& config); template <> -PD_INFER_DECL std::unique_ptr CreatePaddlePredictor< - AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config); +PD_INFER_DECL std::unique_ptr +CreatePaddlePredictor( + const AnalysisConfig& config); template <> PD_INFER_DECL std::unique_ptr @@ -435,6 +438,12 @@ class PD_INFER_DECL InternalUtils { static void UpdateConfigInterleaved(paddle_infer::Config* c, bool with_interleaved); + static void SetTransformerPosid( + paddle_infer::Config* c, const std::string& tensorrt_transformer_posid); + + static void SetTransformerMaskid( + paddle_infer::Config* c, const std::string& tensorrt_transformer_maskid); + static void SyncStream(paddle_infer::Predictor* pred); static void SyncStream(cudaStream_t stream); template diff --git a/paddle/fluid/inference/api/paddle_infer_contrib.cc b/paddle/fluid/inference/api/paddle_infer_contrib.cc index d27f20a93b3a4..e785e91a67139 100644 --- a/paddle/fluid/inference/api/paddle_infer_contrib.cc +++ b/paddle/fluid/inference/api/paddle_infer_contrib.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/api/paddle_infer_contrib.h" + #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index f9ec41f6c8358..9e5b76db4ac16 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -20,6 +20,7 @@ #include #endif #include + #include #include @@ -94,25 +95,25 @@ const std::vector kTRTSubgraphPasses({ "add_support_int8_pass", // // "fc_fuse_pass", // "simplify_with_basic_ops_pass", // - "embedding_eltwise_layernorm_fuse_pass", // + "trt_embedding_eltwise_layernorm_fuse_pass", // "preln_embedding_eltwise_layernorm_fuse_pass", // - "multihead_matmul_fuse_pass_v2", // - "multihead_matmul_fuse_pass_v3", // - "skip_layernorm_fuse_pass", // + "trt_multihead_matmul_fuse_pass_v2", // + "trt_multihead_matmul_fuse_pass_v3", // + "trt_skip_layernorm_fuse_pass", // "preln_skip_layernorm_fuse_pass", // // "set_transformer_input_convert_pass", // - "conv_bn_fuse_pass", // - "unsqueeze2_eltwise_fuse_pass", // - "trt_squeeze2_matmul_fuse_pass", // - "trt_reshape2_matmul_fuse_pass", // - "trt_flatten2_matmul_fuse_pass", // - "trt_map_matmul_v2_to_mul_pass", // - "trt_map_matmul_v2_to_matmul_pass", // - "trt_map_matmul_to_mul_pass", // - "fc_fuse_pass", // - "conv_elementwise_add_fuse_pass", // - // "remove_padding_recover_padding_pass", // - // "delete_remove_padding_recover_padding_pass", // + "conv_bn_fuse_pass", // + "unsqueeze2_eltwise_fuse_pass", // + "trt_squeeze2_matmul_fuse_pass", // + "trt_reshape2_matmul_fuse_pass", // + "trt_flatten2_matmul_fuse_pass", // + "trt_map_matmul_v2_to_mul_pass", // + "trt_map_matmul_v2_to_matmul_pass", // + "trt_map_matmul_to_mul_pass", // + "fc_fuse_pass", // + "conv_elementwise_add_fuse_pass", // + "remove_padding_recover_padding_pass", // + "delete_remove_padding_recover_padding_pass", // // "yolo_box_fuse_pass", // "tensorrt_subgraph_pass", // "conv_bn_fuse_pass", // @@ -348,6 +349,10 @@ void CpuPassStrategy::EnableMkldnnQuantizer() { void CpuPassStrategy::EnableMkldnnBfloat16() { #ifdef PADDLE_WITH_MKLDNN if (!use_mkldnn_bfloat16_) { + passes_.push_back("fc_mkldnn_pass"); + passes_.push_back("fc_act_mkldnn_fuse_pass"); + passes_.push_back("fc_elementwise_add_mkldnn_fuse_pass"); + passes_.push_back("cpu_bfloat16_placement_pass"); passes_.push_back("cpu_bfloat16_pass"); passes_.push_back("cpu_quantize_squash_pass"); diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h index c41968dc58590..24e76598e400b 100644 --- a/paddle/fluid/inference/api/resource_manager.h +++ b/paddle/fluid/inference/api/resource_manager.h @@ -15,6 +15,7 @@ #include #include + #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/backends/cpu/forwards.h" diff --git a/paddle/fluid/inference/capi/CMakeLists.txt b/paddle/fluid/inference/capi/CMakeLists.txt index 32f780122bcd6..73ba41607aae8 100644 --- a/paddle/fluid/inference/capi/CMakeLists.txt +++ b/paddle/fluid/inference/capi/CMakeLists.txt @@ -15,15 +15,22 @@ set(C_API_SRCS pd_config.cc pd_predictor.cc pd_tensor.cc c_api.cc) -cc_library(paddle_inference_c SRCS ${C_API_SRCS} DEPS paddle_inference) +cc_library( + paddle_inference_c + SRCS ${C_API_SRCS} + DEPS paddle_inference) if(NOT ON_INFER) - return() + return() endif() # Create inference capi shared library -cc_library(paddle_inference_c_shared SHARED SRCS ${C_API_SRCS} DEPS paddle_inference) -set_target_properties(paddle_inference_c_shared PROPERTIES OUTPUT_NAME paddle_inference_c) +cc_library( + paddle_inference_c_shared SHARED + SRCS ${C_API_SRCS} + DEPS paddle_inference) +set_target_properties(paddle_inference_c_shared PROPERTIES OUTPUT_NAME + paddle_inference_c) if(WIN32) - target_link_libraries(paddle_inference_c_shared shlwapi.lib) + target_link_libraries(paddle_inference_c_shared shlwapi.lib) endif() diff --git a/paddle/fluid/inference/capi/c_api.cc b/paddle/fluid/inference/capi/c_api.cc index 07493c742c4fa..f2a9838f4bc7d 100644 --- a/paddle/fluid/inference/capi/c_api.cc +++ b/paddle/fluid/inference/capi/c_api.cc @@ -14,6 +14,7 @@ #include #include + #include "paddle/fluid/inference/capi/c_api_internal.h" #include "paddle/fluid/inference/capi/paddle_c_api.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/inference/capi/c_api_internal.h b/paddle/fluid/inference/capi/c_api_internal.h index 7e69b7210768e..11728fb9878fc 100644 --- a/paddle/fluid/inference/capi/c_api_internal.h +++ b/paddle/fluid/inference/capi/c_api_internal.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/api/paddle_api.h" #include "paddle/fluid/inference/capi/paddle_c_api.h" diff --git a/paddle/fluid/inference/capi/pd_config.cc b/paddle/fluid/inference/capi/pd_config.cc index 9bb52ba578025..2bacc94c0d118 100644 --- a/paddle/fluid/inference/capi/pd_config.cc +++ b/paddle/fluid/inference/capi/pd_config.cc @@ -18,6 +18,7 @@ #include #include #include + #include "paddle/fluid/inference/capi/c_api_internal.h" #include "paddle/fluid/inference/capi/paddle_c_api.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/inference/capi/pd_predictor.cc b/paddle/fluid/inference/capi/pd_predictor.cc index 12d7f78e169cc..e88fbfc5a86a3 100644 --- a/paddle/fluid/inference/capi/pd_predictor.cc +++ b/paddle/fluid/inference/capi/pd_predictor.cc @@ -19,6 +19,7 @@ #include #include #include + #include "paddle/fluid/inference/api/paddle_api.h" #include "paddle/fluid/inference/capi/c_api_internal.h" #include "paddle/fluid/inference/capi/paddle_c_api.h" diff --git a/paddle/fluid/inference/capi/pd_tensor.cc b/paddle/fluid/inference/capi/pd_tensor.cc index 9b1eedd7c5a81..199db92d1b0d3 100644 --- a/paddle/fluid/inference/capi/pd_tensor.cc +++ b/paddle/fluid/inference/capi/pd_tensor.cc @@ -17,6 +17,7 @@ #include #include #include + #include "paddle/fluid/inference/capi/c_api_internal.h" #include "paddle/fluid/inference/capi/paddle_c_api.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/inference/capi_exp/CMakeLists.txt b/paddle/fluid/inference/capi_exp/CMakeLists.txt index 521d24329d464..e35e14a0c0241 100644 --- a/paddle/fluid/inference/capi_exp/CMakeLists.txt +++ b/paddle/fluid/inference/capi_exp/CMakeLists.txt @@ -15,15 +15,22 @@ set(C_API_SRCS pd_config.cc pd_predictor.cc pd_tensor.cc pd_utils.cc) -cc_library(paddle_inference_c SRCS ${C_API_SRCS} DEPS paddle_inference) +cc_library( + paddle_inference_c + SRCS ${C_API_SRCS} + DEPS paddle_inference) if(NOT ON_INFER) - return() + return() endif() # Create inference capi shared library -cc_library(paddle_inference_c_shared SHARED SRCS ${C_API_SRCS} DEPS paddle_inference) -set_target_properties(paddle_inference_c_shared PROPERTIES OUTPUT_NAME paddle_inference_c) +cc_library( + paddle_inference_c_shared SHARED + SRCS ${C_API_SRCS} + DEPS paddle_inference) +set_target_properties(paddle_inference_c_shared PROPERTIES OUTPUT_NAME + paddle_inference_c) if(WIN32) - target_link_libraries(paddle_inference_c_shared shlwapi.lib) + target_link_libraries(paddle_inference_c_shared shlwapi.lib) endif() diff --git a/paddle/fluid/inference/capi_exp/lod_demo.cc b/paddle/fluid/inference/capi_exp/lod_demo.cc index 2b049e992e71d..c67d6f870bdd9 100644 --- a/paddle/fluid/inference/capi_exp/lod_demo.cc +++ b/paddle/fluid/inference/capi_exp/lod_demo.cc @@ -27,8 +27,10 @@ #include #include #include + #include #include + #include "paddle/fluid/inference/capi_exp/pd_inference_api.h" int main(int argc, char *argv[]) { diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc index d7b07652babbd..4e1c5a2a0ddd0 100644 --- a/paddle/fluid/inference/capi_exp/pd_config.cc +++ b/paddle/fluid/inference/capi_exp/pd_config.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/capi_exp/pd_config.h" + #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/capi_exp/pd_types.h" #include "paddle/fluid/inference/capi_exp/utils_internal.h" @@ -303,13 +304,13 @@ void PD_ConfigDisableTensorRtOPs(__pd_keep PD_Config* pd_config, size_t ops_num, config->Exp_DisableTensorRtOPs(ops_list); } -void PD_ConfigEnableTensorRtOSS(__pd_keep PD_Config* pd_config) { +void PD_ConfigEnableVarseqlen(__pd_keep PD_Config* pd_config) { CHECK_AND_CONVERT_PD_CONFIG; - config->EnableTensorRtOSS(); + config->EnableVarseqlen(); } PD_Bool PD_ConfigTensorRtOssEnabled(__pd_keep PD_Config* pd_config) { CHECK_AND_CONVERT_PD_CONFIG; - return config->tensorrt_oss_enabled(); + return config->tensorrt_varseqlen_enabled(); } void PD_ConfigEnableTensorRtDla(__pd_keep PD_Config* pd_config, diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h index f6b754cad213f..667843520d686 100644 --- a/paddle/fluid/inference/capi_exp/pd_config.h +++ b/paddle/fluid/inference/capi_exp/pd_config.h @@ -432,7 +432,7 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigDisableTensorRtOPs( /// /// \param[in] pd_onfig config /// -PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtOSS( +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableVarseqlen( __pd_keep PD_Config* pd_config); /// /// \brief A boolean state telling whether to use the TensorRT OSS. diff --git a/paddle/fluid/inference/capi_exp/pd_predictor.cc b/paddle/fluid/inference/capi_exp/pd_predictor.cc index 5ca58b0e4138b..c85dfdf522e67 100644 --- a/paddle/fluid/inference/capi_exp/pd_predictor.cc +++ b/paddle/fluid/inference/capi_exp/pd_predictor.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/capi_exp/pd_predictor.h" + #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/capi_exp/pd_types.h" #include "paddle/fluid/inference/capi_exp/pd_utils.h" diff --git a/paddle/fluid/inference/capi_exp/pd_tensor.cc b/paddle/fluid/inference/capi_exp/pd_tensor.cc index 9c661dea6f2bb..520cfa813f47e 100644 --- a/paddle/fluid/inference/capi_exp/pd_tensor.cc +++ b/paddle/fluid/inference/capi_exp/pd_tensor.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/capi_exp/pd_tensor.h" + #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/capi_exp/pd_types.h" #include "paddle/fluid/inference/capi_exp/pd_utils.h" diff --git a/paddle/fluid/inference/capi_exp/pd_utils.cc b/paddle/fluid/inference/capi_exp/pd_utils.cc index efca350fbaf49..7942a860c4ee8 100644 --- a/paddle/fluid/inference/capi_exp/pd_utils.cc +++ b/paddle/fluid/inference/capi_exp/pd_utils.cc @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/inference/capi_exp/pd_utils.h" + #include #include "paddle/fluid/inference/api/paddle_inference_api.h" -#include "paddle/fluid/inference/capi_exp/pd_utils.h" #include "paddle/fluid/inference/capi_exp/utils_internal.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/inference/experimental/javaapi/CMakeLists.txt b/paddle/fluid/inference/experimental/javaapi/CMakeLists.txt index 5b66d1de91917..fc4a3c408dfe2 100644 --- a/paddle/fluid/inference/experimental/javaapi/CMakeLists.txt +++ b/paddle/fluid/inference/experimental/javaapi/CMakeLists.txt @@ -1,5 +1,6 @@ include_directories($ENV{jni_path} $ENV{jni_sub_path} $ENV{paddle_path}) -find_library(PADDLE_INFERENCE_C libpaddle_inference_c.so HINTS $ENV{paddle_inference_lib}) +find_library(PADDLE_INFERENCE_C libpaddle_inference_c.so + HINTS $ENV{paddle_inference_lib}) aux_source_directory(native JNI_SRCS) add_library(paddle_inference SHARED ${JNI_SRCS}) target_link_libraries(paddle_inference ${PADDLE_INFERENCE_C}) diff --git a/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Config.cpp b/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Config.cpp index 593ba3cb51d8c..efea093fa245a 100644 --- a/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Config.cpp +++ b/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Config.cpp @@ -13,9 +13,10 @@ // limitations under the License. #include "com_baidu_paddle_inference_Config.h" + #include -#include "jni_convert_util.h" // NOLINT +#include "jni_convert_util.h" // NOLINT #include "pd_inference_api.h" // NOLINT JNIEXPORT void JNICALL Java_com_baidu_paddle_inference_Config_cppConfigDestroy( diff --git a/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Predictor.cpp b/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Predictor.cpp index 7eff03690ae8e..0912c2ad57a68 100644 --- a/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Predictor.cpp +++ b/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Predictor.cpp @@ -13,7 +13,9 @@ // limitations under the License. #include "com_baidu_paddle_inference_Predictor.h" + #include + #include "jni_convert_util.h" // NOLINT #include "pd_inference_api.h" // NOLINT diff --git a/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Tensor.cpp b/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Tensor.cpp index b9be4a73ac2ce..a90ae165ebd51 100644 --- a/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Tensor.cpp +++ b/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Tensor.cpp @@ -13,7 +13,9 @@ // limitations under the License. #include "com_baidu_paddle_inference_Tensor.h" + #include + #include "pd_inference_api.h" // NOLINT JNIEXPORT void JNICALL Java_com_baidu_paddle_inference_Tensor_cppTensorDestroy( diff --git a/paddle/fluid/inference/experimental/javaapi/native/jni_convert_util.h b/paddle/fluid/inference/experimental/javaapi/native/jni_convert_util.h index 0026ec2f4102c..c363559298f18 100644 --- a/paddle/fluid/inference/experimental/javaapi/native/jni_convert_util.h +++ b/paddle/fluid/inference/experimental/javaapi/native/jni_convert_util.h @@ -17,6 +17,7 @@ #include #include + #include #include @@ -54,8 +55,8 @@ inline jstring cpp_string_to_jstring(JNIEnv *env, std::string str) { reinterpret_cast(data)); jstring encoding = env->NewStringUTF("UTF-8"); - jstring res = (jstring)( - env->NewObject(strClass, strClassInitMethodID, bytes, encoding)); + jstring res = (jstring)(env->NewObject(strClass, strClassInitMethodID, bytes, + encoding)); env->DeleteLocalRef(strClass); env->DeleteLocalRef(encoding); diff --git a/paddle/fluid/inference/goapi/config.go b/paddle/fluid/inference/goapi/config.go index 8f9f34c06b476..0aca2a1075fd3 100644 --- a/paddle/fluid/inference/goapi/config.go +++ b/paddle/fluid/inference/goapi/config.go @@ -500,8 +500,8 @@ func (config *Config) DisableTensorRtOPs(ops []string) { /// may be more high-performance. Libnvinfer_plugin.so greater than /// V7.2.1 is needed. /// -func (config *Config) EnableTensorRtOSS() { - C.PD_ConfigEnableTensorRtOSS(config.c) +func (config *Config) EnableVarseqlen() { + C.PD_ConfigEnableVarseqlen(config.c) } /// diff --git a/paddle/fluid/inference/goapi/config_test.go b/paddle/fluid/inference/goapi/config_test.go index 297841dcbcf6c..080f2fd0135e5 100644 --- a/paddle/fluid/inference/goapi/config_test.go +++ b/paddle/fluid/inference/goapi/config_test.go @@ -54,7 +54,7 @@ func TestNewConfig(t *testing.T) { } config.SetTRTDynamicShapeInfo(minInputShape, maxInputShape, optInputShape, false) - config.EnableTensorRtOSS() + config.EnableVarseqlen() t.Logf("TensorrtOssEnabled:%+v", config.TensorrtOssEnabled()) config.EnableTensorRtDLA(0) @@ -138,4 +138,4 @@ func TestONNXRuntime(t *testing.T) { config.SetCpuMathLibraryNumThreads(4) t.Logf("CpuMathLibraryNumThreads:%+v", config.CpuMathLibraryNumThreads()) -} \ No newline at end of file +} diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h index 317ef9d93acf3..1106ad261ec41 100644 --- a/paddle/fluid/inference/io.h +++ b/paddle/fluid/inference/io.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" diff --git a/paddle/fluid/inference/lite/CMakeLists.txt b/paddle/fluid/inference/lite/CMakeLists.txt index 6d981d007e73a..7aa010cb0066c 100644 --- a/paddle/fluid/inference/lite/CMakeLists.txt +++ b/paddle/fluid/inference/lite/CMakeLists.txt @@ -2,8 +2,23 @@ if(XPU_SDK_ROOT) set(XPU_DEPS xpuapi xpurt) endif() -cc_library(lite_op_teller SRCS op_teller.cc DEPS ${LITE_DEPS} framework_proto device_context boost xxhash) -cc_library(lite_engine SRCS engine.cc DEPS ${LITE_DEPS} framework_proto ${XPU_DEPS}) -cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy ${LITE_DEPS} framework_proto boost device_context ${XPU_DEPS}) -cc_test(test_lite_engine SRCS test_engine_lite.cc DEPS lite_engine protobuf framework_proto glog gtest analysis) -cc_test(test_lite_tensor_utils SRCS test_tensor_utils.cc DEPS lite_engine lite_tensor_utils) +cc_library( + lite_op_teller + SRCS op_teller.cc + DEPS ${LITE_DEPS} framework_proto device_context boost xxhash) +cc_library( + lite_engine + SRCS engine.cc + DEPS ${LITE_DEPS} framework_proto ${XPU_DEPS}) +cc_library( + lite_tensor_utils + SRCS tensor_utils.cc + DEPS memcpy ${LITE_DEPS} framework_proto boost device_context ${XPU_DEPS}) +cc_test( + test_lite_engine + SRCS test_engine_lite.cc + DEPS lite_engine protobuf framework_proto glog gtest analysis) +cc_test( + test_lite_tensor_utils + SRCS test_tensor_utils.cc + DEPS lite_engine lite_tensor_utils) diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc index cd78cfecd8635..8f8f68b170b62 100644 --- a/paddle/fluid/inference/lite/engine.cc +++ b/paddle/fluid/inference/lite/engine.cc @@ -25,6 +25,7 @@ #endif #include "paddle/fluid/inference/lite/engine.h" + #include namespace paddle { diff --git a/paddle/fluid/inference/lite/op_teller.cc b/paddle/fluid/inference/lite/op_teller.cc index 3a162c3fde13f..3d2ed0a5c9890 100644 --- a/paddle/fluid/inference/lite/op_teller.cc +++ b/paddle/fluid/inference/lite/op_teller.cc @@ -12,12 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/inference/lite/op_teller.h" + #include #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/inference/lite/engine.h" -#include "paddle/fluid/inference/lite/op_teller.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/lite/op_teller.h b/paddle/fluid/inference/lite/op_teller.h index b9391a98a2ee3..1a969f1293dd2 100644 --- a/paddle/fluid/inference/lite/op_teller.h +++ b/paddle/fluid/inference/lite/op_teller.h @@ -17,6 +17,7 @@ #include #include #include + #include "paddle/fluid/framework/op_desc.h" namespace paddle { diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc index eeaa128290339..f70455f18ebfd 100644 --- a/paddle/fluid/inference/lite/tensor_utils.cc +++ b/paddle/fluid/inference/lite/tensor_utils.cc @@ -13,9 +13,11 @@ // limitations under the License. #include "paddle/fluid/inference/lite/tensor_utils.h" + #include #include #include + #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/inference/lite/engine.h" @@ -26,9 +28,9 @@ namespace inference { namespace lite { namespace utils { -using paddle::lite_api::TargetType; -using paddle::lite_api::PrecisionType; using paddle::lite_api::DataLayoutType; +using paddle::lite_api::PrecisionType; +using paddle::lite_api::TargetType; template void SetLoD(DstLoD* dst, const SrcLoD& src) { diff --git a/paddle/fluid/inference/lite/test_engine_lite.cc b/paddle/fluid/inference/lite/test_engine_lite.cc index 85f7d3ee363a7..dee83f70ba2a2 100644 --- a/paddle/fluid/inference/lite/test_engine_lite.cc +++ b/paddle/fluid/inference/lite/test_engine_lite.cc @@ -14,14 +14,12 @@ #include -#include "paddle/fluid/inference/utils/singleton.h" - #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" - #include "paddle/fluid/inference/lite/engine.h" +#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/operators/lite/ut_helper.h" namespace paddle { @@ -29,9 +27,9 @@ namespace inference { namespace lite { using inference::lite::AddTensorToBlockDesc; -using paddle::inference::lite::AddFetchListToBlockDesc; using inference::lite::CreateTensor; using inference::lite::serialize_params; +using paddle::inference::lite::AddFetchListToBlockDesc; void make_fake_model(std::string* model, std::string* param) { framework::ProgramDesc program; diff --git a/paddle/fluid/inference/lite/test_tensor_utils.cc b/paddle/fluid/inference/lite/test_tensor_utils.cc index b0c7c7448a50e..09a6cda62b352 100644 --- a/paddle/fluid/inference/lite/test_tensor_utils.cc +++ b/paddle/fluid/inference/lite/test_tensor_utils.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/inference/lite/tensor_utils.h" @@ -21,9 +22,9 @@ namespace inference { namespace lite { namespace utils { -using paddle::lite_api::TargetType; -using paddle::lite_api::PrecisionType; using paddle::lite_api::DataLayoutType; +using paddle::lite_api::PrecisionType; +using paddle::lite_api::TargetType; TEST(LiteEngineOp, GetNativePlace) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt index c713e3a66ac71..abd00ef9de67e 100644 --- a/paddle/fluid/inference/tensorrt/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt @@ -1,11 +1,27 @@ # Compiling with WITH_PYTHON=ON and WITH_TENSORRT=ON failed on windows. Temporarily add paddle_inference_api dependency to solve the problem if(WIN32) - nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost paddle_inference_api) + nv_library( + tensorrt_engine + SRCS engine.cc trt_int8_calibrator.cc + DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost + paddle_inference_api) else() - nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost) + nv_library( + tensorrt_engine + SRCS engine.cc trt_int8_calibrator.cc + DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost) endif() -nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto device_context boost) -nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) -nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine) +nv_library( + tensorrt_op_teller + SRCS op_teller.cc + DEPS framework_proto device_context boost) +nv_test( + test_tensorrt + SRCS test_tensorrt.cc + DEPS dynload_cuda device_context dynamic_loader) +nv_test( + test_tensorrt_engine + SRCS test_engine.cc + DEPS dynload_cuda tensorrt_engine) add_subdirectory(plugin) add_subdirectory(convert) diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 1910e2f6eb906..b27a584de2bfa 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,62 +1,70 @@ # Add TRT tests -nv_library(tensorrt_converter - SRCS matmul_op.cc - conv2d_op.cc - fc_op.cc - pool2d_op.cc - elementwise_op.cc - batch_norm_op.cc - activation_op.cc - unary_op.cc - softmax_op.cc - concat_op.cc - dropout_op.cc - group_norm_op.cc - pad_op.cc - split_op.cc - prelu_op.cc - leaky_relu_op.cc - gelu_op.cc - layer_norm_op.cc - multihead_matmul_op.cc - shuffle_channel_op.cc - swish_op.cc - instance_norm_op.cc - stack_op.cc - transpose_op.cc - flatten_op.cc - flatten_contiguous_range_op.cc - emb_eltwise_layernorm.cc - skip_layernorm.cc - scale_op.cc - slice_op.cc - hard_sigmoid_op.cc - hard_swish_op.cc - clip_op.cc - gather_op.cc - anchor_generator_op.cc - yolo_box_op.cc - yolo_box_head_op.cc - arg_max_op.cc - roi_align_op.cc - affine_channel_op.cc - multiclass_nms_op.cc - multiclass_nms3_op.cc - nearest_interp_op.cc - reshape_op.cc - reduce_op.cc - gather_nd_op.cc - tile_op.cc - conv3d_op.cc - mish_op.cc - nearest_interp_v2_op.cc - pool3d_op.cc - deformable_conv_op.cc - preln_emb_eltwise_layernorm.cc - strided_slice_op.cc - preln_skip_layernorm.cc - roll_op.cc - DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) +nv_library( + tensorrt_converter + SRCS matmul_op.cc + conv2d_op.cc + fc_op.cc + pool2d_op.cc + elementwise_op.cc + batch_norm_op.cc + activation_op.cc + unary_op.cc + softmax_op.cc + concat_op.cc + dropout_op.cc + group_norm_op.cc + pad_op.cc + split_op.cc + prelu_op.cc + leaky_relu_op.cc + gelu_op.cc + layer_norm_op.cc + multihead_matmul_op.cc + shuffle_channel_op.cc + swish_op.cc + instance_norm_op.cc + stack_op.cc + transpose_op.cc + flatten_op.cc + flatten_contiguous_range_op.cc + emb_eltwise_layernorm.cc + skip_layernorm.cc + scale_op.cc + slice_op.cc + hard_sigmoid_op.cc + hard_swish_op.cc + clip_op.cc + gather_op.cc + anchor_generator_op.cc + yolo_box_op.cc + yolo_box_head_op.cc + arg_max_op.cc + roi_align_op.cc + affine_channel_op.cc + multiclass_nms_op.cc + multiclass_nms3_op.cc + nearest_interp_op.cc + reshape_op.cc + reduce_op.cc + gather_nd_op.cc + tile_op.cc + conv3d_op.cc + mish_op.cc + nearest_interp_v2_op.cc + pool3d_op.cc + deformable_conv_op.cc + preln_emb_eltwise_layernorm.cc + strided_slice_op.cc + preln_skip_layernorm.cc + roll_op.cc + transformer_input_convert_op.cc + remove_padding_op.cc + recover_padding_op.cc + DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto + op_registry) -nv_test(test_op_converter SRCS test_op_converter.cc DEPS - paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_converter) +nv_test( + test_op_converter + SRCS test_op_converter.cc + DEPS paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_engine + tensorrt_converter) diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc index b86351e394bd1..2ef8ec16c76df 100644 --- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include #include "glog/logging.h" diff --git a/paddle/fluid/inference/tensorrt/convert/deformable_conv_op.cc b/paddle/fluid/inference/tensorrt/convert/deformable_conv_op.cc index 2bbe6ea3d2fa8..df6c601500c3b 100644 --- a/paddle/fluid/inference/tensorrt/convert/deformable_conv_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/deformable_conv_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.h" diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc index 7a494860e6fa1..ffb32bab52296 100644 --- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc @@ -30,23 +30,28 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { -#if IS_TRT_VERSION_GE(6000) VLOG(4) << "convert fluid EmbEltwiseLayerNorm op to tensorrt layer"; framework::OpDesc op_desc(op, nullptr); auto word_id_name = op_desc.Input("WordId").front(); - auto pos_id_name = op_desc.Input("PosId").front(); + auto pos_id_name = engine_->tensorrt_transformer_posid(); engine_->Set("ernie_pos_name", new std::string(pos_id_name)); auto sent_id_name = op_desc.Input("SentId").front(); + auto mask_id_name = engine_->tensorrt_transformer_maskid(); auto word_emb_name = op_desc.Input("WordEmbedding").front(); auto pos_emb_name = op_desc.Input("PosEmbedding").front(); auto sent_emb_name = op_desc.Input("SentEmbedding").front(); std::vector id_names; std::vector emb_names; + bool flag_varseqlen = + engine_->use_varseqlen() && pos_id_name != "" && mask_id_name != ""; - if (engine_->use_oss()) { + if (flag_varseqlen) { + engine_->SetITensor("word_id", engine_->GetITensor(word_id_name)); + engine_->SetITensor("pos_id", engine_->GetITensor(pos_id_name)); + engine_->SetITensor("mask_id", engine_->GetITensor(mask_id_name)); id_names = std::vector{word_id_name, pos_id_name, sent_id_name}; emb_names = @@ -106,7 +111,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { nvinfer1::ILayer* layer = nullptr; bool enable_int8 = op_desc.HasAttr("enable_int8"); - if (engine_->use_oss()) { + if (flag_varseqlen) { int output_fp16 = static_cast((engine_->WithFp16() == 1) ? 1 : 0); if (enable_int8) { output_fp16 = 1; @@ -121,7 +126,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { output_fp16, 1, platform::errors::InvalidArgument( "Only Precision::KHalf(fp16) is supported when infering " - "ernie(bert) model with config.EnableTensorRtOSS(). " + "ernie(bert) model with config.EnableVarseqlen(). " "But Precision::KFloat32 is setted.")); const std::vector fields{ {"bert_embeddings_layernorm_beta", bias, @@ -159,8 +164,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { plugin_inputs.emplace_back( engine_->GetITensor(pos_id_name)); // cu_seqlens, // eval_placeholder_2 - auto max_seqlen_tensor = - engine_->GetITensor(engine_->network()->getInput(3)->getName()); + auto max_seqlen_tensor = engine_->GetITensor(mask_id_name); auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *max_seqlen_tensor); nvinfer1::Dims shape_dim; @@ -193,8 +197,8 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { engine_->SetTensorDynamicRange(plugin_layer->getOutput(1), out_scale); } if (engine_->with_interleaved()) { - VLOG(4) - << "fused emb_eltwise_layernorm op: use_oss and with_interleaved"; + VLOG(4) << "fused emb_eltwise_layernorm op: use_varseqlen and " + "with_interleaved"; if (!enable_int8) { PADDLE_THROW( platform::errors::Fatal("use with_interleaved must be int8.")); @@ -229,12 +233,6 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name}, test_mode); } - -#else - PADDLE_THROW(platform::errors::Fatal( - "You are running the TRT Dynamic Shape mode, need to confirm that " - "your TRT version is no less than 6.0")); -#endif } }; diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index a631332dae360..bf3170dacc7df 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -250,8 +250,7 @@ class FcOpConverter : public OpConverter { } // If use tensorrt'oss, the x_dim and x_num_col_dims need change, and can // not add Shuffle layer in ernie's multihead. - if (engine_->use_oss() && engine_->with_ernie() && x_dim.nbDims == 4 && - x_dim.d[3] == 1 && x_num_col_dims == 2) { + if (x_dim.nbDims == 4 && x_num_col_dims == 1) { if (enable_int8 || support_int8) { // add conv1x1 layer nvinfer1::DimsHW nv_ksize(1, 1); diff --git a/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc b/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc index e08f50833ed99..c293282b761d3 100644 --- a/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc @@ -50,10 +50,11 @@ class FlattenContiguousRangeOpConverter : public OpConverter { for (int i = 0, j = 0; i < dims; ++i) { if (start_axis <= i + 1 && i + 1 <= stop_axis) { int dim_i = input_dim.d[i]; - PADDLE_ENFORCE_GT(dim_i, 0, platform::errors::InvalidArgument( - "flatten_contiguous_range input dim " - "should be > 0, but got %d.", - dim_i)); + PADDLE_ENFORCE_GT(dim_i, 0, + platform::errors::InvalidArgument( + "flatten_contiguous_range input dim " + "should be > 0, but got %d.", + dim_i)); dim_prod *= dim_i; if (i + 1 == stop_axis) { flatten_dim.d[j++] = dim_prod; diff --git a/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc index 910a807d3626a..2a62f9009e209 100644 --- a/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" namespace paddle { diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.cc b/paddle/fluid/inference/tensorrt/convert/io_converter.cc index b468518fa5a3c..02e9610ea1ec4 100644 --- a/paddle/fluid/inference/tensorrt/convert/io_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/io_converter.cc @@ -13,15 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/io_converter.h" + #include + #include "paddle/fluid/platform/enforce.h" namespace paddle { namespace inference { namespace tensorrt { -using platform::is_gpu_place; using platform::is_cpu_place; +using platform::is_gpu_place; class DefaultIOConverter : public EngineIOConverter { public: @@ -49,8 +51,9 @@ class DefaultIOConverter : public EngineIOConverter { out, in.data(), size, cudaMemcpyHostToDevice, *stream_)); } else if (is_gpu_place(place)) { PADDLE_ENFORCE_EQ( - 0, cudaMemcpyAsync(out, in.data(), size, - cudaMemcpyDeviceToDevice, *stream_), + 0, + cudaMemcpyAsync(out, in.data(), size, cudaMemcpyDeviceToDevice, + *stream_), platform::errors::External( "cudaMemcpyAsync(cudaMemcpyDeviceToDevice) error.")); } else { @@ -78,14 +81,16 @@ class DefaultIOConverter : public EngineIOConverter { "But out's memory_size = %u, max_size = %u.", size, max_size)); if (is_cpu_place(place)) { - PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data(), in, size, - cudaMemcpyDeviceToHost, *stream_), + PADDLE_ENFORCE_EQ(0, + cudaMemcpyAsync(out->data(), in, size, + cudaMemcpyDeviceToHost, *stream_), platform::errors::External( "cudaMemcpyAsync(cudaMemcpyDeviceToHost) error.")); } else if (is_gpu_place(place)) { PADDLE_ENFORCE_EQ( - 0, cudaMemcpyAsync(out->data(), in, size, - cudaMemcpyDeviceToDevice, *stream_), + 0, + cudaMemcpyAsync(out->data(), in, size, + cudaMemcpyDeviceToDevice, *stream_), platform::errors::External( "cudaMemcpyAsync(cudaMemcpyDeviceToDevice) error.")); } else { diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.h b/paddle/fluid/inference/tensorrt/convert/io_converter.h index 58c178028b8b2..3ff78a6dc7a3b 100644 --- a/paddle/fluid/inference/tensorrt/convert/io_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/io_converter.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/inference/utils/singleton.h" diff --git a/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc b/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc index a968ea2a2c484..ae39267533928 100644 --- a/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" namespace paddle { diff --git a/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc b/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc index b0d67a5bf90ca..d630f7e9967a7 100644 --- a/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" namespace paddle { diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc index 4b4ad01f5674a..f06554e7ebb41 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc @@ -76,12 +76,14 @@ class MultiheadMatMulOpConverter : public OpConverter { nvinfer1::ILayer* layer = nullptr; auto output_name = op_desc.Output("Out")[0]; - + bool flag_varseqlen = engine_->use_varseqlen() && + engine_->tensorrt_transformer_posid() != "" && + engine_->tensorrt_transformer_maskid() != ""; if (engine_->with_dynamic_shape()) { - if (engine_->use_oss()) { + if (flag_varseqlen) { if (engine_->precision() == AnalysisConfig::Precision::kFloat32) { PADDLE_THROW(platform::errors::Fatal( - "use use_oss must be int8 or half, not float32.")); + "use use_varseqlen must be int8 or half, not float32.")); } nvinfer1::Weights weight{nvinfer1::DataType::kFLOAT, static_cast(weight_data), @@ -90,7 +92,8 @@ class MultiheadMatMulOpConverter : public OpConverter { static_cast(bias_data), static_cast(bias_t->numel())}; if (engine_->with_interleaved()) { - VLOG(4) << "fused multihead_matmul op: use_oss and with_interleaved"; + VLOG(4) << "fused multihead_matmul op: use_varseqlen and " + "with_interleaved"; if (!op_desc.HasAttr("Input_scale")) { PADDLE_THROW( platform::errors::Fatal("use with_interleaved must be int8.")); @@ -233,9 +236,6 @@ class MultiheadMatMulOpConverter : public OpConverter { BOOST_GET_CONST(float, op_desc.GetAttr("dp_probs")) / 127.0; } } - - auto mask_tensor = engine_->GetITensor("qkv_plugin_mask"); - auto creator = GetPluginRegistry()->getPluginCreator( "CustomQKVToContextPluginDynamic", "2"); assert(creator != nullptr); @@ -272,18 +272,10 @@ class MultiheadMatMulOpConverter : public OpConverter { std::vector plugin_inputs; plugin_inputs.emplace_back(fc_layer->getOutput(0)); - plugin_inputs.emplace_back(mask_tensor); - if (engine_->Has("ernie_pos_name")) { - plugin_inputs.emplace_back(engine_->GetITensor( - engine_->Get("ernie_pos_name"))); - } else { - plugin_inputs.emplace_back(engine_->GetITensor( - engine_->network() - ->getInput(2) - ->getName())); // cu_seqlens, eval_placeholder_2 - } - auto max_seqlen_tensor = - engine_->GetITensor(engine_->network()->getInput(3)->getName()); + plugin_inputs.emplace_back(engine_->GetITensor("qkv_plugin_mask")); + plugin_inputs.emplace_back(engine_->GetITensor("pos_id")); + + auto max_seqlen_tensor = engine_->GetITensor("mask_id"); auto* shuffle_layer = TRT_ENGINE_ADD_LAYER( engine_, Shuffle, *const_cast(max_seqlen_tensor)); diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index 0a99b12edc25c..077ba32ba89c1 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/scope.h" @@ -268,14 +269,16 @@ class OpConverter { } } engine->DeclareInput( - input, FluidDataType2TRT( - var->Proto()->type().lod_tensor().tensor().data_type()), + input, + FluidDataType2TRT( + var->Proto()->type().lod_tensor().tensor().data_type()), Vec2TRT_Dims(input_shape, input, true)); #endif } else { engine->DeclareInput( - input, FluidDataType2TRT( - var->Proto()->type().lod_tensor().tensor().data_type()), + input, + FluidDataType2TRT( + var->Proto()->type().lod_tensor().tensor().data_type()), Vec2TRT_Dims(var_shape, input)); } } diff --git a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc index 87fdbb71a3faf..4ee8db7c69d62 100644 --- a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc @@ -32,7 +32,7 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter { #if IS_TRT_VERSION_GE(7000) VLOG(4) << "convert fluid PrelnEmbEltwiseLayerNorm op to tensorrt layer"; - if (!(engine_->use_oss() && engine_->with_interleaved())) { + if (!(engine_->use_varseqlen() && engine_->with_interleaved())) { PADDLE_THROW(platform::errors::Fatal( "PrelnErnie: If you want to use oss, must be with interleaved")); } diff --git a/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc index 8053135cc452c..1e9aec29e347a 100644 --- a/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc @@ -24,7 +24,7 @@ class PrelnSkipLayerNormOpConverter : public OpConverter { const framework::Scope& scope, bool test_mode) override { #if IS_TRT_VERSION_GE(7000) VLOG(4) << "convert fused preln_skip_layernorm op to tensorrt layer"; - if (!(engine_->use_oss() && engine_->with_interleaved())) { + if (!(engine_->use_varseqlen() && engine_->with_interleaved())) { PADDLE_THROW(platform::errors::Fatal( "PrelnErnie: If you want to use oss, must be with interleaved")); } @@ -60,7 +60,8 @@ class PrelnSkipLayerNormOpConverter : public OpConverter { nvinfer1::ILayer* layer = nullptr; - VLOG(4) << "fused preln_skip_layernorm op: use_oss and with_interleaved"; + VLOG(4) + << "fused preln_skip_layernorm op: use_varseqlen and with_interleaved"; auto creator = GetPluginRegistry()->getPluginCreator( "CustomSkipLayerNormPluginDynamic", "4"); diff --git a/paddle/fluid/inference/tensorrt/convert/recover_padding_op.cc b/paddle/fluid/inference/tensorrt/convert/recover_padding_op.cc new file mode 100644 index 0000000000000..8f996e1d0f8bc --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/recover_padding_op.cc @@ -0,0 +1,76 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Recover padding of transformer'input. + */ +class RecoverPadding : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "Recover padding of transformer'output: VarSeqlen -> Padding."; + if (!engine_->with_dynamic_shape()) { + PADDLE_THROW(platform::errors::Fatal( + "recover_padding_op: If you want to use transformer, must " + "be with dynamic shape")); + } + + framework::OpDesc op_desc(op, nullptr); + /* + auto x_var_name = op_desc.Input(InputNames()).front(); + auto* x_var_desc = block->FindVar(x_var_name); + const auto x_shape = x_var_desc->GetShape(); + */ + auto input_name = op_desc.Input("Input").front(); + + std::cout << "input_name: " << input_name << std::endl; + + std::vector plugin_inputs; + plugin_inputs.push_back(engine_->GetITensor(input_name)); + plugin_inputs.push_back(engine_->GetITensor("pos_id")); + plugin_inputs.push_back(engine_->GetITensor("mask_id")); + int input_num = 3; + auto output_name = op_desc.Output("Out").front(); + + plugin::RecoverPaddingPlugin* plugin = new plugin::RecoverPaddingPlugin(); + nvinfer1::ILayer* layer = + engine_->AddDynamicPlugin(plugin_inputs.data(), input_num, plugin); + + RreplenishLayerAndOutput(layer, "recover_padding", {output_name}, + test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(recover_padding, RecoverPadding); diff --git a/paddle/fluid/inference/tensorrt/convert/remove_padding_op.cc b/paddle/fluid/inference/tensorrt/convert/remove_padding_op.cc new file mode 100644 index 0000000000000..49d5edbbd4e02 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/remove_padding_op.cc @@ -0,0 +1,69 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Remove padding of transformer'input. + */ +class RemovePadding : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "Remove padding of transformer'input: Padding -> VarSeqlen"; + if (!engine_->with_dynamic_shape()) { + PADDLE_THROW(platform::errors::Fatal( + "remove_padding_op: If you want to use transformer, must " + "be with dynamic shape")); + } + + framework::OpDesc op_desc(op, nullptr); + auto input_name = op_desc.Input("Input").front(); + + std::vector plugin_inputs; + plugin_inputs.push_back(engine_->GetITensor(input_name)); + plugin_inputs.push_back(engine_->GetITensor("pos_id")); + plugin_inputs.push_back(engine_->GetITensor("word_id")); + size_t input_num = plugin_inputs.size(); + auto output_name = op_desc.Output("Out").front(); + + plugin::RemovePaddingPlugin* plugin = new plugin::RemovePaddingPlugin(); + nvinfer1::ILayer* layer = + engine_->AddDynamicPlugin(plugin_inputs.data(), input_num, plugin); + + RreplenishLayerAndOutput(layer, "remove_padding_op", {output_name}, + test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(remove_padding, RemovePadding); diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc index 831e117311771..6f65e27192319 100644 --- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc @@ -52,10 +52,13 @@ class SkipLayerNormOpConverter : public OpConverter { bool enable_int8 = op_desc.HasAttr("enable_int8"); nvinfer1::ILayer* layer = nullptr; - - if (engine_->use_oss()) { + bool flag_varseqlen = engine_->use_varseqlen() && + engine_->tensorrt_transformer_posid() != "" && + engine_->tensorrt_transformer_maskid() != ""; + if (flag_varseqlen) { if (engine_->with_interleaved()) { - VLOG(4) << "fused skip_layernorm op: use_oss and with_interleaved"; + VLOG(4) + << "fused skip_layernorm op: use_varseqlen and with_interleaved"; if (!enable_int8) { PADDLE_THROW( platform::errors::Fatal("use with_interleaved must be int8.")); diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc index dea9a1ec3d76d..fa6f488940365 100644 --- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h" -#include "paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h" namespace paddle { namespace inference { @@ -74,47 +73,12 @@ class SliceOpConverter : public OpConverter { nvinfer1::ILayer* layer = nullptr; if (engine_->with_dynamic_shape()) { - if (engine_->use_oss() && engine_->with_ernie() && - input_dims.nbDims == 4) { - std::vector plugin_inputs; - if (engine_->with_interleaved()) { - auto* shuffler_slice = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); - nvinfer1::Permutation transpose_embed{2, 1, 0, 3}; - shuffler_slice->setSecondTranspose(transpose_embed); - engine_->SetTensorDynamicRange(shuffler_slice->getOutput(0), - out_scale); - shuffler_slice->setName( - ("SpecialSlice_interleaved: transpose: (Output: " + output_name + - ")") - .c_str()); - plugin_inputs.emplace_back(shuffler_slice->getOutput(0)); - } else { - plugin_inputs.emplace_back(input); - } - std::string pos_name; - if (engine_->Has("ernie_pos_name")) { - pos_name = engine_->Get("ernie_pos_name"); - } else { - // hard code for compatibility - pos_name = engine_->network()->getInput(2)->getName(); - } - plugin_inputs.emplace_back( - engine_->GetITensor(pos_name)); // cu_seqlens, eval_placeholder_2 - - // bool ban_fp16 = engine_->disable_trt_plugin_fp16(); - plugin::SpecialSlicePluginDynamic* plugin = - new plugin::SpecialSlicePluginDynamic(); - layer = engine_->AddDynamicPlugin(plugin_inputs.data(), - plugin_inputs.size(), plugin); - } else { - bool with_fp16 = - engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); - int decrease_axis = - decrease_axises.size() == 0 ? -1 : decrease_axises[0]; - plugin::SlicePluginDynamic* plugin = new plugin::SlicePluginDynamic( - starts, ends, axes, decrease_axis, with_fp16); - layer = engine_->AddDynamicPlugin(&input, 1, plugin); - } + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + int decrease_axis = decrease_axises.size() == 0 ? -1 : decrease_axises[0]; + plugin::SlicePluginDynamic* plugin = new plugin::SlicePluginDynamic( + starts, ends, axes, decrease_axis, with_fp16); + layer = engine_->AddDynamicPlugin(&input, 1, plugin); } else { bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc index 46e6c18bfb8e3..66acee964cdbc 100644 --- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" namespace paddle { diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index 1ad82df41737c..7a034f2c166dd 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -13,6 +13,7 @@ limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" diff --git a/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc index 92e34e48bdb29..caa9e9ee2898d 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" diff --git a/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc b/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc index 6c876964297f9..b1319312adfe0 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc index a856d14144469..0b9f4a5fd84db 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" diff --git a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc index cf37739608763..2d77b9b32db2c 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc @@ -12,6 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" diff --git a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc index 9c6ea51fe5a35..5221843db19d8 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc index 8134d389469cb..4647521dd32b0 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" diff --git a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc index 8f91309a0a00d..a2fe32b75f3de 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/inference/tensorrt/convert/io_converter.h" diff --git a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc index f17e00de0eeb7..f7984dd0ab750 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" diff --git a/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc index c84c30255fa96..d2dbb7fb5920c 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc index 86cb7543d42da..35b8fe1ee6ad7 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" diff --git a/paddle/fluid/inference/tensorrt/convert/test_nearest_interp_v2_op.cc b/paddle/fluid/inference/tensorrt/convert/test_nearest_interp_v2_op.cc index f5ab6a9924931..96b14c4e40cb0 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_nearest_interp_v2_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_nearest_interp_v2_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc index 9bfae64fe80e3..9a4d4db3435a2 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc @@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" - #include // NOLINT #include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc index ba35d7ddbb2f4..a8e36f827d8e3 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc @@ -13,6 +13,7 @@ limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc index 36f13262a73d7..b917aa865d28f 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc @@ -12,7 +12,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" diff --git a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc index f2541ff7c0b5e..d71cf051972d1 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" diff --git a/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc index 3ebb51afdf44f..b5e640ea24412 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" diff --git a/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc index 9cd5e81141598..babe682ab4e48 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc @@ -12,6 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc index 3b6a4a80044eb..1d23aeedc5a8d 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" diff --git a/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc b/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc index 7a5a886affed3..94ca6f0ed4627 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" diff --git a/paddle/fluid/inference/tensorrt/convert/transformer_input_convert_op.cc b/paddle/fluid/inference/tensorrt/convert/transformer_input_convert_op.cc new file mode 100644 index 0000000000000..045a5d163ca51 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/transformer_input_convert_op.cc @@ -0,0 +1,72 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/transformer_input_convert_plugin.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Convert Transformer Input(pos_id, max_seqlen). + */ +class TransformerInputConvert : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "Convert Transformer Input(pos_id, max_seqlen), use " + "transformer_input_convert_plugin"; + if (!engine_->with_dynamic_shape()) { + PADDLE_THROW(platform::errors::Fatal( + "transformer_input_convert_op: If you want to use transformer, must " + "be with dynamic shape")); + } + + framework::OpDesc op_desc(op, nullptr); + auto input_name = op_desc.Input("Input").front(); + auto* input = engine_->GetITensor(input_name); + int input_num = op_desc.Input("Input").size(); + + // tensorrt_subgraph_pass will rename tensor + // auto pos_id_name = op_desc.Output("PosId").front(); + // auto max_seqlen_name = op_desc.Output("MaxSeqlen").front(); + auto pos_id_name = "pos_id_tensor"; + auto max_seqlen_name = "max_seqlen_tensor"; + + plugin::TransformerInputConvertPlugin* plugin = + new plugin::TransformerInputConvertPlugin(); + nvinfer1::ILayer* layer = + engine_->AddDynamicPlugin(&input, input_num, plugin); + + RreplenishLayerAndOutput(layer, "transformer_input_convert", + {pos_id_name, max_seqlen_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(transformer_input_convert, TransformerInputConvert); diff --git a/paddle/fluid/inference/tensorrt/convert/unary_op.cc b/paddle/fluid/inference/tensorrt/convert/unary_op.cc index aa3d38ebe2073..72d5cb2aeb4d3 100644 --- a/paddle/fluid/inference/tensorrt/convert/unary_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/unary_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include + #include "glog/logging.h" #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" diff --git a/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc index 17d217dff43fd..f5ab63daa88df 100644 --- a/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h" diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 00a6b2ffbf923..7f308fd3a04d5 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include #include "cuda_runtime_api.h" // NOLINT diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index f781cd0cb3a8d..b28fe827156c3 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include #include #include // NOLINT @@ -151,7 +152,7 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape, std::string input, return dims; } } -} // NOLINT +} // namespace class TRTInt8Calibrator; @@ -410,14 +411,19 @@ class TensorRTEngine { suffix_counter += 1; } - void SetUseOSS(bool use_oss) { use_oss_ = use_oss; } + void SetUseOSS(bool use_varseqlen) { use_varseqlen_ = use_varseqlen; } void SetUseDLA(bool use_dla) { use_dla_ = use_dla; } void SetDLACore(int dla_core) { dla_core_ = dla_core; } void SetWithErnie(bool with_ernie) { with_ernie_ = with_ernie; } void SetWithInterleaved(bool with_interleaved) { with_interleaved_ = with_interleaved; } - + void SetTransformerPosid(std::string tensorrt_transformer_posid) { + tensorrt_transformer_posid_ = tensorrt_transformer_posid; + } + void SetTransformerMaskid(std::string tensorrt_transformer_maskid) { + tensorrt_transformer_maskid_ = tensorrt_transformer_maskid; + } void ClearWeights() { for (auto& weight_pair : weight_map) { weight_pair.second.reset(nullptr); @@ -488,9 +494,15 @@ class TensorRTEngine { return ret; } - bool use_oss() { return use_oss_; } + bool use_varseqlen() { return use_varseqlen_; } bool with_ernie() { return with_ernie_; } bool with_interleaved() { return with_interleaved_; } + std::string tensorrt_transformer_posid() { + return tensorrt_transformer_posid_; + } + std::string tensorrt_transformer_maskid() { + return tensorrt_transformer_maskid_; + } bool disable_trt_plugin_fp16() { return disable_trt_plugin_fp16_; } bool with_dynamic_shape() { return with_dynamic_shape_; } AnalysisConfig::Precision precision() { return precision_; } @@ -612,11 +624,13 @@ class TensorRTEngine { ShapeMapType max_input_shape_; ShapeMapType optim_input_shape_; bool disable_trt_plugin_fp16_{false}; - bool use_oss_{false}; + bool use_varseqlen_{false}; bool use_dla_{false}; int dla_core_{0}; bool with_ernie_{false}; bool with_interleaved_{false}; + std::string tensorrt_transformer_posid_; + std::string tensorrt_transformer_maskid_; nvinfer1::ILogger& logger_; // max data size for the buffers. diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h index b8051d8610442..e283000cdace5 100644 --- a/paddle/fluid/inference/tensorrt/helper.h +++ b/paddle/fluid/inference/tensorrt/helper.h @@ -17,9 +17,11 @@ #include #include #include + #include #include #include + #include "paddle/fluid/platform/dynload/tensorrt.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 690bc173c77cf..dc7c77bc66acf 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/inference/tensorrt/op_teller.h" + #include + #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/data_layout.h" @@ -125,7 +127,10 @@ struct SimpleOpTypeSetTeller : public Teller { "strided_slice", "fused_preln_embedding_eltwise_layernorm", "roll", - "preln_skip_layernorm"}; + "preln_skip_layernorm", + "transformer_input_convert", + "recover_padding", + "remove_padding"}; std::unordered_set teller_set{ "mul", "matmul", @@ -194,7 +199,10 @@ struct SimpleOpTypeSetTeller : public Teller { "fused_preln_embedding_eltwise_layernorm", "preln_skip_layernorm", "roll", - "multiclass_nms3"}; + "multiclass_nms3", + "transformer_input_convert", + "recover_padding", + "remove_padding"}; }; bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, diff --git a/paddle/fluid/inference/tensorrt/op_teller.h b/paddle/fluid/inference/tensorrt/op_teller.h index 0a0cbeae51b02..40f1a0055c78b 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.h +++ b/paddle/fluid/inference/tensorrt/op_teller.h @@ -17,6 +17,7 @@ #include #include #include + #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/inference/tensorrt/engine.h" diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index ff6a1cd60f720..0377c82838bdd 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -1,20 +1,35 @@ -nv_library(tensorrt_plugin - SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu - prelu_op_plugin.cu gelu_op_plugin.cu - pool_op_plugin.cu swish_op_plugin.cu layer_norm_op_plugin.cu - instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu - qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu - hard_swish_op_plugin.cu stack_op_plugin.cu special_slice_plugin.cu - anchor_generator_op_plugin.cu - yolo_box_op_plugin.cu - yolo_box_head_op_plugin.cu - roi_align_op_plugin.cu - gather_nd_op_plugin.cu - mish_op_plugin.cu - pool3d_op_plugin.cu - deformable_conv_op_plugin.cu - matmul_op_int8_plugin.cu - DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor) +nv_library( + tensorrt_plugin + SRCS trt_plugin.cc + split_op_plugin.cu + elementwise_op_plugin.cu + prelu_op_plugin.cu + gelu_op_plugin.cu + pool_op_plugin.cu + swish_op_plugin.cu + layer_norm_op_plugin.cu + instance_norm_op_plugin.cu + emb_eltwise_layernorm_plugin.cu + qkv_to_context_plugin.cu + skip_layernorm_op_plugin.cu + slice_op_plugin.cu + hard_swish_op_plugin.cu + stack_op_plugin.cu + anchor_generator_op_plugin.cu + yolo_box_op_plugin.cu + yolo_box_head_op_plugin.cu + roi_align_op_plugin.cu + gather_nd_op_plugin.cu + mish_op_plugin.cu + pool3d_op_plugin.cu + deformable_conv_op_plugin.cu + matmul_op_int8_plugin.cu + transformer_input_convert_plugin.cu + remove_padding_plugin.cu + recover_padding_plugin.cu + DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor) -nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS - paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_plugin) +nv_test( + test_split_plugin + SRCS test_split_plugin.cc + DEPS paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_plugin) diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu index e5584f2658067..a339f880ac388 100644 --- a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu @@ -14,6 +14,7 @@ #include #include + #include #include diff --git a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu index 6128f8f0e4134..7ea664ded66f2 100644 --- a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include #include @@ -88,9 +89,10 @@ DeformableConvPlugin::DeformableConvPlugin( dilations_.insert(dilations_.end(), dilations.cbegin(), dilations.cend()); PADDLE_ENFORCE_EQ(data_type_ == nvinfer1::DataType::kFLOAT || data_type_ == nvinfer1::DataType::kHALF, - true, platform::errors::InvalidArgument( - "The DeformableConv TRT Plugin's input type " - "should be float or half.")); + true, + platform::errors::InvalidArgument( + "The DeformableConv TRT Plugin's input type " + "should be float or half.")); PADDLE_ENFORCE_EQ( paddings_.size(), strides_.size(), platform::errors::InvalidArgument( @@ -124,9 +126,10 @@ DeformableConvPlugin::DeformableConvPlugin( output_dim_.insert(output_dim_.end(), output_dim.cbegin(), output_dim.cend()); PADDLE_ENFORCE_EQ(data_type_ == nvinfer1::DataType::kFLOAT || data_type_ == nvinfer1::DataType::kHALF, - true, platform::errors::InvalidArgument( - "The DeformableConv TRT Plugin's input type " - "should be float or half.")); + true, + platform::errors::InvalidArgument( + "The DeformableConv TRT Plugin's input type " + "should be float or half.")); PADDLE_ENFORCE_EQ( paddings_.size(), strides_.size(), platform::errors::InvalidArgument( @@ -363,13 +366,11 @@ __global__ void ModulatedDeformableIm2colGpuKernel( const float* data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; const float* data_offset_ptr = - data_offset + - (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * - kernel_w * height_col * width_col; + data_offset + (b_col * deformable_group + deformable_group_index) * 2 * + kernel_h * kernel_w * height_col * width_col; const float* data_mask_ptr = - data_mask + - (b_col * deformable_group + deformable_group_index) * kernel_h * - kernel_w * height_col * width_col; + data_mask + (b_col * deformable_group + deformable_group_index) * + kernel_h * kernel_w * height_col * width_col; for (int i = 0; i < kernel_h; ++i) { for (int j = 0; j < kernel_w; ++j) { @@ -432,13 +433,11 @@ __global__ void ModulatedDeformableIm2colGpuKernel( const half* data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; const half* data_offset_ptr = - data_offset + - (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * - kernel_w * height_col * width_col; + data_offset + (b_col * deformable_group + deformable_group_index) * 2 * + kernel_h * kernel_w * height_col * width_col; const half* data_mask_ptr = - data_mask + - (b_col * deformable_group + deformable_group_index) * kernel_h * - kernel_w * height_col * width_col; + data_mask + (b_col * deformable_group + deformable_group_index) * + kernel_h * kernel_w * height_col * width_col; for (int i = 0; i < kernel_h; ++i) { for (int j = 0; j < kernel_w; ++j) { diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu index 1070a88cee737..5f4abee2838f7 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h" namespace paddle { @@ -67,14 +68,16 @@ __global__ void elementwise_kernel(const size_t total, const T *x_data, nvinfer1::Dims ElementWisePlugin::getOutputDimensions( int index, const nvinfer1::Dims *input_dims, int num_inputs) TRT_NOEXCEPT { - PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( - "There is only one output in TRT elementwise " - "op plugin, but got output index: %d.", - index)); - PADDLE_ENFORCE_EQ(num_inputs, 2, platform::errors::InvalidArgument( - "There are 2 inputs in TRT elementwise " - "op plugin, but got input number: %d.", - num_inputs)); + PADDLE_ENFORCE_EQ(index, 0, + platform::errors::InvalidArgument( + "There is only one output in TRT elementwise " + "op plugin, but got output index: %d.", + index)); + PADDLE_ENFORCE_EQ( + num_inputs, 2, + platform::errors::InvalidArgument("There are 2 inputs in TRT elementwise " + "op plugin, but got input number: %d.", + num_inputs)); PADDLE_ENFORCE_NOT_NULL( input_dims, platform::errors::InvalidArgument( diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h index aa1ab5389a572..51fc1bebd90be 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" namespace paddle { diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu index 82f4420a2a04c..6c7530cdc1f05 100644 --- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu @@ -13,9 +13,11 @@ // limitations under the License. #include + #include #include // NOLINT #include + #include "glog/logging.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" @@ -253,10 +255,11 @@ nvinfer1::DataType EmbEltwiseLayernormPluginDynamic::getOutputDataType( int index, const nvinfer1::DataType *input_types, int nb_inputs) const TRT_NOEXCEPT { PADDLE_ENFORCE_EQ( - index, 0, platform::errors::InvalidArgument( - "The EmbEltwiseLayernorm Plugin only has one input, so the " - "index value should be 0, but get %d.", - index)); + index, 0, + platform::errors::InvalidArgument( + "The EmbEltwiseLayernorm Plugin only has one input, so the " + "index value should be 0, but get %d.", + index)); if (with_fp16_) return nvinfer1::DataType::kHALF; else diff --git a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h index 841fb2f6fe399..f27b66b03f544 100644 --- a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h @@ -15,9 +15,11 @@ #pragma once #include + #include #include #include + #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu index 08b259e0f952e..cba1bb04c3654 100644 --- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu @@ -15,6 +15,7 @@ #include #include #include + #include "paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h" #include "paddle/fluid/platform/float16.h" @@ -112,15 +113,15 @@ int GeluPlugin::enqueue(int batch_size, const void* const* inputs, VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp32"; const float* input = static_cast(inputs[0]); float* output = static_cast(outputs[0]); - gelu_kernel<<>>( - kA, num, input, output); + gelu_kernel + <<>>(kA, num, input, output); } else if (type == nvinfer1::DataType::kHALF) { VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp16"; const half* input = static_cast(inputs[0]); half* output = static_cast(outputs[0]); - no_exact_gelu_kernel<<>>( - kAT, kBT, kCT, num, input, output); + no_exact_gelu_kernel + <<>>(kAT, kBT, kCT, num, input, + output); } else { PADDLE_THROW(platform::errors::InvalidArgument( "The Gelu TRT Plugin's input type should be float or half.")); @@ -170,10 +171,11 @@ bool GeluPluginDynamic::supportsFormatCombination( nvinfer1::DataType GeluPluginDynamic::getOutputDataType( int index, const nvinfer1::DataType* input_types, int nb_inputs) const TRT_NOEXCEPT { - PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( - "The Gelu Plugin only has one input, so the " - "index value should be 0, but get %d.", - index)); + PADDLE_ENFORCE_EQ(index, 0, + platform::errors::InvalidArgument( + "The Gelu Plugin only has one input, so the " + "index value should be 0, but get %d.", + index)); return input_types[0]; } @@ -192,15 +194,15 @@ int GeluPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp32"; const float* input = static_cast(inputs[0]); float* output = static_cast(outputs[0]); - gelu_kernel<<>>( - kA, num, input, output); + gelu_kernel + <<>>(kA, num, input, output); } else if (input_type == nvinfer1::DataType::kHALF) { VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp16"; const half* input = static_cast(inputs[0]); half* output = static_cast(outputs[0]); - no_exact_gelu_kernel<<>>( - kAT, kBT, kCT, num, input, output); + no_exact_gelu_kernel + <<>>(kAT, kBT, kCT, num, input, + output); } else { PADDLE_THROW(platform::errors::InvalidArgument( "The Gelu TRT Plugin's input type should be float or half.")); diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h index 7efdd2798b264..8436ccad78a2c 100644 --- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h @@ -14,9 +14,11 @@ #pragma once #include + #include #include #include + #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" namespace paddle { diff --git a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu index 9872b1ff8d957..05ed76bd3c983 100644 --- a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu @@ -14,6 +14,7 @@ #include #include + #include "paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h" namespace paddle { diff --git a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h index 475c908c13bbf..b1e693799bd77 100644 --- a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h @@ -14,9 +14,11 @@ #pragma once #include + #include #include #include + #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" diff --git a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu index 03686aefc1370..9acd688f707a3 100644 --- a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu @@ -13,8 +13,10 @@ // limitations under the License. #include + #include #include + #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu index 67d44184a76d0..16e2a284d4bf2 100644 --- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu @@ -13,8 +13,10 @@ // limitations under the License. #include + #include #include + #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h" #include "paddle/phi/kernels/layer_norm_kernel.h" diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h index 9e8ce30283373..42dfa2b8aa02b 100644 --- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h @@ -17,6 +17,7 @@ #include #include #include + #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/inference/tensorrt/engine.h" diff --git a/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h b/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h index be8f1c418fc7f..9ca6ff29240d4 100644 --- a/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h @@ -14,9 +14,9 @@ limitations under the License. */ #pragma once #include - #include #include + #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" #include "paddle/fluid/platform/dynload/cublasLt.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu index 6e268e7b0b330..f655d23e62810 100644 --- a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h" @@ -38,11 +39,12 @@ bool MishPlugin::supportsFormat( nvinfer1::Dims MishPlugin::getOutputDimensions(int index, const nvinfer1::Dims* in_dims, int nb_inputs) TRT_NOEXCEPT { - PADDLE_ENFORCE_EQ(nb_inputs, 1, platform::errors::InvalidArgument( - "We expect [number of inputs] == 1" - "in TRT Mish op plugin, but got " - "[number of inputs] = %d.", - nb_inputs)); + PADDLE_ENFORCE_EQ( + nb_inputs, 1, + platform::errors::InvalidArgument("We expect [number of inputs] == 1" + "in TRT Mish op plugin, but got " + "[number of inputs] = %d.", + nb_inputs)); PADDLE_ENFORCE_LT(index, this->getNbOutputs(), platform::errors::InvalidArgument( "We expect [index] < [number of outputs]" @@ -123,14 +125,14 @@ int MishPlugin::enqueue(int batchSize, const void* const* inputs, VLOG(1) << "TRT Plugin DataType selected. Mish-->fp32"; const float* input = static_cast(inputs[0]); float* output = static_cast(outputs[0]); - mish_kernel<<>>(threshold_, num, - input, output); + mish_kernel + <<>>(threshold_, num, input, output); } else if (type == nvinfer1::DataType::kHALF) { VLOG(1) << "TRT Plugin DataType selected. Mish-->fp16"; const half* input = static_cast(inputs[0]); half* output = static_cast(outputs[0]); - mish_kernel<<>>(threshold_, num, - input, output); + mish_kernel + <<>>(threshold_, num, input, output); } else { PADDLE_THROW(platform::errors::InvalidArgument( "The Mish TRT Plugin's input type should be float or half.")); @@ -192,10 +194,11 @@ bool MishPluginDynamic::supportsFormatCombination( nvinfer1::DataType MishPluginDynamic::getOutputDataType( int index, const nvinfer1::DataType* input_types, int nb_inputs) const TRT_NOEXCEPT { - PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( - "The Mish Plugin only has one input, so the " - "index value should be 0, but get %d.", - index)); + PADDLE_ENFORCE_EQ(index, 0, + platform::errors::InvalidArgument( + "The Mish Plugin only has one input, so the " + "index value should be 0, but get %d.", + index)); return input_types[0]; } @@ -214,14 +217,14 @@ int MishPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, VLOG(1) << "TRT Plugin DataType selected. Mish-->fp32"; const float* input = static_cast(inputs[0]); float* output = static_cast(outputs[0]); - mish_kernel<<>>(threshold_, num, - input, output); + mish_kernel + <<>>(threshold_, num, input, output); } else if (input_type == nvinfer1::DataType::kHALF) { VLOG(1) << "TRT Plugin DataType selected. Mish-->fp16"; const half* input = static_cast(inputs[0]); half* output = static_cast(outputs[0]); - mish_kernel<<>>(threshold_, num, - input, output); + mish_kernel + <<>>(threshold_, num, input, output); } else { PADDLE_THROW(platform::errors::InvalidArgument( "The Mish TRT Plugin's input type should be float or half.")); diff --git a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h index 75390666ea097..fdef7b93f32fd 100644 --- a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h @@ -14,8 +14,10 @@ #pragma once #include + #include #include + #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu index 5596a89a083fe..40cb2b88e711c 100644 --- a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu @@ -70,10 +70,11 @@ nvinfer1::Dims Pool3DPlugin::getOutputDimensions( "The Pool3D Plugin only has one input, so the nbInputs " "value should be 1, but get %d.", nbInputs)); - PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( - "The Pool3D Plugin only has one input, so " - "the index value should be 0, but get %d.", - index)); + PADDLE_ENFORCE_EQ(index, 0, + platform::errors::InvalidArgument( + "The Pool3D Plugin only has one input, so " + "the index value should be 0, but get %d.", + index)); PADDLE_ENFORCE_EQ(inputDims[0].nbDims, 4, platform::errors::InvalidArgument( "The Pool3D Plugin only has four Dimensions, so the " diff --git a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h index 7c9a8625d70f3..d54ce067e5ef3 100644 --- a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h @@ -14,9 +14,11 @@ #pragma once #include + #include #include #include + #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" namespace paddle { diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu index 9bfe98d759d8e..80f7e349dac4a 100644 --- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu @@ -240,10 +240,11 @@ bool PoolPluginDynamic::supportsFormatCombination( nvinfer1::DataType PoolPluginDynamic::getOutputDataType( int index, const nvinfer1::DataType *input_types, int nb_inputs) const TRT_NOEXCEPT { - PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( - "The Pool Plugin only has one input, so the " - "index value should be 0, but get %d.", - index)); + PADDLE_ENFORCE_EQ(index, 0, + platform::errors::InvalidArgument( + "The Pool Plugin only has one input, so the " + "index value should be 0, but get %d.", + index)); PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT), true, platform::errors::InvalidArgument( "The input type should be half or float")); diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h index d1bf2cd02e84f..155d69cc45784 100644 --- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h @@ -14,9 +14,11 @@ #pragma once #include + #include #include #include + #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" namespace paddle { diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu index 1ea2b8b5f6ec4..72c1d546e9a2e 100644 --- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu @@ -144,10 +144,11 @@ bool PReluPluginDynamic::supportsFormatCombination( nvinfer1::DataType PReluPluginDynamic::getOutputDataType( int index, const nvinfer1::DataType *input_types, int nb_inputs) const TRT_NOEXCEPT { - PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( - "The PRelu Plugin only has one input, so the " - "index value should be 0, but get %d.", - index)); + PADDLE_ENFORCE_EQ(index, 0, + platform::errors::InvalidArgument( + "The PRelu Plugin only has one input, so the " + "index value should be 0, but get %d.", + index)); PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT), true, platform::errors::InvalidArgument( "The input type should be half or float")); diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h index e0a77de6f5491..0025e1ee5b436 100644 --- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h @@ -17,9 +17,9 @@ #include #include #include + #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" - #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu index e2f1aab9b6460..d3da5d7225d33 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu @@ -13,9 +13,11 @@ // limitations under the License. #include + #include #include // NOLINT #include + #include "glog/logging.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" @@ -103,8 +105,8 @@ inline void TransposeQKV(const int batch, const int seq_len, platform::errors::InvalidArgument( "head_num (%d) * head_size (%d) should <= %d", head_num, head_size, 1024)); - TransposeQkvKernel<<>>(head_size, input, - output); + TransposeQkvKernel + <<>>(head_size, input, output); } } @@ -142,8 +144,8 @@ inline void TransposeQKV(const int batch, const int seq_len, platform::errors::InvalidArgument( "head_num (%d) * head_size (%d) should <= %d", head_num, head_size, 1024)); - TransposeQkvKernel<<>>(head_size, input, - output); + TransposeQkvKernel + <<>>(head_size, input, output); } } @@ -218,10 +220,11 @@ nvinfer1::DataType QkvToContextPluginDynamic::getOutputDataType( int index, const nvinfer1::DataType *input_types, int nb_inputs) const TRT_NOEXCEPT { PADDLE_ENFORCE_EQ( - index, 0, platform::errors::InvalidArgument( - "The EmbEltwiseLayernorm Plugin only has one input, so the " - "index value should be 0, but get %d.", - index)); + index, 0, + platform::errors::InvalidArgument( + "The EmbEltwiseLayernorm Plugin only has one input, so the " + "index value should be 0, but get %d.", + index)); return input_types[0]; } diff --git a/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu new file mode 100644 index 0000000000000..515e01f40538c --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu @@ -0,0 +1,120 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +__global__ void RecoverPaddingKernel(const float* input0, const int32_t* input1, + float* output) { + int word_id = blockIdx.x * gridDim.y + blockIdx.y; + int32_t seqence_length = input1[blockIdx.x + 1] - input1[blockIdx.x]; + if (blockIdx.y < seqence_length) { + output[word_id * gridDim.z * blockDim.x + blockIdx.z * blockDim.x + + threadIdx.x] = + input0[(input1[blockIdx.x] + blockIdx.y) * gridDim.z * blockDim.x + + blockIdx.z * blockDim.x + threadIdx.x]; + } else { + output[word_id * gridDim.z * blockDim.x + blockIdx.z * blockDim.x + + threadIdx.x] = 0; + } +} + +nvinfer1::DataType RecoverPaddingPlugin::getOutputDataType( + int index, const nvinfer1::DataType* input_types, + int nb_inputs) const TRT_NOEXCEPT { + return input_types[0]; +} + +nvinfer1::DimsExprs RecoverPaddingPlugin::getOutputDimensions( + int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT { + nvinfer1::DimsExprs output_dims{}; + output_dims.nbDims = 3; + const auto* one = exprBuilder.constant(1); + output_dims.d[0] = exprBuilder.operation(nvinfer1::DimensionOperation::kSUB, + *inputs[1].d[0], *one); + output_dims.d[1] = inputs[2].d[1]; + output_dims.d[2] = inputs[0].d[1]; + return output_dims; +} + +bool RecoverPaddingPlugin::supportsFormatCombination( + int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, + int nbOutputs) TRT_NOEXCEPT { + PADDLE_ENFORCE_EQ(nbInputs, 3, + platform::errors::InvalidArgument("Must have 3 inputs, " + "but got %d input(s). ", + nbInputs)); + PADDLE_ENFORCE_EQ(nbOutputs, getNbOutputs(), + platform::errors::InvalidArgument("Must have 1 output, " + "but got %d output(s). ", + nbOutputs)); + if (pos == 1) { // PosId, MaxSeqlen + return inOut[pos].type == nvinfer1::DataType::kINT32 && + inOut[pos].format == nvinfer1::TensorFormat::kLINEAR; + } + return inOut[pos].type == nvinfer1::DataType::kFLOAT && + inOut[pos].format == nvinfer1::TensorFormat::kLINEAR; + // return (inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format + // == nvinfer1::TensorFormat::kLINEAR)|| + // (inOut[pos].type == nvinfer1::DataType::kHALF && inOut[pos].format == + // nvinfer1::TensorFormat::kLINEAR)|| + // (inOut[pos].type == nvinfer1::DataType::kINT8 && inOut[pos].format == + // nvinfer1::TensorFormat::kCHW32); +} + +void RecoverPaddingPlugin::configurePlugin( + const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* outputs, + int nbOutputs) TRT_NOEXCEPT {} + +void RecoverPaddingPlugin::attachToContext( + cudnnContext* cudnnContext, cublasContext* cublasContext, + nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {} + +void RecoverPaddingPlugin::detachFromContext() TRT_NOEXCEPT {} + +void RecoverPaddingPlugin::terminate() TRT_NOEXCEPT {} + +int RecoverPaddingPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, + void* const* outputs, void* workspace, + cudaStream_t stream) TRT_NOEXCEPT { + const auto input0_desc = inputDesc[0]; + const auto input1_desc = inputDesc[1]; + const auto input2_desc = inputDesc[2]; + const float* input0 = static_cast(inputs[0]); + const int32_t* input1 = + static_cast(inputs[1]); // pos_id_tensor + float* output = static_cast(outputs[0]); + const int32_t num_threads = 256; + const dim3 num_blocks( + input1_desc.dims.d[0] - 1, input2_desc.dims.d[1], + input0_desc.dims.d[1] / num_threads); // batchs, max sequnce length + // (mask_id.dims.d[1]), + // input.dims.d[1]/256 + RecoverPaddingKernel<<>>(input0, input1, + output); + return cudaGetLastError() != cudaSuccess; +} + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.h b/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.h new file mode 100644 index 0000000000000..71b576610e25c --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.h @@ -0,0 +1,133 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include + +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class RecoverPaddingPlugin : public DynamicPluginTensorRT { + public: + RecoverPaddingPlugin() {} + + RecoverPaddingPlugin(void const* serial_data, size_t serial_length) {} + + nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override { + RecoverPaddingPlugin* ptr = new RecoverPaddingPlugin(); + return ptr; + } + + const char* getPluginType() const TRT_NOEXCEPT override { + return "recover_padding_plugin"; + } + + int getNbOutputs() const TRT_NOEXCEPT override { return 1; } + + int initialize() TRT_NOEXCEPT { return 0; } + void terminate() TRT_NOEXCEPT; + nvinfer1::DimsExprs getOutputDimensions( + int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT override; + + bool supportsFormatCombination(int pos, + const nvinfer1::PluginTensorDesc* inOut, + int nbInputs, + int nbOutputs) TRT_NOEXCEPT override; + + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, + int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* outputs, + int nbOutputs) TRT_NOEXCEPT override; + + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, + int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, + int nbOutputs) const TRT_NOEXCEPT override { + return 0; + } + + void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, + nvinfer1::IGpuAllocator* gpuAllocator) + TRT_NOEXCEPT override; + + void detachFromContext() TRT_NOEXCEPT override; + + int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) TRT_NOEXCEPT override; + nvinfer1::DataType getOutputDataType( + int index, const nvinfer1::DataType* inputTypes, + int nbInputs) const TRT_NOEXCEPT override; + + void destroy() TRT_NOEXCEPT override { delete this; } + + protected: + size_t getSerializationSize() const TRT_NOEXCEPT override { return 0; } + + void serialize(void* buffer) const TRT_NOEXCEPT override {} +}; + +class RecoverPaddingPluginCreator : public nvinfer1::IPluginCreator { + public: + RecoverPaddingPluginCreator() {} + const char* getPluginName() const TRT_NOEXCEPT override { + return "recover_padding_plugin"; + } + const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; } + + const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override { + return &field_collection_; + } + + nvinfer1::IPluginV2* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* plugin_field) + TRT_NOEXCEPT override { + return nullptr; + } + + nvinfer1::IPluginV2* deserializePlugin( + const char* name, void const* serial_data, + size_t serial_length) TRT_NOEXCEPT override { + RecoverPaddingPlugin* obj = + new RecoverPaddingPlugin(serial_data, serial_length); + obj->setPluginNamespace(name); + return obj; + } + + void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override { + plugin_namespace_ = lib_namespace; + } + + const char* getPluginNamespace() const TRT_NOEXCEPT override { + return plugin_namespace_.c_str(); + } + + private: + std::string plugin_namespace_; + std::string plugin_name_; + nvinfer1::PluginFieldCollection field_collection_{0, nullptr}; +}; +REGISTER_TRT_PLUGIN_V2(RecoverPaddingPluginCreator); +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu new file mode 100644 index 0000000000000..84e36a4d5f638 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu @@ -0,0 +1,118 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +__global__ void RemovePaddingKernel(const float* input0, const int32_t* input1, + float* output) { + int word_id = blockIdx.x * gridDim.y + blockIdx.y; + int32_t seqence_length = input1[blockIdx.x + 1] - input1[blockIdx.x]; + if (blockIdx.y < seqence_length) { + output[(input1[blockIdx.x] + blockIdx.y) * gridDim.z * blockDim.x + + blockIdx.z * blockDim.x + threadIdx.x] = + input0[word_id * gridDim.z * blockDim.x + blockIdx.z * blockDim.x + + threadIdx.x]; + } +} + +nvinfer1::DataType RemovePaddingPlugin::getOutputDataType( + int index, const nvinfer1::DataType* input_types, + int nb_inputs) const TRT_NOEXCEPT { + return input_types[0]; +} + +nvinfer1::DimsExprs RemovePaddingPlugin::getOutputDimensions( + int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT { + nvinfer1::DimsExprs output_dims{}; + output_dims.nbDims = 4; + output_dims.d[0] = inputs[2].d[0]; + output_dims.d[1] = inputs[0].d[2]; + output_dims.d[2] = exprBuilder.constant(1); + output_dims.d[3] = exprBuilder.constant(1); + + return output_dims; +} + +bool RemovePaddingPlugin::supportsFormatCombination( + int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, + int nbOutputs) TRT_NOEXCEPT { + PADDLE_ENFORCE_EQ(nbInputs, 3, + platform::errors::InvalidArgument("Must have 3 inputs, " + "but got %d input(s). ", + nbInputs)); + PADDLE_ENFORCE_EQ(nbOutputs, getNbOutputs(), + platform::errors::InvalidArgument("Must have 1 output, " + "but got %d output(s). ", + nbOutputs)); + if (pos == 1 || pos == 2) { // pos_id, work_id + return inOut[pos].type == nvinfer1::DataType::kINT32 && + inOut[pos].format == nvinfer1::TensorFormat::kLINEAR; + } + return inOut[pos].type == nvinfer1::DataType::kFLOAT && + inOut[pos].format == nvinfer1::TensorFormat::kLINEAR; + // return (inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format + // == nvinfer1::TensorFormat::kLINEAR)|| + // (inOut[pos].type == nvinfer1::DataType::kHALF && inOut[pos].format == + // nvinfer1::TensorFormat::kLINEAR)|| + // (inOut[pos].type == nvinfer1::DataType::kINT8 && inOut[pos].format == + // nvinfer1::TensorFormat::kCHW32); +} + +void RemovePaddingPlugin::configurePlugin( + const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* outputs, + int nbOutputs) TRT_NOEXCEPT {} + +void RemovePaddingPlugin::attachToContext( + cudnnContext* cudnnContext, cublasContext* cublasContext, + nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {} + +void RemovePaddingPlugin::detachFromContext() TRT_NOEXCEPT {} + +void RemovePaddingPlugin::terminate() TRT_NOEXCEPT {} + +int RemovePaddingPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, + void* const* outputs, void* workspace, + cudaStream_t stream) TRT_NOEXCEPT { + const auto input_desc = inputDesc[0]; + const float* input0 = static_cast(inputs[0]); + const int32_t* input1 = + static_cast(inputs[1]); // pos_id_tensor + float* output = static_cast(outputs[0]); + + const auto input0_desc = inputDesc[0]; + + const int32_t num_threads = 256; + const dim3 num_blocks( + input0_desc.dims.d[0], input0_desc.dims.d[1], + input0_desc.dims.d[2] / + num_threads); // batchs, max sequnce length, input.dims.d[2]/256 + + RemovePaddingKernel<<>>(input0, input1, + output); + return cudaGetLastError() != cudaSuccess; +} + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.h b/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.h new file mode 100644 index 0000000000000..89fda3dd775c1 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.h @@ -0,0 +1,133 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include + +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class RemovePaddingPlugin : public DynamicPluginTensorRT { + public: + RemovePaddingPlugin() {} + + RemovePaddingPlugin(void const* serial_data, size_t serial_length) {} + + nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override { + RemovePaddingPlugin* ptr = new RemovePaddingPlugin(); + return ptr; + } + + const char* getPluginType() const TRT_NOEXCEPT override { + return "remove_padding_plugin"; + } + + int getNbOutputs() const TRT_NOEXCEPT override { return 1; } + + int initialize() TRT_NOEXCEPT { return 0; } + void terminate() TRT_NOEXCEPT; + nvinfer1::DimsExprs getOutputDimensions( + int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT override; + + bool supportsFormatCombination(int pos, + const nvinfer1::PluginTensorDesc* inOut, + int nbInputs, + int nbOutputs) TRT_NOEXCEPT override; + + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, + int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* outputs, + int nbOutputs) TRT_NOEXCEPT override; + + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, + int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, + int nbOutputs) const TRT_NOEXCEPT override { + return 0; + } + + void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, + nvinfer1::IGpuAllocator* gpuAllocator) + TRT_NOEXCEPT override; + + void detachFromContext() TRT_NOEXCEPT override; + + int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) TRT_NOEXCEPT override; + nvinfer1::DataType getOutputDataType( + int index, const nvinfer1::DataType* inputTypes, + int nbInputs) const TRT_NOEXCEPT override; + + void destroy() TRT_NOEXCEPT override { delete this; } + + protected: + size_t getSerializationSize() const TRT_NOEXCEPT override { return 0; } + + void serialize(void* buffer) const TRT_NOEXCEPT override {} +}; + +class RemovePaddingPluginCreator : public nvinfer1::IPluginCreator { + public: + RemovePaddingPluginCreator() {} + const char* getPluginName() const TRT_NOEXCEPT override { + return "remove_padding_plugin"; + } + const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; } + + const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override { + return &field_collection_; + } + + nvinfer1::IPluginV2* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* plugin_field) + TRT_NOEXCEPT override { + return nullptr; + } + + nvinfer1::IPluginV2* deserializePlugin( + const char* name, void const* serial_data, + size_t serial_length) TRT_NOEXCEPT override { + RemovePaddingPlugin* obj = + new RemovePaddingPlugin(serial_data, serial_length); + obj->setPluginNamespace(name); + return obj; + } + + void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override { + plugin_namespace_ = lib_namespace; + } + + const char* getPluginNamespace() const TRT_NOEXCEPT override { + return plugin_namespace_.c_str(); + } + + private: + std::string plugin_namespace_; + std::string plugin_name_; + nvinfer1::PluginFieldCollection field_collection_{0, nullptr}; +}; +REGISTER_TRT_PLUGIN_V2(RemovePaddingPluginCreator); +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu index 7dc31fb44719a..7eded9e823e2e 100644 --- a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu @@ -14,6 +14,7 @@ #include #include + #include #include "paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h" @@ -281,13 +282,12 @@ int RoiAlignPluginDynamic::enqueue_impl( width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch, aligned_, static_cast(outputs[0])); } else { - GPUROIAlignOpt< - T, OutT, - false><<>>( - output_size, static_cast(inputs[0]), - static_cast(inputs[1]), spatial_scale_, channels, height, - width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch, - aligned_, static_cast(outputs[0])); + GPUROIAlignOpt + <<>>( + output_size, static_cast(inputs[0]), + static_cast(inputs[1]), spatial_scale_, channels, height, + width, pooled_height_, pooled_width_, sampling_ratio_, + rois_num / batch, aligned_, static_cast(outputs[0])); } return cudaGetLastError() != cudaSuccess; diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu index fb14749f3d1db..e1527f85088ad 100644 --- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu @@ -14,9 +14,11 @@ #include #include + #include #include // NOLINT #include + #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h" #include "paddle/fluid/operators/math/bert_encoder_functor.h" @@ -105,8 +107,9 @@ nvinfer1::DataType SkipLayerNormPluginDynamic::getOutputDataType( index)); PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT || input_types[0] == nvinfer1::DataType::kHALF), - true, platform::errors::InvalidArgument( - "The input type should be half or float")); + true, + platform::errors::InvalidArgument( + "The input type should be half or float")); return input_types[0]; } diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu index 0a6d24f90722e..ad426204d5aa1 100644 --- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu @@ -14,9 +14,11 @@ #include #include + #include #include // NOLINT #include + #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h" @@ -301,14 +303,16 @@ bool SlicePluginDynamic::supportsFormatCombination( nvinfer1::DataType SlicePluginDynamic::getOutputDataType( int index, const nvinfer1::DataType *input_types, int nb_inputs) const TRT_NOEXCEPT { - PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( - "The Slice Plugin only has one input, so the " - "index value should be 0, but get %d.", - index)); + PADDLE_ENFORCE_EQ(index, 0, + platform::errors::InvalidArgument( + "The Slice Plugin only has one input, so the " + "index value should be 0, but get %d.", + index)); PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT || input_types[0] == nvinfer1::DataType::kHALF), - true, platform::errors::InvalidArgument( - "The input type should be half or float")); + true, + platform::errors::InvalidArgument( + "The input type should be half or float")); return input_types[0]; } diff --git a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu deleted file mode 100644 index 324e9c0392c93..0000000000000 --- a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu +++ /dev/null @@ -1,197 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h" - -namespace paddle { -namespace inference { -namespace tensorrt { -namespace plugin { - -#if IS_TRT_VERSION_GE(6000) -SpecialSlicePluginDynamic::SpecialSlicePluginDynamic() {} - -SpecialSlicePluginDynamic::SpecialSlicePluginDynamic(void const* serial_data, - size_t serial_length) {} - -SpecialSlicePluginDynamic::~SpecialSlicePluginDynamic() {} - -nvinfer1::IPluginV2DynamicExt* SpecialSlicePluginDynamic::clone() const - TRT_NOEXCEPT { - return new SpecialSlicePluginDynamic(); -} - -const char* SpecialSlicePluginDynamic::getPluginType() const TRT_NOEXCEPT { - return "special_slice_plugin"; -} - -int SpecialSlicePluginDynamic::getNbOutputs() const TRT_NOEXCEPT { return 1; } - -int SpecialSlicePluginDynamic::initialize() TRT_NOEXCEPT { return 0; } - -size_t SpecialSlicePluginDynamic::getSerializationSize() const TRT_NOEXCEPT { - size_t serialize_size = 0; - return serialize_size; -} - -void SpecialSlicePluginDynamic::serialize(void* buffer) const TRT_NOEXCEPT {} - -nvinfer1::DimsExprs SpecialSlicePluginDynamic::getOutputDimensions( - int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, - nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT { - nvinfer1::DimsExprs output(inputs[0]); - output.nbDims++; - for (int i = output.nbDims - 1; i > 1; i--) { - output.d[i] = inputs[0].d[i - 1]; - } - auto one = expr_builder.constant(1); - output.d[1] = one; - output.d[0] = expr_builder.operation(nvinfer1::DimensionOperation::kSUB, - *inputs[1].d[0], *one); - // remove padding 1 - output.nbDims -= 2; - - return output; -} - -void SpecialSlicePluginDynamic::configurePlugin( - const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, - const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT {} - -size_t SpecialSlicePluginDynamic::getWorkspaceSize( - const nvinfer1::PluginTensorDesc* inputs, int nbInputs, - const nvinfer1::PluginTensorDesc* outputs, - int nbOutputs) const TRT_NOEXCEPT { - return 0; -} - -void SpecialSlicePluginDynamic::destroy() TRT_NOEXCEPT { delete this; } - -void SpecialSlicePluginDynamic::terminate() TRT_NOEXCEPT {} - -bool SpecialSlicePluginDynamic::supportsFormatCombination( - int pos, const nvinfer1::PluginTensorDesc* desc, int nb_inputs, - int nb_outputs) TRT_NOEXCEPT { - if (pos == 0) // slice tensor - return (desc[pos].type == nvinfer1::DataType::kHALF && - desc[pos].format == - nvinfer1::TensorFormat::kLINEAR); // || desc[pos].type == - // nvinfer1::DataType::kFLOAT); - - if (pos == 1) // cu_seqlen - return (desc[pos].type == nvinfer1::DataType::kINT32 && - desc[pos].format == nvinfer1::TensorFormat::kLINEAR); - - return (desc[pos].type == nvinfer1::DataType::kHALF && - desc[pos].format == - nvinfer1::TensorFormat::kLINEAR); // || desc[pos].type == - // nvinfer1::DataType::kFLOAT); -} - -nvinfer1::DataType SpecialSlicePluginDynamic::getOutputDataType( - int index, const nvinfer1::DataType* input_types, - int nb_inputs) const TRT_NOEXCEPT { - PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( - "The index should be equal to 0")); - return input_types[0]; -} - -template -__global__ void SpecialSliceKernel(const T* slice_input, - const int32_t* cu_seqlens, T* output) { - const int hidden = blockDim.x * gridDim.x; - const int hidden_id = blockIdx.x * blockDim.x + threadIdx.x; - const int batch_id = blockIdx.y; - - output[batch_id * hidden + hidden_id] = - slice_input[cu_seqlens[batch_id] * hidden + hidden_id]; -} - -int SpecialSlicePluginDynamic::enqueue( - const nvinfer1::PluginTensorDesc* input_desc, - const nvinfer1::PluginTensorDesc* output_desc, const void* const* inputs, - void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { - auto input_dims = input_desc[0].dims; // (sum(S), hidden, 1, 1) - auto out_dims = output_desc[0].dims; // (batch, hidden, 1, 1) - - PADDLE_ENFORCE_EQ( - input_desc[0].type, nvinfer1::DataType::kHALF, - platform::errors::InvalidArgument("Type of input should be half.")); - - const int32_t hidden = input_dims.d[1]; - PADDLE_ENFORCE_EQ(hidden % 128, 0, platform::errors::InvalidArgument( - "hidden should be multiple of 128.")); - - constexpr int num_threads = 128; - const half* slice_input = static_cast(inputs[0]); - const int32_t* cu_seqlens = static_cast(inputs[1]); - half* output = static_cast(outputs[0]); - - const int32_t num_blocks_x = hidden / num_threads; - const int32_t num_blocks_y = out_dims.d[0]; // batchs - const dim3 num_blocks(num_blocks_x, num_blocks_y); // blocks - - SpecialSliceKernel<<>>( - slice_input, cu_seqlens, output); - return cudaGetLastError() != cudaSuccess; -} - -SpecialSlicePluginDynamicCreator::SpecialSlicePluginDynamicCreator() {} - -const char* SpecialSlicePluginDynamicCreator::getPluginName() const - TRT_NOEXCEPT { - return "special_slice_plugin"; -} - -const char* SpecialSlicePluginDynamicCreator::getPluginVersion() const - TRT_NOEXCEPT { - return "1"; -} - -const nvinfer1::PluginFieldCollection* -SpecialSlicePluginDynamicCreator::getFieldNames() TRT_NOEXCEPT { - return &field_collection_; -} - -nvinfer1::IPluginV2* SpecialSlicePluginDynamicCreator::createPlugin( - const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT { - return new SpecialSlicePluginDynamic(); -} - -nvinfer1::IPluginV2* SpecialSlicePluginDynamicCreator::deserializePlugin( - const char* name, const void* serial_data, - size_t serial_length) TRT_NOEXCEPT { - auto plugin = new SpecialSlicePluginDynamic(serial_data, serial_length); - return plugin; -} - -void SpecialSlicePluginDynamicCreator::setPluginNamespace( - const char* lib_namespace) TRT_NOEXCEPT { - plugin_namespace_ = lib_namespace; -} - -const char* SpecialSlicePluginDynamicCreator::getPluginNamespace() const - TRT_NOEXCEPT { - return plugin_namespace_.c_str(); -} - -#endif - -} // namespace plugin -} // namespace tensorrt -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h deleted file mode 100644 index c3521e4ed6371..0000000000000 --- a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" - -namespace paddle { -namespace inference { -namespace tensorrt { -namespace plugin { - -#if IS_TRT_VERSION_GE(6000) -class SpecialSlicePluginDynamic : public DynamicPluginTensorRT { - public: - SpecialSlicePluginDynamic(); - SpecialSlicePluginDynamic(void const* serial_data, size_t serial_length); - ~SpecialSlicePluginDynamic(); - nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override; - nvinfer1::DimsExprs getOutputDimensions( - int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, - nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT override; - bool supportsFormatCombination(int pos, - const nvinfer1::PluginTensorDesc* inOut, - int nbInputs, - int nbOutputs) TRT_NOEXCEPT override; - void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, - int nbInputs, - const nvinfer1::DynamicPluginTensorDesc* out, - int nbOutputs) TRT_NOEXCEPT override; - size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, - int nbInputs, - const nvinfer1::PluginTensorDesc* outputs, - int nbOutputs) const TRT_NOEXCEPT override; - int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, - const nvinfer1::PluginTensorDesc* outputDesc, - const void* const* inputs, void* const* outputs, void* workspace, - cudaStream_t stream) TRT_NOEXCEPT override; - - nvinfer1::DataType getOutputDataType( - int index, const nvinfer1::DataType* inputTypes, - int nbInputs) const TRT_NOEXCEPT override; - - const char* getPluginType() const TRT_NOEXCEPT override; - int getNbOutputs() const TRT_NOEXCEPT override; - int initialize() TRT_NOEXCEPT override; - void terminate() TRT_NOEXCEPT override; - size_t getSerializationSize() const TRT_NOEXCEPT override; - void serialize(void* buffer) const TRT_NOEXCEPT override; - void destroy() TRT_NOEXCEPT override; - - private: - int axis_; - int num_stack_; -}; - -class SpecialSlicePluginDynamicCreator : public nvinfer1::IPluginCreator { - public: - SpecialSlicePluginDynamicCreator(); - const char* getPluginName() const TRT_NOEXCEPT override; - const char* getPluginVersion() const TRT_NOEXCEPT override; - const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override; - nvinfer1::IPluginV2* createPlugin(const char* name, - const nvinfer1::PluginFieldCollection* fc) - TRT_NOEXCEPT override; - nvinfer1::IPluginV2* deserializePlugin( - const char* name, const void* serial_data, - size_t serial_length) TRT_NOEXCEPT override; - void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override; - const char* getPluginNamespace() const TRT_NOEXCEPT override; - - private: - std::string plugin_namespace_; - nvinfer1::PluginFieldCollection field_collection_{0, nullptr}; - std::vector plugin_attributes_; -}; -REGISTER_TRT_PLUGIN_V2(SpecialSlicePluginDynamicCreator); -#endif - -} // namespace plugin -} // namespace tensorrt -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu index ec4fcca6d74d0..1cfc9fade7b15 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -13,7 +13,9 @@ // limitations under the License. #include + #include + #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" namespace paddle { diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h index 7a41fe1d1eef2..49f028493ee87 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h @@ -15,9 +15,11 @@ #pragma once #include + #include #include #include + #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" namespace paddle { diff --git a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu index 74a6c3cdf3e4e..1c6dae78b387d 100644 --- a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu @@ -15,6 +15,7 @@ #include #include #include + #include "paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h" namespace paddle { @@ -128,8 +129,9 @@ bool StackPluginDynamic::supportsFormatCombination( nvinfer1::DataType StackPluginDynamic::getOutputDataType( int index, const nvinfer1::DataType* input_types, int nb_inputs) const TRT_NOEXCEPT { - PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( - "The index should be equal to 0")); + PADDLE_ENFORCE_EQ( + index, 0, + platform::errors::InvalidArgument("The index should be equal to 0")); return input_types[0]; } diff --git a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h index 965c53e269877..12beafdadb316 100644 --- a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h @@ -14,9 +14,11 @@ #pragma once #include + #include #include #include + #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu index 2c2fad74b9a2d..1992dd57d68fe 100644 --- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu @@ -13,8 +13,10 @@ // limitations under the License. #include + #include #include + #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h" @@ -181,10 +183,11 @@ bool SwishPluginDynamic::supportsFormatCombination( nvinfer1::DataType SwishPluginDynamic::getOutputDataType( int index, const nvinfer1::DataType *input_types, int nb_inputs) const TRT_NOEXCEPT { - PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( - "The Swish Plugin only has one input, so the " - "index value should be 0, but get %d.", - index)); + PADDLE_ENFORCE_EQ(index, 0, + platform::errors::InvalidArgument( + "The Swish Plugin only has one input, so the " + "index value should be 0, but get %d.", + index)); return input_types[0]; } @@ -203,8 +206,8 @@ int SwishPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc, VLOG(1) << "TRT Plugin DataType selected. Swish-->fp32"; const float *input = static_cast(inputs[0]); float *output = static_cast(outputs[0]); - swish_kernel<<>>(num, input, output, - beta_); + swish_kernel + <<>>(num, input, output, beta_); } else if (input_type == nvinfer1::DataType::kHALF) { VLOG(1) << "TRT Plugin DataType selected. Swish-->fp16"; const half *input = static_cast(inputs[0]); diff --git a/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc index 46f585e655746..9cb680da5a95d 100644 --- a/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc +++ b/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" namespace paddle { diff --git a/paddle/fluid/inference/tensorrt/plugin/transformer_input_convert_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/transformer_input_convert_plugin.cu new file mode 100644 index 0000000000000..a7fff02781609 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/transformer_input_convert_plugin.cu @@ -0,0 +1,110 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/plugin/transformer_input_convert_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +__global__ void TransformerInputConvertKernel(const int64_t* input, + int32_t* output0) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ int32_t shared_data; + if (threadIdx.x == static_cast(input[tid])) { + atomicAdd(&shared_data, 1); + } + output0[0] = 0; + output0[blockIdx.x + 1] = shared_data; + __syncthreads(); + for (int i = 0; i < blockDim.x; ++i) { + output0[i + 1] += output0[i]; + } +} + +nvinfer1::DataType TransformerInputConvertPlugin::getOutputDataType( + int index, const nvinfer1::DataType* input_types, + int nb_inputs) const TRT_NOEXCEPT { + return nvinfer1::DataType::kINT32; +} + +nvinfer1::DimsExprs TransformerInputConvertPlugin::getOutputDimensions( + int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT { + nvinfer1::DimsExprs output_dims{}; + output_dims.nbDims = 1; + if (outputIndex == 0) { // PosId + const auto* one = exprBuilder.constant(1); + output_dims.d[0] = exprBuilder.operation(nvinfer1::DimensionOperation::kSUM, + *inputs[0].d[0], *one); + } else { // MaxSeqlen + output_dims.d[0] = inputs[0].d[1]; + } + return output_dims; +} + +bool TransformerInputConvertPlugin::supportsFormatCombination( + int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, + int nbOutputs) TRT_NOEXCEPT { + PADDLE_ENFORCE_EQ(nbInputs, 1, + platform::errors::InvalidArgument("Must have 1 inputs, " + "but got %d input(s). ", + nbInputs)); + PADDLE_ENFORCE_EQ(nbOutputs, getNbOutputs(), + platform::errors::InvalidArgument("Must have 2 output, " + "but got %d output(s). ", + nbOutputs)); + if (pos == 0) { // input + return inOut[pos].format == nvinfer1::TensorFormat::kLINEAR; + } else { // output0, output1 + return inOut[pos].type == nvinfer1::DataType::kINT32 && + inOut[pos].format == nvinfer1::TensorFormat::kLINEAR; + } +} + +void TransformerInputConvertPlugin::configurePlugin( + const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* outputs, + int nbOutputs) TRT_NOEXCEPT {} + +void TransformerInputConvertPlugin::attachToContext( + cudnnContext* cudnnContext, cublasContext* cublasContext, + nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {} + +void TransformerInputConvertPlugin::detachFromContext() TRT_NOEXCEPT {} + +void TransformerInputConvertPlugin::terminate() TRT_NOEXCEPT {} + +int TransformerInputConvertPlugin::enqueue( + const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs, + void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { + const auto input_desc = inputDesc[0]; + const int64_t* input = static_cast(inputs[0]); + int32_t* output0 = static_cast(outputs[0]); // PosId + // int32_t* output1 = static_cast(outputs[1]); // MaxSeqlen + + const int32_t num_blocks = input_desc.dims.d[0]; // batchs + const int32_t num_threads = input_desc.dims.d[1]; // max sequnce length + + TransformerInputConvertKernel<<>>( + input, output0); + return cudaGetLastError() != cudaSuccess; +} + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/transformer_input_convert_plugin.h b/paddle/fluid/inference/tensorrt/plugin/transformer_input_convert_plugin.h new file mode 100644 index 0000000000000..92aa0c48a49ce --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/transformer_input_convert_plugin.h @@ -0,0 +1,134 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include + +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class TransformerInputConvertPlugin : public DynamicPluginTensorRT { + public: + TransformerInputConvertPlugin() {} + + TransformerInputConvertPlugin(void const* serial_data, size_t serial_length) { + } + + nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override { + TransformerInputConvertPlugin* ptr = new TransformerInputConvertPlugin(); + return ptr; + } + + const char* getPluginType() const TRT_NOEXCEPT override { + return "transformer_input_convert_plugin"; + } + + int getNbOutputs() const TRT_NOEXCEPT override { return 2; } + + int initialize() TRT_NOEXCEPT { return 0; } + void terminate() TRT_NOEXCEPT; + nvinfer1::DimsExprs getOutputDimensions( + int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT override; + + bool supportsFormatCombination(int pos, + const nvinfer1::PluginTensorDesc* inOut, + int nbInputs, + int nbOutputs) TRT_NOEXCEPT override; + + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, + int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* outputs, + int nbOutputs) TRT_NOEXCEPT override; + + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, + int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, + int nbOutputs) const TRT_NOEXCEPT override { + return 0; + } + + void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, + nvinfer1::IGpuAllocator* gpuAllocator) + TRT_NOEXCEPT override; + + void detachFromContext() TRT_NOEXCEPT override; + + int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) TRT_NOEXCEPT override; + nvinfer1::DataType getOutputDataType( + int index, const nvinfer1::DataType* inputTypes, + int nbInputs) const TRT_NOEXCEPT override; + + void destroy() TRT_NOEXCEPT override { delete this; } + + protected: + size_t getSerializationSize() const TRT_NOEXCEPT override { return 0; } + + void serialize(void* buffer) const TRT_NOEXCEPT override {} +}; + +class TransformerInputConvertPluginCreator : public nvinfer1::IPluginCreator { + public: + TransformerInputConvertPluginCreator() {} + const char* getPluginName() const TRT_NOEXCEPT override { + return "transformer_input_convert_plugin"; + } + const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; } + + const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override { + return &field_collection_; + } + + nvinfer1::IPluginV2* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* plugin_field) + TRT_NOEXCEPT override { + return nullptr; + } + + nvinfer1::IPluginV2* deserializePlugin( + const char* name, void const* serial_data, + size_t serial_length) TRT_NOEXCEPT override { + TransformerInputConvertPlugin* obj = + new TransformerInputConvertPlugin(serial_data, serial_length); + obj->setPluginNamespace(name); + return obj; + } + + void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override { + plugin_namespace_ = lib_namespace; + } + + const char* getPluginNamespace() const TRT_NOEXCEPT override { + return plugin_namespace_.c_str(); + } + + private: + std::string plugin_namespace_; + std::string plugin_name_; + nvinfer1::PluginFieldCollection field_collection_{0, nullptr}; +}; +REGISTER_TRT_PLUGIN_V2(TransformerInputConvertPluginCreator); +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h index 9210cd48d078b..a1316384cd491 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h @@ -15,6 +15,7 @@ #pragma once #include + #include #include #include diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h index 16751c764bd03..cf9c66f0eb3fc 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h @@ -17,6 +17,7 @@ #include #include #include + #include "paddle/fluid/platform/enforce.h" namespace paddle { diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.h index 2094dbfc9db4b..7116093ae36e6 100644 --- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.h @@ -15,6 +15,7 @@ #pragma once #include #include + #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" diff --git a/paddle/fluid/inference/tensorrt/test_tensorrt.cc b/paddle/fluid/inference/tensorrt/test_tensorrt.cc index 2f5b75c102004..70f36ec34b708 100644 --- a/paddle/fluid/inference/tensorrt/test_tensorrt.cc +++ b/paddle/fluid/inference/tensorrt/test_tensorrt.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include #include + #include "NvInfer.h" #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/platform/dynload/tensorrt.h" diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h index c84cb45b7ecba..35c776b9e532c 100644 --- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h +++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h @@ -16,6 +16,7 @@ #include #include + #include #include #include // NOLINT diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index fc85f83661889..307af84fa367e 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -1,409 +1,592 @@ -if (NOT APPLE AND NOT WIN32) - set(INFERENCE_EXTRA_DEPS paddle_inference_shared) +if(NOT APPLE AND NOT WIN32) + set(INFERENCE_EXTRA_DEPS paddle_inference_shared) else() - set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_inference_io ir_pass_manager analysis_predictor benchmark) + set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_inference_io + ir_pass_manager analysis_predictor benchmark) endif() if(WITH_GPU AND TENSORRT_FOUND) - set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps}) + set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps}) endif() function(download_data install_dir data_file check_sum) - string(REGEX MATCH "[^/\\]+$" file_name ${data_file}) - if (NOT EXISTS ${install_dir}/${file_name}) - inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${data_file} ${check_sum}) - endif() + string(REGEX MATCH "[^/\\]+$" file_name ${data_file}) + if(NOT EXISTS ${install_dir}/${file_name}) + inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} + ${data_file} ${check_sum}) + endif() endfunction() function(download_data_without_verify install_dir data_file) - string(REGEX MATCH "[^/\\]+$" file_name ${data_file}) - if (NOT EXISTS ${install_dir}/${file_name}) - inference_download_and_uncompress_without_verify(${install_dir} ${INFERENCE_URL} ${data_file}) - endif() + string(REGEX MATCH "[^/\\]+$" file_name ${data_file}) + if(NOT EXISTS ${install_dir}/${file_name}) + inference_download_and_uncompress_without_verify( + ${install_dir} ${INFERENCE_URL} ${data_file}) + endif() endfunction() function(download_int8_data install_dir data_file check_sum) - if (NOT EXISTS ${install_dir}/${data_file}) - inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file} ${check_sum}) - endif() + if(NOT EXISTS ${install_dir}/${data_file}) + inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 + ${data_file} ${check_sum}) + endif() endfunction() function(download_int8_data_without_verify install_dir data_file) - if (NOT EXISTS ${install_dir}/${data_file}) - inference_download_and_uncompress_without_verify(${install_dir} ${INFERENCE_URL}/int8 ${data_file}) - endif() + if(NOT EXISTS ${install_dir}/${data_file}) + inference_download_and_uncompress_without_verify( + ${install_dir} ${INFERENCE_URL}/int8 ${data_file}) + endif() endfunction() function(download_bfloat16_data install_dir data_file check_sum) - if (NOT EXISTS ${install_dir}/${data_file}) - inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/bfloat16 ${data_file} ${check_sum}) - endif() + if(NOT EXISTS ${install_dir}/${data_file}) + inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/bfloat16 + ${data_file} ${check_sum}) + endif() endfunction() function(download_bfloat16_data_without_verify install_dir data_file) - if (NOT EXISTS ${install_dir}/${data_file}) - inference_download_and_uncompress_without_verify(${install_dir} ${INFERENCE_URL}/bfloat16 ${data_file}) - endif() + if(NOT EXISTS ${install_dir}/${data_file}) + inference_download_and_uncompress_without_verify( + ${install_dir} ${INFERENCE_URL}/bfloat16 ${data_file}) + endif() endfunction() function(download_GRU_data install_dir data_file check_sum) - if (NOT EXISTS ${install_dir}/${data_file}) - inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/gru ${data_file} ${check_sum}) - endif() + if(NOT EXISTS ${install_dir}/${data_file}) + inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/gru + ${data_file} ${check_sum}) + endif() endfunction() function(download_GRU_data_without_verify install_dir data_file) - if (NOT EXISTS ${install_dir}/${data_file}) - inference_download_and_uncompress_without_verify(${install_dir} ${INFERENCE_URL}/gru ${data_file}) - endif() + if(NOT EXISTS ${install_dir}/${data_file}) + inference_download_and_uncompress_without_verify( + ${install_dir} ${INFERENCE_URL}/gru ${data_file}) + endif() endfunction() function(download_quant_data install_dir data_file check_sum) - if (NOT EXISTS ${install_dir}/${data_file}) - inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file} ${check_sum}) - endif() + if(NOT EXISTS ${install_dir}/${data_file}) + inference_download_and_uncompress( + ${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file} ${check_sum}) + endif() endfunction() function(download_quant_data_without_verify install_dir data_file) - if (NOT EXISTS ${install_dir}/${data_file}) - inference_download_and_uncompress_without_verify(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file}) - endif() + if(NOT EXISTS ${install_dir}/${data_file}) + inference_download_and_uncompress_without_verify( + ${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file}) + endif() endfunction() -function(download_model_and_data install_dir model_name model_check_sum data_name data_check_sum) - download_data(${install_dir} ${model_name} ${model_check_sum}) - download_data(${install_dir} ${data_name} ${data_check_sum}) +function(download_model_and_data install_dir model_name model_check_sum + data_name data_check_sum) + download_data(${install_dir} ${model_name} ${model_check_sum}) + download_data(${install_dir} ${data_name} ${data_check_sum}) endfunction() -function(download_model_and_data_without_verify install_dir model_name data_name) - download_data_without_verify(${install_dir} ${model_name}) - download_data_without_verify(${install_dir} ${data_name}) +function(download_model_and_data_without_verify install_dir model_name + data_name) + download_data_without_verify(${install_dir} ${model_name}) + download_data_without_verify(${install_dir} ${data_name}) endfunction() function(download_result install_dir result_name check_sum) - download_data(${install_dir} ${result_name} ${check_sum}) + download_data(${install_dir} ${result_name} ${check_sum}) endfunction() function(download_result_without_verify install_dir result_name) - download_data_without_verify(${install_dir} ${result_name}) + download_data_without_verify(${install_dir} ${result_name}) endfunction() function(inference_analysis_api_test target install_dir filename) - inference_analysis_test(${target} SRCS ${filename} - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt --refer_result=${install_dir}/result.txt) + inference_analysis_test( + ${target} + SRCS + ${filename} + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${install_dir}/model + --infer_data=${install_dir}/data.txt + --refer_result=${install_dir}/result.txt) endfunction() function(inference_analysis_api_int8_test target install_dir filename) - inference_analysis_test(${target} SRCS ${filename} - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${install_dir}/model - --infer_data=${install_dir}/data.txt - --refer_result=${install_dir}/result.txt - --accuracy=0.8 - --batch_size=5 - --enable_int8=true) + inference_analysis_test( + ${target} + SRCS + ${filename} + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${install_dir}/model + --infer_data=${install_dir}/data.txt + --refer_result=${install_dir}/result.txt + --accuracy=0.8 + --batch_size=5 + --enable_int8=true) endfunction() -function(inference_multiple_models_analysis_api_test target install_dir filename) - inference_analysis_test(${target} SRCS ${filename} - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${install_dir}/mobilenet_v2_models/1 --infer_model2=${install_dir}/mobilenet_v2_models/xx --infer_model3=${install_dir}/mobilenet_v2_models/3) +function(inference_multiple_models_analysis_api_test target install_dir + filename) + inference_analysis_test( + ${target} + SRCS + ${filename} + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${install_dir}/mobilenet_v2_models/1 + --infer_model2=${install_dir}/mobilenet_v2_models/xx + --infer_model3=${install_dir}/mobilenet_v2_models/3) endfunction() function(inference_analysis_api_test_build TARGET_NAME filename) - inference_analysis_test_build(${TARGET_NAME} SRCS ${filename} - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}) + inference_analysis_test_build(${TARGET_NAME} SRCS ${filename} EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS}) endfunction() -function(inference_analysis_api_int8_test_run TARGET_NAME test_binary model_dir data_path) - inference_analysis_test_run(${TARGET_NAME} - COMMAND ${test_binary} - ARGS --infer_model=${model_dir}/model - --infer_data=${data_path} - --warmup_batch_size=${WARMUP_BATCH_SIZE} - --batch_size=50 - --enable_int8=true - --cpu_num_threads=${CPU_NUM_THREADS_ON_CI} - --iterations=2) +function(inference_analysis_api_int8_test_run TARGET_NAME test_binary model_dir + data_path) + inference_analysis_test_run( + ${TARGET_NAME} + COMMAND + ${test_binary} + ARGS + --infer_model=${model_dir}/model + --infer_data=${data_path} + --warmup_batch_size=${WARMUP_BATCH_SIZE} + --batch_size=50 + --enable_int8=true + --cpu_num_threads=${CPU_NUM_THREADS_ON_CI} + --iterations=2) endfunction() -function(inference_analysis_api_int8_test_run_custom_warmup_batch_size TARGET_NAME test_binary model_dir data_path warmup_batch_size) - set(WARMUP_BATCH_SIZE ${warmup_batch_size}) - inference_analysis_api_int8_test_run(${TARGET_NAME} ${test_binary} ${model_dir} ${data_path}) +function(inference_analysis_api_int8_test_run_custom_warmup_batch_size + TARGET_NAME test_binary model_dir data_path warmup_batch_size) + set(WARMUP_BATCH_SIZE ${warmup_batch_size}) + inference_analysis_api_int8_test_run(${TARGET_NAME} ${test_binary} + ${model_dir} ${data_path}) endfunction() -function(inference_analysis_api_bfloat16_test_run TARGET_NAME test_binary model_dir data_path) - inference_analysis_test_run(${TARGET_NAME} - COMMAND ${test_binary} - ARGS --infer_model=${model_dir}/model - --infer_data=${data_path} - --batch_size=50 - --enable_bf16=true - --paddle_num_threads=${CPU_NUM_THREADS_ON_CI} - --iterations=2) +function(inference_analysis_api_bfloat16_test_run TARGET_NAME test_binary + model_dir data_path) + inference_analysis_test_run( + ${TARGET_NAME} + COMMAND + ${test_binary} + ARGS + --infer_model=${model_dir}/model + --infer_data=${data_path} + --batch_size=50 + --enable_bf16=true + --paddle_num_threads=${CPU_NUM_THREADS_ON_CI} + --iterations=2) endfunction() -function(inference_analysis_api_object_dection_int8_test_run TARGET_NAME test_binary model_dir data_path) - inference_analysis_test_run(${TARGET_NAME} - COMMAND ${test_binary} - ARGS --infer_model=${model_dir}/model - --infer_data=${data_path} - --warmup_batch_size=10 - --batch_size=300 - --enable_int8=true - --cpu_num_threads=${CPU_NUM_THREADS_ON_CI} - --iterations=1) +function(inference_analysis_api_object_dection_int8_test_run TARGET_NAME + test_binary model_dir data_path) + inference_analysis_test_run( + ${TARGET_NAME} + COMMAND + ${test_binary} + ARGS + --infer_model=${model_dir}/model + --infer_data=${data_path} + --warmup_batch_size=10 + --batch_size=300 + --enable_int8=true + --cpu_num_threads=${CPU_NUM_THREADS_ON_CI} + --iterations=1) endfunction() function(inference_analysis_api_test_with_fake_data_build TARGET_NAME filename) - inference_analysis_test_build(${TARGET_NAME} SRCS ${filename} - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}) + inference_analysis_test_build(${TARGET_NAME} SRCS ${filename} EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS}) endfunction() -function(inference_analysis_api_test_with_fake_data_run TARGET_NAME test_binary model_dir disable_fc) - inference_analysis_test_run(${TARGET_NAME} - COMMAND ${test_binary} - ARGS --infer_model=${model_dir}/model - --disable_mkldnn_fc=${disable_fc}) +function(inference_analysis_api_test_with_fake_data_run TARGET_NAME test_binary + model_dir disable_fc) + inference_analysis_test_run( + ${TARGET_NAME} COMMAND ${test_binary} ARGS --infer_model=${model_dir}/model + --disable_mkldnn_fc=${disable_fc}) endfunction() -function(inference_analysis_api_quant_test_run TARGET_NAME test_binary fp32_model_dir int8_model_dir data_path enable_quant_int8) - inference_analysis_test_run(${TARGET_NAME} - COMMAND ${test_binary} - ARGS --fp32_model=${fp32_model_dir} - --int8_model=${int8_model_dir} - --infer_data=${data_path} - --batch_size=50 - --enable_int8=true - --enable_quant_int8=${enable_quant_int8} - --cpu_num_threads=${CPU_NUM_THREADS_ON_CI} - --with_accuracy_layer=false - --iterations=2) +function( + inference_analysis_api_quant_test_run + TARGET_NAME + test_binary + fp32_model_dir + int8_model_dir + data_path + enable_quant_int8) + inference_analysis_test_run( + ${TARGET_NAME} + COMMAND + ${test_binary} + ARGS + --fp32_model=${fp32_model_dir} + --int8_model=${int8_model_dir} + --infer_data=${data_path} + --batch_size=50 + --enable_int8=true + --enable_quant_int8=${enable_quant_int8} + --cpu_num_threads=${CPU_NUM_THREADS_ON_CI} + --with_accuracy_layer=false + --iterations=2) endfunction() -function(inference_analysis_api_lexical_test_run TARGET_NAME test_binary infer_model data_path) - inference_analysis_test_run(${TARGET_NAME} - COMMAND ${test_binary} - ARGS --infer_model=${infer_model} - --infer_data=${data_path} - --batch_size=50 - --cpu_num_threads=${CPU_NUM_THREADS_ON_CI} - --with_accuracy_layer=true - --use_analysis=true - --iterations=2) +function(inference_analysis_api_lexical_test_run TARGET_NAME test_binary + infer_model data_path) + inference_analysis_test_run( + ${TARGET_NAME} + COMMAND + ${test_binary} + ARGS + --infer_model=${infer_model} + --infer_data=${data_path} + --batch_size=50 + --cpu_num_threads=${CPU_NUM_THREADS_ON_CI} + --with_accuracy_layer=true + --use_analysis=true + --iterations=2) endfunction() -function(inference_analysis_api_lexical_bfloat16_test_run TARGET_NAME test_binary infer_model data_path) - inference_analysis_test_run(${TARGET_NAME} - COMMAND ${test_binary} - ARGS --infer_model=${infer_model} - --infer_data=${data_path} - --batch_size=50 - --cpu_num_threads=${CPU_NUM_THREADS_ON_CI} - --with_accuracy_layer=true - --use_analysis=true - --enable_bf16=true - --iterations=2) +function(inference_analysis_api_lexical_bfloat16_test_run TARGET_NAME + test_binary infer_model data_path) + inference_analysis_test_run( + ${TARGET_NAME} + COMMAND + ${test_binary} + ARGS + --infer_model=${infer_model} + --infer_data=${data_path} + --batch_size=50 + --cpu_num_threads=${CPU_NUM_THREADS_ON_CI} + --with_accuracy_layer=true + --use_analysis=true + --enable_bf16=true + --iterations=2) endfunction() -function(inference_analysis_api_lexical_int8_test_run TARGET_NAME test_binary infer_model data_path fuse_multi_gru) - inference_analysis_test_run(${TARGET_NAME} - COMMAND ${test_binary} - ARGS --infer_model=${infer_model} - --infer_data=${data_path} - --batch_size=100 - --cpu_num_threads=${CPU_NUM_THREADS_ON_CI} - --with_accuracy_layer=true - --use_analysis=true - --enable_int8=true - --quantized_accuracy=0.01 - --fuse_multi_gru=${fuse_multi_gru} - --iterations=4) +function(inference_analysis_api_lexical_int8_test_run TARGET_NAME test_binary + infer_model data_path fuse_multi_gru) + inference_analysis_test_run( + ${TARGET_NAME} + COMMAND + ${test_binary} + ARGS + --infer_model=${infer_model} + --infer_data=${data_path} + --batch_size=100 + --cpu_num_threads=${CPU_NUM_THREADS_ON_CI} + --with_accuracy_layer=true + --use_analysis=true + --enable_int8=true + --quantized_accuracy=0.01 + --fuse_multi_gru=${fuse_multi_gru} + --iterations=4) endfunction() -function(preprocess_data2bin_test_run target py_script_source data_dir output_file) - py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/${py_script_source} - ARGS --data_dir=${data_dir} - --output_file=${output_file} - --local) +function(preprocess_data2bin_test_run target py_script_source data_dir + output_file) + py_test(${target} + SRCS ${CMAKE_CURRENT_SOURCE_DIR}/${py_script_source} ARGS + --data_dir=${data_dir} --output_file=${output_file} --local) endfunction() if(NOT APPLE AND WITH_MKLML) - # RNN1 - set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1") - download_model_and_data_without_verify(${RNN1_INSTALL_DIR} "rnn1/model.tar.gz" "rnn1/data.txt.tar.gz") - inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc) - - # seq_pool1 - set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool") - download_model_and_data_without_verify(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz") - inference_analysis_api_test(test_analyzer_seq_pool1_compare_determine ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_compare_determine_tester.cc) - inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_compare_tester.cc) - inference_analysis_api_test(test_analyzer_seq_pool1_fuse_compare_zero_copy ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc) - inference_analysis_api_test(test_analyzer_seq_pool1_fuse_statis ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_fuse_statis_tester.cc) - inference_analysis_api_test(test_analyzer_seq_pool1_profile ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_profile_tester.cc) - if(NOT WIN32 AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") - set_tests_properties(test_analyzer_seq_pool1_compare_determine PROPERTIES TIMEOUT 120) - set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 120) - set_tests_properties(test_analyzer_seq_pool1_fuse_compare_zero_copy PROPERTIES TIMEOUT 120) - set_tests_properties(test_analyzer_seq_pool1_fuse_statis PROPERTIES TIMEOUT 120) - set_tests_properties(test_analyzer_seq_pool1_profile PROPERTIES TIMEOUT 120) - endif() + # RNN1 + set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1") + download_model_and_data_without_verify( + ${RNN1_INSTALL_DIR} "rnn1/model.tar.gz" "rnn1/data.txt.tar.gz") + inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} + analyzer_rnn1_tester.cc) + + # seq_pool1 + set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool") + download_model_and_data_without_verify( + ${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" + "seq_pool1_data.txt.tar.gz") + inference_analysis_api_test( + test_analyzer_seq_pool1_compare_determine ${SEQ_POOL1_INSTALL_DIR} + analyzer_seq_pool1_compare_determine_tester.cc) + inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} + analyzer_seq_pool1_compare_tester.cc) + inference_analysis_api_test( + test_analyzer_seq_pool1_fuse_compare_zero_copy ${SEQ_POOL1_INSTALL_DIR} + analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc) + inference_analysis_api_test( + test_analyzer_seq_pool1_fuse_statis ${SEQ_POOL1_INSTALL_DIR} + analyzer_seq_pool1_fuse_statis_tester.cc) + inference_analysis_api_test( + test_analyzer_seq_pool1_profile ${SEQ_POOL1_INSTALL_DIR} + analyzer_seq_pool1_profile_tester.cc) + if(NOT WIN32 AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") + set_tests_properties(test_analyzer_seq_pool1_compare_determine + PROPERTIES TIMEOUT 120) + set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 120) + set_tests_properties(test_analyzer_seq_pool1_fuse_compare_zero_copy + PROPERTIES TIMEOUT 120) + set_tests_properties(test_analyzer_seq_pool1_fuse_statis PROPERTIES TIMEOUT + 120) + set_tests_properties(test_analyzer_seq_pool1_profile PROPERTIES TIMEOUT 120) + endif() else() - # TODO: fix this test on MACOS and OPENBLAS, the reason is that - # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS - message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_rnn1") - message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_seq_pool1") + # TODO: fix this test on MACOS and OPENBLAS, the reason is that + # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS + message( + WARNING + "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_rnn1" + ) + message( + WARNING + "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_seq_pool1" + ) endif() - # RNN2 set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2") -download_model_and_data_without_verify(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz") -inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc) +download_model_and_data_without_verify(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" + "rnn2_data.txt.tar.gz") +inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} + analyzer_rnn2_tester.cc) # TODO(luotao, Superjom) Disable DAM test, temporarily fix # https://github.com/PaddlePaddle/Paddle/issues/15032#issuecomment-455990914. # After inference framework refactor, will reopen it. # normal DAM set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam") -download_model_and_data_without_verify(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz") +download_model_and_data_without_verify(${DAM_INSTALL_DIR} "DAM_model.tar.gz" + "DAM_data.txt.tar.gz") #inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator) # small DAM set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam") -download_model_and_data_without_verify(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz") -inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt) - -#save model -inference_analysis_api_test(test_analyzer_save_model ${DAM_SMALL_INSTALL_DIR} analyzer_save_model_tester.cc) +download_model_and_data_without_verify( + ${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz") +inference_analysis_test( + test_analyzer_small_dam + SRCS + analyzer_dam_tester.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${DAM_SMALL_INSTALL_DIR}/model + --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt) + +#save model +inference_analysis_api_test(test_analyzer_save_model ${DAM_SMALL_INSTALL_DIR} + analyzer_save_model_tester.cc) # chinese_ner set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner") -download_model_and_data_without_verify(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz") -inference_analysis_api_test(test_analyzer_ner ${CHINESE_NER_INSTALL_DIR} analyzer_ner_tester.cc) +download_model_and_data_without_verify( + ${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" + "chinese_ner-data.txt.tar.gz") +inference_analysis_api_test(test_analyzer_ner ${CHINESE_NER_INSTALL_DIR} + analyzer_ner_tester.cc) # lac set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac") -download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" 419ca6eb85f57a01bfe173591910aec5 "lac_data.txt.tar.gz" 9983539cd6b34fbdc411e43422776bfd) -inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_tester.cc) +download_model_and_data( + ${LAC_INSTALL_DIR} "lac_model.tar.gz" 419ca6eb85f57a01bfe173591910aec5 + "lac_data.txt.tar.gz" 9983539cd6b34fbdc411e43422776bfd) +inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} + analyzer_lac_tester.cc) # Pyramid DNN set(PYRAMID_DNN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/pyramid_dnn") -download_model_and_data_without_verify(${PYRAMID_DNN_INSTALL_DIR} "PyramidDNN_model.tar.gz" "PyramidDNN_data.txt.tar.gz") -inference_analysis_api_test(test_analyzer_pyramid_dnn ${PYRAMID_DNN_INSTALL_DIR} analyzer_pyramid_dnn_tester.cc) +download_model_and_data_without_verify( + ${PYRAMID_DNN_INSTALL_DIR} "PyramidDNN_model.tar.gz" + "PyramidDNN_data.txt.tar.gz") +inference_analysis_api_test( + test_analyzer_pyramid_dnn ${PYRAMID_DNN_INSTALL_DIR} + analyzer_pyramid_dnn_tester.cc) # Ernie set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie") -download_model_and_data(${ERNIE_INSTALL_DIR} "Ernie_model.tar.gz" aa59192dd41ed377f9f168e3a1309fa6 "Ernie_data.txt.tar.gz" 5396e63548edad7ca561e7e26a9476d1) -download_result(${ERNIE_INSTALL_DIR} "Ernie_result.txt.tar.gz" 73beea65abda2edb61c1662cd3180c62) -if (WITH_GPU) - inference_analysis_api_test(test_analyzer_ernie ${ERNIE_INSTALL_DIR} analyzer_ernie_tester.cc) +download_model_and_data( + ${ERNIE_INSTALL_DIR} "Ernie_model.tar.gz" aa59192dd41ed377f9f168e3a1309fa6 + "Ernie_data.txt.tar.gz" 5396e63548edad7ca561e7e26a9476d1) +download_result(${ERNIE_INSTALL_DIR} "Ernie_result.txt.tar.gz" + 73beea65abda2edb61c1662cd3180c62) +if(WITH_GPU) + inference_analysis_api_test(test_analyzer_ernie ${ERNIE_INSTALL_DIR} + analyzer_ernie_tester.cc) endif() -inference_analysis_api_int8_test(test_analyzer_ernie_int8 ${ERNIE_INSTALL_DIR} analyzer_ernie_int8_tester.cc) +inference_analysis_api_int8_test(test_analyzer_ernie_int8 ${ERNIE_INSTALL_DIR} + analyzer_ernie_int8_tester.cc) # Ernie large set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie_Large") -download_model_and_data(${ERNIE_INSTALL_DIR} "Ernie_large_model.tar.gz" af7715245ed32cc77374625d4c80f7ef "Ernie_large_data.txt.tar.gz" edb2113eec93783cad56ed76d47ba57f) -download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz" 1facda98eef1085dc9d435ebf3f23a73) -inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${ERNIE_INSTALL_DIR}/model --infer_data=${ERNIE_INSTALL_DIR}/data.txt --refer_result=${ERNIE_INSTALL_DIR}/result.txt --ernie_large=true) -if(NOT WIN32 AND NOT APPLE AND TEST test_analyzer_ernie_large) - set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150 LABELS "RUN_TYPE=NIGHTLY") +download_model_and_data( + ${ERNIE_INSTALL_DIR} "Ernie_large_model.tar.gz" + af7715245ed32cc77374625d4c80f7ef "Ernie_large_data.txt.tar.gz" + edb2113eec93783cad56ed76d47ba57f) +download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz" + 1facda98eef1085dc9d435ebf3f23a73) +inference_analysis_test( + test_analyzer_ernie_large + SRCS + analyzer_ernie_tester.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${ERNIE_INSTALL_DIR}/model + --infer_data=${ERNIE_INSTALL_DIR}/data.txt + --refer_result=${ERNIE_INSTALL_DIR}/result.txt + --ernie_large=true) +if(NOT WIN32 + AND NOT APPLE + AND TEST test_analyzer_ernie_large) + set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150 LABELS + "RUN_TYPE=NIGHTLY") endif() -if (WIN32 AND TEST test_analyzer_ernie_large) - set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 200) +if(WIN32 AND TEST test_analyzer_ernie_large) + set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 200) endif() # text_classification -set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification") -download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" 3f0f440313ca50e26184e65ffd5809ab "text_classification_data.txt.tar.gz" 36ae620020cc3377f45ed330dd36238f) -inference_analysis_api_test(test_analyzer_text_classification ${TEXT_CLASSIFICATION_INSTALL_DIR} analyzer_text_classification_tester.cc) +set(TEXT_CLASSIFICATION_INSTALL_DIR + "${INFERENCE_DEMO_INSTALL_DIR}/text_classification") +download_model_and_data( + ${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" + 3f0f440313ca50e26184e65ffd5809ab "text_classification_data.txt.tar.gz" + 36ae620020cc3377f45ed330dd36238f) +inference_analysis_api_test( + test_analyzer_text_classification ${TEXT_CLASSIFICATION_INSTALL_DIR} + analyzer_text_classification_tester.cc) # seq_conv1 set(SEQ_CONV1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_conv1") -download_model_and_data_without_verify(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz") -inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} analyzer_seq_conv1_tester.cc) +download_model_and_data_without_verify( + ${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz") +inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} + analyzer_seq_conv1_tester.cc) # transformer, the dataset only works on batch_size=8 now set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer") -download_model_and_data_without_verify(${TRANSFORMER_INSTALL_DIR} "temp/transformer_model.tar.gz" "temp/transformer_data.txt.tar.gz") -inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_compare_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 - --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}) -inference_analysis_test(test_analyzer_transformer_fuse SRCS analyzer_transformer_fuse_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 - --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}) -inference_analysis_test(test_analyzer_transformer_profile SRCS analyzer_transformer_profile_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 - --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}) +download_model_and_data_without_verify( + ${TRANSFORMER_INSTALL_DIR} "temp/transformer_model.tar.gz" + "temp/transformer_data.txt.tar.gz") +inference_analysis_test( + test_analyzer_transformer + SRCS + analyzer_transformer_compare_tester.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${TRANSFORMER_INSTALL_DIR}/model + --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt + --batch_size=8 + --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}) +inference_analysis_test( + test_analyzer_transformer_fuse + SRCS + analyzer_transformer_fuse_tester.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${TRANSFORMER_INSTALL_DIR}/model + --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt + --batch_size=8 + --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}) +inference_analysis_test( + test_analyzer_transformer_profile + SRCS + analyzer_transformer_profile_tester.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${TRANSFORMER_INSTALL_DIR}/model + --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt + --batch_size=8 + --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}) # VIT-OCR set(VIT_OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/vit") -if (NOT EXISTS ${VIT_OCR_INSTALL_DIR}/vit_ocr.tgz) - inference_download_and_uncompress_without_verify(${VIT_OCR_INSTALL_DIR} ${INFERENCE_URL} "ocr/vit_ocr.tgz") +if(NOT EXISTS ${VIT_OCR_INSTALL_DIR}/vit_ocr.tgz) + inference_download_and_uncompress_without_verify( + ${VIT_OCR_INSTALL_DIR} ${INFERENCE_URL} "ocr/vit_ocr.tgz") endif() -inference_analysis_test(test_analyzer_vit_ocr SRCS analyzer_vit_ocr_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${VIT_OCR_INSTALL_DIR}/vit_ocr/model --infer_data=${VIT_OCR_INSTALL_DIR}/vit_ocr/datavit.txt) +inference_analysis_test( + test_analyzer_vit_ocr + SRCS + analyzer_vit_ocr_tester.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${VIT_OCR_INSTALL_DIR}/vit_ocr/model + --infer_data=${VIT_OCR_INSTALL_DIR}/vit_ocr/datavit.txt) # ocr set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr") -if (NOT EXISTS ${OCR_INSTALL_DIR}/ocr.tar.gz) - inference_download_and_uncompress_without_verify(${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos/ocr.tar.gz") +if(NOT EXISTS ${OCR_INSTALL_DIR}/ocr.tar.gz) + inference_download_and_uncompress_without_verify( + ${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" + "inference-vis-demos/ocr.tar.gz") endif() -inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc) +inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} + analyzer_vis_tester.cc) # densebox set(DENSEBOX_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/densebox") download_data_without_verify(${DENSEBOX_INSTALL_DIR} "densebox.tar.gz") -inference_analysis_test(test_analyzer_detect_functional_mkldnn SRCS analyzer_detect_functional_mkldnn_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${DENSEBOX_INSTALL_DIR}/model --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt - --infer_shape=${DENSEBOX_INSTALL_DIR}/shape_50.txt) +inference_analysis_test( + test_analyzer_detect_functional_mkldnn + SRCS + analyzer_detect_functional_mkldnn_tester.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${DENSEBOX_INSTALL_DIR}/model + --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt + --infer_shape=${DENSEBOX_INSTALL_DIR}/shape_50.txt) # mobilenet with transpose op set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet") -if (NOT EXISTS ${MOBILENET_INSTALL_DIR}/mobilenet.tar.gz) - inference_download_and_uncompress_without_verify(${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos/mobilenet.tar.gz") +if(NOT EXISTS ${MOBILENET_INSTALL_DIR}/mobilenet.tar.gz) + inference_download_and_uncompress_without_verify( + ${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" + "inference-vis-demos/mobilenet.tar.gz") endif() -inference_analysis_api_test(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc) +inference_analysis_api_test(test_analyzer_mobilenet_transpose + ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc) ### Image classification tests with fake data set(IMG_CLASS_TEST_APP "test_analyzer_image_classification") set(IMG_CLASS_TEST_APP_SRC "analyzer_image_classification_tester.cc") # build test binary to be used in subsequent tests -inference_analysis_api_test_with_fake_data_build(${IMG_CLASS_TEST_APP} ${IMG_CLASS_TEST_APP_SRC}) +inference_analysis_api_test_with_fake_data_build(${IMG_CLASS_TEST_APP} + ${IMG_CLASS_TEST_APP_SRC}) # googlenet set(GOOGLENET_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/googlenet") download_data_without_verify(${GOOGLENET_MODEL_DIR} "googlenet.tar.gz") -inference_analysis_api_test_with_fake_data_run(test_analyzer_googlenet ${IMG_CLASS_TEST_APP} - ${GOOGLENET_MODEL_DIR} false) +inference_analysis_api_test_with_fake_data_run( + test_analyzer_googlenet ${IMG_CLASS_TEST_APP} ${GOOGLENET_MODEL_DIR} false) # resnet50 set(RESNET50_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50") download_data_without_verify(${RESNET50_MODEL_DIR} "resnet50_model.tar.gz") -inference_analysis_api_test_with_fake_data_run(test_analyzer_resnet50 ${IMG_CLASS_TEST_APP} - ${RESNET50_MODEL_DIR} true) -if (WIN32) - set_tests_properties(test_analyzer_resnet50 PROPERTIES TIMEOUT 200) +inference_analysis_api_test_with_fake_data_run( + test_analyzer_resnet50 ${IMG_CLASS_TEST_APP} ${RESNET50_MODEL_DIR} true) +if(WIN32) + set_tests_properties(test_analyzer_resnet50 PROPERTIES TIMEOUT 200) endif() - # mobilenet with depthwise_conv op -set(MOBILENET_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv") +set(MOBILENET_MODEL_DIR + "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv") download_data_without_verify(${MOBILENET_MODEL_DIR} "mobilenet_model.tar.gz") -inference_analysis_api_test_with_fake_data_run(test_analyzer_mobilenet_depthwise_conv ${IMG_CLASS_TEST_APP} - ${MOBILENET_MODEL_DIR} false) +inference_analysis_api_test_with_fake_data_run( + test_analyzer_mobilenet_depthwise_conv ${IMG_CLASS_TEST_APP} + ${MOBILENET_MODEL_DIR} false) if(WITH_MKLDNN) @@ -418,97 +601,135 @@ if(WITH_MKLDNN) set(IMAGENET_DATA_ARCHIVE "imagenet_val_100_tail.tar.gz") set(IMAGENET_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/imagenet") set(IMAGENET_DATA_PATH "${IMAGENET_DATA_DIR}/data.bin") - download_int8_data_without_verify(${IMAGENET_DATA_DIR} ${IMAGENET_DATA_ARCHIVE}) + download_int8_data_without_verify(${IMAGENET_DATA_DIR} + ${IMAGENET_DATA_ARCHIVE}) # build test binary to be used in subsequent tests set(INT8_IMG_CLASS_TEST_APP "test_analyzer_int8_image_classification") - set(INT8_IMG_CLASS_TEST_APP_SRC "analyzer_int8_image_classification_tester.cc") - inference_analysis_api_test_build(${INT8_IMG_CLASS_TEST_APP} ${INT8_IMG_CLASS_TEST_APP_SRC}) + set(INT8_IMG_CLASS_TEST_APP_SRC + "analyzer_int8_image_classification_tester.cc") + inference_analysis_api_test_build(${INT8_IMG_CLASS_TEST_APP} + ${INT8_IMG_CLASS_TEST_APP_SRC}) # resnet50 int8 set(INT8_RESNET50_MODEL_DIR "${INT8_DATA_DIR}/resnet50") - download_int8_data_without_verify(${INT8_RESNET50_MODEL_DIR} "resnet50_int8_model.tar.gz" ) - inference_analysis_api_int8_test_run(test_analyzer_int8_resnet50 ${INT8_IMG_CLASS_TEST_APP} ${INT8_RESNET50_MODEL_DIR} ${IMAGENET_DATA_PATH}) + download_int8_data_without_verify(${INT8_RESNET50_MODEL_DIR} + "resnet50_int8_model.tar.gz") + inference_analysis_api_int8_test_run( + test_analyzer_int8_resnet50 ${INT8_IMG_CLASS_TEST_APP} + ${INT8_RESNET50_MODEL_DIR} ${IMAGENET_DATA_PATH}) # mobilenetv1 int8 set(INT8_MOBILENETV1_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv1") - download_int8_data_without_verify(${INT8_MOBILENETV1_MODEL_DIR} "mobilenetv1_int8_model.tar.gz" ) - inference_analysis_api_int8_test_run(test_analyzer_int8_mobilenetv1 ${INT8_IMG_CLASS_TEST_APP} ${INT8_MOBILENETV1_MODEL_DIR} ${IMAGENET_DATA_PATH}) - + download_int8_data_without_verify(${INT8_MOBILENETV1_MODEL_DIR} + "mobilenetv1_int8_model.tar.gz") + inference_analysis_api_int8_test_run( + test_analyzer_int8_mobilenetv1 ${INT8_IMG_CLASS_TEST_APP} + ${INT8_MOBILENETV1_MODEL_DIR} ${IMAGENET_DATA_PATH}) + # mobilenetv2 int8 set(INT8_MOBILENETV2_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv2") - download_int8_data_without_verify(${INT8_MOBILENETV2_MODEL_DIR} "mobilenet_v2_int8_model.tar.gz" ) - inference_analysis_api_int8_test_run(test_analyzer_int8_mobilenetv2 ${INT8_IMG_CLASS_TEST_APP} ${INT8_MOBILENETV2_MODEL_DIR} ${IMAGENET_DATA_PATH}) - + download_int8_data_without_verify(${INT8_MOBILENETV2_MODEL_DIR} + "mobilenet_v2_int8_model.tar.gz") + inference_analysis_api_int8_test_run( + test_analyzer_int8_mobilenetv2 ${INT8_IMG_CLASS_TEST_APP} + ${INT8_MOBILENETV2_MODEL_DIR} ${IMAGENET_DATA_PATH}) + # resnet101 int8 # TODO(grygielski) Enable after MKL-DNN 1.0 merge set(INT8_RESNET101_MODEL_DIR "${INT8_DATA_DIR}/resnet101") - download_int8_data_without_verify(${INT8_RESNET101_MODEL_DIR} "Res101_int8_model.tar.gz" ) -# inference_analysis_api_int8_test_run(test_analyzer_int8_resnet101 ${INT8_IMG_CLASS_TEST_APP} ${INT8_RESNET101_MODEL_DIR} ${IMAGENET_DATA_PATH}) - + download_int8_data_without_verify(${INT8_RESNET101_MODEL_DIR} + "Res101_int8_model.tar.gz") + # inference_analysis_api_int8_test_run(test_analyzer_int8_resnet101 ${INT8_IMG_CLASS_TEST_APP} ${INT8_RESNET101_MODEL_DIR} ${IMAGENET_DATA_PATH}) + # vgg16 int8 # TODO(grygielski) Enable after MKL-DNN 1.0 merge set(INT8_VGG16_MODEL_DIR "${INT8_DATA_DIR}/vgg16") - download_int8_data_without_verify(${INT8_VGG16_MODEL_DIR} "VGG16_int8_model.tar.gz" ) -# inference_analysis_api_int8_test_run(test_analyzer_int8_vgg16 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG16_MODEL_DIR} ${IMAGENET_DATA_PATH}) - + download_int8_data_without_verify(${INT8_VGG16_MODEL_DIR} + "VGG16_int8_model.tar.gz") + # inference_analysis_api_int8_test_run(test_analyzer_int8_vgg16 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG16_MODEL_DIR} ${IMAGENET_DATA_PATH}) + # vgg19 int8 # TODO(grygielski) Enable after MKL-DNN 1.0 merge set(INT8_VGG19_MODEL_DIR "${INT8_DATA_DIR}/vgg19") - download_int8_data_without_verify(${INT8_VGG19_MODEL_DIR} "VGG19_int8_model.tar.gz" ) -# inference_analysis_api_int8_test_run(test_analyzer_int8_vgg19 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG19_MODEL_DIR} ${IMAGENET_DATA_PATH}) + download_int8_data_without_verify(${INT8_VGG19_MODEL_DIR} + "VGG19_int8_model.tar.gz") + # inference_analysis_api_int8_test_run(test_analyzer_int8_vgg19 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG19_MODEL_DIR} ${IMAGENET_DATA_PATH}) # googlenet int8 set(INT8_GOOGLENET_MODEL_DIR "${INT8_DATA_DIR}/googlenet") - download_int8_data_without_verify(${INT8_GOOGLENET_MODEL_DIR} "GoogleNet_int8_model.tar.gz" ) - inference_analysis_api_int8_test_run_custom_warmup_batch_size(test_analyzer_int8_googlenet ${INT8_IMG_CLASS_TEST_APP} ${INT8_GOOGLENET_MODEL_DIR} ${IMAGENET_DATA_PATH} 10) - - # mobilenetv3_large_x1_0 int8 - set(INT8_MOBILENETV3_LARGE_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv3_large") - set(INT8_MOBILENETV3_FILE_NAME "MobileNetV3_large_x1_0_infer.tar") - if (NOT EXISTS ${INT8_MOBILENETV3_LARGE_MODEL_DIR}/${INT8_MOBILENETV3_FILE_NAME}) - inference_download_and_uncompress_without_verify(${INT8_MOBILENETV3_LARGE_MODEL_DIR} "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/" ${INT8_MOBILENETV3_FILE_NAME}) - endif() - inference_analysis_test_run(test_analyzer_int8_mobilenetv3_large - COMMAND ${INT8_IMG_CLASS_TEST_APP} - ARGS --infer_model=${INT8_MOBILENETV3_LARGE_MODEL_DIR}/MobileNetV3_large_x1_0_infer - --infer_data=${IMAGENET_DATA_PATH} - --warmup_batch_size=50 - --batch_size=1 - --enable_int8=true - --cpu_num_threads=${CPU_NUM_THREADS_ON_CI} - --iterations=100 - --with_accuracy_layer=false) + download_int8_data_without_verify(${INT8_GOOGLENET_MODEL_DIR} + "GoogleNet_int8_model.tar.gz") + inference_analysis_api_int8_test_run_custom_warmup_batch_size( + test_analyzer_int8_googlenet ${INT8_IMG_CLASS_TEST_APP} + ${INT8_GOOGLENET_MODEL_DIR} ${IMAGENET_DATA_PATH} 10) + + # mobilenetv3_large_x1_0 int8 + set(INT8_MOBILENETV3_LARGE_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv3_large") + set(INT8_MOBILENETV3_FILE_NAME "MobileNetV3_large_x1_0_infer.tar") + if(NOT EXISTS + ${INT8_MOBILENETV3_LARGE_MODEL_DIR}/${INT8_MOBILENETV3_FILE_NAME}) + inference_download_and_uncompress_without_verify( + ${INT8_MOBILENETV3_LARGE_MODEL_DIR} + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/" + ${INT8_MOBILENETV3_FILE_NAME}) + endif() + inference_analysis_test_run( + test_analyzer_int8_mobilenetv3_large + COMMAND + ${INT8_IMG_CLASS_TEST_APP} + ARGS + --infer_model=${INT8_MOBILENETV3_LARGE_MODEL_DIR}/MobileNetV3_large_x1_0_infer + --infer_data=${IMAGENET_DATA_PATH} + --warmup_batch_size=50 + --batch_size=1 + --enable_int8=true + --cpu_num_threads=${CPU_NUM_THREADS_ON_CI} + --iterations=100 + --with_accuracy_layer=false) ### BFLOAT16 tests # build test binary to be used in subsequent tests set(BF16_IMG_CLASS_TEST_APP "test_analyzer_bfloat16_image_classification") - set(BF16_IMG_CLASS_TEST_APP_SRC "analyzer_bfloat16_image_classification_tester.cc") - inference_analysis_api_test_build(${BF16_IMG_CLASS_TEST_APP} ${BF16_IMG_CLASS_TEST_APP_SRC}) + set(BF16_IMG_CLASS_TEST_APP_SRC + "analyzer_bfloat16_image_classification_tester.cc") + inference_analysis_api_test_build(${BF16_IMG_CLASS_TEST_APP} + ${BF16_IMG_CLASS_TEST_APP_SRC}) # resnet50 bfloat16 - inference_analysis_api_bfloat16_test_run(test_analyzer_bfloat16_resnet50 ${BF16_IMG_CLASS_TEST_APP} ${INT8_RESNET50_MODEL_DIR} ${IMAGENET_DATA_PATH}) - + inference_analysis_api_bfloat16_test_run( + test_analyzer_bfloat16_resnet50 ${BF16_IMG_CLASS_TEST_APP} + ${INT8_RESNET50_MODEL_DIR} ${IMAGENET_DATA_PATH}) + # googlenet bfloat16 - inference_analysis_api_bfloat16_test_run(test_analyzer_bfloat16_googlenet ${BF16_IMG_CLASS_TEST_APP} ${INT8_GOOGLENET_MODEL_DIR} ${IMAGENET_DATA_PATH}) + inference_analysis_api_bfloat16_test_run( + test_analyzer_bfloat16_googlenet ${BF16_IMG_CLASS_TEST_APP} + ${INT8_GOOGLENET_MODEL_DIR} ${IMAGENET_DATA_PATH}) # mobilenetv1 bfloat16 - inference_analysis_api_bfloat16_test_run(test_analyzer_bfloat16_mobilenetv1 ${BF16_IMG_CLASS_TEST_APP} ${INT8_MOBILENETV1_MODEL_DIR} ${IMAGENET_DATA_PATH}) + inference_analysis_api_bfloat16_test_run( + test_analyzer_bfloat16_mobilenetv1 ${BF16_IMG_CLASS_TEST_APP} + ${INT8_MOBILENETV1_MODEL_DIR} ${IMAGENET_DATA_PATH}) # mobilenetv2 bfloat16 - inference_analysis_api_bfloat16_test_run(test_analyzer_bfloat16_mobilenetv2 ${BF16_IMG_CLASS_TEST_APP} ${INT8_MOBILENETV2_MODEL_DIR} ${IMAGENET_DATA_PATH}) - - # mobilenetv3_large - inference_analysis_test_run(test_analyzer_bfloat16_mobilenetv3_large - COMMAND ${BF16_IMG_CLASS_TEST_APP} - ARGS --infer_model=${INT8_MOBILENETV3_LARGE_MODEL_DIR}/MobileNetV3_large_x1_0_infer - --infer_data=${IMAGENET_DATA_PATH} - --batch_size=1 - --enable_bf16=true - --paddle_num_threads=${CPU_NUM_THREADS_ON_CI} - --iterations=100 - --with_accuracy_layer=false) + inference_analysis_api_bfloat16_test_run( + test_analyzer_bfloat16_mobilenetv2 ${BF16_IMG_CLASS_TEST_APP} + ${INT8_MOBILENETV2_MODEL_DIR} ${IMAGENET_DATA_PATH}) + + # mobilenetv3_large + inference_analysis_test_run( + test_analyzer_bfloat16_mobilenetv3_large + COMMAND + ${BF16_IMG_CLASS_TEST_APP} + ARGS + --infer_model=${INT8_MOBILENETV3_LARGE_MODEL_DIR}/MobileNetV3_large_x1_0_infer + --infer_data=${IMAGENET_DATA_PATH} + --batch_size=1 + --enable_bf16=true + --paddle_num_threads=${CPU_NUM_THREADS_ON_CI} + --iterations=100 + --with_accuracy_layer=false) ### Object detection models set(PASCALVOC_DATA_PATH "${INT8_DATA_DIR}/pascalvoc_val_head_300.bin") @@ -516,21 +737,25 @@ if(WITH_MKLDNN) set(INT8_OBJ_DETECT_TEST_APP_SRC "analyzer_int8_object_detection_tester.cc") # download dataset if necessary - download_int8_data_without_verify(${INT8_DATA_DIR} "pascalvoc_val_head_300.tar.gz") - + download_int8_data_without_verify(${INT8_DATA_DIR} + "pascalvoc_val_head_300.tar.gz") # build test binary to be used in subsequent tests - inference_analysis_api_test_build(${INT8_OBJ_DETECT_TEST_APP} ${INT8_OBJ_DETECT_TEST_APP_SRC}) + inference_analysis_api_test_build(${INT8_OBJ_DETECT_TEST_APP} + ${INT8_OBJ_DETECT_TEST_APP_SRC}) # mobilenet-ssd int8 set(INT8_MOBILENET_SSD_MODEL_DIR "${INT8_DATA_DIR}/mobilenet-ssd") - download_int8_data_without_verify(${INT8_MOBILENET_SSD_MODEL_DIR} "mobilenet_ssd_int8_model.tar.gz" ) - inference_analysis_api_object_dection_int8_test_run(test_analyzer_int8_mobilenet_ssd ${INT8_OBJ_DETECT_TEST_APP} ${INT8_MOBILENET_SSD_MODEL_DIR} ${PASCALVOC_DATA_PATH}) + download_int8_data_without_verify(${INT8_MOBILENET_SSD_MODEL_DIR} + "mobilenet_ssd_int8_model.tar.gz") + inference_analysis_api_object_dection_int8_test_run( + test_analyzer_int8_mobilenet_ssd ${INT8_OBJ_DETECT_TEST_APP} + ${INT8_MOBILENET_SSD_MODEL_DIR} ${PASCALVOC_DATA_PATH}) ### Lexcial analysis GRU model set(GRU_PATH "${INFERENCE_DEMO_INSTALL_DIR}/gru") - download_GRU_data_without_verify("${GRU_PATH}" "GRU_eval_data.tar.gz") - download_GRU_data_without_verify("${GRU_PATH}" "GRU_eval_model_v2.tar.gz") + download_gru_data_without_verify("${GRU_PATH}" "GRU_eval_data.tar.gz") + download_gru_data_without_verify("${GRU_PATH}" "GRU_eval_model_v2.tar.gz") set(GRU_DATA_PATH "${GRU_PATH}/GRU_eval_data.bin") set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model_v2") set(LEXICAL_TEST_APP "test_analyzer_lexical_analysis") @@ -539,266 +764,497 @@ if(WITH_MKLDNN) # build test binary to be used in subsequent tests inference_analysis_api_test_build(${LEXICAL_TEST_APP} ${LEXICAL_TEST_APP_SRC}) # run lexcial analysis test - inference_analysis_api_lexical_test_run(test_analyzer_lexical_gru ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH} ${GRU_DATA_PATH}) + inference_analysis_api_lexical_test_run( + test_analyzer_lexical_gru ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH} + ${GRU_DATA_PATH}) # run bfloat16 lexical analysis test - inference_analysis_api_lexical_bfloat16_test_run(test_analyzer_lexical_gru_bfloat16 ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH} ${GRU_DATA_PATH}) + inference_analysis_api_lexical_bfloat16_test_run( + test_analyzer_lexical_gru_bfloat16 ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH} + ${GRU_DATA_PATH}) # run post-training quantization lexical analysis test - inference_analysis_api_lexical_int8_test_run(test_analyzer_lexical_gru_int8 ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH} ${GRU_DATA_PATH} false) - # run post-training quantization lexical analysis test with multi_gru fuse - inference_analysis_api_lexical_int8_test_run(test_analyzer_lexical_gru_int8_multi_gru ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH} ${GRU_DATA_PATH} true) + inference_analysis_api_lexical_int8_test_run( + test_analyzer_lexical_gru_int8 ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH} + ${GRU_DATA_PATH} false) + # run post-training quantization lexical analysis test with multi_gru fuse + inference_analysis_api_lexical_int8_test_run( + test_analyzer_lexical_gru_int8_multi_gru ${LEXICAL_TEST_APP} + ${GRU_MODEL_PATH} ${GRU_DATA_PATH} true) ### optimized FP32 vs. Quant INT8 tests - + set(QUANT_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/quant") set(QUANT_IMG_CLASS_TEST_APP "test_analyzer_quant_image_classification") - set(QUANT_IMG_CLASS_TEST_APP_SRC "analyzer_quant_image_classification_tester.cc") + set(QUANT_IMG_CLASS_TEST_APP_SRC + "analyzer_quant_image_classification_tester.cc") # build test binary to be used in subsequent tests - inference_analysis_api_test_build(${QUANT_IMG_CLASS_TEST_APP} ${QUANT_IMG_CLASS_TEST_APP_SRC}) + inference_analysis_api_test_build(${QUANT_IMG_CLASS_TEST_APP} + ${QUANT_IMG_CLASS_TEST_APP_SRC}) # MobileNetV1 FP32 vs. Quant INT8 # The FP32 model should already be downloaded for slim Quant unit tests on Linux set(QUANT2_MobileNetV1_MODEL_DIR "${QUANT_DATA_DIR}/MobileNetV1_quant2") - set(QUANT2_INT8_MobileNetV1_MODEL_DIR "${QUANT_DATA_DIR}/MobileNetV1_quant2_int8") + set(QUANT2_INT8_MobileNetV1_MODEL_DIR + "${QUANT_DATA_DIR}/MobileNetV1_quant2_int8") if(NOT LINUX) - download_quant_data_without_verify(${QUANT2_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf.tar.gz") + download_quant_data_without_verify(${QUANT2_MobileNetV1_MODEL_DIR} + "MobileNet_qat_perf.tar.gz") endif(NOT LINUX) - download_quant_data_without_verify(${QUANT2_INT8_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf_int8.tar.gz") - inference_analysis_api_quant_test_run(test_analyzer_quant_performance_benchmark ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float ${QUANT2_INT8_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf_int8 ${IMAGENET_DATA_PATH} false) + download_quant_data_without_verify(${QUANT2_INT8_MobileNetV1_MODEL_DIR} + "MobileNet_qat_perf_int8.tar.gz") + inference_analysis_api_quant_test_run( + test_analyzer_quant_performance_benchmark + ${QUANT_IMG_CLASS_TEST_APP} + ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float + ${QUANT2_INT8_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf_int8 + ${IMAGENET_DATA_PATH} + false) # Quant2 MobileNetV1 - inference_analysis_api_quant_test_run(test_analyzer_quant2_mobilenetv1_mkldnn ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float ${IMAGENET_DATA_PATH} true) + inference_analysis_api_quant_test_run( + test_analyzer_quant2_mobilenetv1_mkldnn + ${QUANT_IMG_CLASS_TEST_APP} + ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float + ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float + ${IMAGENET_DATA_PATH} + true) # Quant2 ResNet50 with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes, # with weight scales in `fake_channel_wise_dequantize_max_abs` operators - set(QUANT2_RESNET50_CHANNELWISE_MODEL_DIR "${QUANT_DATA_DIR}/ResNet50_quant2_channelwise") - set(QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE "ResNet50_qat_channelwise.tar.gz") + set(QUANT2_RESNET50_CHANNELWISE_MODEL_DIR + "${QUANT_DATA_DIR}/ResNet50_quant2_channelwise") + set(QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE + "ResNet50_qat_channelwise.tar.gz") if(NOT LINUX) - download_quant_data_without_verify(${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR} ${QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE}) + download_quant_data_without_verify( + ${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR} + ${QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE}) endif(NOT LINUX) - set(QUANT2_RESNET50_MODEL ${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR}/ResNet50_qat_channelwise) - inference_analysis_api_quant_test_run(test_analyzer_quant2_resnet50_channelwise_mkldnn ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_RESNET50_MODEL} ${QUANT2_RESNET50_MODEL} ${IMAGENET_DATA_PATH} true) + set(QUANT2_RESNET50_MODEL + ${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR}/ResNet50_qat_channelwise) + inference_analysis_api_quant_test_run( + test_analyzer_quant2_resnet50_channelwise_mkldnn + ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_RESNET50_MODEL} + ${QUANT2_RESNET50_MODEL} ${IMAGENET_DATA_PATH} true) ### Other tests - + # MKLDNN quantizer config set(MKLDNN_QUANTIZER_CONFIG_TEST_APP "test_mkldnn_quantizer_config") set(MKLDNN_QUANTIZER_CONFIG_TEST_APP_SRC "mkldnn_quantizer_config_tester.cc") - inference_analysis_api_test_build(${MKLDNN_QUANTIZER_CONFIG_TEST_APP} ${MKLDNN_QUANTIZER_CONFIG_TEST_APP_SRC}) - inference_analysis_test_run(test_mkldnn_quantizer_config COMMAND ${MKLDNN_QUANTIZER_CONFIG_TEST_APP}) + inference_analysis_api_test_build(${MKLDNN_QUANTIZER_CONFIG_TEST_APP} + ${MKLDNN_QUANTIZER_CONFIG_TEST_APP_SRC}) + inference_analysis_test_run(test_mkldnn_quantizer_config COMMAND + ${MKLDNN_QUANTIZER_CONFIG_TEST_APP}) # preprocess data2bin imagenet - download_int8_data_without_verify(${INT8_DATA_DIR} "imagenet_small.tar.gz") - set(IMAGENET_SMALL_DATA_DIR "${INT8_DATA_DIR}/imagenet_small") - set(IMAGENET_SMALL_OUTPUT_FILE "imagenet_small.bin") - preprocess_data2bin_test_run(preprocess_local_imagenet "full_ILSVRC2012_val_preprocess.py" ${IMAGENET_SMALL_DATA_DIR} ${IMAGENET_SMALL_OUTPUT_FILE}) - + download_int8_data_without_verify(${INT8_DATA_DIR} "imagenet_small.tar.gz") + set(IMAGENET_SMALL_DATA_DIR "${INT8_DATA_DIR}/imagenet_small") + set(IMAGENET_SMALL_OUTPUT_FILE "imagenet_small.bin") + preprocess_data2bin_test_run( + preprocess_local_imagenet "full_ILSVRC2012_val_preprocess.py" + ${IMAGENET_SMALL_DATA_DIR} ${IMAGENET_SMALL_OUTPUT_FILE}) + # preprocess data2bin pascalvoc download_int8_data_without_verify(${INT8_DATA_DIR} "pascalvoc_small.tar.gz") set(PASCALVOC_SMALL_DATA_DIR "${INT8_DATA_DIR}/pascalvoc_small") set(PASCALVOC_SMALL_OUTPUT_FILE "pascalvoc_small.bin") - preprocess_data2bin_test_run(preprocess_local_pascalvoc "full_pascalvoc_test_preprocess.py" ${PASCALVOC_SMALL_DATA_DIR} ${PASCALVOC_SMALL_OUTPUT_FILE}) + preprocess_data2bin_test_run( + preprocess_local_pascalvoc "full_pascalvoc_test_preprocess.py" + ${PASCALVOC_SMALL_DATA_DIR} ${PASCALVOC_SMALL_OUTPUT_FILE}) endif() # bert, max_len=20, embedding_dim=128 set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert_emb128") -download_model_and_data_without_verify(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz") -if (WITH_GPU) - inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc) +download_model_and_data_without_verify( + ${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz") +if(WITH_GPU) + inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} + analyzer_bert_tester.cc) endif() # multiple models prediction set(MMP_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/multi_model_prediction") -download_data_without_verify(${MMP_INSTALL_DIR} PaddleInference/mobilenet_v2_models.tar.gz) -inference_multiple_models_analysis_api_test(test_analyzer_multi_model_prediction ${MMP_INSTALL_DIR} analyzer_mmp_tester.cc) +download_data_without_verify(${MMP_INSTALL_DIR} + PaddleInference/mobilenet_v2_models.tar.gz) +inference_multiple_models_analysis_api_test( + test_analyzer_multi_model_prediction ${MMP_INSTALL_DIR} + analyzer_mmp_tester.cc) if(WITH_GPU AND TENSORRT_FOUND) - set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt_models") - if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models.tar.gz) - inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_inference_test_models.tar.gz" 3dcccdc38b549b6b1b4089723757bd98) - endif() - set(TEST_SPLIT_CONVERTER_MODEL "${TRT_MODEL_INSTALL_DIR}/trt_split_op_converter_test") - if (NOT EXISTS ${TEST_SPLIT_CONVERTER_MODEL}/split_converter.tgz) - inference_download_and_uncompress_without_verify(${TEST_SPLIT_CONVERTER_MODEL} ${INFERENCE_URL}/tensorrt_test "split_converter.tgz") - endif() - inference_analysis_test(trt_mobilenet_test SRCS trt_mobilenet_test.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) - inference_analysis_test(trt_resnet50_test SRCS trt_resnet50_test.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) - inference_analysis_test(trt_resnext_test SRCS trt_resnext_test.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) - inference_analysis_test(trt_fc_prelu_test SRCS trt_fc_prelu_test.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) - inference_analysis_test(trt_cascade_rcnn_test SRCS trt_cascade_rcnn_test.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) - inference_analysis_test(trt_split_converter_test SRCS trt_split_converter_test.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${TEST_SPLIT_CONVERTER_MODEL}/) - inference_analysis_test(test_analyzer_capi_exp_gpu SRCS analyzer_capi_exp_gpu_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c - ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) - inference_analysis_test(test_analyzer_capi_exp_xpu SRCS analyzer_capi_exp_xpu_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c - ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) - - set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model") - if (NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model.tgz) - inference_download_and_uncompress_without_verify(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "small_quant_model.tgz") - endif() - inference_analysis_test(trt_quant_int8_test SRCS trt_quant_int8_test.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${TRT_MODEL_QUANT_RESNET_DIR}) - - set(TRT_MODEL_QUANT_YOLOV3_DIR "${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware") - if (NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware.tgz) - inference_download_and_uncompress_without_verify(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "yolov3_r50_quant_aware.tgz") - endif() - inference_analysis_test(trt_quant_int8_yolov3_r50_test SRCS trt_quant_int8_yolov3_r50_test.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${TRT_MODEL_QUANT_YOLOV3_DIR}) - - set(TEST_TRT_DYNAMIC_MODEL2 "${TRT_MODEL_INSTALL_DIR}/complex_model_dynamic") - if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL2}/complex_model_dynamic2.tar.gz) - inference_download_and_uncompress_without_verify(${TEST_TRT_DYNAMIC_MODEL2} ${INFERENCE_URL}/tensorrt_test "complex_model_dynamic2.tar.gz") - endif() - - set(TEST_TRT_DYNAMIC_MODEL "${TRT_MODEL_INSTALL_DIR}/conv_bn_swish_split_gelu") - if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL}/conv_bn_swish_split_gelu.tar.gz) - inference_download_and_uncompress(${TEST_TRT_DYNAMIC_MODEL} ${INFERENCE_URL}/tensorrt_test "conv_bn_swish_split_gelu.tar.gz" 2a5e8791e47b221b4f782151d76da9c6) - endif() - inference_analysis_test(trt_dynamic_shape_test SRCS trt_dynamic_shape_test.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}) - - set(TEST_TRT_ERNIE_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test") - if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4.tar.gz) - inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4.tar.gz" 5fa371efa75706becbaad79195d2ca68) - endif() - - inference_analysis_test(test_trt_dynamic_shape_ernie SRCS trt_dynamic_shape_ernie_test.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4) - - set(TEST_TRT_TRANSFORMER_PRUNE_MODEL "${TRT_MODEL_INSTALL_DIR}/transformer_prune") - if (NOT EXISTS ${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune.tar.gz) - inference_download_and_uncompress(${TEST_TRT_TRANSFORMER_PRUNE_MODEL} ${INFERENCE_URL}/tensorrt_test "transformer_prune.tar.gz" 77b56dc73ff0cf44ddb1ce9ca0b0f471) - endif() - - inference_analysis_test(test_trt_dynamic_shape_transformer_prune SRCS trt_dynamic_shape_transformer_prune_test.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune) - - if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized.tgz) - inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz" 833d73fc6a7f7e1ee4a1fd6419209e55) - endif() - - inference_analysis_test(test_trt_dynamic_shape_ernie_ser_deser SRCS trt_dynamic_shape_ernie_serialize_deserialize_test.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized) - - if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_fp16_unserialized.tgz) - inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_fp16_unserialized.tgz" c5ff2d0cad79953ffbf2b8b9e2fae6e4) - endif() - - inference_analysis_test(test_trt_dynamic_shape_ernie_fp16_ser_deser SRCS trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_fp16_unserialized) + set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt_models") + if(NOT EXISTS ${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models.tar.gz) + inference_download_and_uncompress( + ${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test + "trt_inference_test_models.tar.gz" 3dcccdc38b549b6b1b4089723757bd98) + endif() + set(TEST_SPLIT_CONVERTER_MODEL + "${TRT_MODEL_INSTALL_DIR}/trt_split_op_converter_test") + if(NOT EXISTS ${TEST_SPLIT_CONVERTER_MODEL}/split_converter.tgz) + inference_download_and_uncompress_without_verify( + ${TEST_SPLIT_CONVERTER_MODEL} ${INFERENCE_URL}/tensorrt_test + "split_converter.tgz") + endif() + inference_analysis_test( + trt_mobilenet_test + SRCS + trt_mobilenet_test.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) + inference_analysis_test( + trt_resnet50_test + SRCS + trt_resnet50_test.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) + inference_analysis_test( + trt_resnext_test + SRCS + trt_resnext_test.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) + inference_analysis_test( + trt_fc_prelu_test + SRCS + trt_fc_prelu_test.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) + inference_analysis_test( + trt_cascade_rcnn_test + SRCS + trt_cascade_rcnn_test.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) + inference_analysis_test( + trt_split_converter_test + SRCS + trt_split_converter_test.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${TEST_SPLIT_CONVERTER_MODEL}/) + inference_analysis_test( + test_analyzer_capi_exp_gpu + SRCS + analyzer_capi_exp_gpu_tester.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + paddle_inference_c + ARGS + --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) + inference_analysis_test( + test_analyzer_capi_exp_xpu + SRCS + analyzer_capi_exp_xpu_tester.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + paddle_inference_c + ARGS + --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) + + set(TRT_MODEL_QUANT_RESNET_DIR + "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model") + if(NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model.tgz) + inference_download_and_uncompress_without_verify( + ${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test + "small_quant_model.tgz") + endif() + inference_analysis_test( + trt_quant_int8_test + SRCS + trt_quant_int8_test.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${TRT_MODEL_QUANT_RESNET_DIR}) + + set(TRT_MODEL_QUANT_YOLOV3_DIR + "${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware") + if(NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware.tgz) + inference_download_and_uncompress_without_verify( + ${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test + "yolov3_r50_quant_aware.tgz") + endif() + inference_analysis_test( + trt_quant_int8_yolov3_r50_test + SRCS + trt_quant_int8_yolov3_r50_test.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${TRT_MODEL_QUANT_YOLOV3_DIR}) + + set(TEST_TRT_DYNAMIC_MODEL2 "${TRT_MODEL_INSTALL_DIR}/complex_model_dynamic") + if(NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL2}/complex_model_dynamic2.tar.gz) + inference_download_and_uncompress_without_verify( + ${TEST_TRT_DYNAMIC_MODEL2} ${INFERENCE_URL}/tensorrt_test + "complex_model_dynamic2.tar.gz") + endif() + + set(TEST_TRT_DYNAMIC_MODEL + "${TRT_MODEL_INSTALL_DIR}/conv_bn_swish_split_gelu") + if(NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL}/conv_bn_swish_split_gelu.tar.gz) + inference_download_and_uncompress( + ${TEST_TRT_DYNAMIC_MODEL} ${INFERENCE_URL}/tensorrt_test + "conv_bn_swish_split_gelu.tar.gz" 2a5e8791e47b221b4f782151d76da9c6) + endif() + inference_analysis_test( + trt_dynamic_shape_test + SRCS + trt_dynamic_shape_test.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${TRT_MODEL_INSTALL_DIR}) + + set(TEST_TRT_ERNIE_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test") + if(NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4.tar.gz) + inference_download_and_uncompress( + ${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test + "ernie_model_4.tar.gz" 5fa371efa75706becbaad79195d2ca68) + endif() + + inference_analysis_test( + test_trt_dynamic_shape_ernie + SRCS + trt_dynamic_shape_ernie_test.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4) + + set(TEST_TRT_TRANSFORMER_PRUNE_MODEL + "${TRT_MODEL_INSTALL_DIR}/transformer_prune") + if(NOT EXISTS ${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune.tar.gz) + inference_download_and_uncompress( + ${TEST_TRT_TRANSFORMER_PRUNE_MODEL} ${INFERENCE_URL}/tensorrt_test + "transformer_prune.tar.gz" 77b56dc73ff0cf44ddb1ce9ca0b0f471) + endif() + + inference_analysis_test( + test_trt_dynamic_shape_transformer_prune + SRCS + trt_dynamic_shape_transformer_prune_test.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune) + + if(NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized.tgz) + inference_download_and_uncompress( + ${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test + "ernie_model_4_unserialized.tgz" 833d73fc6a7f7e1ee4a1fd6419209e55) + endif() + + inference_analysis_test( + test_trt_dynamic_shape_ernie_ser_deser + SRCS + trt_dynamic_shape_ernie_serialize_deserialize_test.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized) + + if(NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_fp16_unserialized.tgz) + inference_download_and_uncompress( + ${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test + "ernie_model_4_fp16_unserialized.tgz" c5ff2d0cad79953ffbf2b8b9e2fae6e4) + endif() + + inference_analysis_test( + test_trt_dynamic_shape_ernie_fp16_ser_deser + SRCS + trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_fp16_unserialized) endif() set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite") download_data_without_verify(${LITE_MODEL_INSTALL_DIR} "mul_model_fp32.tgz") -inference_analysis_test(lite_mul_model_test SRCS lite_mul_model_test.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${LITE_MODEL_INSTALL_DIR}) -inference_analysis_test(lite_resnet50_test SRCS lite_resnet50_test.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${RESNET50_MODEL_DIR}) - -inference_analysis_test(test_analyzer_capi_exp SRCS analyzer_capi_exp_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c - ARGS --infer_model=${RESNET50_MODEL_DIR}/model) - -inference_analysis_test(test_analyzer_capi_exp_pd_config SRCS analyzer_capi_exp_pd_config_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c - ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) - -inference_analysis_test(test_analyzer_capi_exp_pd_tensor SRCS analyzer_capi_exp_pd_tensor_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c - ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) - -if (NOT APPLE AND NOT WIN32) - inference_analysis_test(test_analyzer_capi_exp_pd_threads SRCS analyzer_capi_exp_pd_threads_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c - ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) +inference_analysis_test( + lite_mul_model_test + SRCS + lite_mul_model_test.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${LITE_MODEL_INSTALL_DIR}) +inference_analysis_test( + lite_resnet50_test + SRCS + lite_resnet50_test.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${RESNET50_MODEL_DIR}) + +inference_analysis_test( + test_analyzer_capi_exp + SRCS + analyzer_capi_exp_tester.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + paddle_inference_c + ARGS + --infer_model=${RESNET50_MODEL_DIR}/model) + +inference_analysis_test( + test_analyzer_capi_exp_pd_config + SRCS + analyzer_capi_exp_pd_config_tester.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + paddle_inference_c + ARGS + --infer_model=${MOBILENET_INSTALL_DIR}/model) + +inference_analysis_test( + test_analyzer_capi_exp_pd_tensor + SRCS + analyzer_capi_exp_pd_tensor_tester.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + paddle_inference_c + ARGS + --infer_model=${MOBILENET_INSTALL_DIR}/model) + +if(NOT APPLE AND NOT WIN32) + inference_analysis_test( + test_analyzer_capi_exp_pd_threads + SRCS + analyzer_capi_exp_pd_threads_tester.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + paddle_inference_c + ARGS + --infer_model=${MOBILENET_INSTALL_DIR}/model) endif() -inference_analysis_test(test_analyzer_zerocopytensor_tensor SRCS analyzer_zerocopy_tensor_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${OCR_INSTALL_DIR}/model) - -if(WITH_DISTRIBUTE AND WITH_PSCORE AND NOT (WITH_ASCEND OR WITH_ASCEND_CL)) - inference_analysis_test(test_analyzer_dist_model SRCS analyzer_dist_model_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${OCR_INSTALL_DIR}/model) +inference_analysis_test( + test_analyzer_zerocopytensor_tensor + SRCS + analyzer_zerocopy_tensor_tester.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${OCR_INSTALL_DIR}/model) + +if(WITH_DISTRIBUTE + AND WITH_PSCORE + AND NOT (WITH_ASCEND OR WITH_ASCEND_CL)) + inference_analysis_test( + test_analyzer_dist_model + SRCS + analyzer_dist_model_tester.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${OCR_INSTALL_DIR}/model) endif() -inference_analysis_test(test_analyzer_paddletensor_tensor SRCS analyzer_paddle_tensor_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${OCR_INSTALL_DIR}/model --infer_data=${OCR_INSTALL_DIR}/data.txt --refer_result=${OCR_INSTALL_DIR}/result.txt) - +inference_analysis_test( + test_analyzer_paddletensor_tensor + SRCS + analyzer_paddle_tensor_tester.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${OCR_INSTALL_DIR}/model + --infer_data=${OCR_INSTALL_DIR}/data.txt + --refer_result=${OCR_INSTALL_DIR}/result.txt) + if(WITH_MKLDNN) - inference_analysis_test(test_analyzer_capi_exp_int SRCS analyzer_capi_exp_int_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c - ARGS --infer_model=${INT8_DATA_DIR}/resnet50/model) + inference_analysis_test( + test_analyzer_capi_exp_int + SRCS + analyzer_capi_exp_int_tester.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + paddle_inference_c + ARGS + --infer_model=${INT8_DATA_DIR}/resnet50/model) endif() -inference_analysis_test(test_analyzer_capi_exp_ner SRCS analyzer_capi_exp_ner_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c - ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model) +inference_analysis_test( + test_analyzer_capi_exp_ner + SRCS + analyzer_capi_exp_ner_tester.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + paddle_inference_c + ARGS + --infer_model=${CHINESE_NER_INSTALL_DIR}/model) if(WITH_GPU) - inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${RESNET50_MODEL_DIR}) - - inference_analysis_test(paddle_infer_api_copy_tensor_tester SRCS paddle_infer_api_copy_tensor_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${RESNET50_MODEL_DIR}) - set_tests_properties(paddle_infer_api_copy_tensor_tester PROPERTIES TIMEOUT 30) + inference_analysis_test( + paddle_infer_api_test + SRCS + paddle_infer_api_test.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${RESNET50_MODEL_DIR}) + + inference_analysis_test( + paddle_infer_api_copy_tensor_tester + SRCS + paddle_infer_api_copy_tensor_tester.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${RESNET50_MODEL_DIR}) + set_tests_properties(paddle_infer_api_copy_tensor_tester PROPERTIES TIMEOUT + 30) endif() -cc_test(paddle_infer_api_errors_test SRCS paddle_infer_api_errors_tester.cc DEPS paddle_inference_api) +cc_test( + paddle_infer_api_errors_test + SRCS paddle_infer_api_errors_tester.cc + DEPS paddle_inference_api) if("$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") - return() + return() endif() if(WITH_GPU AND TENSORRT_FOUND) - set_tests_properties(trt_resnext_test PROPERTIES TIMEOUT 300) - set_tests_properties(trt_quant_int8_yolov3_r50_test PROPERTIES TIMEOUT 300) - set_tests_properties(trt_resnet50_test PROPERTIES TIMEOUT 300) - set_tests_properties(trt_cascade_rcnn_test PROPERTIES TIMEOUT 300) - set_tests_properties(test_trt_dynamic_shape_ernie_ser_deser PROPERTIES TIMEOUT 300) - set_tests_properties(test_trt_dynamic_shape_ernie_fp16_ser_deser PROPERTIES TIMEOUT 300) - set_tests_properties(test_trt_dynamic_shape_ernie PROPERTIES TIMEOUT 300) + set_tests_properties(trt_resnext_test PROPERTIES TIMEOUT 300) + set_tests_properties(trt_quant_int8_yolov3_r50_test PROPERTIES TIMEOUT 300) + set_tests_properties(trt_resnet50_test PROPERTIES TIMEOUT 300) + set_tests_properties(trt_cascade_rcnn_test PROPERTIES TIMEOUT 300) + set_tests_properties(test_trt_dynamic_shape_ernie_ser_deser PROPERTIES TIMEOUT + 300) + set_tests_properties(test_trt_dynamic_shape_ernie_fp16_ser_deser + PROPERTIES TIMEOUT 300) + set_tests_properties(test_trt_dynamic_shape_ernie PROPERTIES TIMEOUT 300) endif() if(WITH_MKLDNN) - set_tests_properties(test_analyzer_int8_resnet50 PROPERTIES TIMEOUT 120) - set_tests_properties(test_analyzer_int8_mobilenet_ssd PROPERTIES TIMEOUT 120) - set_tests_properties(test_analyzer_quant_performance_benchmark PROPERTIES TIMEOUT 120) - set_tests_properties(test_analyzer_int8_mobilenetv2 PROPERTIES TIMEOUT 120) - set_tests_properties(test_analyzer_int8_mobilenetv1 PROPERTIES TIMEOUT 120) - set_tests_properties(test_analyzer_int8_mobilenetv3_large PROPERTIES TIMEOUT 120) - set_tests_properties(test_analyzer_quant2_mobilenetv1_mkldnn PROPERTIES TIMEOUT 120) - set_tests_properties(test_analyzer_quant2_resnet50_channelwise_mkldnn PROPERTIES TIMEOUT 120) + set_tests_properties(test_analyzer_int8_resnet50 PROPERTIES TIMEOUT 120) + set_tests_properties(test_analyzer_int8_mobilenet_ssd PROPERTIES TIMEOUT 120) + set_tests_properties(test_analyzer_quant_performance_benchmark + PROPERTIES TIMEOUT 120) + set_tests_properties(test_analyzer_int8_mobilenetv2 PROPERTIES TIMEOUT 120) + set_tests_properties(test_analyzer_int8_mobilenetv1 PROPERTIES TIMEOUT 120) + set_tests_properties(test_analyzer_int8_mobilenetv3_large PROPERTIES TIMEOUT + 120) + set_tests_properties(test_analyzer_quant2_mobilenetv1_mkldnn + PROPERTIES TIMEOUT 120) + set_tests_properties(test_analyzer_quant2_resnet50_channelwise_mkldnn + PROPERTIES TIMEOUT 120) endif() set_tests_properties(lite_resnet50_test PROPERTIES TIMEOUT 120) @@ -809,45 +1265,74 @@ set_tests_properties(test_analyzer_ernie_int8 PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_googlenet PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_small_dam PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_transformer PROPERTIES TIMEOUT 120) -set_tests_properties(test_analyzer_mobilenet_depthwise_conv PROPERTIES TIMEOUT 120) -if (WITH_GPU) - set_tests_properties(test_analyzer_bert PROPERTIES TIMEOUT 120) - set_tests_properties(test_analyzer_ernie PROPERTIES TIMEOUT 120) +set_tests_properties(test_analyzer_mobilenet_depthwise_conv PROPERTIES TIMEOUT + 120) +if(WITH_GPU) + set_tests_properties(test_analyzer_bert PROPERTIES TIMEOUT 120) + set_tests_properties(test_analyzer_ernie PROPERTIES TIMEOUT 120) endif() if(WITH_GPU AND TENSORRT_FOUND) - set_tests_properties(trt_mobilenet_test PROPERTIES TIMEOUT 120) - if(WITH_MKLDNN) - set_tests_properties(test_analyzer_bfloat16_resnet50 PROPERTIES TIMEOUT 120) - endif() + set_tests_properties(trt_mobilenet_test PROPERTIES TIMEOUT 120) + if(WITH_MKLDNN) + set_tests_properties(test_analyzer_bfloat16_resnet50 PROPERTIES TIMEOUT 120) + endif() endif() if(ON_INFER OR WITH_GPU) - set_tests_properties(test_analyzer_transformer_profile PROPERTIES TIMEOUT 120) + set_tests_properties(test_analyzer_transformer_profile PROPERTIES TIMEOUT 120) endif() -if (WITH_IPU) - #word2vec sample - set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec/word2vec.inference.model") - inference_analysis_test(ipu_word2vec_sample SRCS ipu_word2vec_sample.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${WORD2VEC_INSTALL_DIR}) - - # ERNIE - set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie") - inference_analysis_api_test(ipu_ernie_test ${ERNIE_INSTALL_DIR} ipu_ernie_test.cc - ARGS --warmup=true --repeat=10) - inference_analysis_api_test(ipu_ernie_fp16_test ${ERNIE_INSTALL_DIR} ipu_ernie_fp16_test.cc - ARGS --warmup=true --repeat=10) - - # Resnet50 - set(RESNET50_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50") - inference_analysis_test(ipu_resnet50_test SRCS ipu_resnet50_test.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${RESNET50_MODEL_DIR} --warmup=true --repeat=10) - inference_analysis_test(ipu_resnet50_fp16_test SRCS ipu_resnet50_fp16_test.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${RESNET50_MODEL_DIR} --warmup=true --repeat=10) - - # Only support Resnet50 and Ernie currently - inference_analysis_api_test(ipu_multi_model_profile SRCS ipu_multi_model_profile.cc - ARGS --model_name="Resnet50" --infer_model=${RESNET50_MODEL_DIR} --warmup=true --repeat=10) +if(WITH_IPU) + #word2vec sample + set(WORD2VEC_INSTALL_DIR + "${INFERENCE_DEMO_INSTALL_DIR}/word2vec/word2vec.inference.model") + inference_analysis_test( + ipu_word2vec_sample + SRCS + ipu_word2vec_sample.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${WORD2VEC_INSTALL_DIR}) + + # ERNIE + set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie") + inference_analysis_api_test(ipu_ernie_test ${ERNIE_INSTALL_DIR} + ipu_ernie_test.cc ARGS --warmup=true --repeat=10) + inference_analysis_api_test( + ipu_ernie_fp16_test ${ERNIE_INSTALL_DIR} ipu_ernie_fp16_test.cc ARGS + --warmup=true --repeat=10) + + # Resnet50 + set(RESNET50_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50") + inference_analysis_test( + ipu_resnet50_test + SRCS + ipu_resnet50_test.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${RESNET50_MODEL_DIR} + --warmup=true + --repeat=10) + inference_analysis_test( + ipu_resnet50_fp16_test + SRCS + ipu_resnet50_fp16_test.cc + EXTRA_DEPS + ${INFERENCE_EXTRA_DEPS} + ARGS + --infer_model=${RESNET50_MODEL_DIR} + --warmup=true + --repeat=10) + + # Only support Resnet50 and Ernie currently + inference_analysis_api_test( + ipu_multi_model_profile + SRCS + ipu_multi_model_profile.cc + ARGS + --model_name="Resnet50" + --infer_model=${RESNET50_MODEL_DIR} + --warmup=true + --repeat=10) endif() diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc index dcda34c64da5d..ae838955adc02 100644 --- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc @@ -15,8 +15,10 @@ limitations under the License. */ #include #include #include + #include #include + #include "paddle/fluid/inference/capi_exp/pd_inference_api.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" @@ -65,7 +67,7 @@ TEST(PD_Config, gpu_interface) { &min_shape_ptr, &max_shape_ptr, &opt_shape_ptr, FALSE); PD_ConfigDisableTensorRtOPs(config, 1, &ops_name); - PD_ConfigEnableTensorRtOSS(config); + PD_ConfigEnableVarseqlen(config); bool oss_enabled = PD_ConfigTensorRtOssEnabled(config); EXPECT_TRUE(oss_enabled); diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc index d3a15cb285772..dfcf5fda4763e 100644 --- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc @@ -15,8 +15,10 @@ limitations under the License. */ #include #include #include + #include #include + #include "paddle/fluid/inference/capi_exp/pd_inference_api.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc index 4369cd78dfa37..db5406b8ef6af 100644 --- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc @@ -15,8 +15,10 @@ #include #include #include + #include #include + #include "paddle/fluid/inference/capi_exp/pd_inference_api.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc index a341ffd7a081c..8b094e8a6cb9b 100644 --- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc @@ -15,8 +15,10 @@ limitations under the License. */ #include #include #include + #include #include + #include "paddle/fluid/inference/capi_exp/pd_inference_api.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc index f4017fc5a7f34..33685e6a96060 100644 --- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc @@ -15,11 +15,13 @@ limitations under the License. */ #include #include #include + #include #include #include #include #include + #include "paddle/fluid/inference/capi_exp/pd_inference_api.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc index 8951c446b1f83..f59b337d6afe5 100644 --- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc @@ -15,11 +15,13 @@ limitations under the License. */ #include #include #include + #include #include #include #include #include + #include "paddle/fluid/inference/capi_exp/pd_inference_api.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc index a84c19de25516..347f0e6e2532a 100644 --- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc @@ -15,8 +15,10 @@ limitations under the License. */ #include #include #include + #include #include + #include "paddle/fluid/inference/capi_exp/pd_inference_api.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc index c60e0a25f28c0..524d39854debe 100644 --- a/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc @@ -15,8 +15,10 @@ limitations under the License. */ #include #include #include + #include #include + #include "paddle/fluid/inference/capi/paddle_c_api.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_int_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_int_tester.cc index c0c8ff083de57..cf8582ee778e9 100644 --- a/paddle/fluid/inference/tests/api/analyzer_capi_int_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_capi_int_tester.cc @@ -15,8 +15,10 @@ limitations under the License. */ #include #include #include + #include #include + #include "paddle/fluid/inference/capi/paddle_c_api.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_ner_tester.cc index bf0576f9f93b1..b74f51af980db 100644 --- a/paddle/fluid/inference/tests/api/analyzer_capi_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_capi_ner_tester.cc @@ -15,8 +15,10 @@ #include #include #include + #include #include + #include "paddle/fluid/inference/capi/paddle_c_api.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc index a9c24c4503f9f..d0cd55e918e65 100644 --- a/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc @@ -15,11 +15,13 @@ limitations under the License. */ #include #include #include + #include #include #include #include #include + #include "paddle/fluid/inference/capi/c_api_internal.h" #include "paddle/fluid/inference/capi/paddle_c_api.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" @@ -69,8 +71,9 @@ void PD_run() { PD_DeletePaddleTensor(input); int size; const int* out_shape = PD_GetPaddleTensorShape(out_data, &size); - PADDLE_ENFORCE_EQ(size, 2, paddle::platform::errors::InvalidArgument( - "The Output shape's size is NOT match.")); + PADDLE_ENFORCE_EQ(size, 2, + paddle::platform::errors::InvalidArgument( + "The Output shape's size is NOT match.")); std::vector ref_outshape_size({9, 6}); for (int i = 0; i < 2; ++i) { PADDLE_ENFORCE_EQ(out_shape[i], ref_outshape_size[i], diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc index 0b2be0076fdb1..4ff3e27f420be 100644 --- a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc @@ -15,8 +15,10 @@ limitations under the License. */ #include #include #include + #include #include + #include "paddle/fluid/inference/capi/paddle_c_api.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_xpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_xpu_tester.cc index 33a67d8140575..e6a6a8c1037a0 100644 --- a/paddle/fluid/inference/tests/api/analyzer_capi_xpu_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_capi_xpu_tester.cc @@ -15,8 +15,10 @@ limitations under the License. */ #include #include #include + #include #include + #include "paddle/fluid/inference/capi/paddle_c_api.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc index 820bbf0701778..e3bdb98ec522b 100644 --- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" diff --git a/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc index 384bef8a4b439..c21785f7ce7a3 100644 --- a/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include #include + #include "paddle/fluid/inference/tests/api/tester_helper.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc index 5333f0052d742..166bdc621c198 100644 --- a/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include #include + #include "paddle/fluid/inference/tests/api/tester_helper.h" DEFINE_string(infer_shape, "", "data shape file"); diff --git a/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc index af0a51e4ddbb4..cf3380d0406d0 100644 --- a/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/inference/tests/api/tester_helper.h" DEFINE_bool(disable_mkldnn_fc, false, "Disable usage of MKL-DNN's FC op"); diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc index d11b5f0c218f2..c6d266ceb21eb 100644 --- a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc index 57ab1b00908b1..18990dba3148e 100644 --- a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index bd3a1d737afb1..2b69a15e26a8a 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -148,8 +148,9 @@ TEST(Analyzer_LAC, profile) { "The size of output should be equal to 1.")); size_t size = GetSize(output[0]); size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t); - PADDLE_ENFORCE_GE(size, batch1_size, paddle::platform::errors::Fatal( - "The size of batch is invaild.")); + PADDLE_ENFORCE_GE( + size, batch1_size, + paddle::platform::errors::Fatal("The size of batch is invaild.")); int64_t *pdata = static_cast(output[0].data.data()); for (size_t i = 0; i < batch1_size; ++i) { EXPECT_EQ(pdata[i], lac_ref_data[i]); diff --git a/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc index 141e60513eb95..7e754ad93bc3d 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" @@ -211,18 +212,15 @@ std::vector Lexical_Test( } } // nums_infer, nums_label, nums_correct - auto precision = - acc_sum[0] - ? static_cast(acc_sum[2]) / static_cast(acc_sum[0]) - : 0; - auto recall = - acc_sum[1] - ? static_cast(acc_sum[2]) / static_cast(acc_sum[1]) - : 0; - auto f1_score = - acc_sum[2] - ? static_cast(2 * precision * recall) / (precision + recall) - : 0; + auto precision = acc_sum[0] ? static_cast(acc_sum[2]) / + static_cast(acc_sum[0]) + : 0; + auto recall = acc_sum[1] ? static_cast(acc_sum[2]) / + static_cast(acc_sum[1]) + : 0; + auto f1_score = acc_sum[2] ? static_cast(2 * precision * recall) / + (precision + recall) + : 0; LOG(INFO) << "Precision: " << std::fixed << std::setw(6) << std::setprecision(5) << precision; diff --git a/paddle/fluid/inference/tests/api/analyzer_mmp_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mmp_tester.cc index 4a5ec95934a9a..43fed05db133c 100644 --- a/paddle/fluid/inference/tests/api/analyzer_mmp_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_mmp_tester.cc @@ -12,11 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include + #include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" -#include - // Here add missing commands DEFINE_string(infer_model2, "", "model path"); DEFINE_string(infer_model3, "", "model path"); @@ -96,8 +96,9 @@ void compare(bool use_mkldnn = false) { xx_output.begin(), xx_output.end(), xx2_output.begin(), [](const float& l, const float& r) { return fabs(l - r) < 1e-4; }); - PADDLE_ENFORCE_EQ(result, true, paddle::platform::errors::Fatal( - "Results of model run independently " + PADDLE_ENFORCE_EQ( + result, true, + paddle::platform::errors::Fatal("Results of model run independently " "differs from results of the same model " "run as a sequence of models")); } diff --git a/paddle/fluid/inference/tests/api/analyzer_paddle_tensor_tester.cc b/paddle/fluid/inference/tests/api/analyzer_paddle_tensor_tester.cc index 2eb75c4dc5369..2c02b87ba2be4 100644 --- a/paddle/fluid/inference/tests/api/analyzer_paddle_tensor_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_paddle_tensor_tester.cc @@ -16,9 +16,8 @@ #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/inference/utils/singleton.h" - #include "paddle/fluid/inference/tests/api/tester_helper.h" +#include "paddle/fluid/inference/utils/singleton.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc index 4bb59f3c8df42..1618ba575a26e 100644 --- a/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc index 978aaf1c6a32d..883d946dff54e 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc @@ -47,8 +47,9 @@ struct DataRecord { num_lines++; std::vector data; split(line, '\t', &data); - PADDLE_ENFORCE_GT(data.size(), 4, paddle::platform::errors::Fatal( - "The size of data is invaild.")); + PADDLE_ENFORCE_GT( + data.size(), 4, + paddle::platform::errors::Fatal("The size of data is invaild.")); // load title1 data std::vector title1_data; split_to_int64(data[0], ' ', &title1_data); diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc index 8f0778b83e52e..1ef5e81e18a38 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc index 099ff1f31a759..5a78d36276cb9 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc index 1fbcbf1a3f427..30cea4f69bdd0 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc index d33b11c389a09..15f4b3a3a5bf0 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc index 0ccd95f2a176d..063d29abee9a2 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h index 5d7f7c290f6a2..ef00c0209738e 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h @@ -19,6 +19,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h b/paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h index e43456ed8322e..a384c75e0bb45 100644 --- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h +++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h @@ -15,6 +15,7 @@ #include #include #include + #include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index faa15fc4f0a17..0a43d166e93cf 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include #include + #include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { diff --git a/paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc index 029f2f0421d15..08f26bae37bea 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { diff --git a/paddle/fluid/inference/tests/api/analyzer_zerocopy_tensor_tester.cc b/paddle/fluid/inference/tests/api/analyzer_zerocopy_tensor_tester.cc index e1ee1b196e4d3..d8ba615c8ed77 100644 --- a/paddle/fluid/inference/tests/api/analyzer_zerocopy_tensor_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_zerocopy_tensor_tester.cc @@ -16,9 +16,8 @@ #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/inference/utils/singleton.h" - #include "paddle/fluid/inference/tests/api/tester_helper.h" +#include "paddle/fluid/inference/utils/singleton.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h index b952b62f13ed6..6ef3eb95dd222 100644 --- a/paddle/fluid/inference/tests/api/config_printer.h +++ b/paddle/fluid/inference/tests/api/config_printer.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/inference/api/paddle_inference_api.h" namespace paddle { diff --git a/paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc b/paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc index 1d69069da0716..38cf475d3da6f 100644 --- a/paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc +++ b/paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include #include "gflags/gflags.h" diff --git a/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc b/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc index 5fde8e6a5e1e6..cbfe8229d31a1 100644 --- a/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc +++ b/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc @@ -11,6 +11,7 @@ limitations under the License. */ #include #include + #include #include "gflags/gflags.h" diff --git a/paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc b/paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc index d38c5c3416351..a0e36e9779da8 100644 --- a/paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc +++ b/paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc @@ -31,8 +31,8 @@ limitations under the License. */ DEFINE_string(infer_model, "", "Directory of the inference model."); using paddle_infer::Config; -using paddle_infer::Predictor; using paddle_infer::CreatePredictor; +using paddle_infer::Predictor; void inference(std::string model_path, bool use_ipu, std::vector *out_data) { diff --git a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc index 9211ea246a5c5..1adbf0ec7a552 100644 --- a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc +++ b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc @@ -14,11 +14,12 @@ limitations under the License. */ #include #include + #include #include // NOLINT #include // NOLINT -#include "gflags/gflags.h" +#include "gflags/gflags.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { diff --git a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc index 59bbaa2b78fb0..169d0b9987d79 100644 --- a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc +++ b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include #include "gflags/gflags.h" diff --git a/paddle/fluid/inference/tests/api/mkldnn_quantizer_config_tester.cc b/paddle/fluid/inference/tests/api/mkldnn_quantizer_config_tester.cc index 4a2527a217f8b..d972945db7d8c 100644 --- a/paddle/fluid/inference/tests/api/mkldnn_quantizer_config_tester.cc +++ b/paddle/fluid/inference/tests/api/mkldnn_quantizer_config_tester.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" @@ -90,9 +91,10 @@ TEST(Mkldnn_quantizer_config, configuration) { PADDLE_ENFORCE_EQ( cfg.mkldnn_quantizer_config()->scale_algo("conv2d", "Input"), - conv2d_scale_algo, platform::errors::InvalidArgument( - "Scale algorithm got from config differs with the " - "one set previously.")); + conv2d_scale_algo, + platform::errors::InvalidArgument( + "Scale algorithm got from config differs with the " + "one set previously.")); PADDLE_ENFORCE_EQ( cfg.mkldnn_quantizer_config()->scale_algo("unknown", "unknown"), diff --git a/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc b/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc index 2be69781c4e60..38bcb7645abb5 100644 --- a/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc +++ b/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc @@ -14,8 +14,10 @@ limitations under the License. */ #include #include + #include #include + #include "gflags/gflags.h" #include "glog/logging.h" #include "paddle/fluid/inference/api/paddle_infer_contrib.h" diff --git a/paddle/fluid/inference/tests/api/paddle_infer_api_errors_tester.cc b/paddle/fluid/inference/tests/api/paddle_infer_api_errors_tester.cc index c5a0746c4d760..ab82c82b1e3b3 100644 --- a/paddle/fluid/inference/tests/api/paddle_infer_api_errors_tester.cc +++ b/paddle/fluid/inference/tests/api/paddle_infer_api_errors_tester.cc @@ -15,7 +15,6 @@ #include "gflags/gflags.h" #include "glog/logging.h" #include "gtest/gtest.h" - #include "paddle/fluid/inference/api/paddle_infer_contrib.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc b/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc index 88ebd85c79a13..8cbc410eb5ff3 100644 --- a/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc +++ b/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc @@ -15,10 +15,11 @@ limitations under the License. */ #include #include #include + #include #include -#include "gflags/gflags.h" +#include "gflags/gflags.h" #include "paddle/fluid/inference/tests/api/trt_test_helper.h" namespace paddle_infer { diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index f2df018f4978a..d7784a909afd4 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -1081,7 +1081,7 @@ static bool CompareTensor(const framework::LoDTensor &a, } void ConvertFP32toFP16(paddle::PaddleTensor &tensor // NOLINT - ) { +) { int num = 1; for (auto dim : tensor.shape) { num *= dim; @@ -1101,7 +1101,7 @@ void ConvertFP32toFP16(paddle::PaddleTensor &tensor // NOLINT } void ConvertFP16toFP32(paddle::PaddleTensor &tensor // NOLINT - ) { +) { int num = 1; for (auto dim : tensor.shape) { num *= dim; diff --git a/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc b/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc index a1f31c3108ba5..ab059496ad8a7 100644 --- a/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc +++ b/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc @@ -14,8 +14,8 @@ limitations under the License. */ #include #include -#include "gflags/gflags.h" +#include "gflags/gflags.h" #include "paddle/fluid/inference/tests/api/trt_test_helper.h" namespace paddle { diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc index 7e9f71c8b3c0c..b0c4c13dbbc63 100644 --- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc +++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc @@ -22,8 +22,8 @@ limitations under the License. */ #define GLOG_NO_ABBREVIATED_SEVERITIES #include #include -#include "gflags/gflags.h" +#include "gflags/gflags.h" #include "paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h" namespace paddle { diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.cc index 209dd90c48070..f269432d4da1e 100644 --- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.cc +++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.cc @@ -22,8 +22,8 @@ limitations under the License. */ #define GLOG_NO_ABBREVIATED_SEVERITIES #include #include -#include "gflags/gflags.h" +#include "gflags/gflags.h" #include "paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h" namespace paddle { diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h index 5ae14576dfeb0..3ca62afba1d05 100644 --- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h +++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h @@ -24,8 +24,8 @@ limitations under the License. */ #include #include #include -#include "gflags/gflags.h" +#include "gflags/gflags.h" #include "paddle/fluid/inference/tests/api/trt_test_helper.h" namespace paddle { diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc index 1058a5b5ec6b8..977c6856f8c08 100644 --- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc +++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc @@ -14,8 +14,8 @@ limitations under the License. */ #include #include -#include "gflags/gflags.h" +#include "gflags/gflags.h" #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tests/api/trt_test_helper.h" @@ -210,7 +210,11 @@ std::shared_ptr InitPredictor() { config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape, opt_input_shape); // erinie varlen must be used with oss - config.EnableTensorRtOSS(); + config.EnableVarseqlen(); + paddle_infer::experimental::InternalUtils::SetTransformerPosid(&config, + input_name2); + paddle_infer::experimental::InternalUtils::SetTransformerMaskid(&config, + input_name3); return paddle_infer::CreatePredictor(config); } @@ -222,13 +226,78 @@ void run(paddle_infer::Predictor* predictor, std::vector* out_data) { int32_t i1[run_seq_len] = { // sentence 1 - 1, 3558, 4, 75, 491, 89, 340, 313, 93, 4, 255, 10, 75, 321, 4095, 1902, 4, - 134, 49, 75, 311, 14, 44, 178, 543, 15, 12043, 2, 75, 201, 340, 9, 14, 44, - 486, 218, 1140, 279, 12043, 2, + 1, + 3558, + 4, + 75, + 491, + 89, + 340, + 313, + 93, + 4, + 255, + 10, + 75, + 321, + 4095, + 1902, + 4, + 134, + 49, + 75, + 311, + 14, + 44, + 178, + 543, + 15, + 12043, + 2, + 75, + 201, + 340, + 9, + 14, + 44, + 486, + 218, + 1140, + 279, + 12043, + 2, // sentence 2 - 101, 2054, 2234, 2046, 2486, 2044, 1996, 2047, 4552, 2001, 9536, 1029, - 102, 2004, 1997, 2008, 2154, 1010, 1996, 2047, 4552, 9536, 2075, 1996, - 2117, 3072, 2234, 2046, 2486, 1012, 102, + 101, + 2054, + 2234, + 2046, + 2486, + 2044, + 1996, + 2047, + 4552, + 2001, + 9536, + 1029, + 102, + 2004, + 1997, + 2008, + 2154, + 1010, + 1996, + 2047, + 4552, + 9536, + 2075, + 1996, + 2117, + 3072, + 2234, + 2046, + 2486, + 1012, + 102, }; int32_t i2[run_seq_len] = { // sentence 1 diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc index ccdf237ffa54d..4b22bba2bcc97 100644 --- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc +++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc @@ -14,8 +14,8 @@ limitations under the License. */ #include #include -#include "gflags/gflags.h" +#include "gflags/gflags.h" #include "paddle/fluid/inference/tests/api/trt_test_helper.h" namespace paddle { diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc index 2d7aa72a036fd..a238e62fc7cc0 100644 --- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc +++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc @@ -14,8 +14,8 @@ limitations under the License. */ #include #include -#include "gflags/gflags.h" +#include "gflags/gflags.h" #include "paddle/fluid/inference/tests/api/trt_test_helper.h" namespace paddle { diff --git a/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc b/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc index c0be194493112..93d4a88383c33 100644 --- a/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc +++ b/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc @@ -14,8 +14,8 @@ limitations under the License. */ #include #include -#include "gflags/gflags.h" +#include "gflags/gflags.h" #include "paddle/fluid/inference/tests/api/trt_test_helper.h" namespace paddle { diff --git a/paddle/fluid/inference/tests/api/trt_instance_norm_converter_test.cc b/paddle/fluid/inference/tests/api/trt_instance_norm_converter_test.cc index ceb8b99774e48..243be1d33193c 100644 --- a/paddle/fluid/inference/tests/api/trt_instance_norm_converter_test.cc +++ b/paddle/fluid/inference/tests/api/trt_instance_norm_converter_test.cc @@ -14,8 +14,8 @@ limitations under the License. */ #include #include -#include "gflags/gflags.h" +#include "gflags/gflags.h" #include "paddle/fluid/inference/tests/api/trt_test_helper.h" namespace paddle { diff --git a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc index a87bf7b085bd8..bcf8a23b9b922 100644 --- a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc +++ b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc @@ -14,8 +14,8 @@ limitations under the License. */ #include #include -#include "gflags/gflags.h" +#include "gflags/gflags.h" #include "paddle/fluid/inference/tests/api/trt_test_helper.h" namespace paddle { diff --git a/paddle/fluid/inference/tests/api/trt_quant_int8_test.cc b/paddle/fluid/inference/tests/api/trt_quant_int8_test.cc index ca25967b59a6a..3a884abe88889 100644 --- a/paddle/fluid/inference/tests/api/trt_quant_int8_test.cc +++ b/paddle/fluid/inference/tests/api/trt_quant_int8_test.cc @@ -14,9 +14,10 @@ limitations under the License. */ #include #include + #include -#include "gflags/gflags.h" +#include "gflags/gflags.h" #include "paddle/fluid/inference/tests/api/trt_test_helper.h" namespace paddle { diff --git a/paddle/fluid/inference/tests/api/trt_quant_int8_yolov3_r50_test.cc b/paddle/fluid/inference/tests/api/trt_quant_int8_yolov3_r50_test.cc index 1fa24dddead88..d9e1e3f8c9e8a 100644 --- a/paddle/fluid/inference/tests/api/trt_quant_int8_yolov3_r50_test.cc +++ b/paddle/fluid/inference/tests/api/trt_quant_int8_yolov3_r50_test.cc @@ -11,9 +11,10 @@ limitations under the License. */ #include #include + #include -#include "gflags/gflags.h" +#include "gflags/gflags.h" #include "paddle/fluid/inference/tests/api/trt_test_helper.h" namespace paddle { diff --git a/paddle/fluid/inference/tests/api/trt_resnet50_test.cc b/paddle/fluid/inference/tests/api/trt_resnet50_test.cc index 2975967e0c0de..cdc6586f1272b 100644 --- a/paddle/fluid/inference/tests/api/trt_resnet50_test.cc +++ b/paddle/fluid/inference/tests/api/trt_resnet50_test.cc @@ -14,8 +14,8 @@ limitations under the License. */ #include #include -#include "gflags/gflags.h" +#include "gflags/gflags.h" #include "paddle/fluid/inference/tests/api/trt_test_helper.h" namespace paddle { diff --git a/paddle/fluid/inference/tests/api/trt_resnext_test.cc b/paddle/fluid/inference/tests/api/trt_resnext_test.cc index b525a1b706858..374074957c870 100644 --- a/paddle/fluid/inference/tests/api/trt_resnext_test.cc +++ b/paddle/fluid/inference/tests/api/trt_resnext_test.cc @@ -14,8 +14,8 @@ limitations under the License. */ #include #include -#include "gflags/gflags.h" +#include "gflags/gflags.h" #include "paddle/fluid/inference/tests/api/trt_test_helper.h" namespace paddle { diff --git a/paddle/fluid/inference/tests/api/trt_split_converter_test.cc b/paddle/fluid/inference/tests/api/trt_split_converter_test.cc index c00b36b520bcd..0726db28343bc 100644 --- a/paddle/fluid/inference/tests/api/trt_split_converter_test.cc +++ b/paddle/fluid/inference/tests/api/trt_split_converter_test.cc @@ -14,8 +14,8 @@ limitations under the License. */ #include #include -#include "gflags/gflags.h" +#include "gflags/gflags.h" #include "paddle/fluid/inference/tests/api/trt_test_helper.h" namespace paddle { diff --git a/paddle/fluid/inference/tests/api/trt_test_helper.h b/paddle/fluid/inference/tests/api/trt_test_helper.h index aaa285b2fc2c9..cadf996e071d8 100644 --- a/paddle/fluid/inference/tests/api/trt_test_helper.h +++ b/paddle/fluid/inference/tests/api/trt_test_helper.h @@ -13,13 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once #include + #include #include #include "gflags/gflags.h" #include "glog/logging.h" #include "gtest/gtest.h" - #include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { diff --git a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt index ad7ef0c04ce67..5aef30bf335c3 100644 --- a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt +++ b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt @@ -1,32 +1,34 @@ cmake_minimum_required(VERSION 3.0) project(cpp_inference_demo CXX C) -option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL." ON) -option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF) -option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." OFF) -option(USE_TENSORRT "Compile demo with TensorRT." OFF) -option(WITH_GTEST "Compile demo with GTEST" OFF) -option(WITH_ONNXRUNTIME "Compile demo with ONNXRuntime" OFF) +option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL." ON) +option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF) +option(WITH_STATIC_LIB + "Compile demo with static/shared library, default use static." OFF) +option(USE_TENSORRT "Compile demo with TensorRT." OFF) +option(WITH_GTEST "Compile demo with GTEST" OFF) +option(WITH_ONNXRUNTIME "Compile demo with ONNXRuntime" OFF) if(NOT WITH_STATIC_LIB) add_definitions("-DPADDLE_WITH_SHARED_LIB") else() - # PD_INFER_DECL is mainly used to set the dllimport/dllexport attribute in dynamic library mode. + # PD_INFER_DECL is mainly used to set the dllimport/dllexport attribute in dynamic library mode. # Set it to empty in static library mode to avoid compilation issues. add_definitions("/DPD_INFER_DECL=") endif() macro(safe_set_static_flag) - foreach(flag_var - CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE - CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) - if(${flag_var} MATCHES "/MD") - string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") - endif(${flag_var} MATCHES "/MD") - endforeach(flag_var) + foreach(flag_var + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) + if(${flag_var} MATCHES "/MD") + string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") + endif(${flag_var} MATCHES "/MD") + endforeach(flag_var) endmacro() if(NOT DEFINED PADDLE_LIB) - message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") + message( + FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") endif() if(NOT DEFINED DEMO_NAME) message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name") @@ -46,7 +48,7 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib") link_directories("${PADDLE_LIB}/paddle/lib") -if (WITH_ONNXRUNTIME) +if(WITH_ONNXRUNTIME) include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include") @@ -54,21 +56,25 @@ if (WITH_ONNXRUNTIME) link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib") endif() -if (WIN32) +if(WIN32) add_definitions("/DGOOGLE_GLOG_DLL_DECL=") option(MSVC_STATIC_CRT "use static C Runtime library by default" ON) - if (MSVC_STATIC_CRT) - if (WITH_MKL) + if(MSVC_STATIC_CRT) + if(WITH_MKL) set(FLAG_OPENMP "/openmp") endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4244 /wd4530") - set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}") + set(CMAKE_C_FLAGS_DEBUG + "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}") + set(CMAKE_C_FLAGS_RELEASE + "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4530") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}") + set(CMAKE_CXX_FLAGS_DEBUG + "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}") + set(CMAKE_CXX_FLAGS_RELEASE + "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}") safe_set_static_flag() - if (WITH_STATIC_LIB) + if(WITH_STATIC_LIB) add_definitions(-DSTATIC_LIB) endif() endif() @@ -81,60 +87,75 @@ endif() if(WITH_GPU) if(NOT WIN32) - set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library") + set(CUDA_LIB + "/usr/local/cuda/lib64/" + CACHE STRING "CUDA Library") else() - set(CUDA_LIB "" CACHE STRING "CUDA_LIB") + set(CUDA_LIB + "" + CACHE STRING "CUDA_LIB") if("${CUDA_LIB}" STREQUAL "") if(DEFINED ENV{CUDA_PATH}) set(CUDA_LIB "$ENV{CUDA_PATH}\\lib\\x64") else() - set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\lib\\x64") + set(CUDA_LIB + "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\lib\\x64" + ) endif() endif() message(STATUS "Current CUDA lib path: ${CUDA_LIB}") endif(NOT WIN32) endif() -if (USE_TENSORRT AND WITH_GPU) - set(TENSORRT_ROOT "" CACHE STRING "The root directory of TensorRT library") +if(USE_TENSORRT AND WITH_GPU) + set(TENSORRT_ROOT + "" + CACHE STRING "The root directory of TensorRT library") if("${TENSORRT_ROOT}" STREQUAL "") - message(FATAL_ERROR "The TENSORRT_ROOT is empty, you must assign it a value with CMake command. Such as: -DTENSORRT_ROOT=TENSORRT_ROOT_PATH ") + message( + FATAL_ERROR + "The TENSORRT_ROOT is empty, you must assign it a value with CMake command. Such as: -DTENSORRT_ROOT=TENSORRT_ROOT_PATH " + ) endif() set(TENSORRT_INCLUDE_DIR ${TENSORRT_ROOT}/include) set(TENSORRT_LIB_DIR ${TENSORRT_ROOT}/lib) file(READ ${TENSORRT_INCLUDE_DIR}/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS) - string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION - "${TENSORRT_VERSION_FILE_CONTENTS}") - string(REGEX MATCH "define NV_TENSORRT_MINOR +([0-9]+)" TENSORRT_MINOR_VERSION - "${TENSORRT_VERSION_FILE_CONTENTS}") - string(REGEX MATCH "define NV_TENSORRT_PATCH +([0-9]+)" TENSORRT_PATCH_VERSION - "${TENSORRT_VERSION_FILE_CONTENTS}") - string(REGEX MATCH "define NV_TENSORRT_BUILD +([0-9]+)" TENSORRT_BUILD_VERSION - "${TENSORRT_VERSION_FILE_CONTENTS}") + string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" + TENSORRT_MAJOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}") + string(REGEX MATCH "define NV_TENSORRT_MINOR +([0-9]+)" + TENSORRT_MINOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}") + string(REGEX MATCH "define NV_TENSORRT_PATCH +([0-9]+)" + TENSORRT_PATCH_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}") + string(REGEX MATCH "define NV_TENSORRT_BUILD +([0-9]+)" + TENSORRT_BUILD_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}") if("${TENSORRT_MAJOR_VERSION}" STREQUAL "") - file(READ ${TENSORRT_INCLUDE_DIR}/NvInferVersion.h TENSORRT_VERSION_FILE_CONTENTS) - string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION - "${TENSORRT_VERSION_FILE_CONTENTS}") - string(REGEX MATCH "define NV_TENSORRT_MINOR +([0-9]+)" TENSORRT_MINOR_VERSION - "${TENSORRT_VERSION_FILE_CONTENTS}") - string(REGEX MATCH "define NV_TENSORRT_PATCH +([0-9]+)" TENSORRT_PATCH_VERSION - "${TENSORRT_VERSION_FILE_CONTENTS}") - string(REGEX MATCH "define NV_TENSORRT_BUILD +([0-9]+)" TENSORRT_BUILD_VERSION - "${TENSORRT_VERSION_FILE_CONTENTS}") + file(READ ${TENSORRT_INCLUDE_DIR}/NvInferVersion.h + TENSORRT_VERSION_FILE_CONTENTS) + string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" + TENSORRT_MAJOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}") + string(REGEX MATCH "define NV_TENSORRT_MINOR +([0-9]+)" + TENSORRT_MINOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}") + string(REGEX MATCH "define NV_TENSORRT_PATCH +([0-9]+)" + TENSORRT_PATCH_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}") + string(REGEX MATCH "define NV_TENSORRT_BUILD +([0-9]+)" + TENSORRT_BUILD_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}") endif() if("${TENSORRT_MAJOR_VERSION}" STREQUAL "") message(SEND_ERROR "Failed to detect TensorRT version.") endif() string(REGEX REPLACE "define NV_TENSORRT_MAJOR +([0-9]+)" "\\1" - TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}") + TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}") string(REGEX REPLACE "define NV_TENSORRT_MINOR +([0-9]+)" "\\1" - TENSORRT_MINOR_VERSION "${TENSORRT_MINOR_VERSION}") + TENSORRT_MINOR_VERSION "${TENSORRT_MINOR_VERSION}") string(REGEX REPLACE "define NV_TENSORRT_PATCH +([0-9]+)" "\\1" - TENSORRT_PATCH_VERSION "${TENSORRT_PATCH_VERSION}") + TENSORRT_PATCH_VERSION "${TENSORRT_PATCH_VERSION}") string(REGEX REPLACE "define NV_TENSORRT_BUILD +([0-9]+)" "\\1" - TENSORRT_BUILD_VERSION "${TENSORRT_BUILD_VERSION}") - message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. " - "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}.${TENSORRT_MINOR_VERSION}.${TENSORRT_PATCH_VERSION}.${TENSORRT_BUILD_VERSION} ") + TENSORRT_BUILD_VERSION "${TENSORRT_BUILD_VERSION}") + message( + STATUS + "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. " + "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}.${TENSORRT_MINOR_VERSION}.${TENSORRT_PATCH_VERSION}.${TENSORRT_BUILD_VERSION} " + ) include_directories("${TENSORRT_INCLUDE_DIR}") link_directories("${TENSORRT_LIB_DIR}") add_compile_definitions(NV_TENSORRT_MAJOR=${TENSORRT_MAJOR_VERSION}) @@ -150,8 +171,9 @@ if(WITH_MKL) set(MATH_LIB ${MATH_LIB_PATH}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX} ${MATH_LIB_PATH}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX}) else() - set(MATH_LIB ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} - ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(MATH_LIB + ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} + ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) endif() set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn") if(EXISTS ${MKLDNN_PATH}) @@ -166,63 +188,97 @@ else() set(OPENBLAS_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}openblas") include_directories("${OPENBLAS_LIB_PATH}/include/openblas") if(WIN32) - set(MATH_LIB ${OPENBLAS_LIB_PATH}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(MATH_LIB + ${OPENBLAS_LIB_PATH}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}) else() - set(MATH_LIB ${OPENBLAS_LIB_PATH}/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(MATH_LIB + ${OPENBLAS_LIB_PATH}/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX}) endif() endif() if(WITH_STATIC_LIB) - set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS + ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX} + ) else() if(WIN32) - set(DEPS ${PADDLE_LIB}/paddle/lib/paddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS + ${PADDLE_LIB}/paddle/lib/paddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX}) else() - set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(DEPS + ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX} + ) endif() endif() -if (WITH_ONNXRUNTIME) +if(WITH_ONNXRUNTIME) if(WIN32) - set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib paddle2onnx) + set(DEPS + ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib + paddle2onnx) elseif(APPLE) - set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx) + set(DEPS + ${DEPS} + ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib + paddle2onnx) else() - set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx) + set(DEPS + ${DEPS} + ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 + paddle2onnx) endif() endif() -if (NOT WIN32) +if(NOT WIN32) set(EXTERNAL_LIB "-lrt -ldl -lpthread") - set(DEPS ${DEPS} - ${MATH_LIB} ${MKLDNN_LIB} - glog gflags protobuf xxhash cryptopp + set(DEPS + ${DEPS} + ${MATH_LIB} + ${MKLDNN_LIB} + glog + gflags + protobuf + xxhash + cryptopp ${EXTERNAL_LIB}) else() - set(DEPS ${DEPS} - ${MATH_LIB} ${MKLDNN_LIB} - glog gflags_static libprotobuf xxhash cryptopp-static ${EXTERNAL_LIB}) + set(DEPS + ${DEPS} + ${MATH_LIB} + ${MKLDNN_LIB} + glog + gflags_static + libprotobuf + xxhash + cryptopp-static + ${EXTERNAL_LIB}) set(DEPS ${DEPS} shlwapi.lib) endif(NOT WIN32) if(WITH_GPU) if(NOT WIN32) - if (USE_TENSORRT) - set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX}) - set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}) + if(USE_TENSORRT) + set(DEPS ${DEPS} + ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(DEPS + ${DEPS} + ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}) endif() set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX}) else() if(USE_TENSORRT) - set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_STATIC_LIBRARY_SUFFIX}) - set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} + ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} + ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX}) if(${TENSORRT_MAJOR_VERSION} EQUAL 7) - set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/myelin64_1${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} + ${TENSORRT_LIB_DIR}/myelin64_1${CMAKE_STATIC_LIBRARY_SUFFIX}) endif() endif() - set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} ) - set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} ) - set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX}) endif() endif() @@ -237,11 +293,14 @@ if(WITH_GTEST) include(GNUInstallDirs) include_directories(${GTEST_INSTALL_DIR}/include) add_dependencies(${DEMO_NAME} thirdparty_gtest) - IF(WIN32) + if(WIN32) target_link_libraries(${DEMO_NAME} ${GTEST_LIBRARIES}) - ELSE() - target_link_libraries(${DEMO_NAME} ${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest${CMAKE_STATIC_LIBRARY_SUFFIX}) - ENDIF(WIN32) + else() + target_link_libraries( + ${DEMO_NAME} + ${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest${CMAKE_STATIC_LIBRARY_SUFFIX} + ) + endif(WIN32) endif() if(WIN32) if("${CMAKE_GENERATOR}" MATCHES "Ninja") @@ -251,41 +310,62 @@ if(WIN32) endif() if(USE_TENSORRT) - add_custom_command(TARGET ${DEMO_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_SHARED_LIBRARY_SUFFIX} - ${LIB_PATH} - COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX} - ${LIB_PATH} - ) + add_custom_command( + TARGET ${DEMO_NAME} + POST_BUILD + COMMAND + ${CMAKE_COMMAND} -E copy + ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_SHARED_LIBRARY_SUFFIX} ${LIB_PATH} + COMMAND + ${CMAKE_COMMAND} -E copy + ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX} + ${LIB_PATH}) if(${TENSORRT_MAJOR_VERSION} EQUAL 7) - add_custom_command(TARGET ${DEMO_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/myelin64_1${CMAKE_SHARED_LIBRARY_SUFFIX} - ${LIB_PATH}) + add_custom_command( + TARGET ${DEMO_NAME} + POST_BUILD + COMMAND + ${CMAKE_COMMAND} -E copy + ${TENSORRT_LIB_DIR}/myelin64_1${CMAKE_SHARED_LIBRARY_SUFFIX} + ${LIB_PATH}) endif() endif() if(WITH_MKL) message("LIB_PATH IS ${LIB_PATH}") - add_custom_command(TARGET ${DEMO_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/mklml.dll ${LIB_PATH} - COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/libiomp5md.dll ${LIB_PATH} - COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/mkldnn.dll ${LIB_PATH} - ) + add_custom_command( + TARGET ${DEMO_NAME} + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/mklml.dll + ${LIB_PATH} + COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/libiomp5md.dll + ${LIB_PATH} + COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/mkldnn.dll + ${LIB_PATH}) else() - add_custom_command(TARGET ${DEMO_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${LIB_PATH} - ) + add_custom_command( + TARGET ${DEMO_NAME} + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll + ${LIB_PATH}) endif() if(WITH_ONNXRUNTIME) - add_custom_command(TARGET ${DEMO_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll - ${LIB_PATH} - COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll - ${LIB_PATH} - ) + add_custom_command( + TARGET ${DEMO_NAME} + POST_BUILD + COMMAND + ${CMAKE_COMMAND} -E copy + ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll + ${LIB_PATH} + COMMAND + ${CMAKE_COMMAND} -E copy + ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll + ${LIB_PATH}) endif() if(NOT WITH_STATIC_LIB) - add_custom_command(TARGET ${DEMO_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${LIB_PATH} - ) + add_custom_command( + TARGET ${DEMO_NAME} + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${LIB_PATH}) endif() endif() diff --git a/paddle/fluid/inference/tests/infer_ut/external-cmake/gtest-cpp.cmake b/paddle/fluid/inference/tests/infer_ut/external-cmake/gtest-cpp.cmake index b38984314ec85..49b0a04197d12 100644 --- a/paddle/fluid/inference/tests/infer_ut/external-cmake/gtest-cpp.cmake +++ b/paddle/fluid/inference/tests/infer_ut/external-cmake/gtest-cpp.cmake @@ -1,43 +1,50 @@ find_package(Git REQUIRED) message("${CMAKE_BUILD_TYPE}") -SET(GTEST_PREFIX_DIR ${CMAKE_CURRENT_BINARY_DIR}/gtest) -SET(GTEST_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/gtest/src/extern_gtest) -SET(GTEST_INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/install/gtest) -SET(GTEST_INCLUDE_DIR "${GTEST_INSTALL_DIR}/include" CACHE PATH "gtest include directory." FORCE) -set(GTEST_REPOSITORY https://github.com/google/googletest.git) -set(GTEST_TAG release-1.8.1) -INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIR}) -IF(WIN32) - # if use CMAKE_INSTALL_LIBDIR, the path of lib actually is install/gtest/lib/gtest.lib but GTEST_LIBRARIES - # is install/gtest/gtest.lib - set(GTEST_LIBRARIES - "${GTEST_INSTALL_DIR}/lib/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE) - set(GTEST_MAIN_LIBRARIES - "${GTEST_INSTALL_DIR}/lib/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE) -ELSE() - set(GTEST_LIBRARIES - "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE) - set(GTEST_MAIN_LIBRARIES - "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE) -ENDIF(WIN32) +set(GTEST_PREFIX_DIR ${CMAKE_CURRENT_BINARY_DIR}/gtest) +set(GTEST_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/gtest/src/extern_gtest) +set(GTEST_INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/install/gtest) +set(GTEST_INCLUDE_DIR + "${GTEST_INSTALL_DIR}/include" + CACHE PATH "gtest include directory." FORCE) +set(GTEST_REPOSITORY https://github.com/google/googletest.git) +set(GTEST_TAG release-1.8.1) +include_directories(${GTEST_INCLUDE_DIR}) +if(WIN32) + # if use CMAKE_INSTALL_LIBDIR, the path of lib actually is install/gtest/lib/gtest.lib but GTEST_LIBRARIES + # is install/gtest/gtest.lib + set(GTEST_LIBRARIES + "${GTEST_INSTALL_DIR}/lib/gtest.lib" + CACHE FILEPATH "gtest libraries." FORCE) + set(GTEST_MAIN_LIBRARIES + "${GTEST_INSTALL_DIR}/lib/gtest_main.lib" + CACHE FILEPATH "gtest main libraries." FORCE) +else() + set(GTEST_LIBRARIES + "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest.a" + CACHE FILEPATH "gtest libraries." FORCE) + set(GTEST_MAIN_LIBRARIES + "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest_main.a" + CACHE FILEPATH "gtest main libraries." FORCE) +endif(WIN32) ExternalProject_Add( - extern_gtest - PREFIX gtest - GIT_REPOSITORY ${GTEST_REPOSITORY} - GIT_TAG ${GTEST_TAG} - DOWNLOAD_DIR "${DOWNLOAD_LOCATION}" - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR} - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=Release - BUILD_BYPRODUCTS ${GTEST_LIBRARIES} - BUILD_BYPRODUCTS ${GTEST_MAIN_LIBRARIES} -) + extern_gtest + PREFIX gtest + GIT_REPOSITORY ${GTEST_REPOSITORY} + GIT_TAG ${GTEST_TAG} + DOWNLOAD_DIR "${DOWNLOAD_LOCATION}" + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=Release + BUILD_BYPRODUCTS ${GTEST_LIBRARIES} + BUILD_BYPRODUCTS ${GTEST_MAIN_LIBRARIES}) -ADD_LIBRARY(thirdparty_gtest STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET thirdparty_gtest PROPERTY IMPORTED_LOCATION ${GTEST_LIBRARIES}) -ADD_DEPENDENCIES(thirdparty_gtest extern_gtest) +add_library(thirdparty_gtest STATIC IMPORTED GLOBAL) +set_property(TARGET thirdparty_gtest PROPERTY IMPORTED_LOCATION + ${GTEST_LIBRARIES}) +add_dependencies(thirdparty_gtest extern_gtest) -ADD_LIBRARY(thirdparty_gtest_main STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET thirdparty_gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES}) -ADD_DEPENDENCIES(thirdparty_gtest_main extern_gtest) +add_library(thirdparty_gtest_main STATIC IMPORTED GLOBAL) +set_property(TARGET thirdparty_gtest_main PROPERTY IMPORTED_LOCATION + ${GTEST_MAIN_LIBRARIES}) +add_dependencies(thirdparty_gtest_main extern_gtest) diff --git a/paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc b/paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc index 4e924e3197965..53edc554ebaf8 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc @@ -68,7 +68,7 @@ std::shared_ptr InitPredictor() { config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape, opt_input_shape); // erinie varlen must be used with oss - config.EnableTensorRtOSS(); + config.EnableVarseqlen(); return CreatePredictor(config); } diff --git a/paddle/fluid/inference/tests/infer_ut/test_suite.h b/paddle/fluid/inference/tests/infer_ut/test_suite.h index a5c8c52402180..8737afa809933 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_suite.h +++ b/paddle/fluid/inference/tests/infer_ut/test_suite.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once #include + #include #include #include @@ -26,7 +27,6 @@ #include "gflags/gflags.h" #include "glog/logging.h" #include "gtest/gtest.h" - #include "paddle/include/paddle_inference_api.h" namespace paddle { @@ -64,7 +64,7 @@ void SingleThreadPrediction(paddle_infer::Predictor *predictor, int repeat_times = 2) { // prepare input tensor auto input_names = predictor->GetInputNames(); - for (const auto & [ key, value ] : *input_data_map) { + for (const auto &[key, value] : *input_data_map) { switch (value.type) { case paddle::PaddleDType::INT64: { std::vector input_value = @@ -150,7 +150,7 @@ void SingleThreadPrediction(paddle_infer::Predictor *predictor, void CompareRecord(std::map *truth_output_data, std::map *infer_output_data, float epislon = 1e-5) { - for (const auto & [ key, value ] : *infer_output_data) { + for (const auto &[key, value] : *infer_output_data) { auto truth_record = (*truth_output_data)[key]; VLOG(1) << "output name: " << key; size_t numel = value.data.size() / sizeof(float); @@ -190,7 +190,7 @@ double SingleThreadProfile(paddle_infer::Predictor *predictor, int repeat_times = 2) { // prepare input tensor auto input_names = predictor->GetInputNames(); - for (const auto & [ key, value ] : *input_data_map) { + for (const auto &[key, value] : *input_data_map) { switch (value.type) { case paddle::PaddleDType::INT64: { std::vector input_value = diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake index 6b6c0cd22f03b..d4b3ebdaa0b7f 100644 --- a/paddle/fluid/inference/tests/test.cmake +++ b/paddle/fluid/inference/tests/test.cmake @@ -1,26 +1,33 @@ include(ExternalProject) -set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com" CACHE STRING "inference download url") -set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING - "A path setting inference demo download directories.") -set(CPU_NUM_THREADS_ON_CI 4 CACHE STRING "Run multi-threads on CI to reduce CI time.") -set(WARMUP_BATCH_SIZE 100 CACHE STRING "Default warmup_batch_size.") +set(INFERENCE_URL + "http://paddle-inference-dist.bj.bcebos.com" + CACHE STRING "inference download url") +set(INFERENCE_DEMO_INSTALL_DIR + "${THIRD_PARTY_PATH}/inference_demo" + CACHE STRING "A path setting inference demo download directories.") +set(CPU_NUM_THREADS_ON_CI + 4 + CACHE STRING "Run multi-threads on CI to reduce CI time.") +set(WARMUP_BATCH_SIZE + 100 + CACHE STRING "Default warmup_batch_size.") function(inference_download INSTALL_DIR URL FILENAME) message(STATUS "Download inference test stuff from ${URL}/${FILENAME}") string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME}) ExternalProject_Add( - extern_inference_download_${FILENAME_EX} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${INSTALL_DIR} - URL ${URL}/${FILENAME} - DOWNLOAD_COMMAND wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} - DOWNLOAD_DIR ${INSTALL_DIR} - DOWNLOAD_NO_PROGRESS 1 - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - UPDATE_COMMAND "" - INSTALL_COMMAND "" - ) + extern_inference_download_${FILENAME_EX} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${INSTALL_DIR} + URL ${URL}/${FILENAME} + DOWNLOAD_COMMAND wget --no-check-certificate -q -O + ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} + DOWNLOAD_DIR ${INSTALL_DIR} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND "") endfunction() function(inference_download_and_uncompress INSTALL_DIR URL FILENAME CHECK_SUM) @@ -30,93 +37,101 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME CHECK_SUM) set(EXTERNAL_PROJECT_NAME "extern_download_${FILENAME_EX}") set(UNPACK_DIR "${INSTALL_DIR}/src/${EXTERNAL_PROJECT_NAME}") ExternalProject_Add( - ${EXTERNAL_PROJECT_NAME} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${INSTALL_DIR} - URL ${URL}/${FILENAME} - URL_HASH MD5=${CHECK_SUM} - DOWNLOAD_DIR ${INSTALL_DIR} - DOWNLOAD_NO_EXTRACT 1 - DOWNLOAD_NO_PROGRESS 1 - CONFIGURE_COMMAND "" - BUILD_COMMAND ${CMAKE_COMMAND} -E chdir ${INSTALL_DIR} - ${CMAKE_COMMAND} -E tar xzf ${DOWNLOAD_NAME} - UPDATE_COMMAND "" - INSTALL_COMMAND "" - ) + ${EXTERNAL_PROJECT_NAME} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${INSTALL_DIR} + URL ${URL}/${FILENAME} + URL_HASH MD5=${CHECK_SUM} + DOWNLOAD_DIR ${INSTALL_DIR} + DOWNLOAD_NO_EXTRACT 1 + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND ${CMAKE_COMMAND} -E chdir ${INSTALL_DIR} ${CMAKE_COMMAND} -E + tar xzf ${DOWNLOAD_NAME} + UPDATE_COMMAND "" + INSTALL_COMMAND "") endfunction() -function(inference_download_and_uncompress_without_verify INSTALL_DIR URL FILENAME) +function(inference_download_and_uncompress_without_verify INSTALL_DIR URL + FILENAME) message(STATUS "Download inference test stuff from ${URL}/${FILENAME}") string(REGEX REPLACE "[-%./\\]" "_" FILENAME_EX ${FILENAME}) string(REGEX MATCH "[^/\\]+$" DOWNLOAD_NAME ${FILENAME}) set(EXTERNAL_PROJECT_NAME "extern_download_${FILENAME_EX}") set(UNPACK_DIR "${INSTALL_DIR}/src/${EXTERNAL_PROJECT_NAME}") ExternalProject_Add( - ${EXTERNAL_PROJECT_NAME} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${INSTALL_DIR} - URL ${URL}/${FILENAME} - DOWNLOAD_DIR ${INSTALL_DIR} - DOWNLOAD_NO_EXTRACT 1 - DOWNLOAD_NO_PROGRESS 1 - CONFIGURE_COMMAND "" - BUILD_COMMAND ${CMAKE_COMMAND} -E chdir ${INSTALL_DIR} - ${CMAKE_COMMAND} -E tar xzf ${DOWNLOAD_NAME} - UPDATE_COMMAND "" - INSTALL_COMMAND "" - ) + ${EXTERNAL_PROJECT_NAME} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${INSTALL_DIR} + URL ${URL}/${FILENAME} + DOWNLOAD_DIR ${INSTALL_DIR} + DOWNLOAD_NO_EXTRACT 1 + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND ${CMAKE_COMMAND} -E chdir ${INSTALL_DIR} ${CMAKE_COMMAND} -E + tar xzf ${DOWNLOAD_NAME} + UPDATE_COMMAND "" + INSTALL_COMMAND "") endfunction() set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec") if(NOT EXISTS ${WORD2VEC_INSTALL_DIR}/word2vec.inference.model.tar.gz) - inference_download_and_uncompress_without_verify(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz") + inference_download_and_uncompress_without_verify( + ${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz") endif() set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model") -set(IMG_CLS_RESNET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/image_classification_resnet") -if(NOT EXISTS ${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model.tgz) - inference_download_and_uncompress_without_verify(${IMG_CLS_RESNET_INSTALL_DIR} ${INFERENCE_URL} "image_classification_resnet.inference.model.tgz") +set(IMG_CLS_RESNET_INSTALL_DIR + "${INFERENCE_DEMO_INSTALL_DIR}/image_classification_resnet") +if(NOT EXISTS + ${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model.tgz +) + inference_download_and_uncompress_without_verify( + ${IMG_CLS_RESNET_INSTALL_DIR} ${INFERENCE_URL} + "image_classification_resnet.inference.model.tgz") endif() -set(IMG_CLS_RESNET_MODEL_DIR "${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model") +set(IMG_CLS_RESNET_MODEL_DIR + "${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model") if(WITH_ONNXRUNTIME) set(MOBILENETV2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/MobileNetV2") if(NOT EXISTS ${MOBILENETV2_INSTALL_DIR}/MobileNetV2.inference.model.tar.gz) - inference_download_and_uncompress_without_verify(${MOBILENETV2_INSTALL_DIR} ${INFERENCE_URL} "MobileNetV2.inference.model.tar.gz") + inference_download_and_uncompress_without_verify( + ${MOBILENETV2_INSTALL_DIR} ${INFERENCE_URL} + "MobileNetV2.inference.model.tar.gz") endif() set(MOBILENETV2_MODEL_DIR "${MOBILENETV2_INSTALL_DIR}/MobileNetV2") endif() -function (inference_base_test_build TARGET) - set(options "") - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - cc_test_build(${TARGET} SRCS ${base_test_SRCS} DEPS ${base_test_DEPS}) +function(inference_base_test_build TARGET) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + cc_test_build(${TARGET} SRCS ${base_test_SRCS} DEPS ${base_test_DEPS}) endfunction() -function (inference_base_test_run TARGET) - set(options "") - set(oneValueArgs "") - set(multiValueArgs COMMAND ARGS) - cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - if(WITH_GPU) - set(mem_opt "--fraction_of_gpu_memory_to_use=0.5") - endif() - cc_test_run(${TARGET} COMMAND ${base_test_COMMAND} ARGS ${mem_opt} ${base_test_ARGS}) +function(inference_base_test_run TARGET) + set(options "") + set(oneValueArgs "") + set(multiValueArgs COMMAND ARGS) + cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + if(WITH_GPU) + set(mem_opt "--fraction_of_gpu_memory_to_use=0.5") + endif() + cc_test_run(${TARGET} COMMAND ${base_test_COMMAND} ARGS ${mem_opt} + ${base_test_ARGS}) endfunction() -function (inference_base_test TARGET) - set(options "") - set(oneValueArgs "") - set(multiValueArgs SRCS ARGS DEPS) - cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - inference_base_test_build(${TARGET} - SRCS ${base_test_SRCS} - DEPS ${base_test_DEPS}) - inference_base_test_run(${TARGET} - COMMAND ${TARGET} - ARGS ${base_test_ARGS}) +function(inference_base_test TARGET) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS ARGS DEPS) + cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + inference_base_test_build(${TARGET} SRCS ${base_test_SRCS} DEPS + ${base_test_DEPS}) + inference_base_test_run(${TARGET} COMMAND ${TARGET} ARGS ${base_test_ARGS}) endfunction() - diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt index 9a495194a8ac1..a32a61842a5ec 100644 --- a/paddle/fluid/inference/utils/CMakeLists.txt +++ b/paddle/fluid/inference/utils/CMakeLists.txt @@ -1,8 +1,23 @@ -cc_library(benchmark SRCS benchmark.cc DEPS enforce) -cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark) -cc_library(infer_io_utils SRCS io_utils.cc DEPS paddle_inference_api lod_tensor shape_range_info_proto) -cc_test(infer_io_utils_tester SRCS io_utils_tester.cc DEPS infer_io_utils) +cc_library( + benchmark + SRCS benchmark.cc + DEPS enforce) +cc_test( + test_benchmark + SRCS benchmark_tester.cc + DEPS benchmark) +cc_library( + infer_io_utils + SRCS io_utils.cc + DEPS paddle_inference_api lod_tensor shape_range_info_proto) +cc_test( + infer_io_utils_tester + SRCS io_utils_tester.cc + DEPS infer_io_utils) cc_library(table_printer SRCS table_printer.cc) -cc_test(test_table_printer SRCS table_printer_tester.cc DEPS table_printer) +cc_test( + test_table_printer + SRCS table_printer_tester.cc + DEPS table_printer) proto_library(shape_range_info_proto SRCS shape_range_info.proto) diff --git a/paddle/fluid/inference/utils/benchmark_tester.cc b/paddle/fluid/inference/utils/benchmark_tester.cc index 0c48c2db9b691..8f7614cb10a44 100644 --- a/paddle/fluid/inference/utils/benchmark_tester.cc +++ b/paddle/fluid/inference/utils/benchmark_tester.cc @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/utils/benchmark.h" #include #include +#include "paddle/fluid/inference/utils/benchmark.h" + using namespace paddle::inference; // NOLINT TEST(Benchmark, basic) { Benchmark benchmark; diff --git a/paddle/fluid/inference/utils/io_utils.cc b/paddle/fluid/inference/utils/io_utils.cc index 87331e1978f95..425c67d2fd240 100644 --- a/paddle/fluid/inference/utils/io_utils.cc +++ b/paddle/fluid/inference/utils/io_utils.cc @@ -158,8 +158,9 @@ void SerializePDTensorsToFile(const std::string &path, void DeserializePDTensorsToFile(const std::string &path, std::vector *tensors) { bool is_present = analysis::FileExists(path); - PADDLE_ENFORCE_EQ(is_present, true, platform::errors::InvalidArgument( - "Cannot open %s to read", path)); + PADDLE_ENFORCE_EQ( + is_present, true, + platform::errors::InvalidArgument("Cannot open %s to read", path)); std::ifstream fin(path, std::ios::binary); DeserializePDTensorsToStream(fin, tensors); fin.close(); diff --git a/paddle/fluid/inference/utils/io_utils_tester.cc b/paddle/fluid/inference/utils/io_utils_tester.cc index ffd97232652fd..e8ebb72acc322 100644 --- a/paddle/fluid/inference/utils/io_utils_tester.cc +++ b/paddle/fluid/inference/utils/io_utils_tester.cc @@ -12,11 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/utils/io_utils.h" #include #include + #include + #include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/utils/io_utils.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/utils/singleton.h b/paddle/fluid/inference/utils/singleton.h index 6828924c300fd..5fccd3458a1d0 100644 --- a/paddle/fluid/inference/utils/singleton.h +++ b/paddle/fluid/inference/utils/singleton.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/platform/enforce.h" namespace paddle { diff --git a/paddle/fluid/inference/utils/table_printer_tester.cc b/paddle/fluid/inference/utils/table_printer_tester.cc index f56d2527d730c..fc482807b2854 100644 --- a/paddle/fluid/inference/utils/table_printer_tester.cc +++ b/paddle/fluid/inference/utils/table_printer_tester.cc @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/utils/table_printer.h" #include #include +#include "paddle/fluid/inference/utils/table_printer.h" + namespace paddle { namespace inference {} // namespace inference } // namespace paddle @@ -43,7 +44,7 @@ TEST(table_printer, output) { table.InsertRow({"trt_precision", "fp32"}); table.InsertRow({"enable_dynamic_shape", "true"}); table.InsertRow({"DisableTensorRtOPs", "{}"}); - table.InsertRow({"EnableTensorRtOSS", "ON"}); + table.InsertRow({"EnableVarseqlen", "ON"}); table.InsertRow({"tensorrt_dla_enabled", "ON"}); table.InsetDivider(); diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt index 53e7993945586..1f72482eef777 100644 --- a/paddle/fluid/memory/CMakeLists.txt +++ b/paddle/fluid/memory/CMakeLists.txt @@ -1,57 +1,89 @@ add_subdirectory(detail) add_subdirectory(allocation) -if (WITH_MKLDNN) - set(MKLDNN_CTX_DEPS mkldnn) -else () - set(MKLDNN_CTX_DEPS) +if(WITH_MKLDNN) + set(MKLDNN_CTX_DEPS mkldnn) +else() + set(MKLDNN_CTX_DEPS) endif() -cc_library(malloc SRCS malloc.cc DEPS - place enforce allocator_facade profiler ${MKLDNN_CTX_DEPS}) -cc_library(memcpy SRCS memcpy.cc DEPS place device_context) -cc_library(stats SRCS stats.cc DEPS enforce) +cc_library( + malloc + SRCS malloc.cc + DEPS place enforce allocator_facade profiler ${MKLDNN_CTX_DEPS}) +cc_library( + memcpy + SRCS memcpy.cc + DEPS place device_context) +cc_library( + stats + SRCS stats.cc + DEPS enforce) cc_library(memory DEPS malloc memcpy stats) -cc_test(memory_stats_test SRCS memory_stats_test.cc DEPS memory) -cc_test(stats_test SRCS stats_test.cc DEPS stats) - -if (WITH_GPU) - nv_test(malloc_test - SRCS malloc_test.cu - DEPS device_context malloc) - nv_test(stream_safe_cuda_alloc_test - SRCS stream_safe_cuda_alloc_test.cu - DEPS malloc cuda_graph_with_memory_pool) - nv_test(cuda_managed_memory_test - SRCS cuda_managed_memory_test.cu - DEPS malloc gpu_info place) - - if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test) - set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES - ENVIRONMENT "FLAGS_use_stream_safe_cuda_allocator=true;FLAGS_allocator_strategy=auto_growth") - endif() +cc_test( + memory_stats_test + SRCS memory_stats_test.cc + DEPS memory) +cc_test( + stats_test + SRCS stats_test.cc + DEPS stats) + +if(WITH_GPU) + nv_test( + malloc_test + SRCS malloc_test.cu + DEPS device_context malloc) + nv_test( + stream_safe_cuda_alloc_test + SRCS stream_safe_cuda_alloc_test.cu + DEPS malloc cuda_graph_with_memory_pool) + nv_test( + cuda_managed_memory_test + SRCS cuda_managed_memory_test.cu + DEPS malloc gpu_info place) + + if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test) + set_tests_properties( + stream_safe_cuda_alloc_test + PROPERTIES + ENVIRONMENT + "FLAGS_use_stream_safe_cuda_allocator=true;FLAGS_allocator_strategy=auto_growth" + ) + endif() endif() -if (WITH_ROCM) - hip_test(malloc_test - SRCS malloc_test.cu - DEPS device_context malloc) - hip_test(cuda_managed_memory_test - SRCS cuda_managed_memory_test.cu - DEPS malloc gpu_info place) +if(WITH_ROCM) + hip_test( + malloc_test + SRCS malloc_test.cu + DEPS device_context malloc) + hip_test( + cuda_managed_memory_test + SRCS cuda_managed_memory_test.cu + DEPS malloc gpu_info place) endif() if(WITH_TESTING AND TEST cuda_managed_memory_test) -set_tests_properties(cuda_managed_memory_test PROPERTIES - ENVIRONMENT "FLAGS_use_cuda_managed_memory=true;FLAGS_allocator_strategy=auto_growth" - TIMEOUT 50) + set_tests_properties( + cuda_managed_memory_test + PROPERTIES + ENVIRONMENT + "FLAGS_use_cuda_managed_memory=true;FLAGS_allocator_strategy=auto_growth" + TIMEOUT 50) endif() -if(WITH_GPU AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") - nv_test(get_base_ptr_test SRCS get_base_ptr_test.cu DEPS malloc gpu_info) - set_tests_properties(get_base_ptr_test PROPERTIES - ENVIRONMENT "FLAGS_allocator_strategy=auto_growth; +if(WITH_GPU + AND WITH_TESTING + AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") + nv_test( + get_base_ptr_test + SRCS get_base_ptr_test.cu + DEPS malloc gpu_info) + set_tests_properties( + get_base_ptr_test + PROPERTIES ENVIRONMENT "FLAGS_allocator_strategy=auto_growth; FLAGS_use_stream_safe_cuda_allocator=true;") endif() diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 5af13f76b36bd..109afd06f4df1 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -1,137 +1,264 @@ -cc_library(allocator SRCS allocator.cc DEPS place stats) -cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator) -cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator) -cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator) -cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator) -cc_library(naive_best_fit_allocator SRCS naive_best_fit_allocator.cc DEPS allocator buddy_allocator profiler) -cc_test(naive_best_fit_allocator_test SRCS naive_best_fit_allocator_test.cc DEPS naive_best_fit_allocator) -cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS locked_allocator buffered_allocator cpu_allocator best_fit_allocator) - -if (WITH_MKLDNN) +cc_library( + allocator + SRCS allocator.cc + DEPS place stats) +cc_library( + cpu_allocator + SRCS cpu_allocator.cc + DEPS allocator) +cc_library( + locked_allocator + SRCS locked_allocator.cc + DEPS allocator) +cc_library( + buffered_allocator + SRCS buffered_allocator.cc + DEPS allocator) +cc_library( + best_fit_allocator + SRCS best_fit_allocator.cc + DEPS allocator) +cc_library( + naive_best_fit_allocator + SRCS naive_best_fit_allocator.cc + DEPS allocator buddy_allocator profiler) +cc_test( + naive_best_fit_allocator_test + SRCS naive_best_fit_allocator_test.cc + DEPS naive_best_fit_allocator) +cc_test( + buffered_allocator_test + SRCS buffered_allocator_test.cc + DEPS locked_allocator buffered_allocator cpu_allocator best_fit_allocator) + +if(WITH_MKLDNN) set(MKLDNN_CTX_DEPS mkldnn) -else () +else() set(MKLDNN_CTX_DEPS) endif() -if (WITH_GPU) - nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard stats) - nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info) - nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator) - nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator cuda_graph) - nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator) +if(WITH_GPU) + nv_library( + cuda_allocator + SRCS cuda_allocator.cc + DEPS allocator cuda_device_guard stats) + nv_library( + cuda_managed_allocator + SRCS cuda_managed_allocator.cc + DEPS allocator cuda_device_guard gpu_info) + nv_library( + pinned_allocator + SRCS pinned_allocator.cc + DEPS allocator) + nv_library( + stream_safe_cuda_allocator + SRCS stream_safe_cuda_allocator.cc + DEPS allocator cuda_graph) + nv_library( + thread_local_allocator + SRCS thread_local_allocator.cc + DEPS allocator) - cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator) + cc_test( + thread_local_allocator_test + SRCS thread_local_allocator_test.cc + DEPS thread_local_allocator) if(CUDA_VERSION GREATER_EQUAL 10.2) - nv_library(cuda_virtual_mem_allocator SRCS cuda_virtual_mem_allocator.cc DEPS dynload_cuda) + nv_library( + cuda_virtual_mem_allocator + SRCS cuda_virtual_mem_allocator.cc + DEPS dynload_cuda) endif() endif() -if (WITH_ROCM) - hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard stats) - hip_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info) - hip_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator) - hip_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator) - hip_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator) - - cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator) +if(WITH_ROCM) + hip_library( + cuda_allocator + SRCS cuda_allocator.cc + DEPS allocator cuda_device_guard stats) + hip_library( + cuda_managed_allocator + SRCS cuda_managed_allocator.cc + DEPS allocator cuda_device_guard gpu_info) + hip_library( + pinned_allocator + SRCS pinned_allocator.cc + DEPS allocator) + hip_library( + stream_safe_cuda_allocator + SRCS stream_safe_cuda_allocator.cc + DEPS allocator) + hip_library( + thread_local_allocator + SRCS thread_local_allocator.cc + DEPS allocator) + + cc_test( + thread_local_allocator_test + SRCS thread_local_allocator_test.cc + DEPS thread_local_allocator) endif() -if (WITH_ASCEND_CL) - cc_library(npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info) - cc_library(npu_pinned_allocator SRCS npu_pinned_allocator.cc DEPS allocator npu_info) +if(WITH_ASCEND_CL) + cc_library( + npu_allocator + SRCS npu_allocator.cc + DEPS allocator npu_info) + cc_library( + npu_pinned_allocator + SRCS npu_pinned_allocator.cc + DEPS allocator npu_info) endif() -cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator) +cc_library( + retry_allocator + SRCS retry_allocator.cc + DEPS allocator) -if (WITH_GPU OR WITH_ROCM) - set(AllocatorFacadeDeps gpu_info cuda_allocator cuda_managed_allocator pinned_allocator cuda_device_guard thread_local_allocator stream_safe_cuda_allocator device_context) - if(CUDA_VERSION GREATER_EQUAL 10.2) - list(APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator) - endif() +if(WITH_GPU OR WITH_ROCM) + set(AllocatorFacadeDeps + gpu_info + cuda_allocator + cuda_managed_allocator + pinned_allocator + cuda_device_guard + thread_local_allocator + stream_safe_cuda_allocator + device_context) + if(CUDA_VERSION GREATER_EQUAL 10.2) + list(APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator) + endif() elseif(WITH_XPU) - set(AllocatorFacadeDeps xpu_info) + set(AllocatorFacadeDeps xpu_info) elseif(WITH_IPU) - set(AllocatorFacadeDeps ipu_info) + set(AllocatorFacadeDeps ipu_info) elseif(WITH_ASCEND) - set(AllocatorFacadeDeps ascend_npu_info) -else () - set(AllocatorFacadeDeps) + set(AllocatorFacadeDeps ascend_npu_info) +else() + set(AllocatorFacadeDeps) endif() -if (WITH_CUSTOM_DEVICE) - cc_library(custom_allocator SRCS custom_allocator.cc DEPS allocator device_manager) +if(WITH_CUSTOM_DEVICE) + cc_library( + custom_allocator + SRCS custom_allocator.cc + DEPS allocator device_manager) set(AllocatorFacadeDeps ${AllocatorFacadeDeps} custom_allocator) endif() -if (WITH_GPU) - nv_test(best_fit_allocator_test - SRCS best_fit_allocator_test.cc - best_fit_allocator_test.cu - DEPS best_fit_allocator - locked_allocator - cpu_allocator - cuda_allocator - device_context - memcpy) -elseif (WITH_ROCM) - hip_test(best_fit_allocator_test - SRCS best_fit_allocator_test.cc - best_fit_allocator_test.cu - DEPS best_fit_allocator - locked_allocator - cpu_allocator - cuda_allocator - device_context - memcpy) +if(WITH_GPU) + nv_test( + best_fit_allocator_test + SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu + DEPS best_fit_allocator locked_allocator cpu_allocator cuda_allocator + device_context memcpy) +elseif(WITH_ROCM) + hip_test( + best_fit_allocator_test + SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu + DEPS best_fit_allocator locked_allocator cpu_allocator cuda_allocator + device_context memcpy) else() - cc_test(best_fit_allocator_test - SRCS best_fit_allocator_test.cc - DEPS best_fit_allocator - locked_allocator - cpu_allocator) + cc_test( + best_fit_allocator_test + SRCS best_fit_allocator_test.cc + DEPS best_fit_allocator locked_allocator cpu_allocator) endif() -list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator virtual_memory_auto_growth_best_fit_allocator best_fit_allocator) +list( + APPEND + AllocatorFacadeDeps + cpu_allocator + locked_allocator + aligned_allocator + retry_allocator + buffered_allocator + naive_best_fit_allocator + auto_growth_best_fit_allocator + virtual_memory_auto_growth_best_fit_allocator + best_fit_allocator) -if (WITH_ASCEND_CL) - list(APPEND AllocatorFacadeDeps npu_pinned_allocator) +if(WITH_ASCEND_CL) + list(APPEND AllocatorFacadeDeps npu_pinned_allocator) endif() +cc_library( + aligned_allocator + SRCS aligned_allocator.cc + DEPS allocator) +cc_test( + test_aligned_allocator + SRCS test_aligned_allocator.cc + DEPS aligned_allocator) +cc_library( + allocator_strategy + SRCS allocator_strategy.cc + DEPS gflags ${AllocatorFacadeDeps}) +cc_library( + allocator_facade + SRCS allocator_facade.cc + DEPS allocator_strategy stats) -cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator) -cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator) -cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps}) -cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy stats) - -if (WITH_GPU) +if(WITH_GPU) target_link_libraries(allocator_facade cuda_graph) endif() -cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator) -if (WITH_TESTING) - if ((WITH_GPU OR WITH_ROCM) AND TARGET retry_allocator_test) +cc_test( + retry_allocator_test + SRCS retry_allocator_test.cc + DEPS retry_allocator locked_allocator cpu_allocator) +if(WITH_TESTING) + if((WITH_GPU OR WITH_ROCM) AND TARGET retry_allocator_test) target_link_libraries(retry_allocator_test cuda_allocator) endif() - if (TEST retry_allocator_test) - set_tests_properties(retry_allocator_test PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") + if(TEST retry_allocator_test) + set_tests_properties(retry_allocator_test PROPERTIES LABELS + "RUN_TYPE=EXCLUSIVE") endif() endif() -cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade) +cc_test( + allocator_facade_abs_flags_test + SRCS allocator_facade_abs_flags_test.cc + DEPS allocator_facade) -cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade) +cc_test( + allocator_facade_frac_flags_test + SRCS allocator_facade_frac_flags_test.cc + DEPS allocator_facade) -cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator flags) -cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator) -cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS auto_growth_best_fit_allocator) +cc_library( + auto_growth_best_fit_allocator + SRCS auto_growth_best_fit_allocator.cc + DEPS allocator aligned_allocator flags) +cc_test( + auto_growth_best_fit_allocator_facade_test + SRCS auto_growth_best_fit_allocator_facade_test.cc + DEPS cpu_allocator auto_growth_best_fit_allocator) +cc_test( + auto_growth_best_fit_allocator_test + SRCS auto_growth_best_fit_allocator_test.cc + DEPS auto_growth_best_fit_allocator) -cc_library(virtual_memory_auto_growth_best_fit_allocator SRCS virtual_memory_auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator) +cc_library( + virtual_memory_auto_growth_best_fit_allocator + SRCS virtual_memory_auto_growth_best_fit_allocator.cc + DEPS allocator aligned_allocator) if(NOT WIN32) - cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator) - cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator) - if (WITH_GPU) - cc_library(cuda_ipc_allocator SRCS cuda_ipc_allocator.cc DEPS allocator) + cc_library( + mmap_allocator + SRCS mmap_allocator.cc + DEPS allocator) + cc_test( + mmap_allocator_test + SRCS mmap_allocator_test.cc + DEPS mmap_allocator allocator) + if(WITH_GPU) + cc_library( + cuda_ipc_allocator + SRCS cuda_ipc_allocator.cc + DEPS allocator) endif() endif(NOT WIN32) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 46e1a500e4870..d72af70657a29 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -28,6 +28,7 @@ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include + #include "paddle/fluid/memory/allocation/cuda_allocator.h" #include "paddle/fluid/memory/allocation/cuda_managed_allocator.h" #include "paddle/fluid/memory/allocation/pinned_allocator.h" @@ -123,6 +124,8 @@ class CUDAGraphAllocator : underlying_allocator_(allocator) {} public: + ~CUDAGraphAllocator() { VLOG(10) << "CUDAGraphAllocator destructed"; } + static std::shared_ptr Create( const std::shared_ptr& allocator) { return std::shared_ptr(new CUDAGraphAllocator(allocator)); @@ -973,7 +976,7 @@ AllocatorFacade& AllocatorFacade::Instance() { AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const { #ifdef PADDLE_WITH_CUDA if (UNLIKELY(IsCUDAGraphCapturing())) { - auto id = platform::CUDAGraph::CapturingID(); + auto id = platform::CUDAGraph::CapturingPoolID(); auto iter = cuda_graph_map_.find(id); PADDLE_ENFORCE_NE( iter, cuda_graph_map_.end(), @@ -1116,7 +1119,7 @@ void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place, } #ifdef PADDLE_WITH_CUDA -void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { +void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(int64_t id) { PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth, platform::errors::InvalidArgument( "CUDA Graph is only supported when the " @@ -1124,23 +1127,32 @@ void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { "FLAGS_allocator_strategy=\"%s\"", FLAGS_allocator_strategy)); auto& allocator = cuda_graph_map_[id]; - PADDLE_ENFORCE_EQ( - allocator.get(), nullptr, - platform::errors::InvalidArgument( - "The memory pool of the CUDA Graph with ID %d have been prepared.", - id)); - allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false)); - - VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id; + auto& ref_cnt = cuda_graph_ref_cnt_[id]; + if (allocator.get() == nullptr) { + allocator.reset( + new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false)); + VLOG(10) << "Create memory pool for CUDA Graph with memory ID " << id; + } else { + VLOG(10) << "Use created memory pool for CUDA Graph with memory ID " << id; + } + ++ref_cnt; } -void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) { - auto iter = cuda_graph_map_.find(id); - PADDLE_ENFORCE_NE(iter, cuda_graph_map_.end(), +void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(int64_t id) { + auto ref_cnt_iter = cuda_graph_ref_cnt_.find(id); + PADDLE_ENFORCE_NE(ref_cnt_iter, cuda_graph_ref_cnt_.end(), platform::errors::InvalidArgument( - "Cannot find CUDA Graph with ID = %d", id)); - cuda_graph_map_.erase(iter); - VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id; + "Cannot find CUDA Graph with memory ID = %d", id)); + auto& ref_cnt = ref_cnt_iter->second; + --ref_cnt; + if (ref_cnt == 0) { + cuda_graph_map_.erase(id); + cuda_graph_ref_cnt_.erase(ref_cnt_iter); + VLOG(10) << "Remove memory pool of CUDA Graph with memory ID " << id; + } else { + VLOG(10) << "Decrease memory pool ID " << id << " reference count to be " + << ref_cnt; + } } #endif #endif diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 1dea50edccf2e..a37c11c0c048b 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -14,6 +14,7 @@ #pragma once #include + #include "paddle/fluid/memory/allocation/allocator.h" #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" @@ -89,8 +90,8 @@ class AllocatorFacade { #endif #ifdef PADDLE_WITH_CUDA - void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id); - void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id); + void PrepareMemoryPoolForCUDAGraph(int64_t id); + void RemoveMemoryPoolOfCUDAGraph(int64_t id); #endif // TODO(yy): Allocate a Copy-On-Write allocation? @@ -98,8 +99,9 @@ class AllocatorFacade { AllocatorFacade(); AllocatorFacadePrivate* m_; #ifdef PADDLE_WITH_CUDA - std::unordered_map> + std::unordered_map> cuda_graph_map_; + std::unordered_map cuda_graph_ref_cnt_; #endif }; diff --git a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc index fca07ba8e2511..d3f16ec628660 100644 --- a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc +++ b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc @@ -12,9 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/memory/allocation/allocator_facade.h" #include +#include "paddle/fluid/memory/allocation/allocator_facade.h" + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_cuda_pinned_memory_to_use); diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc index 782062283e985..d460480bc734f 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc @@ -16,6 +16,7 @@ #include #include // NOLINT + #include "paddle/fluid/memory/allocation/aligned_allocator.h" #include "paddle/fluid/platform/flags.h" #include "paddle/fluid/platform/profiler/event_tracing.h" diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc index 4469673b305bf..70c43145cc85d 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc @@ -13,10 +13,12 @@ // limitations under the License. #include + #include // NOLINT #include // NOLINT #include #include // NOLINT + #include "gflags/gflags.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc index 8d2f6e07a2901..441e80dfa4f8d 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc @@ -12,12 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include - -#include "paddle/fluid/memory/allocation/aligned_allocator.h" #include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h" +#include + #include "gtest/gtest.h" +#include "paddle/fluid/memory/allocation/aligned_allocator.h" DECLARE_bool(free_idle_chunk); DECLARE_bool(free_when_no_cache_hit); diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc index 4cfe3997d89a9..c93645bf7a00d 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/memory/allocation/best_fit_allocator.h" + #include #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h index 69cb7c2708f9d..64ee632c3879a 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/best_fit_allocator.h @@ -14,6 +14,7 @@ #pragma once #include + #include #include #include diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc index 62a2dd78128bb..de6cac63e9ddb 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_allocator.cc @@ -24,6 +24,7 @@ #endif #include + #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/memory/allocation/cuda_allocator.h b/paddle/fluid/memory/allocation/cuda_allocator.h index 522b1d623e83b..f3df30827417d 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_allocator.h @@ -14,6 +14,7 @@ #pragma once #include // NOLINT + #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc index b2f24d5aed1eb..dff93736a6e70 100644 --- a/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc @@ -15,15 +15,16 @@ #ifndef _WIN32 #include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h" -#include "paddle/fluid/platform/cuda_device_guard.h" #include #include #include + #include #include #include "glog/logging.h" +#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { diff --git a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc index 0c83d4d36634e..ac62b10c0e07a 100644 --- a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc @@ -24,6 +24,7 @@ #endif #include + #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc index a235b3871b3e6..9494141615f34 100644 --- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc @@ -18,6 +18,7 @@ #endif #include + #include "paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h index e7b296e6a5a11..ff26a96a0e101 100644 --- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h @@ -16,10 +16,12 @@ #ifdef PADDLE_WITH_CUDA #include + #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include // NOLINT + #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/memory/allocation/custom_allocator.cc b/paddle/fluid/memory/allocation/custom_allocator.cc index e53d7b1cc766a..2cd969e2bd17f 100644 --- a/paddle/fluid/memory/allocation/custom_allocator.cc +++ b/paddle/fluid/memory/allocation/custom_allocator.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/memory/allocation/custom_allocator.h" + #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/memory/allocation/custom_allocator.h b/paddle/fluid/memory/allocation/custom_allocator.h index 0f34bc156c872..b10f840f60d94 100644 --- a/paddle/fluid/memory/allocation/custom_allocator.h +++ b/paddle/fluid/memory/allocation/custom_allocator.h @@ -14,6 +14,7 @@ #pragma once #include // NOLINT + #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/memory/allocation/mmap_allocator.cc b/paddle/fluid/memory/allocation/mmap_allocator.cc index 25c2235cce853..6fd87fb6a7748 100644 --- a/paddle/fluid/memory/allocation/mmap_allocator.cc +++ b/paddle/fluid/memory/allocation/mmap_allocator.cc @@ -19,6 +19,7 @@ #include #include #include + #include #include @@ -217,9 +218,9 @@ std::shared_ptr AllocateMemoryMapWriterAllocation( const std::string &ipc_name = GetIPCName(); int flags = O_RDWR | O_CREAT; int fd = shm_open(ipc_name.c_str(), flags, 0600); - PADDLE_ENFORCE_NE( - fd, -1, platform::errors::Unavailable("File descriptor %s open failed", - ipc_name.c_str())); + PADDLE_ENFORCE_NE(fd, -1, + platform::errors::Unavailable( + "File descriptor %s open failed", ipc_name.c_str())); PADDLE_ENFORCE_EQ(ftruncate(fd, size), 0, platform::errors::Unavailable( "Fruncate a file to a specified length failed!")); @@ -239,9 +240,9 @@ std::shared_ptr RebuildMemoryMapReaderAllocation( flags &= ~O_CREAT; int fd = shm_open(ipc_name.c_str(), flags, 0600); - PADDLE_ENFORCE_NE( - fd, -1, platform::errors::Unavailable("File descriptor %s open failed", - ipc_name.c_str())); + PADDLE_ENFORCE_NE(fd, -1, + platform::errors::Unavailable( + "File descriptor %s open failed", ipc_name.c_str())); void *ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); PADDLE_ENFORCE_NE(ptr, MAP_FAILED, platform::errors::Unavailable( diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 5efbfce7fedd6..7cc95de83101b 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -24,7 +24,6 @@ #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" - #include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/split.h" #include "paddle/phi/common/place.h" diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h index 05db0d7341aca..3d6500d0f5642 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h @@ -14,6 +14,7 @@ #pragma once #include + #include #include // NOLINT #include diff --git a/paddle/fluid/memory/allocation/npu_allocator.cc b/paddle/fluid/memory/allocation/npu_allocator.cc index d69663f636e32..1c277c5db84d6 100644 --- a/paddle/fluid/memory/allocation/npu_allocator.cc +++ b/paddle/fluid/memory/allocation/npu_allocator.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/memory/allocation/npu_allocator.h" + #include + #include "paddle/fluid/platform/device/npu/npu_info.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/memory/allocation/npu_allocator.h b/paddle/fluid/memory/allocation/npu_allocator.h index ff55ba70c520f..04832c6fd9b63 100644 --- a/paddle/fluid/memory/allocation/npu_allocator.h +++ b/paddle/fluid/memory/allocation/npu_allocator.h @@ -14,6 +14,7 @@ #pragma once #include // NOLINT + #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc index 5e5aea6dab2cc..ad11d81875231 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/pinned_allocator.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/memory/allocation/pinned_allocator.h" + #include "paddle/fluid/memory/stats.h" namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc index d6074975720c5..2914da4f6361c 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.cc +++ b/paddle/fluid/memory/allocation/retry_allocator.cc @@ -44,8 +44,9 @@ void RetryAllocator::FreeImpl(phi::Allocation* allocation) { size_t size = allocation->size(); underlying_allocator_->Free(allocation); if (UNLIKELY(waited_allocate_size_)) { - VLOG(10) << "Free " << size << " bytes and notify all waited threads, " - "where waited_allocate_size_ = " + VLOG(10) << "Free " << size + << " bytes and notify all waited threads, " + "where waited_allocate_size_ = " << waited_allocate_size_; cv_.notify_all(); } diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc index cb593f5ab74c7..e7370036cee36 100644 --- a/paddle/fluid/memory/allocation/retry_allocator_test.cc +++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/memory/allocation/retry_allocator.h" #include // NOLINT + #include "gtest/gtest.h" #include "paddle/fluid/memory/allocation/best_fit_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h" diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index 80877cb670ba9..81a87ef07b592 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h" + #include "paddle/fluid/platform/profiler/event_tracing.h" #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h index 32d3896e66bbf..ac4b7c790c950 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h @@ -17,6 +17,7 @@ #include #include #include + #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/spin_lock.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/memory/allocation/thread_local_allocator_test.cc b/paddle/fluid/memory/allocation/thread_local_allocator_test.cc index c5378d9f59c3d..74c83149b4cb5 100644 --- a/paddle/fluid/memory/allocation/thread_local_allocator_test.cc +++ b/paddle/fluid/memory/allocation/thread_local_allocator_test.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "paddle/fluid/memory/allocation/thread_local_allocator.h" + #include // NOLINT #include // NOLINT + #include "gtest/gtest.h" #include "paddle/fluid/memory/malloc.h" diff --git a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc index c8b4e980566d0..07ad149a3078d 100644 --- a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h" + #include #include "paddle/fluid/memory/allocation/aligned_allocator.h" -#include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h" namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/buffer.h b/paddle/fluid/memory/buffer.h index 99b25ca289ce1..f42b5262e3422 100644 --- a/paddle/fluid/memory/buffer.h +++ b/paddle/fluid/memory/buffer.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt index a039cd8f41860..afe5c0dba0f3b 100644 --- a/paddle/fluid/memory/detail/CMakeLists.txt +++ b/paddle/fluid/memory/detail/CMakeLists.txt @@ -1,47 +1,78 @@ include(ExternalProject) -cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc DEPS place) +cc_library( + memory_block + SRCS memory_block.cc memory_block_desc.cc meta_cache.cc + DEPS place) if(WITH_GPU) - nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place) + nv_library( + system_allocator + SRCS system_allocator.cc + DEPS gflags cpu_info gpu_info place) elseif(WITH_ROCM) - hip_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place) + hip_library( + system_allocator + SRCS system_allocator.cc + DEPS gflags cpu_info gpu_info place) elseif(${WITH_ASCEND_CL}) - cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info npu_info place) + cc_library( + system_allocator + SRCS system_allocator.cc + DEPS gflags cpu_info npu_info place) elseif(WITH_MLU) - cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info mlu_info place) + cc_library( + system_allocator + SRCS system_allocator.cc + DEPS gflags cpu_info mlu_info place) else() - cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info place) + cc_library( + system_allocator + SRCS system_allocator.cc + DEPS gflags cpu_info place) endif() -cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator) +cc_test( + system_allocator_test + SRCS system_allocator_test.cc + DEPS system_allocator) -cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS memory_block system_allocator glog) +cc_library( + buddy_allocator + SRCS buddy_allocator.cc + DEPS memory_block system_allocator glog) -cc_test(buddy_allocator_test SRCS buddy_allocator_test.cc DEPS buddy_allocator) +cc_test( + buddy_allocator_test + SRCS buddy_allocator_test.cc + DEPS buddy_allocator) -FUNCTION(file_download_and_uncompress URL NAME) - MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}") - SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME} PARENT_SCOPE) +function(file_download_and_uncompress URL NAME) + message(STATUS "Download dependence[${NAME}] from ${URL}") + set(${NAME}_INCLUDE_DIR + ${THIRD_PARTY_PATH}/${NAME} + PARENT_SCOPE) ExternalProject_Add( - extern_download_${NAME} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${THIRD_PARTY_PATH}/${NAME} - URL ${URL} - DOWNLOAD_DIR ${THIRD_PARTY_PATH}/${NAME} - SOURCE_DIR ${THIRD_PARTY_PATH}/${NAME} - DOWNLOAD_NO_PROGRESS 1 - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - UPDATE_COMMAND "" - INSTALL_COMMAND "" - ) - set(third_party_deps ${third_party_deps} extern_download_${NAME} PARENT_SCOPE) -ENDFUNCTION() + extern_download_${NAME} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${THIRD_PARTY_PATH}/${NAME} + URL ${URL} + DOWNLOAD_DIR ${THIRD_PARTY_PATH}/${NAME} + SOURCE_DIR ${THIRD_PARTY_PATH}/${NAME} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND "") + set(third_party_deps + ${third_party_deps} extern_download_${NAME} + PARENT_SCOPE) +endfunction() if(WITH_TESTING) if(TEST buddy_allocator_test) - set_tests_properties(buddy_allocator_test PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") + set_tests_properties(buddy_allocator_test PROPERTIES LABELS + "RUN_TYPE=EXCLUSIVE") endif() set(URL "https://paddle-ci.cdn.bcebos.com/buddy_allocator_test_data.tar") file_download_and_uncompress(URL "buddy_allocator") diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index e1077d66c54ec..244445d59b829 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -168,8 +168,9 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { } void GPUAllocator::Free(void* p, size_t size, size_t index) { - PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( - "The index should be 0, index is %d", index)); + PADDLE_ENFORCE_EQ(index, 0, + platform::errors::InvalidArgument( + "The index should be 0, index is %d", index)); PADDLE_ENFORCE_GE(gpu_alloc_size_, size, platform::errors::InvalidArgument( "The size of memory (%d) to free exceeds the size of " @@ -223,8 +224,9 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { gpuError_t err; - PADDLE_ENFORCE_EQ(index, 1, platform::errors::InvalidArgument( - "The index should be 1, but got %d", index)); + PADDLE_ENFORCE_EQ(index, 1, + platform::errors::InvalidArgument( + "The index should be 1, but got %d", index)); PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_, size, platform::errors::InvalidArgument( @@ -310,8 +312,9 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) { void NPUAllocator::Free(void* p, size_t size, size_t index) { VLOG(4) << "Free " << p << " size " << size; - PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( - "The index should be 0, index is %d", index)); + PADDLE_ENFORCE_EQ(index, 0, + platform::errors::InvalidArgument( + "The index should be 0, index is %d", index)); PADDLE_ENFORCE_GE(npu_alloc_size_, size, platform::errors::InvalidArgument( "The size of memory (%d) to free exceeds the size of " @@ -355,8 +358,9 @@ void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) { void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) { aclError err; - PADDLE_ENFORCE_EQ(index, 1, platform::errors::InvalidArgument( - "The index should be 1, but got %d", index)); + PADDLE_ENFORCE_EQ(index, 1, + platform::errors::InvalidArgument( + "The index should be 1, but got %d", index)); PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_, size, platform::errors::InvalidArgument( @@ -425,8 +429,9 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) { } void MLUAllocator::Free(void* p, size_t size, size_t index) { - PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( - "The index should be 0, index is %d", index)); + PADDLE_ENFORCE_EQ(index, 0, + platform::errors::InvalidArgument( + "The index should be 0, index is %d", index)); PADDLE_ENFORCE_GE(mlu_alloc_size_, size, platform::errors::InvalidArgument( "The size of memory (%d) to free exceeds the size of " @@ -469,8 +474,9 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) { void CustomAllocator::Free(void* p, size_t size, size_t index) { VLOG(4) << "CustomAllocator::Free " << p << " size " << size; - PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( - "The index should be 0, index is %d", index)); + PADDLE_ENFORCE_EQ(index, 0, + platform::errors::InvalidArgument( + "The index should be 0, index is %d", index)); PADDLE_ENFORCE_GE(plug_alloc_size, size, platform::errors::InvalidArgument( "The size of memory (%d) to free exceeds the size of " diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h index f6ff6282a614a..18c2e278f99c5 100644 --- a/paddle/fluid/memory/detail/system_allocator.h +++ b/paddle/fluid/memory/detail/system_allocator.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include // for size_t + #include namespace paddle { diff --git a/paddle/fluid/memory/get_base_ptr_test.cu b/paddle/fluid/memory/get_base_ptr_test.cu index 188d2f5f420cf..c8928bda0c937 100644 --- a/paddle/fluid/memory/get_base_ptr_test.cu +++ b/paddle/fluid/memory/get_base_ptr_test.cu @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "gtest/gtest.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 796bdcf0ec2f6..a7d0fa9781f77 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -24,9 +24,9 @@ limitations under the License. */ namespace paddle { namespace memory { -using phi::Allocation; -using allocation::Allocator; using allocation::AllocationPtr; +using allocation::Allocator; +using phi::Allocation; extern std::shared_ptr AllocShared(const platform::Place& place, size_t size); diff --git a/paddle/fluid/memory/memory_stats_test.cc b/paddle/fluid/memory/memory_stats_test.cc index b2fc602e401ed..081f0d3d78c13 100644 --- a/paddle/fluid/memory/memory_stats_test.cc +++ b/paddle/fluid/memory/memory_stats_test.cc @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/memory/memory.h" #include #include + #include "gtest/gtest.h" +#include "paddle/fluid/memory/memory.h" namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/pinned_memory_test.cu b/paddle/fluid/memory/pinned_memory_test.cu index 837c964e2ad32..e5958615d0184 100644 --- a/paddle/fluid/memory/pinned_memory_test.cu +++ b/paddle/fluid/memory/pinned_memory_test.cu @@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include + #include #include "paddle/fluid/memory/detail/memory_block.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memory.h" - #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/memory/stats.h b/paddle/fluid/memory/stats.h index bb6a3cca6644c..a30ee161e1c08 100644 --- a/paddle/fluid/memory/stats.h +++ b/paddle/fluid/memory/stats.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/errors.h" @@ -149,15 +150,16 @@ void HostMemoryStatUpdate(const std::string& stat_type, int dev_id, #define DEVICE_MEMORY_STAT_UPDATE(item, id, increment) \ DEVICE_MEMORY_STAT_FUNC(item, id, Update, increment) -#define HOST_MEMORY_STAT_FUNC(item, id, func, ...) \ - [&] { \ - PADDLE_ENFORCE_EQ(id, 0, paddle::platform::errors::OutOfRange( \ - "Only support device id 0 for host memory " \ - "stats, not support device id: %d", \ - id)); \ - return paddle::memory::Stat< \ - paddle::memory::HostMemoryStat##item##0>::GetInstance() \ - ->func(__VA_ARGS__); \ +#define HOST_MEMORY_STAT_FUNC(item, id, func, ...) \ + [&] { \ + PADDLE_ENFORCE_EQ(id, 0, \ + paddle::platform::errors::OutOfRange( \ + "Only support device id 0 for host memory " \ + "stats, not support device id: %d", \ + id)); \ + return paddle::memory::Stat< \ + paddle::memory::HostMemoryStat##item##0>::GetInstance() \ + ->func(__VA_ARGS__); \ }() #define HOST_MEMORY_STAT_CURRENT_VALUE(item, id) \ diff --git a/paddle/fluid/memory/stats_test.cc b/paddle/fluid/memory/stats_test.cc index bcaba8e91080f..73a6b921ca8a4 100644 --- a/paddle/fluid/memory/stats_test.cc +++ b/paddle/fluid/memory/stats_test.cc @@ -13,11 +13,13 @@ // limitations under the License. #include "paddle/fluid/memory/stats.h" + #include #include #include #include #include + #include "gtest/gtest.h" namespace paddle { diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu index 3bf873bcfc231..5b5350c34fb6f 100644 --- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu +++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu @@ -25,6 +25,7 @@ #ifdef PADDLE_WITH_CUDA #include #include + #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #endif @@ -47,9 +48,9 @@ __global__ void add_kernel(int *x, int *y, int n) { void CheckMemLeak(const platform::CUDAPlace &place) { uint64_t cuda_malloc_size = platform::RecordedGpuMallocSize(place.GetDeviceId()); - ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size - << " bytes memory that not released yet," - << " there may be a memory leak problem"; + ASSERT_EQ(cuda_malloc_size, 0) + << "Found " << cuda_malloc_size << " bytes memory that not released yet," + << " there may be a memory leak problem"; } TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) { diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 3112d0d8205a8..b2fd59b47454e 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -107,6 +107,7 @@ register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combin recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS}) +target_link_libraries(run_program_op cuda_graph_with_memory_pool) op_library(quantize_linear_op DEPS cast_kernel) op_library(save_combine_op DEPS string_array) op_library(load_combine_op DEPS string_array) diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc index b9517e1cc863c..86b60da341e63 100644 --- a/paddle/fluid/operators/abs_op.cc +++ b/paddle/fluid/operators/abs_op.cc @@ -16,6 +16,7 @@ #include #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc index b4a97e24cf292..b9d5e5fbe5ebc 100644 --- a/paddle/fluid/operators/activation_cudnn_op.cu.cc +++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc @@ -20,8 +20,8 @@ namespace paddle { namespace operators { using framework::Tensor; using platform::ActivationDescriptor; -using platform::TensorDescriptor; using platform::CUDADeviceContext; +using platform::TensorDescriptor; #ifdef PADDLE_WITH_HIP #define GPUDNN_ACTIVATION_RELU miopenActivationRELU diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 6905f3d79546e..e500992e1b5a5 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -1454,18 +1454,19 @@ namespace plat = paddle::platform; REGISTER_OPERATOR(KERNEL_TYPE##_grad, ops::ActivationOpGrad, \ ops::ActivationGradOpInplaceInferer); -#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, op_name, functor, \ - grad_functor) \ - REGISTER_OP_CPU_KERNEL( \ - act_type, ops::ActivationKernel>, \ - ops::ActivationKernel>); \ - REGISTER_OP_CPU_KERNEL( \ - act_type##_grad, \ - ops::ActivationGradKernel>, \ - ops::ActivationGradKernel>, \ + ops::ActivationKernel>); \ + REGISTER_OP_CPU_KERNEL( \ + act_type##_grad, \ + ops::ActivationGradKernel>, \ + ops::ActivationGradKernel>); FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP); @@ -1781,21 +1782,18 @@ REGISTER_OP_VERSION(hard_shrink) "((x < -threshold) + (x > threshold)); after checkpoint: out = " "x * (((x < -threshold) + (x > threshold)) > 0)")); -REGISTER_OP_VERSION(softplus) - .AddCheckpoint( - R"ROC(add new attributes [beta] and [threshold], and the formula is changed to " +REGISTER_OP_VERSION(softplus).AddCheckpoint( + R"ROC(add new attributes [beta] and [threshold], and the formula is changed to " " softplus(x) = \\frac{1}{beta} * \\log(1 + e^{beta * x}) \\\\ \\text{For numerical" " stability, the implementation reverts to the linear function when: beta * x > threshold.})ROC", - paddle::framework::compatible::OpVersionDesc() - .NewAttr("beta", "The beta value of the new formula", 1.0f) - .NewAttr("threshold", "The threshold value of the new formula", - 20.0f)); - -REGISTER_OP_VERSION(mish) - .AddCheckpoint( - R"ROC(add new attributes [use_mkldnn], and when computing softplus the formula is changed as the new veriosn of softplus)ROC", - paddle::framework::compatible::OpVersionDesc().NewAttr( - "use_mkldnn", "(bool, default false) Only used in mkldnn kernel", - false)); + paddle::framework::compatible::OpVersionDesc() + .NewAttr("beta", "The beta value of the new formula", 1.0f) + .NewAttr("threshold", "The threshold value of the new formula", 20.0f)); + +REGISTER_OP_VERSION(mish).AddCheckpoint( + R"ROC(add new attributes [use_mkldnn], and when computing softplus the formula is changed as the new veriosn of softplus)ROC", + paddle::framework::compatible::OpVersionDesc().NewAttr( + "use_mkldnn", "(bool, default false) Only used in mkldnn kernel", + false)); /* ========================================================================== */ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 5f3916a65e792..81f5e24abfed5 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -12,19 +12,20 @@ limitations under the License. */ #pragma once #include + #include +#include #include #include #include #include #include - -#include #ifndef _USE_MATH_DEFINES #define _USE_MATH_DEFINES #endif #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" @@ -362,9 +363,8 @@ struct Relu6GradFunctor : public BaseActivationFunctor { typename dX> void operator()(Device d, X x, Out out, dOut dout, dX dx) const { dx.device(d) = - dout * - ((out > static_cast(0)) * (out < static_cast(threshold))) - .template cast(); + dout * ((out > static_cast(0)) * (out < static_cast(threshold))) + .template cast(); } static constexpr ActBwdOpFwdDeps FwdDeps() { diff --git a/paddle/fluid/operators/activation_op_xpu.cc b/paddle/fluid/operators/activation_op_xpu.cc index e950f952c24e6..4127e4b1b103b 100644 --- a/paddle/fluid/operators/activation_op_xpu.cc +++ b/paddle/fluid/operators/activation_op_xpu.cc @@ -253,8 +253,9 @@ struct XPUHardSwishFunctor : public BaseActivationFunctor { PADDLE_ENFORCE_EQ(threshold, 6.0f, platform::errors::External( "Not support threshold [%f] in XPU", threshold)); - PADDLE_ENFORCE_EQ(scale, 6.0f, platform::errors::External( - "Not support scale [%f] in XPU", scale)); + PADDLE_ENFORCE_EQ( + scale, 6.0f, + platform::errors::External("Not support scale [%f] in XPU", scale)); PADDLE_ENFORCE_EQ( offset, 3.0f, platform::errors::External("Not support offset [%f] in XPU", offset)); @@ -273,8 +274,9 @@ struct XPUHardSwishGradFunctor : public BaseActivationFunctor { PADDLE_ENFORCE_EQ(threshold, 6.0f, platform::errors::External( "Not support threshold [%f] in XPU", threshold)); - PADDLE_ENFORCE_EQ(scale, 6.0f, platform::errors::External( - "Not support scale [%f] in XPU", scale)); + PADDLE_ENFORCE_EQ( + scale, 6.0f, + platform::errors::External("Not support scale [%f] in XPU", scale)); PADDLE_ENFORCE_EQ( offset, 3.0f, platform::errors::External("Not support offset [%f] in XPU", offset)); @@ -377,10 +379,12 @@ struct XPUPowGradFunctor : public BaseActivationFunctor { auto x_dims = phi::vectorize(x->dims()); auto dy_dims = phi::vectorize(dOut->dims()); auto dx_dims = phi::vectorize(dX->dims()); - PADDLE_ENFORCE_EQ(x_dims, dy_dims, platform::errors::PreconditionNotMet( - "x_dims should match dy_dims.")); - PADDLE_ENFORCE_EQ(x_dims, dx_dims, platform::errors::PreconditionNotMet( - "x_dims should match dx_dims.")); + PADDLE_ENFORCE_EQ( + x_dims, dy_dims, + platform::errors::PreconditionNotMet("x_dims should match dy_dims.")); + PADDLE_ENFORCE_EQ( + x_dims, dx_dims, + platform::errors::PreconditionNotMet("x_dims should match dx_dims.")); float pow_factor = ctx.Attr("factor"); auto xpu_context = diff --git a/paddle/fluid/operators/add_position_encoding_op.cc b/paddle/fluid/operators/add_position_encoding_op.cc index e5fcd270eb8b8..4d2c23e2bb440 100644 --- a/paddle/fluid/operators/add_position_encoding_op.cc +++ b/paddle/fluid/operators/add_position_encoding_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/add_position_encoding_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/addmm_op.cc b/paddle/fluid/operators/addmm_op.cc index 716a2e40179e4..d0f0a6ae0c679 100644 --- a/paddle/fluid/operators/addmm_op.cc +++ b/paddle/fluid/operators/addmm_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" diff --git a/paddle/fluid/operators/affine_channel_op.cc b/paddle/fluid/operators/affine_channel_op.cc index 1b584fc557849..cd6798be2b2ed 100644 --- a/paddle/fluid/operators/affine_channel_op.cc +++ b/paddle/fluid/operators/affine_channel_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu index cf4041f721af2..87a71130b85bf 100644 --- a/paddle/fluid/operators/affine_channel_op.cu +++ b/paddle/fluid/operators/affine_channel_op.cu @@ -81,13 +81,13 @@ class AffineChannelCUDAKernel : public framework::OpKernel { int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); grid = std::min(std::max(max_threads / block, 1), grid); if (layout == framework::DataLayout::kNCHW) { - KeAffineChannelCUDA<<>>( - x_d, scale_d, bias_d, C, HxW, num, y_d); + KeAffineChannelCUDA + <<>>(x_d, scale_d, bias_d, C, HxW, + num, y_d); } else { - KeAffineChannelCUDA<<>>( - x_d, scale_d, bias_d, C, HxW, num, y_d); + KeAffineChannelCUDA + <<>>(x_d, scale_d, bias_d, C, HxW, + num, y_d); } } }; @@ -169,29 +169,29 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel { if (layout == framework::DataLayout::kNCHW) { if (dscale && dbias) { const T* x_d = x->data(); - AffineChannelScaleBiasGradientCUDAKernel< - T, block, framework::DataLayout::kNCHW><<>>( - dy_d, x_d, N, C, HxW, ds_d, db_d); + AffineChannelScaleBiasGradientCUDAKernel + <<>>(dy_d, x_d, N, C, HxW, ds_d, + db_d); } if (dx) { - KeAffineChannelCUDA<<>>( - dy_d, s_d, nullptr, C, HxW, num, dx_d); + KeAffineChannelCUDA + <<>>(dy_d, s_d, nullptr, C, HxW, + num, dx_d); } } else { if (dscale && dbias) { const T* x_d = x->data(); - AffineChannelScaleBiasGradientCUDAKernel< - T, block, framework::DataLayout::kNHWC><<>>( - dy_d, x_d, N, C, HxW, ds_d, db_d); + AffineChannelScaleBiasGradientCUDAKernel + <<>>(dy_d, x_d, N, C, HxW, ds_d, + db_d); } if (dx) { - KeAffineChannelCUDA<<>>( - dy_d, s_d, nullptr, C, HxW, num, dx_d); + KeAffineChannelCUDA + <<>>(dy_d, s_d, nullptr, C, HxW, + num, dx_d); } } } diff --git a/paddle/fluid/operators/affine_channel_op_xpu.cc b/paddle/fluid/operators/affine_channel_op_xpu.cc index db3eedea7ca67..4de233b184aed 100644 --- a/paddle/fluid/operators/affine_channel_op_xpu.cc +++ b/paddle/fluid/operators/affine_channel_op_xpu.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc index 31801b14564d3..6fca4afabd9cc 100644 --- a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc +++ b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc @@ -65,8 +65,9 @@ class CUDNNAffineGridOpKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ( platform::dynload::cudnnSpatialTfGridGeneratorForward( handle, cudnn_st_desc, theta_data, output_data), - 0, platform::errors::Fatal("Some errors has occurred " - "during forward computation in cudnn.")); + 0, + platform::errors::Fatal("Some errors has occurred " + "during forward computation in cudnn.")); } }; diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc index e311d21bb54d3..d7a49a965a0ee 100644 --- a/paddle/fluid/operators/affine_grid_op.cc +++ b/paddle/fluid/operators/affine_grid_op.cc @@ -13,9 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/affine_grid_op.h" + #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" diff --git a/paddle/fluid/operators/affine_grid_op.cu b/paddle/fluid/operators/affine_grid_op.cu index eeb4b3bc8a760..29a540bdc2ce5 100644 --- a/paddle/fluid/operators/affine_grid_op.cu +++ b/paddle/fluid/operators/affine_grid_op.cu @@ -42,8 +42,8 @@ struct Linspace { auto stream = ctx.cuda_device_context().stream(); int block = 512; int grid = (count + block - 1) / block; - LinspaceKernel<<>>(start, slice, count, - number_data); + LinspaceKernel + <<>>(start, slice, count, number_data); } }; diff --git a/paddle/fluid/operators/affine_grid_op.h b/paddle/fluid/operators/affine_grid_op.h index 21540de2b640e..cbf70b9135be2 100644 --- a/paddle/fluid/operators/affine_grid_op.h +++ b/paddle/fluid/operators/affine_grid_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/blas/blas.h" diff --git a/paddle/fluid/operators/amp/CMakeLists.txt b/paddle/fluid/operators/amp/CMakeLists.txt index 2ea8bbcbc61df..cbedb02f86836 100644 --- a/paddle/fluid/operators/amp/CMakeLists.txt +++ b/paddle/fluid/operators/amp/CMakeLists.txt @@ -1,10 +1,14 @@ include(operators) if(WITH_UNITY_BUILD) - # Load Unity Build rules for operators in paddle/fluid/operators/amp. - include(unity_build_rule.cmake) + # Load Unity Build rules for operators in paddle/fluid/operators/amp. + include(unity_build_rule.cmake) endif() register_operators() if(WITH_ASCEND_CL) - cc_test(check_finite_and_unscale_op_npu_test SRCS check_finite_and_unscale_op_npu_test.cc DEPS op_registry check_finite_and_unscale_op scope device_context enforce executor) + cc_test( + check_finite_and_unscale_op_npu_test + SRCS check_finite_and_unscale_op_npu_test.cc + DEPS op_registry check_finite_and_unscale_op scope device_context enforce + executor) endif() diff --git a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc index 68f6e3b2f3bd0..78bacc3016178 100644 --- a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc +++ b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu index 2f6977b9e2da2..7771902c02b1f 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu @@ -143,10 +143,10 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel { int blocks_per_grid = (total_num + elements_per_block - 1) / elements_per_block; VLOG(3) << "launch kernel"; - CheckFiniteAndUnscale< - T, MPDType><<>>( - d_xs, inverse_scale_v, xs_size, d_starts, found_inf_data, d_outs); + CheckFiniteAndUnscale + <<>>(d_xs, inverse_scale_v, xs_size, d_starts, + found_inf_data, d_outs); VLOG(3) << "finish kernel"; } }; diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc index 2862d9230768c..46572579e081c 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc @@ -20,6 +20,7 @@ limitations under the License. */ #include #include #include + #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc index 30266d3eec0e0..1d3e5e5162ca9 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc @@ -65,13 +65,15 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel { int r = xpu::isfinite(dev_ctx.x_context(), reinterpret_cast(x->data()), is_finite.data(), x->numel()); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( - "XPU API(isfinite) return wrong " - "value[%d %s]", - r, XPUAPIErrorMsg[r])); - r = xpu::logical_not(dev_ctx.x_context(), reinterpret_cast( - is_finite.data()), - is_finite.data(), x->numel()); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU API(isfinite) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + r = xpu::logical_not( + dev_ctx.x_context(), + reinterpret_cast(is_finite.data()), + is_finite.data(), x->numel()); PADDLE_ENFORCE_EQ( r, XPU_SUCCESS, platform::errors::External("XPU API(logical_not) return wrong " @@ -79,10 +81,11 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel { r, XPUAPIErrorMsg[r])); r = xpu::any(dev_ctx.x_context(), is_finite.data(), found_inf_data, x->numel()); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( - "XPU API(any) return wrong " - "value[%d %s]", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU API(any) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); if (dev_ctx.x_context()->xpu_stream) { dev_ctx.Wait(); } @@ -106,36 +109,40 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel { int r = xpu::cast_v2(dev_ctx.x_context(), reinterpret_cast(x->data()), float_x.data(), x->numel()); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( - "XPU API(cast_v2) return wrong " - "value[%d %s]", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU API(cast_v2) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); r = xpu::scale(dev_ctx.x_context(), float_x.data(), float_out.data(), x->numel(), false, inverse_scale, 0.0); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( - "XPU API(scale) return wrong " - "value[%d %s]", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU API(scale) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); r = xpu::cast_v2(dev_ctx.x_context(), float_out.data(), reinterpret_cast(out->data()), out->numel()); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( - "XPU API(cast_v2) return wrong " - "value[%d %s]", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU API(cast_v2) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); } else { int r = xpu::scale(dev_ctx.x_context(), reinterpret_cast(x->data()), reinterpret_cast(out->data()), x->numel(), false, inverse_scale, 0.0); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( - "XPU API(scale) return wrong " - "value[%d %s]", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU API(scale) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); } } if (dev_ctx.x_context()->xpu_stream) { diff --git a/paddle/fluid/operators/amp/clear_float_status_op_npu.cc b/paddle/fluid/operators/amp/clear_float_status_op_npu.cc index e5a2d93e32fe2..c102bd2ba47bd 100644 --- a/paddle/fluid/operators/amp/clear_float_status_op_npu.cc +++ b/paddle/fluid/operators/amp/clear_float_status_op_npu.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" diff --git a/paddle/fluid/operators/amp/get_float_status_op_npu.cc b/paddle/fluid/operators/amp/get_float_status_op_npu.cc index 8109a1ff43ff2..0c1187616503b 100644 --- a/paddle/fluid/operators/amp/get_float_status_op_npu.cc +++ b/paddle/fluid/operators/amp/get_float_status_op_npu.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" diff --git a/paddle/fluid/operators/amp/unity_build_rule.cmake b/paddle/fluid/operators/amp/unity_build_rule.cmake index bfdab0cd9623c..fa460e33c8068 100644 --- a/paddle/fluid/operators/amp/unity_build_rule.cmake +++ b/paddle/fluid/operators/amp/unity_build_rule.cmake @@ -4,9 +4,7 @@ # Generally, the combination rules in this file do not need to be modified. # If there are some redefined error in compiling with the source file which # in combination rule, you can remove the source file from the following rules. -register_unity_group(cc - check_finite_and_unscale_op.cc - update_loss_scaling_op.cc) -register_unity_group(cu - check_finite_and_unscale_op.cu - update_loss_scaling_op.cu) +register_unity_group(cc check_finite_and_unscale_op.cc + update_loss_scaling_op.cc) +register_unity_group(cu check_finite_and_unscale_op.cu + update_loss_scaling_op.cu) diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc index 8354650df0237..baf742b0b404b 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op.cc +++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc @@ -13,9 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/amp/update_loss_scaling_op.h" + #include #include #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu index 43f8f84578c70..81f986434411c 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu +++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/amp/update_loss_scaling_op.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.h b/paddle/fluid/operators/amp/update_loss_scaling_op.h index 41eb94247f593..f4c6b6f1f7d8d 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op.h +++ b/paddle/fluid/operators/amp/update_loss_scaling_op.h @@ -19,6 +19,7 @@ #endif // PADDLE_WITH_CUDA && __NVCC__ #include #include + #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc index f9a93a47ff2be..da7e23c4620ba 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc +++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc @@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/amp/update_loss_scaling_op.h" #include #include + #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/amp/update_loss_scaling_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" DECLARE_int32(min_loss_scaling); diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc index fe03d93f4480f..8f57e00fe1117 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc +++ b/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc @@ -13,12 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/amp/update_loss_scaling_op.h" #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/amp/update_loss_scaling_op.h" #include "paddle/fluid/platform/float16.h" namespace paddle { @@ -59,10 +60,11 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel { r = xpu::constant(dev_ctx.x_context(), reinterpret_cast(out_data), num, XPUTyp(0.0)); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( - "XPU API(constant) return wrong " - "value[%d %s]", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU API(constant) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); } } const bool stop_update = ctx.Attr("stop_update"); diff --git a/paddle/fluid/operators/angle_op.h b/paddle/fluid/operators/angle_op.h index 116a8053db3ed..ace345465dc25 100644 --- a/paddle/fluid/operators/angle_op.h +++ b/paddle/fluid/operators/angle_op.h @@ -17,11 +17,11 @@ #define _USE_MATH_DEFINES #endif #include -#include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc index c5e4188ca2d6f..63fd27a1edf7a 100644 --- a/paddle/fluid/operators/arg_max_op.cc +++ b/paddle/fluid/operators/arg_max_op.cc @@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/arg_min_max_op_base.h" - -#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/unary.h" @@ -28,20 +27,18 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, ArgMaxInferShapeFunctor); -REGISTER_OP_VERSION(arg_max) - .AddCheckpoint( - R"ROC( +REGISTER_OP_VERSION(arg_max).AddCheckpoint( + R"ROC( Upgrade argmax add a new attribute [flatten] and modify the attribute of dtype)ROC", - paddle::framework::compatible::OpVersionDesc() - .NewAttr("flatten", - "In order to compute the argmax over the flattened array " - "when the " - "argument `axis` in python API is None.", - false) - .ModifyAttr( - "dtype", - "Change the default value of dtype from -1 to 3" - ", means return the int64 indices directly. The rearse why " - "changing the default value is that the int64 value in " - "VarType is 3 in the frameworke.proto.", - 3)); + paddle::framework::compatible::OpVersionDesc() + .NewAttr("flatten", + "In order to compute the argmax over the flattened array " + "when the " + "argument `axis` in python API is None.", + false) + .ModifyAttr("dtype", + "Change the default value of dtype from -1 to 3" + ", means return the int64 indices directly. The rearse why " + "changing the default value is that the int64 value in " + "VarType is 3 in the frameworke.proto.", + 3)); diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h index 585341beea12c..194a3070bf683 100644 --- a/paddle/fluid/operators/arg_min_max_op_base.h +++ b/paddle/fluid/operators/arg_min_max_op_base.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc index fb3abd01af8c3..c995d56cf6b09 100644 --- a/paddle/fluid/operators/arg_min_op.cc +++ b/paddle/fluid/operators/arg_min_op.cc @@ -27,20 +27,18 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, ArgMinInferShapeFunctor); -REGISTER_OP_VERSION(arg_min) - .AddCheckpoint( - R"ROC( +REGISTER_OP_VERSION(arg_min).AddCheckpoint( + R"ROC( Upgrade argmin add a new attribute [flatten] and modify the attribute of dtype)ROC", - paddle::framework::compatible::OpVersionDesc() - .NewAttr("flatten", - "In order to compute the argmin over the flattened array " - "when the " - "argument `axis` in python API is None.", - false) - .ModifyAttr( - "dtype", - "Change the default value of dtype from -1 to 3" - ", means return the int64 indices directly. The rearse why " - "changing the default value is that the int64 value in " - "VarType is 3 in the frameworke.proto.", - 3)); + paddle::framework::compatible::OpVersionDesc() + .NewAttr("flatten", + "In order to compute the argmin over the flattened array " + "when the " + "argument `axis` in python API is None.", + false) + .ModifyAttr("dtype", + "Change the default value of dtype from -1 to 3" + ", means return the int64 indices directly. The rearse why " + "changing the default value is that the int64 value in " + "VarType is 3 in the frameworke.proto.", + 3)); diff --git a/paddle/fluid/operators/array_operator.h b/paddle/fluid/operators/array_operator.h index af44a77c8131d..0cc3b695aef93 100644 --- a/paddle/fluid/operators/array_operator.h +++ b/paddle/fluid/operators/array_operator.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc index 1db3592b1cfab..f0824695a060f 100644 --- a/paddle/fluid/operators/array_to_lod_tensor_op.cc +++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/core/lod_utils.h" diff --git a/paddle/fluid/operators/ascend_trigger_op.h b/paddle/fluid/operators/ascend_trigger_op.h index eaa79da2ba8ee..d1eaa00c2a3e0 100644 --- a/paddle/fluid/operators/ascend_trigger_op.h +++ b/paddle/fluid/operators/ascend_trigger_op.h @@ -15,6 +15,7 @@ #pragma once #include #include + #include "paddle/fluid/framework/op_registry.h" #ifdef PADDLE_WITH_ASCEND #include "paddle/fluid/framework/fleet/ascend_wrapper.h" diff --git a/paddle/fluid/operators/assign_op_xpu.cc b/paddle/fluid/operators/assign_op_xpu.cc index b95be3096f071..7d03982f6ad03 100644 --- a/paddle/fluid/operators/assign_op_xpu.cc +++ b/paddle/fluid/operators/assign_op_xpu.cc @@ -13,10 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/assign_op.h" - #include +#include "paddle/fluid/operators/assign_op.h" + namespace paddle { namespace framework { class OpDesc; diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index bf7d609370a8d..22db7d9e982c2 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/attention_lstm_op.h" + #include + #include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/cpu_vec.h" @@ -62,8 +64,9 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { "LSTMWeight dims should be (%d + %d) * %d.", D, M, 4 * D)); auto b_dims = ctx->GetInputDim("LSTMBias"); - PADDLE_ENFORCE_EQ(b_dims.size(), 2, platform::errors::InvalidArgument( - "Input(LSTMBias)'s rank must be 2.")); + PADDLE_ENFORCE_EQ( + b_dims.size(), 2, + platform::errors::InvalidArgument("Input(LSTMBias)'s rank must be 2.")); PADDLE_ENFORCE_EQ(b_dims[0], 1, platform::errors::InvalidArgument( "LSTMBias dims should be 1 x %d.", 4 * D)); @@ -72,11 +75,13 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { "LSTMBias dims should be 1 x %d.", 4 * D)); auto c_dims = ctx->GetInputDim("C0"); - PADDLE_ENFORCE_EQ(c_dims.size(), 2, platform::errors::InvalidArgument( - "Input(C0)'s rank must be 2.")); + PADDLE_ENFORCE_EQ( + c_dims.size(), 2, + platform::errors::InvalidArgument("Input(C0)'s rank must be 2.")); if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(c_dims[1], D, platform::errors::InvalidArgument( - "C0 dims should be N x %d.", D)); + PADDLE_ENFORCE_EQ( + c_dims[1], D, + platform::errors::InvalidArgument("C0 dims should be N x %d.", D)); } if (ctx->HasInput("H0")) { @@ -126,10 +131,12 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { PADDLE_ENFORCE_EQ(dims.size(), 2, platform::errors::InvalidArgument( "Input(AttentionScalar)'s rank must be 2.")); - PADDLE_ENFORCE_EQ(dims[0], 1, platform::errors::InvalidArgument( - "AttentionScalar shapes must be 1 * 1.")); - PADDLE_ENFORCE_EQ(dims[1], 1, platform::errors::InvalidArgument( - "AttentionScalar shapes must be 1 * 1.")); + PADDLE_ENFORCE_EQ(dims[0], 1, + platform::errors::InvalidArgument( + "AttentionScalar shapes must be 1 * 1.")); + PADDLE_ENFORCE_EQ(dims[1], 1, + platform::errors::InvalidArgument( + "AttentionScalar shapes must be 1 * 1.")); } if (ctx->HasInput("AttentionScalarBias")) { @@ -332,14 +339,15 @@ class AttentionLSTMKernel : public framework::OpKernel { int len = x_lod[0][i + 1] - x_lod[0][i]; max_seq_len = max_seq_len < len ? len : max_seq_len; } - PADDLE_ENFORCE_EQ(x_lod.size(), 1UL, platform::errors::InvalidArgument( - "Input(X)'s lod size must be 1.")); + PADDLE_ENFORCE_EQ( + x_lod.size(), 1UL, + platform::errors::InvalidArgument("Input(X)'s lod size must be 1.")); PADDLE_ENFORCE_EQ( c0->dims()[0], N, platform::errors::InvalidArgument("C0 dims should be %d x %d.", N, D)); fc_out->Resize({max_seq_len, 1}); - std::function act_gate, act_cell, act_cand; + std::function act_gate, act_cell, act_cand; auto& act_gate_str = ctx.Attr("gate_activation"); auto& act_cell_str = ctx.Attr("cell_activation"); auto& act_cand_str = ctx.Attr("candidate_activation"); diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h index 289dda56b19df..de6eca3903f88 100644 --- a/paddle/fluid/operators/average_accumulates_op.h +++ b/paddle/fluid/operators/average_accumulates_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/batch_fc_op.cc b/paddle/fluid/operators/batch_fc_op.cc index 952625bcb6e46..2d2deae69a783 100644 --- a/paddle/fluid/operators/batch_fc_op.cc +++ b/paddle/fluid/operators/batch_fc_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/batch_fc_op.h" + #include namespace paddle { @@ -42,8 +43,9 @@ class BatchFCOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(input_dims.size(), 3, platform::errors::InvalidArgument( "Input of BatchFCOp should have 3D.")); - PADDLE_ENFORCE_EQ(w_dims.size(), 3, platform::errors::InvalidArgument( - "W of BatchFCOp should have 3D.")); + PADDLE_ENFORCE_EQ( + w_dims.size(), 3, + platform::errors::InvalidArgument("W of BatchFCOp should have 3D.")); PADDLE_ENFORCE_EQ( input_dims[0], w_dims[0], platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/batch_fc_op.cu b/paddle/fluid/operators/batch_fc_op.cu index ddedf0172be82..5843acb4fdd0c 100644 --- a/paddle/fluid/operators/batch_fc_op.cu +++ b/paddle/fluid/operators/batch_fc_op.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/operators/batch_fc_op.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 2663a08101157..67384338d764e 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -13,9 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/batch_norm_op.h" + #include #include #include + #include "paddle/fluid/framework/data_layout.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -167,10 +169,11 @@ framework::OpKernelType BatchNormOp::GetExpectedKernelType( bn_param_type, framework::TransToProtoVarType(ctx.Input("Mean")->dtype()), platform::errors::InvalidArgument("Mean input should be of float type")); - PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType( - ctx.Input("Variance")->dtype()), - platform::errors::InvalidArgument( - "Variance input should be of float type")); + PADDLE_ENFORCE_EQ( + bn_param_type, + framework::TransToProtoVarType(ctx.Input("Variance")->dtype()), + platform::errors::InvalidArgument( + "Variance input should be of float type")); // TODO(pzelazko-intel): enable MKLDNN layout when it's ready framework::LibraryType library = framework::LibraryType::kPlain; diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h index d274e8d2c006d..b82b49e5cd58e 100644 --- a/paddle/fluid/operators/batch_norm_op.h +++ b/paddle/fluid/operators/batch_norm_op.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/layout_utils.h" diff --git a/paddle/fluid/operators/batch_norm_op_mlu.cc b/paddle/fluid/operators/batch_norm_op_mlu.cc index 6507890a8b5dc..6dff315aa6a21 100644 --- a/paddle/fluid/operators/batch_norm_op_mlu.cc +++ b/paddle/fluid/operators/batch_norm_op_mlu.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc index ae03ecbcb16a0..725b7f3848f4a 100644 --- a/paddle/fluid/operators/batch_norm_op_npu.cc +++ b/paddle/fluid/operators/batch_norm_op_npu.cc @@ -113,8 +113,9 @@ class NPUBatchNormOpKernel : public framework::OpKernel { runner_reduce.Run(stream); const auto &runner_update = NpuOpRunner( - "BNTrainingUpdate", {x_tensor, sum, square_sum, *scale, *bias, - *running_mean, *running_var}, + "BNTrainingUpdate", + {x_tensor, sum, square_sum, *scale, *bias, *running_mean, + *running_var}, {y_tesnor, *mean_out, *variance_out, *saved_mean, *saved_variance}, {{"factor", momentum}, {"epsilon", epsilon}}); runner_update.Run(stream); @@ -216,10 +217,11 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel { {dx_tensor}, {{"epsilon", epsilon}}); runner_infer.Run(stream); } else { - const auto &runner_reduce = NpuOpRunner( - "BNTrainingReduceGrad", {dy_tensor, x_tensor, *d_scale, *d_bias, - *scale, *saved_mean, *saved_inv_variance}, - {dx_tensor}, {{"epsilon", epsilon}}); + const auto &runner_reduce = + NpuOpRunner("BNTrainingReduceGrad", + {dy_tensor, x_tensor, *d_scale, *d_bias, *scale, + *saved_mean, *saved_inv_variance}, + {dx_tensor}, {{"epsilon", epsilon}}); runner_reduce.Run(stream); } } diff --git a/paddle/fluid/operators/batch_norm_op_xpu.cc b/paddle/fluid/operators/batch_norm_op_xpu.cc index 0893324c602a8..3ade2f36ad89f 100644 --- a/paddle/fluid/operators/batch_norm_op_xpu.cc +++ b/paddle/fluid/operators/batch_norm_op_xpu.cc @@ -13,10 +13,11 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/batch_norm_op.h" #include #include +#include "paddle/fluid/operators/batch_norm_op.h" + namespace paddle { namespace operators { @@ -128,8 +129,9 @@ static int calculate_inv_BN_Y(xpu::Context *ctx, T *x, const T *scale, const T *bias, const T *mean, const T *variance, const int N, const int C, const int M, const T *y) { - PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument( - "X and Y should be inplaced in inplace mode")); + PADDLE_ENFORCE_EQ(x, y, + platform::errors::InvalidArgument( + "X and Y should be inplaced in inplace mode")); std::vector tensor_shape_vec({N, C, M}); std::vector array_shape_vec({1, C, 1}); // y - bias @@ -207,8 +209,9 @@ class BatchNormGradXPUKernel : public framework::OpKernel { is_inplace = false; if (d_x) { PADDLE_ENFORCE_NE( - d_x, d_y, platform::errors::InvalidArgument( - "X@GRAD and Y@GRAD inplaced in non-inplace mode")); + d_x, d_y, + platform::errors::InvalidArgument( + "X@GRAD and Y@GRAD inplaced in non-inplace mode")); } } @@ -275,11 +278,12 @@ class BatchNormGradXPUKernel : public framework::OpKernel { int r1 = calculate_inv_var(dev_ctx.x_context(), global_var->data(), epsilon, C, epsilon_data, global_inv_std_data); - PADDLE_ENFORCE_EQ(r1, XPU_SUCCESS, platform::errors::External( - "XPU API(batch_norm_grad " - "calculate_inv_var function) " - "return wrong value[%d %s]", - r1, XPUAPIErrorMsg[r1])); + PADDLE_ENFORCE_EQ( + r1, XPU_SUCCESS, + platform::errors::External("XPU API(batch_norm_grad " + "calculate_inv_var function) " + "return wrong value[%d %s]", + r1, XPUAPIErrorMsg[r1])); } auto px = *x; auto *inv_std_data = @@ -290,11 +294,12 @@ class BatchNormGradXPUKernel : public framework::OpKernel { dev_ctx.x_context(), px.mutable_data(ctx.GetPlace()), scale->data(), bias->data(), mean_data, inv_std_data, N, C, H * W, x->data()); - PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS, platform::errors::External( - "XPU API(batch_norm_grad " - "calculate_inv_BN_Y function) " - "return wrong value[%d %s]", - r2, XPUAPIErrorMsg[r2])); + PADDLE_ENFORCE_EQ( + r2, XPU_SUCCESS, + platform::errors::External("XPU API(batch_norm_grad " + "calculate_inv_BN_Y function) " + "return wrong value[%d %s]", + r2, XPUAPIErrorMsg[r2])); } int r3; @@ -319,10 +324,11 @@ class BatchNormGradXPUKernel : public framework::OpKernel { scale_data, batch_mean->data(), batch_inv_std->data(), d_scale_data, d_bias_data, is_nchw); } - PADDLE_ENFORCE_EQ(r3, XPU_SUCCESS, platform::errors::External( - "XPU API(batch_norm_grad) return " - "wrong value[%d %s]", - r3, XPUAPIErrorMsg[r3])); + PADDLE_ENFORCE_EQ( + r3, XPU_SUCCESS, + platform::errors::External("XPU API(batch_norm_grad) return " + "wrong value[%d %s]", + r3, XPUAPIErrorMsg[r3])); } }; diff --git a/paddle/fluid/operators/batch_size_like.h b/paddle/fluid/operators/batch_size_like.h index facb4cd82542b..1cc6e36467767 100644 --- a/paddle/fluid/operators/batch_size_like.h +++ b/paddle/fluid/operators/batch_size_like.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include #include + #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc index 3fae65c50177b..0e3e32666a832 100644 --- a/paddle/fluid/operators/beam_search_decode_op.cc +++ b/paddle/fluid/operators/beam_search_decode_op.cc @@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/beam_search_decode_op.h" + #include #include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/operators/beam_search_decode_op.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { diff --git a/paddle/fluid/operators/beam_search_decode_op_test.cc b/paddle/fluid/operators/beam_search_decode_op_test.cc index cf32e40742441..6f70136b2d213 100644 --- a/paddle/fluid/operators/beam_search_decode_op_test.cc +++ b/paddle/fluid/operators/beam_search_decode_op_test.cc @@ -103,11 +103,9 @@ TEST(BeamSearchDecodeOp, Backtrace) { std::vector{1, 1, 3, 5}, &ids, &scores); paddle::test::GenerateExample( std::vector{0, 2, 4}, - std::vector{0, 0, 0, 2, - 2}, // the branchs of the first source sentence - // are pruned since finished - std::vector{5, 1}, - &ids, &scores); + std::vector{0, 0, 0, 2, 2}, // the branchs of the first source + // sentence are pruned since finished + std::vector{5, 1}, &ids, &scores); ASSERT_EQ(ids.size(), 5UL); ASSERT_EQ(scores.size(), 5UL); diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc index 887d28f5875e3..90b6359f447ef 100644 --- a/paddle/fluid/operators/beam_search_op.cc +++ b/paddle/fluid/operators/beam_search_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/beam_search_op.cu.cc b/paddle/fluid/operators/beam_search_op.cu.cc index 4ef9476eee5d3..15aca070221b0 100644 --- a/paddle/fluid/operators/beam_search_op.cu.cc +++ b/paddle/fluid/operators/beam_search_op.cu.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/beam_search_op.h" + #include "paddle/fluid/framework/op_registry.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/beam_search_op_npu.cc b/paddle/fluid/operators/beam_search_op_npu.cc index cae3d0e55fc5d..f5fa0ac026d57 100644 --- a/paddle/fluid/operators/beam_search_op_npu.cc +++ b/paddle/fluid/operators/beam_search_op_npu.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/beam_search_op.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/beam_search_op.h" namespace ops = paddle::operators; REGISTER_OP_NPU_KERNEL( diff --git a/paddle/fluid/operators/benchmark/CMakeLists.txt b/paddle/fluid/operators/benchmark/CMakeLists.txt index e5023d8eb354a..e05011eaf6b3a 100644 --- a/paddle/fluid/operators/benchmark/CMakeLists.txt +++ b/paddle/fluid/operators/benchmark/CMakeLists.txt @@ -1,3 +1,14 @@ -cc_test(op_tester SRCS op_tester.cc op_tester_config.cc - DEPS memory timer framework_proto proto_desc lod_tensor op_registry - device_context scope ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} eigen_function) +cc_test( + op_tester + SRCS op_tester.cc op_tester_config.cc + DEPS memory + timer + framework_proto + proto_desc + lod_tensor + op_registry + device_context + scope + ${GLOB_OP_LIB} + ${GLOB_OPERATOR_DEPS} + eigen_function) diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc index 4b1593b1f8b40..fc01eef8058c3 100644 --- a/paddle/fluid/operators/benchmark/op_tester.cc +++ b/paddle/fluid/operators/benchmark/op_tester.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/benchmark/op_tester.h" + #include + #include "gflags/gflags.h" #include "gtest/gtest.h" #include "paddle/fluid/framework/op_info.h" diff --git a/paddle/fluid/operators/benchmark/op_tester.h b/paddle/fluid/operators/benchmark/op_tester.h index 6acd42c8675cb..217fbe2653e3d 100644 --- a/paddle/fluid/operators/benchmark/op_tester.h +++ b/paddle/fluid/operators/benchmark/op_tester.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/benchmark/op_tester_config.h" diff --git a/paddle/fluid/operators/benchmark/op_tester_config.cc b/paddle/fluid/operators/benchmark/op_tester_config.cc index e9477798858d1..d7a055ede1b73 100644 --- a/paddle/fluid/operators/benchmark/op_tester_config.cc +++ b/paddle/fluid/operators/benchmark/op_tester_config.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/benchmark/op_tester_config.h" + #include + #include "paddle/fluid/platform/enforce.h" namespace paddle { diff --git a/paddle/fluid/operators/bilateral_slice_op.cc b/paddle/fluid/operators/bilateral_slice_op.cc index 675566504c211..124441093d3a5 100644 --- a/paddle/fluid/operators/bilateral_slice_op.cc +++ b/paddle/fluid/operators/bilateral_slice_op.cc @@ -10,9 +10,11 @@ limitations under the License. */ #include "paddle/fluid/operators/bilateral_slice_op.h" + #include #include #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/bilateral_slice_op.cu b/paddle/fluid/operators/bilateral_slice_op.cu index e7bf6d212dcf1..f20debdf0b815 100644 --- a/paddle/fluid/operators/bilateral_slice_op.cu +++ b/paddle/fluid/operators/bilateral_slice_op.cu @@ -11,6 +11,7 @@ #include #include + #include "paddle/fluid/operators/bilateral_slice_op.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" @@ -167,11 +168,11 @@ class BilateralSliceOpCUDAKernel : public framework::OpKernel { platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), total_count); - BilateralSliceCudaForwardKernel< - T><<>>( - output_data, grid_data, guide_data, input_data, grid_sizes, has_offset, - total_count, output_dims[1]); + BilateralSliceCudaForwardKernel + <<>>( + output_data, grid_data, guide_data, input_data, grid_sizes, + has_offset, total_count, output_dims[1]); } }; @@ -475,29 +476,29 @@ class BilateralSliceGradOpCUDAKernel : public framework::OpKernel { platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), grid_count); - BilateralSliceCudaGridGradKernel< - T><<>>( - grid_grad_data, output_grad_data, guide_data, input_data, grid_sizes, - has_offset, grid_count, output_chans); + BilateralSliceCudaGridGradKernel + <<>>( + grid_grad_data, output_grad_data, guide_data, input_data, + grid_sizes, has_offset, grid_count, output_chans); config = platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), guide_count); - BilateralSliceCudaGuideGradKernel< - T><<>>( - guide_grad_data, output_grad_data, grid_data, guide_data, input_data, - grid_sizes, has_offset, guide_count, output_chans); + BilateralSliceCudaGuideGradKernel + <<>>( + guide_grad_data, output_grad_data, grid_data, guide_data, + input_data, grid_sizes, has_offset, guide_count, output_chans); config = platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), input_count); - BilateralSliceCudaInputGradKernel< - T><<>>( - input_grad_data, output_grad_data, grid_data, guide_data, grid_sizes, - has_offset, input_count, output_chans); + BilateralSliceCudaInputGradKernel + <<>>( + input_grad_data, output_grad_data, grid_data, guide_data, + grid_sizes, has_offset, input_count, output_chans); } }; diff --git a/paddle/fluid/operators/bilateral_slice_op.h b/paddle/fluid/operators/bilateral_slice_op.h index a388f4763ec68..66783f151ea06 100644 --- a/paddle/fluid/operators/bilateral_slice_op.h +++ b/paddle/fluid/operators/bilateral_slice_op.h @@ -12,6 +12,7 @@ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/hostdevice.h" diff --git a/paddle/fluid/operators/bmm_op.cc b/paddle/fluid/operators/bmm_op.cc index 6b5f4755d771e..16066c1a13e41 100644 --- a/paddle/fluid/operators/bmm_op.cc +++ b/paddle/fluid/operators/bmm_op.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/bmm_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/bmm_op.h b/paddle/fluid/operators/bmm_op.h index 3fecb55caaeea..271a74a44442c 100644 --- a/paddle/fluid/operators/bmm_op.h +++ b/paddle/fluid/operators/bmm_op.h @@ -18,6 +18,7 @@ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/phi/kernels/funcs/blas/blas.h" diff --git a/paddle/fluid/operators/bmm_op_xpu.cc b/paddle/fluid/operators/bmm_op_xpu.cc index cc18558027982..348f25d46b4c5 100644 --- a/paddle/fluid/operators/bmm_op_xpu.cc +++ b/paddle/fluid/operators/bmm_op_xpu.cc @@ -16,8 +16,8 @@ #include #include -#include "paddle/fluid/operators/matmul_v2_op.h" +#include "paddle/fluid/operators/matmul_v2_op.h" #include "paddle/fluid/operators/xpu_api_wrapper.h" #include "paddle/fluid/platform/device/device_wrapper.h" diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc index bbe4bb08adf27..afa7aee445043 100644 --- a/paddle/fluid/operators/bpr_loss_op.cc +++ b/paddle/fluid/operators/bpr_loss_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/bpr_loss_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h index 993bc0fccf07d..fd6df2c159470 100644 --- a/paddle/fluid/operators/bpr_loss_op.h +++ b/paddle/fluid/operators/bpr_loss_op.h @@ -61,8 +61,9 @@ class BprLossOpKernel : public framework::OpKernel { const int64_t* label_data = labels->data(); for (int i = 0; i < step_size; ++i) { int lbl_pos = label_data[i]; - PADDLE_ENFORCE_GE(lbl_pos, 0, platform::errors::InvalidArgument( - "label data %d is illegal.", lbl_pos)); + PADDLE_ENFORCE_GE(lbl_pos, 0, + platform::errors::InvalidArgument( + "label data %d is illegal.", lbl_pos)); PADDLE_ENFORCE_LT(lbl_pos, class_num, platform::errors::InvalidArgument( "label data %d is illegal.", lbl_pos)); diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc index 1063a8b799215..53146417f2149 100644 --- a/paddle/fluid/operators/broadcast_tensors_op.cc +++ b/paddle/fluid/operators/broadcast_tensors_op.cc @@ -20,8 +20,8 @@ limitations under the License. */ namespace paddle { namespace operators { -using framework::Tensor; using framework::DDim; +using framework::Tensor; class BroadcastTensorsOp : public framework::OperatorWithKernel { public: diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc index 76e0f23df2168..f0146994c1f7e 100644 --- a/paddle/fluid/operators/cast_op.cc +++ b/paddle/fluid/operators/cast_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/cast_op.h" + #include + #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/float16.h" diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h index 034cb47fab189..2f222d23e7cba 100644 --- a/paddle/fluid/operators/cast_op.h +++ b/paddle/fluid/operators/cast_op.h @@ -17,7 +17,6 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/transform.h" - #include "paddle/phi/api/lib/utils/tensor_utils.h" #include "paddle/phi/kernels/cast_kernel.h" diff --git a/paddle/fluid/operators/cast_op_xpu.cc b/paddle/fluid/operators/cast_op_xpu.cc index 64324d9772b47..8551d799cc39b 100644 --- a/paddle/fluid/operators/cast_op_xpu.cc +++ b/paddle/fluid/operators/cast_op_xpu.cc @@ -19,9 +19,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/cast_op.h" #include "paddle/fluid/platform/float16.h" -#include "xpu/refactor/math.h" - #include "paddle/phi/kernels/cast_kernel.h" +#include "xpu/refactor/math.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/center_loss_op.cc b/paddle/fluid/operators/center_loss_op.cc index cd1aa9d9c841a..add0bf966d933 100644 --- a/paddle/fluid/operators/center_loss_op.cc +++ b/paddle/fluid/operators/center_loss_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/center_loss_op.h" + #include #include diff --git a/paddle/fluid/operators/center_loss_op.cu b/paddle/fluid/operators/center_loss_op.cu index 549bb5ae75aff..b46feeae64bd4 100644 --- a/paddle/fluid/operators/center_loss_op.cu +++ b/paddle/fluid/operators/center_loss_op.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/operators/center_loss_op.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" diff --git a/paddle/fluid/operators/center_loss_op.h b/paddle/fluid/operators/center_loss_op.h index ed266e9ac7dc5..18769fed37ba9 100644 --- a/paddle/fluid/operators/center_loss_op.h +++ b/paddle/fluid/operators/center_loss_op.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/transform.h" diff --git a/paddle/fluid/operators/chunk_eval_op.cc b/paddle/fluid/operators/chunk_eval_op.cc index dfb0ad96b0be2..83bdaa2de7db1 100644 --- a/paddle/fluid/operators/chunk_eval_op.cc +++ b/paddle/fluid/operators/chunk_eval_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/chunk_eval_op.h" + #include #include @@ -55,11 +56,12 @@ class ChunkEvalOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ( (inference_dim.size() == 3 && inference_dim[2] == 1) || inference_dim.size() == 2, - true, platform::errors::InvalidArgument( - "when Input(SeqLength) is provided, Input(Inference) " - "should be of dim 3 (batch_size, bucket, 1) or dim 2 " - "(batch_size, bucket), but received [%s].", - inference_dim)); + true, + platform::errors::InvalidArgument( + "when Input(SeqLength) is provided, Input(Inference) " + "should be of dim 3 (batch_size, bucket, 1) or dim 2 " + "(batch_size, bucket), but received [%s].", + inference_dim)); auto seq_length_dim = ctx->GetInputDim("SeqLength"); PADDLE_ENFORCE_LE(seq_length_dim.size(), 2, platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt index 862a0d04fbdfe..f2a4201fd960d 100644 --- a/paddle/fluid/operators/cinn/CMakeLists.txt +++ b/paddle/fluid/operators/cinn/CMakeLists.txt @@ -1,19 +1,67 @@ include(operators) -cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context) -cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy device_context parallel_executor transform_type cinn) +cc_library( + cinn_op_helper + SRCS cinn_op_helper.cc + DEPS operator device_context) +cc_library( + cinn_launch_context + SRCS cinn_launch_context.cc + DEPS ddim + lod_tensor + scope + proto_desc + graph + build_strategy + device_context + parallel_executor + transform_type + cinn) -SET(CINN_OP_DEPS parallel_executor string_helper variable_helper cinn cinn_compiler cinn_op_helper cinn_launch_context transform_type) +set(CINN_OP_DEPS + parallel_executor + string_helper + variable_helper + cinn + cinn_compiler + cinn_op_helper + cinn_launch_context + transform_type) register_operators(DEPS ${CINN_OP_DEPS}) -if (WITH_TESTING) - cc_test(cinn_launch_context_test SRCS cinn_launch_context_test.cc DEPS ddim lod_tensor scope proto_desc graph cinn_launch_context cinn_instruction_run_op cinn) - set_tests_properties(cinn_launch_context_test PROPERTIES LABELS "RUN_TYPE=CINN") +if(WITH_TESTING) + cc_test( + cinn_launch_context_test + SRCS cinn_launch_context_test.cc + DEPS ddim + lod_tensor + scope + proto_desc + graph + cinn_launch_context + cinn_instruction_run_op + cinn) + set_tests_properties(cinn_launch_context_test PROPERTIES LABELS + "RUN_TYPE=CINN") - SET(CINN_RUN_ENVIRONMENT "OMP_NUM_THREADS=1;runtime_include_dir=${PADDLE_BINARY_DIR}/third_party/CINN/src/external_cinn/cinn/runtime/cuda") - cc_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op elementwise_add_op gflags) - set_tests_properties(cinn_launch_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT "${CINN_RUN_ENVIRONMENT}") + set(CINN_RUN_ENVIRONMENT + "OMP_NUM_THREADS=1;runtime_include_dir=${PADDLE_BINARY_DIR}/third_party/CINN/src/external_cinn/cinn/runtime/cuda" + ) + cc_test( + cinn_launch_op_test + SRCS cinn_launch_op_test.cc + DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op + elementwise_add_op gflags) + set_tests_properties( + cinn_launch_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT + "${CINN_RUN_ENVIRONMENT}") - cc_test(cinn_instruction_run_op_test SRCS cinn_instruction_run_op_test.cc DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op elementwise_add_op) - set_tests_properties(cinn_instruction_run_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT "${CINN_RUN_ENVIRONMENT}") + cc_test( + cinn_instruction_run_op_test + SRCS cinn_instruction_run_op_test.cc + DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op + elementwise_add_op) + set_tests_properties( + cinn_instruction_run_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT + "${CINN_RUN_ENVIRONMENT}") endif() diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc index 0903c53e5ecac..be9829dd43b17 100644 --- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc +++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/cinn/cinn_instruction_run_op.h" + #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" #include "paddle/fluid/operators/cinn/cinn_launch_context.h" #include "paddle/fluid/platform/enforce.h" @@ -48,12 +49,12 @@ class CinnInstructionRunOp : public framework::OperatorWithKernel { protected: /* [Why use single type kernel]: - * - * Whether the kernel data type is int, float or other type, - * which has no effect on its execution logic, so directly - * specified a data type here. - * - */ + * + * Whether the kernel data type is int, float or other type, + * which has no effect on its execution logic, so directly + * specified a data type here. + * + */ framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType(framework::proto::VarType::FP32, diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc index ea72f6c53745a..afa350ef116c4 100644 --- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc +++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/cinn/cinn_instruction_run_op.h" + #include "paddle/fluid/framework/op_registry.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.h b/paddle/fluid/operators/cinn/cinn_instruction_run_op.h index 81c2d23d3f149..13483d78f49b6 100644 --- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.h +++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.h @@ -18,6 +18,7 @@ #include #include #include + #include "cinn/hlir/framework/graph_compiler.h" #include "cinn/hlir/framework/instruction.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc index 68bc3a0eb5c53..cbfab3090c0ad 100644 --- a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include + #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc index a660d59fb4c0f..6b70efee86f57 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc @@ -13,10 +13,12 @@ // limitations under the License. #include "paddle/fluid/operators/cinn/cinn_launch_context.h" + #include #include #include #include + #include "cinn/hlir/framework/graph_compiler.h" #include "cinn/hlir/framework/instruction.h" #include "cinn/hlir/framework/scope.h" @@ -43,13 +45,13 @@ namespace paddle { namespace operators::details { -using framework::Scope; using framework::LoDTensor; using framework::ParallelExecutor; +using framework::Scope; using CinnInstruction = ::cinn::hlir::framework::Instruction; using CinnRuntimeProgram = ::cinn::hlir::framework::Program; -using framework::paddle2cinn::Name2VarInfoMap; using framework::paddle2cinn::kMemOptVarInfoFromMainGraph; +using framework::paddle2cinn::Name2VarInfoMap; CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph, const CinnCompiledObject& compiled_obj) diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h index ed5e4383d83d2..0bbbcc8b03177 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context.h +++ b/paddle/fluid/operators/cinn/cinn_launch_context.h @@ -20,6 +20,7 @@ #include #include #include + #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc index ecbfbf2f92ebf..cd4465d355f35 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc @@ -13,9 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/cinn/cinn_launch_context.h" + #include #include #include + #include "cinn/auto_schedule/auto_tuner.h" #include "cinn/common/target.h" #include "cinn/common/type.h" @@ -38,11 +40,11 @@ USE_OP(cinn_instruction_run); namespace paddle { namespace operators::details { +using framework::LoDTensor; using framework::OpDesc; +using framework::ParallelExecutor; using framework::ProgramDesc; -using framework::LoDTensor; using framework::ir::Graph; -using framework::ParallelExecutor; using framework::paddle2cinn::Name2VarInfoMap; using CinnShape = ::cinn::hlir::framework::Shape; using CinnInstruction = ::cinn::hlir::framework::Instruction; diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cc index 0a9b66bc92c15..3b0198613dbdb 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "paddle/fluid/operators/cinn/cinn_launch_op.h" + #include #include + #include "cinn/hlir/framework/graph_compiler.h" #include "cinn/runtime/cinn_runtime.h" #include "cinn/runtime/flags.h" diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc index 9dfd53834e937..fb5a48ca3d0b4 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/cinn/cinn_launch_op.h" + #include "paddle/fluid/framework/operator.h" /* see [Why use single type kernel] */ diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h index f40b788dfb5b3..62c79faafec72 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op.h +++ b/paddle/fluid/operators/cinn/cinn_launch_op.h @@ -77,16 +77,16 @@ class CinnLaunchOpKernel : public framework::OpKernel { std::map inputs_name2tensor; std::vector input_x_variable_names; std::vector input_no_need_buffer_variable_names; - auto add_name2tensor_fn = [&inputs_name2tensor]( - const std::vector& variable_names, - const std::vector& tensors) { - std::transform( - variable_names.begin(), variable_names.end(), tensors.begin(), - std::inserter(inputs_name2tensor, inputs_name2tensor.end()), - [](const std::string& name, const LoDTensor* tensor) { - return std::make_pair(name, tensor); - }); - }; + auto add_name2tensor_fn = + [&inputs_name2tensor](const std::vector& variable_names, + const std::vector& tensors) { + std::transform( + variable_names.begin(), variable_names.end(), tensors.begin(), + std::inserter(inputs_name2tensor, inputs_name2tensor.end()), + [](const std::string& name, const LoDTensor* tensor) { + return std::make_pair(name, tensor); + }); + }; auto input_x_tensors = ctx.MultiInput(kX); if (!input_x_tensors.empty()) { diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc index b0bd043f43247..9ed9fad36a3d7 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc @@ -13,10 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/cinn/cinn_launch_op.h" + #include + #include #include #include + #include "gflags/gflags.h" #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/cinn/cinn_op_helper.cc b/paddle/fluid/operators/cinn/cinn_op_helper.cc index 3fb9c822c77c4..26fee2d9e577c 100644 --- a/paddle/fluid/operators/cinn/cinn_op_helper.cc +++ b/paddle/fluid/operators/cinn/cinn_op_helper.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/cinn/cinn_op_helper.h" + #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/operators/cinn/cinn_op_helper.h b/paddle/fluid/operators/cinn/cinn_op_helper.h index e542134b94689..55ee3789c0a82 100644 --- a/paddle/fluid/operators/cinn/cinn_op_helper.h +++ b/paddle/fluid/operators/cinn/cinn_op_helper.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/fluid/framework/operator.h" // We define some common names or utility functions diff --git a/paddle/fluid/operators/cinn/test_helper.h b/paddle/fluid/operators/cinn/test_helper.h index 9720a5309fa6e..4e06882279bee 100644 --- a/paddle/fluid/operators/cinn/test_helper.h +++ b/paddle/fluid/operators/cinn/test_helper.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include + #include "gtest/gtest.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/lod_tensor.h" diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu index a23cf2815d8fe..7192b415c27ec 100644 --- a/paddle/fluid/operators/class_center_sample_op.cu +++ b/paddle/fluid/operators/class_center_sample_op.cu @@ -15,17 +15,20 @@ #ifdef PADDLE_WITH_HIP #include #include + #include typedef hiprandState curandState; namespace cub = hipcub; #else #include #include + #include #endif #include #include + #include "paddle/fluid/operators/class_center_sample_op.h" #include "paddle/phi/api/include/tensor.h" diff --git a/paddle/fluid/operators/class_center_sample_op.h b/paddle/fluid/operators/class_center_sample_op.h index 24ce9ace3bf11..8f12e90e18539 100644 --- a/paddle/fluid/operators/class_center_sample_op.h +++ b/paddle/fluid/operators/class_center_sample_op.h @@ -16,6 +16,7 @@ #include #include #include + #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h index 8822fffd326e1..379cd4c665314 100644 --- a/paddle/fluid/operators/clip_by_norm_op.h +++ b/paddle/fluid/operators/clip_by_norm_op.h @@ -114,10 +114,11 @@ class ClipByNormOp : public framework::OperatorWithKernel { "Output(Out) of ClipByNormOp should not be null. " "Please check if it is created correctly.")); auto max_norm = ctx->Attrs().Get("max_norm"); - PADDLE_ENFORCE_GT(max_norm, 0, platform::errors::InvalidArgument( - "max_norm should be greater than 0. " - "Received max_norm is %f.", - max_norm)); + PADDLE_ENFORCE_GT( + max_norm, 0, + platform::errors::InvalidArgument("max_norm should be greater than 0. " + "Received max_norm is %f.", + max_norm)); auto x_dims = ctx->GetInputDim("X"); ctx->SetOutputDim("Out", x_dims); ctx->ShareLoD("X", /*->*/ "Out"); diff --git a/paddle/fluid/operators/clip_by_norm_op_xpu.cc b/paddle/fluid/operators/clip_by_norm_op_xpu.cc index 7c91f06a8d722..62c2608f11c4c 100644 --- a/paddle/fluid/operators/clip_by_norm_op_xpu.cc +++ b/paddle/fluid/operators/clip_by_norm_op_xpu.cc @@ -13,9 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/clip_by_norm_op.h" #include +#include "paddle/fluid/operators/clip_by_norm_op.h" + namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc index 6e898d31663fa..46eb9448d9d6b 100644 --- a/paddle/fluid/operators/clip_op.cc +++ b/paddle/fluid/operators/clip_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" @@ -179,14 +180,13 @@ REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad, ops::ClipGradInplaceInferer, ops::ClipDoubleGradOpMaker, ops::ClipDoubleGradOpMaker); -REGISTER_OP_VERSION(clip) - .AddCheckpoint( - R"ROC( +REGISTER_OP_VERSION(clip).AddCheckpoint( + R"ROC( Upgrade clip add a new input [Min])ROC", - paddle::framework::compatible::OpVersionDesc() - .NewInput("Min", - "Pass the mix, min value as input, not attribute. Min is " - "dispensable.") - .NewInput("Max", - "Pass the mix, min value as input, not attribute. Max is " - "dispensable.")); + paddle::framework::compatible::OpVersionDesc() + .NewInput("Min", + "Pass the mix, min value as input, not attribute. Min is " + "dispensable.") + .NewInput("Max", + "Pass the mix, min value as input, not attribute. Max is " + "dispensable.")); diff --git a/paddle/fluid/operators/clip_op_xpu.cc b/paddle/fluid/operators/clip_op_xpu.cc index c551312837274..a99e5d2506fad 100644 --- a/paddle/fluid/operators/clip_op_xpu.cc +++ b/paddle/fluid/operators/clip_op_xpu.cc @@ -61,10 +61,11 @@ class ClipXPUKernel : public framework::OpKernel { auto out_data = reinterpret_cast(out->data()); int r = xpu::clip_v2(dev_ctx.x_context(), x_data, out_data, x->numel(), min, max); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( - "XPU API(clip_v2) return wrong " - "value[%d %s]", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU API(clip_v2) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); } }; diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc index aa5a38e4dbf08..af15ca2acb7f4 100644 --- a/paddle/fluid/operators/coalesce_tensor_op.cc +++ b/paddle/fluid/operators/coalesce_tensor_op.cc @@ -14,6 +14,7 @@ #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/operator.h" @@ -265,11 +266,10 @@ class CoalesceTensorOpKernel : public framework::OpKernel { ->ShareDataWith(fused_tensor->Slice( static_cast(offset), static_cast(offset + len))) .Resize(dim); - len = use_align - ? platform::Alignment(len * size_of_dtype, context.GetPlace(), - align_size) / - size_of_dtype - : len; + len = use_align ? platform::Alignment(len * size_of_dtype, + context.GetPlace(), align_size) / + size_of_dtype + : len; ss << "output(" << out_var_names[i] << ") dim:(" << dim << ")" << " address: " << out_tensors[i]->data() << " len: " << len << ", "; offset += len; @@ -304,12 +304,11 @@ class CoalesceTensorOpKernel : public framework::OpKernel { size, 0, platform::errors::InvalidArgument( "The number of tensor `%s`'s elements is 0.", var_names[i])); - auto len = - use_align - ? platform::Alignment(static_cast(size) * size_of_dtype, - place, align_size) / - size_of_dtype - : static_cast(size); + auto len = use_align ? platform::Alignment( + static_cast(size) * size_of_dtype, + place, align_size) / + size_of_dtype + : static_cast(size); const void *ptr = lod_tensors[i]->IsInitialized() ? lod_tensors[i]->data() : nullptr; VLOG(4) << size << " " << len; diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt index 89c573d2dcb71..c94b0c93eb34a 100644 --- a/paddle/fluid/operators/collective/CMakeLists.txt +++ b/paddle/fluid/operators/collective/CMakeLists.txt @@ -2,72 +2,154 @@ include(operators) set(COLLECTIVE_DEPS "") -set(COLLECTIVE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") +set(COLLECTIVE_COMPILE_FLAGS + "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor" +) -file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") +file( + GLOB OPS + RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + "*_op.cc") list(REMOVE_DUPLICATES OPS) foreach(src ${OPS}) - set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${COLLECTIVE_COMPILE_FLAGS}) + set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS + ${COLLECTIVE_COMPILE_FLAGS}) endforeach() -register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op c_gen_hccl_id_op gen_hccl_id_op c_gen_cncl_id_op DEPS ${COLLECTIVE_DEPS}) +register_operators( + EXCLUDES + c_gen_bkcl_id_op + gen_bkcl_id_op + c_gen_nccl_id_op + gen_nccl_id_op + c_gen_hccl_id_op + gen_hccl_id_op + c_gen_cncl_id_op + DEPS + ${COLLECTIVE_DEPS}) if(WITH_NCCL OR WITH_RCCL) - set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper) - op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) - op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) + set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper) + op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) + op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) endif() if(WITH_GLOO) - set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper) + set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper) endif() if(WITH_XPU_BKCL) - set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper) - op_library(c_gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS}) - op_library(gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS}) + set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper) + op_library(c_gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS}) + op_library(gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS}) endif() if(WITH_CNCL) - set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper) - op_library(c_gen_cncl_id_op DEPS ${COLLECTIVE_DEPS}) + set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper) + op_library(c_gen_cncl_id_op DEPS ${COLLECTIVE_DEPS}) endif() if(WITH_ASCEND_CL) - cc_library(gen_hccl_id_op_helper SRCS gen_hccl_id_op_helper.cc DEPS dynload_warpctc dynamic_loader scope) - set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper gen_hccl_id_op_helper) - op_library(c_gen_hccl_id_op DEPS ${COLLECTIVE_DEPS}) - op_library(gen_hccl_id_op DEPS ${COLLECTIVE_DEPS}) + cc_library( + gen_hccl_id_op_helper + SRCS gen_hccl_id_op_helper.cc + DEPS dynload_warpctc dynamic_loader scope) + set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper + gen_hccl_id_op_helper) + op_library(c_gen_hccl_id_op DEPS ${COLLECTIVE_DEPS}) + op_library(gen_hccl_id_op DEPS ${COLLECTIVE_DEPS}) endif() -set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE) -set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency") +set(OPERATOR_DEPS + ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} + PARENT_SCOPE) +set(GLOB_COLLECTIVE_DEPS + ${COLLECTIVE_DEPS} + CACHE INTERNAL "collective dependency") if(WITH_ASCEND_CL) - set(COMMON_TEST_DEPS_FOR_HCOM c_comm_init_hccl_op c_gen_hccl_id_op gen_hccl_id_op_helper - gen_hccl_id_op op_registry ascend_hccl flags - dynamic_loader dynload_warpctc scope device_context enforce executor) - cc_test(c_broadcast_op_npu_test SRCS c_broadcast_op_npu_test.cc - DEPS c_broadcast_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) - cc_test(c_allreduce_sum_op_npu_test SRCS c_allreduce_sum_op_npu_test.cc - DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) - cc_test(c_reducescatter_op_npu_test SRCS c_reducescatter_op_npu_test.cc - DEPS c_reducescatter_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) - cc_test(c_allgather_op_npu_test SRCS c_allgather_op_npu_test.cc - DEPS c_allgather_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) - cc_test(c_reduce_sum_op_npu_test SRCS c_reduce_sum_op_npu_test.cc - DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) - cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc - DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) - cc_test(send_v2_op_npu_test SRCS send_v2_op_npu_test.cc - DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) - cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc - DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) - cc_test(checknumeric SRCS checknumeric_npu_test.cc - DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) - cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc - DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor) - cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc - DEPS op_registry elementwise_add_op c_sync_calc_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor) + set(COMMON_TEST_DEPS_FOR_HCOM + c_comm_init_hccl_op + c_gen_hccl_id_op + gen_hccl_id_op_helper + gen_hccl_id_op + op_registry + ascend_hccl + flags + dynamic_loader + dynload_warpctc + scope + device_context + enforce + executor) + cc_test( + c_broadcast_op_npu_test + SRCS c_broadcast_op_npu_test.cc + DEPS c_broadcast_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test( + c_allreduce_sum_op_npu_test + SRCS c_allreduce_sum_op_npu_test.cc + DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test( + c_reducescatter_op_npu_test + SRCS c_reducescatter_op_npu_test.cc + DEPS c_reducescatter_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test( + c_allgather_op_npu_test + SRCS c_allgather_op_npu_test.cc + DEPS c_allgather_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test( + c_reduce_sum_op_npu_test + SRCS c_reduce_sum_op_npu_test.cc + DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test( + c_allreduce_max_op_npu_test + SRCS c_allreduce_max_op_npu_test.cc + DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test( + send_v2_op_npu_test + SRCS send_v2_op_npu_test.cc + DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test( + recv_v2_op_npu_test + SRCS recv_v2_op_npu_test.cc + DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test( + checknumeric + SRCS checknumeric_npu_test.cc + DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test( + c_sync_comm_stream_op_npu_test + SRCS c_sync_comm_stream_op_npu_test.cc + DEPS op_registry + c_broadcast_op + c_comm_init_hccl_op + c_sync_comm_stream_op + c_gen_hccl_id_op + gen_hccl_id_op_helper + ${COLLECTIVE_DEPS} + ascend_hccl + dynamic_loader + dynload_warpctc + scope + device_context + enforce + executor) + cc_test( + c_sync_calc_stream_op_npu_test + SRCS c_sync_calc_stream_op_npu_test.cc + DEPS op_registry + elementwise_add_op + c_sync_calc_stream_op + c_gen_hccl_id_op + gen_hccl_id_op_helper + ${COLLECTIVE_DEPS} + ascend_hccl + dynamic_loader + dynload_warpctc + scope + device_context + enforce + executor) endif() diff --git a/paddle/fluid/operators/collective/allreduce_op.cc b/paddle/fluid/operators/collective/allreduce_op.cc index 63b135a74cf4b..53843104dc5fd 100644 --- a/paddle/fluid/operators/collective/allreduce_op.cc +++ b/paddle/fluid/operators/collective/allreduce_op.cc @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/collective/allreduce_op.h" + #include // NOLINT #include -#include "paddle/fluid/operators/collective/allreduce_op.h" - namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/collective/alltoall_op.cu.cc b/paddle/fluid/operators/collective/alltoall_op.cu.cc index 0e0ea72208488..bb498047a50b0 100644 --- a/paddle/fluid/operators/collective/alltoall_op.cu.cc +++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc @@ -91,6 +91,9 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL(alltoall, ops::AllToAllOpCUDAKernel, ops::AllToAllOpCUDAKernel, +#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000 + ops::AllToAllOpCUDAKernel, +#endif ops::AllToAllOpCUDAKernel, ops::AllToAllOpCUDAKernel, ops::AllToAllOpCUDAKernel); diff --git a/paddle/fluid/operators/collective/barrier_op.h b/paddle/fluid/operators/collective/barrier_op.h index 6df4d24c0edf9..88333f36413b8 100644 --- a/paddle/fluid/operators/collective/barrier_op.h +++ b/paddle/fluid/operators/collective/barrier_op.h @@ -25,6 +25,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_GLOO) #include + #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #endif diff --git a/paddle/fluid/operators/collective/broadcast_op.cc b/paddle/fluid/operators/collective/broadcast_op.cc index 61e27887b68c7..071b0350de6d2 100644 --- a/paddle/fluid/operators/collective/broadcast_op.cc +++ b/paddle/fluid/operators/collective/broadcast_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/collective/c_allgather_op.cc b/paddle/fluid/operators/collective/c_allgather_op.cc index c4e779698ccca..f20ec75a97006 100644 --- a/paddle/fluid/operators/collective/c_allgather_op.cc +++ b/paddle/fluid/operators/collective/c_allgather_op.cc @@ -26,8 +26,9 @@ class CAllGatherOp : public framework::OperatorWithKernel { OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "AllGather"); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Input", "Out", "AllGather"); int nranks = ctx->Attrs().Get("nranks"); - PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument( - "The value of nranks should be >=2.")); + PADDLE_ENFORCE_GE(nranks, 2, + platform::errors::InvalidArgument( + "The value of nranks should be >=2.")); framework::DDim dim = ctx->GetInputDim("X"); dim[0] = dim[0] * nranks; if (dim[0] < 0) dim[0] = -1; diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc index 0d97ffa96dc5c..62ed916d6e08c 100644 --- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc @@ -90,6 +90,9 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL(c_allgather, ops::CAllGatherOpCUDAKernel, ops::CAllGatherOpCUDAKernel, +#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000 + ops::CAllGatherOpCUDAKernel, +#endif ops::CAllGatherOpCUDAKernel, ops::CAllGatherOpCUDAKernel, ops::CAllGatherOpCUDAKernel); diff --git a/paddle/fluid/operators/collective/c_allgather_op.h b/paddle/fluid/operators/collective/c_allgather_op.h index aa2040a2693b2..7f8c7b2f50e7c 100644 --- a/paddle/fluid/operators/collective/c_allgather_op.h +++ b/paddle/fluid/operators/collective/c_allgather_op.h @@ -25,6 +25,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_GLOO) #include + #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #endif diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu.cc b/paddle/fluid/operators/collective/c_allgather_op_npu.cc index 5339293da0fe2..f9ffdea790807 100644 --- a/paddle/fluid/operators/collective/c_allgather_op_npu.cc +++ b/paddle/fluid/operators/collective/c_allgather_op_npu.cc @@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/collective/c_allgather_op.h" - #include +#include "paddle/fluid/operators/collective/c_allgather_op.h" + #if defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/npu/hccl_helper.h" diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc index 7206dd01bcaa3..087f6b879c328 100644 --- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc @@ -17,23 +17,22 @@ limitations under the License. */ #endif #include + #include #include // NOLINT #include #include "gtest/gtest.h" - #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - #include "paddle/fluid/operators/collective/c_allgather_op.h" #include "paddle/fluid/operators/collective/c_allreduce_op.h" #include "paddle/fluid/operators/collective/c_broadcast_op.h" #include "paddle/fluid/operators/collective/c_reducescatter_op.h" #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" +#include "paddle/fluid/string/printf.h" +#include "paddle/phi/kernels/funcs/math_function.h" #if defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/collective_helper.h" diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc index 0946ad8aca65e..5c2d6981bad03 100644 --- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc @@ -17,23 +17,22 @@ limitations under the License. */ #endif #include + #include #include // NOLINT #include #include "gtest/gtest.h" - #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - #include "paddle/fluid/operators/collective/c_allgather_op.h" #include "paddle/fluid/operators/collective/c_allreduce_op.h" #include "paddle/fluid/operators/collective/c_broadcast_op.h" #include "paddle/fluid/operators/collective/c_reducescatter_op.h" #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" +#include "paddle/fluid/string/printf.h" +#include "paddle/phi/kernels/funcs/math_function.h" #if defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/collective_helper.h" diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 404f7c017ac41..61cf4cf5b7f5f 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -41,6 +41,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_GLOO) #include + #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #endif @@ -335,10 +336,11 @@ class CAllReduceOpXPUKernel : public framework::OpKernel { "Invalid reduce type: %d", red_type)); } - PADDLE_ENFORCE_EQ(bkcl_all_reduce(comm->comm(), sendbuff, recvbuff, numel, - dtype, bkcl_red_type, stream), - BKCL_SUCCESS, platform::errors::PreconditionNotMet( - "BKCL all reduce failed")); + PADDLE_ENFORCE_EQ( + bkcl_all_reduce(comm->comm(), sendbuff, recvbuff, numel, dtype, + bkcl_red_type, stream), + BKCL_SUCCESS, + platform::errors::PreconditionNotMet("BKCL all reduce failed")); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should be compiled with XPU.")); diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc index 8fe7fce21e465..565633c2e7b2d 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc @@ -19,6 +19,9 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( c_allreduce_sum, ops::CAllReduceOpCUDAKernel, +#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000 + ops::CAllReduceOpCUDAKernel, +#endif ops::CAllReduceOpCUDAKernel, ops::CAllReduceOpCUDAKernel, ops::CAllReduceOpCUDAKernel, diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.kps similarity index 58% rename from paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc rename to paddle/fluid/operators/collective/c_allreduce_sum_op.kps index d23572e6d670b..3230d2c9ec331 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.kps @@ -1,4 +1,4 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,10 +12,31 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef PADDLE_WITH_XPU_KP + +// Please do not modify the following code +#if defined(__CUDA_ARCH__) +#undef __CUDA_ARCH__ +#endif + +#if defined(__CUDACC__) +#undef __CUDACC__ +#endif + +#if defined(__CUDA__) +#undef __CUDA__ +#endif + +#if defined(__NVCC__) +#undef __NVCC__ +#endif + #include "paddle/fluid/operators/collective/c_allreduce_op.h" namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_XPU_KERNEL(c_allreduce_sum, - ops::CAllReduceOpXPUKernel) +REGISTER_OP_KERNEL(c_allreduce_sum, KP, plat::XPUPlace, + ops::CAllReduceOpXPUKernel); + +#endif diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc index 61e5f27903477..4c76d094bafa5 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc @@ -17,20 +17,19 @@ limitations under the License. */ #endif #include + #include #include // NOLINT #include #include "gtest/gtest.h" - #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - #include "paddle/fluid/operators/collective/c_allreduce_op.h" #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" +#include "paddle/fluid/string/printf.h" +#include "paddle/phi/kernels/funcs/math_function.h" #if defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/collective_helper.h" diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc index eeae16a0d71f3..478dc85914964 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc @@ -98,6 +98,9 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL(c_broadcast, ops::CBroadcastOpCUDAKernel, ops::CBroadcastOpCUDAKernel, +#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000 + ops::CBroadcastOpCUDAKernel, +#endif ops::CBroadcastOpCUDAKernel, ops::CBroadcastOpCUDAKernel, ops::CBroadcastOpCUDAKernel); diff --git a/paddle/fluid/operators/collective/c_broadcast_op.h b/paddle/fluid/operators/collective/c_broadcast_op.h index eb4acb9a369fc..394ea45efbb7d 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op.h +++ b/paddle/fluid/operators/collective/c_broadcast_op.h @@ -24,6 +24,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_GLOO) #include + #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #endif diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc index cf4d6a28744b3..e383e78c5dddc 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc @@ -17,20 +17,19 @@ limitations under the License. */ #endif #include + #include #include // NOLINT #include #include "gtest/gtest.h" - #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - #include "paddle/fluid/operators/collective/c_broadcast_op.h" #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" +#include "paddle/fluid/string/printf.h" +#include "paddle/phi/kernels/funcs/math_function.h" #if defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/collective_helper.h" diff --git a/paddle/fluid/operators/collective/c_comm_init_all_op.cc b/paddle/fluid/operators/collective/c_comm_init_all_op.cc index 5820bd318d8bc..c9605f4d1b268 100644 --- a/paddle/fluid/operators/collective/c_comm_init_all_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_all_op.cc @@ -15,13 +15,17 @@ limitations under the License. */ #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_registry.h" - #include "paddle/fluid/framework/threadpool.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif +#if defined(PADDLE_WITH_XPU_BKCL) +#include "paddle/fluid/platform/device/xpu/bkcl_helper.h" +#endif + namespace paddle { namespace framework { class InferShapeContext; @@ -48,9 +52,9 @@ class CCommInitAllOp : public framework::OperatorBase { void RunImpl(const framework::Scope& scope, const platform::Place& place) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(place), true, - platform::errors::PreconditionNotMet( - "CCommInitAllOp can run on gpu place only")); + // PADDLE_ENFORCE_EQ(platform::is_gpu_place(place), true, + // platform::errors::PreconditionNotMet( + // "CCommInitAllOp can run on gpu place only")); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) std::vector devices = Attr>("devices"); @@ -61,9 +65,52 @@ class CCommInitAllOp : public framework::OperatorBase { int rid = Attr("ring_id"); platform::NCCLCommContext::Instance().CreateAllNCCLComms(devices, rid); + +#elif defined(PADDLE_WITH_XPU_BKCL) + std::vector devices = Attr>("devices"); + int ring_id = Attr("ring_id"); + + if (devices.empty()) { + int count = platform::GetXPUDeviceCount(); + for (int i = 0; i < count; ++i) { + devices.push_back(i); + } + } + + if (devices.size() > 1) { + std::vector place_list_; + for (size_t i = 0; i < devices.size(); ++i) { + auto p = platform::XPUPlace(devices[i]); + place_list_.push_back(p); + } + + // create pthread to bkcl_init_rank on all devices + auto ptr = new platform::BKCLContextMap(place_list_); + ptr->init(); + + for (size_t i = 0; i < devices.size(); ++i) { + platform::BKCLCommContext::Instance().AssignBKCLComm( + ptr->contexts_.at(devices[i]).comm_, devices.size(), devices[i], + devices[i], ring_id); + + VLOG(0) << "bkcl communicator of rank " << devices[i] << " in ring " + << ring_id << " has been created on device " << devices[i]; + + // TODO(WorgenZhang): need release comm_map_ when quit + // std::call_once(once_flag_, []() { + // std::atexit([]() { + // platform::BKCLCommContext::Instance().ReleaseBKCLComms(); }); + // }); + } + + VLOG(0) << "done bkcl_init_rank on all devices"; + } else { + VLOG(0) + << "bkcl_init_rank doesn't support on one device, skip init process"; + } #else PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddlePaddle should compile with GPU.")); + "PaddlePaddle should compile with GPU or XPU.")); #endif } }; diff --git a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc index 86c966378ccb6..3ea24f6e654f0 100644 --- a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include #endif #include + #include #include diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc index 82d3b1b1dbfea..a41d4293c90e4 100644 --- a/paddle/fluid/operators/collective/c_comm_init_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_op.cc @@ -71,8 +71,9 @@ class CCommInitOp : public framework::OperatorBase { PADDLE_ENFORCE_EQ( platform::is_gpu_place(place) || platform::is_xpu_place(place) || platform::is_mlu_place(place), - true, platform::errors::PreconditionNotMet( - "CCommInitOp can run on gpu or xpu or mlu place only.")); + true, + platform::errors::PreconditionNotMet( + "CCommInitOp can run on gpu or xpu or mlu place only.")); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CNCL) @@ -97,18 +98,9 @@ class CCommInitOp : public framework::OperatorBase { if (Attr("device_id") >= 0) { device_id = Attr("device_id"); } - -#if defined(PADDLE_WITH_XPU_BKCL) && defined(PADDLE_WITH_HETERPS) && \ - defined(PADDLE_WITH_PSLIB) - // XPUPS rank_id only equals 0, so replace rank_id with device_id - CommContext::Instance().CreateComm(comm_id, nranks, device_id, device_id, - rid); -#else int rank_id = Attr("rank"); CommContext::Instance().CreateComm(comm_id, nranks, rank_id, device_id, rid); -#endif - #endif } }; diff --git a/paddle/fluid/operators/collective/c_concat_op.cc b/paddle/fluid/operators/collective/c_concat_op.cc index 551fde2116258..155db23a0391a 100644 --- a/paddle/fluid/operators/collective/c_concat_op.cc +++ b/paddle/fluid/operators/collective/c_concat_op.cc @@ -27,17 +27,19 @@ class CConcatOp : public framework::OperatorWithKernel { int nranks = ctx->Attrs().Get("nranks"); int rank = ctx->Attrs().Get("rank"); int ring_id = ctx->Attrs().Get("ring_id"); - PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument( - "The number of ranks (%d) for c_concat " - "must be greater than 1.", - nranks)); + PADDLE_ENFORCE_GE(nranks, 2, + platform::errors::InvalidArgument( + "The number of ranks (%d) for c_concat " + "must be greater than 1.", + nranks)); PADDLE_ENFORCE_GE( ring_id, 0, platform::errors::InvalidArgument( "The ring_id (%d) for c_concat must be non-negative.", ring_id)); PADDLE_ENFORCE_GE( - rank, 0, platform::errors::InvalidArgument( - "The rank (%d) for c_concat must be non-negative.", rank)); + rank, 0, + platform::errors::InvalidArgument( + "The rank (%d) for c_concat must be non-negative.", rank)); PADDLE_ENFORCE_LT(rank, nranks, platform::errors::InvalidArgument( "The value of rank (%d) for c_concat must " diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc index d3d9db0e5f87e..98df6c8688e74 100644 --- a/paddle/fluid/operators/collective/c_concat_op.cu.cc +++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc @@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/collective/c_concat_op.h" + #include -#include "paddle/fluid/operators/collective/c_concat_op.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/phi/api/include/tensor.h" diff --git a/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc b/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc index ec174ad0e56bc..3bd7e3ceffa2a 100644 --- a/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc +++ b/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc @@ -21,9 +21,8 @@ limitations under the License. */ #include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" - #include "paddle/fluid/platform/gen_comm_id_helper.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/collective/c_gen_cncl_id_op.cc b/paddle/fluid/operators/collective/c_gen_cncl_id_op.cc index 7e65fba571800..d2e85171a4a40 100644 --- a/paddle/fluid/operators/collective/c_gen_cncl_id_op.cc +++ b/paddle/fluid/operators/collective/c_gen_cncl_id_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include + #include #include "paddle/fluid/framework/op_proto_maker.h" @@ -21,9 +22,8 @@ limitations under the License. */ #include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" - #include "paddle/fluid/platform/gen_comm_id_helper.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc index 6eec385388090..3f81eab7bc2c4 100644 --- a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc +++ b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc @@ -19,12 +19,11 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/platform/device/npu/dynload/hccl.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" - -#include "paddle/fluid/platform/device/npu/dynload/hccl.h" #include "paddle/fluid/platform/gen_comm_id_helper.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc index d392beb3a4834..d4f1fe1c18297 100644 --- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc +++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc @@ -20,9 +20,8 @@ limitations under the License. */ #include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" - #include "paddle/fluid/platform/gen_comm_id_helper.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h index 4e9edb53730c2..5399a4aacbe2c 100644 --- a/paddle/fluid/operators/collective/c_reduce_op.h +++ b/paddle/fluid/operators/collective/c_reduce_op.h @@ -40,6 +40,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_GLOO) #include + #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #endif @@ -261,10 +262,11 @@ class CReduceOpXPUKernel : public framework::OpKernel { "Invalid reduce type: %d", red_type)); } - PADDLE_ENFORCE_EQ(bkcl_reduce(comm->comm(), sendbuff, recvbuff, numel, - dtype, bkcl_red_type, root, stream), - BKCL_SUCCESS, platform::errors::PreconditionNotMet( - "BKCL all reduce failed")); + PADDLE_ENFORCE_EQ( + bkcl_reduce(comm->comm(), sendbuff, recvbuff, numel, dtype, + bkcl_red_type, root, stream), + BKCL_SUCCESS, + platform::errors::PreconditionNotMet("BKCL all reduce failed")); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should be compiled with XPU.")); @@ -319,9 +321,10 @@ class CReduceOpCUDAKernel : public framework::OpKernel { break; default: - PADDLE_ENFORCE_EQ(true, false, platform::errors::InvalidArgument( - "red_type must be one of kRedSum, " - "kRedMax, kRedMin, kRedProd.")); + PADDLE_ENFORCE_EQ(true, false, + platform::errors::InvalidArgument( + "red_type must be one of kRedSum, " + "kRedMax, kRedMin, kRedProd.")); } PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce( diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc index c4e410d04da5f..3bd55ea370465 100644 --- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc @@ -17,20 +17,19 @@ limitations under the License. */ #endif #include + #include #include // NOLINT #include #include "gtest/gtest.h" - #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - #include "paddle/fluid/operators/collective/c_reduce_op.h" #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" +#include "paddle/fluid/string/printf.h" +#include "paddle/phi/kernels/funcs/math_function.h" #if defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/collective_helper.h" diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc index 9b05e940d4f60..fda192c45e779 100644 --- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc +++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc @@ -76,6 +76,9 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL(c_reducescatter, ops::CReduceScatterOpCUDAKernel, ops::CReduceScatterOpCUDAKernel, +#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000 + ops::CReduceScatterOpCUDAKernel, +#endif ops::CReduceScatterOpCUDAKernel, ops::CReduceScatterOpCUDAKernel, ops::CReduceScatterOpCUDAKernel); diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc index 8b498787c69db..16437d4769eb0 100644 --- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc @@ -17,23 +17,22 @@ limitations under the License. */ #endif #include + #include #include // NOLINT #include #include "gtest/gtest.h" - #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - #include "paddle/fluid/operators/collective/c_allgather_op.h" #include "paddle/fluid/operators/collective/c_allreduce_op.h" #include "paddle/fluid/operators/collective/c_broadcast_op.h" #include "paddle/fluid/operators/collective/c_reducescatter_op.h" #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" +#include "paddle/fluid/string/printf.h" +#include "paddle/phi/kernels/funcs/math_function.h" #if defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/collective_helper.h" diff --git a/paddle/fluid/operators/collective/c_scatter_op.h b/paddle/fluid/operators/collective/c_scatter_op.h index 71a5f488ebc11..ee07d7663b2ec 100644 --- a/paddle/fluid/operators/collective/c_scatter_op.h +++ b/paddle/fluid/operators/collective/c_scatter_op.h @@ -24,6 +24,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_GLOO) #include + #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #endif diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu index 4c9fb14842489..71216538a4e12 100644 --- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu @@ -373,15 +373,15 @@ class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { const int end_index = start_index + D; if (label_type == framework::proto::VarType::INT32) { - MaskLabelByIndexGrad<<>>( - logit_grad_2d.data(), loss_grad->data(), - labels->data(), start_index, end_index, N, D); + MaskLabelByIndexGrad + <<>>( + logit_grad_2d.data(), loss_grad->data(), + labels->data(), start_index, end_index, N, D); } else if (label_type == framework::proto::VarType::INT64) { - MaskLabelByIndexGrad<<>>( - logit_grad_2d.data(), loss_grad->data(), - labels->data(), start_index, end_index, N, D); + MaskLabelByIndexGrad + <<>>( + logit_grad_2d.data(), loss_grad->data(), + labels->data(), start_index, end_index, N, D); } } }; diff --git a/paddle/fluid/operators/collective/c_split_op.cc b/paddle/fluid/operators/collective/c_split_op.cc index 37ec989f3f981..32f3ff9eab10d 100644 --- a/paddle/fluid/operators/collective/c_split_op.cc +++ b/paddle/fluid/operators/collective/c_split_op.cc @@ -27,17 +27,19 @@ class CSplitOp : public framework::OperatorWithKernel { int nranks = ctx->Attrs().Get("nranks"); int rank = ctx->Attrs().Get("rank"); int ring_id = ctx->Attrs().Get("ring_id"); - PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument( - "The number of ranks (%d) for c_split " - "must be greater than 1.", - nranks)); + PADDLE_ENFORCE_GE(nranks, 2, + platform::errors::InvalidArgument( + "The number of ranks (%d) for c_split " + "must be greater than 1.", + nranks)); PADDLE_ENFORCE_GE( ring_id, 0, platform::errors::InvalidArgument( "The ring_id (%d) for c_split must be non-negative.", ring_id)); PADDLE_ENFORCE_GE( - rank, 0, platform::errors::InvalidArgument( - "The rank (%d) for c_split must be non-negative.", rank)); + rank, 0, + platform::errors::InvalidArgument( + "The rank (%d) for c_split must be non-negative.", rank)); PADDLE_ENFORCE_LT(rank, nranks, platform::errors::InvalidArgument( "The value of rank (%d) for c_split must " diff --git a/paddle/fluid/operators/collective/c_split_op.cu b/paddle/fluid/operators/collective/c_split_op.cu index a0c4182468f07..1dce4ce04b56f 100644 --- a/paddle/fluid/operators/collective/c_split_op.cu +++ b/paddle/fluid/operators/collective/c_split_op.cu @@ -59,10 +59,11 @@ class CSplitOpCUDAKernel : public framework::OpKernel { int rank = ctx.Attr("rank"); auto place = ctx.GetPlace(); - PADDLE_ENFORCE_GE(rank, 0, platform::errors::PreconditionNotMet( - "The value of rank (%d) for c_split must be " - "greater than or equal to 0.", - rank)); + PADDLE_ENFORCE_GE(rank, 0, + platform::errors::PreconditionNotMet( + "The value of rank (%d) for c_split must be " + "greater than or equal to 0.", + rank)); PADDLE_ENFORCE_GE(nranks, 2, platform::errors::PreconditionNotMet( "The value of nranks (%d) for c_split must be " diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc index 6ad22ff8b19eb..bf7434686b97a 100644 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc @@ -23,7 +23,6 @@ class CSyncCalcStreamOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "(Tensor) Dependency of the variable need to sync"); AddComment(R"DOC( CSyncCalcStream Operator - Call calculation stream synchronization. )DOC"); } diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.kps b/paddle/fluid/operators/collective/c_sync_calc_stream_op.kps new file mode 100644 index 0000000000000..65126f416c4aa --- /dev/null +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.kps @@ -0,0 +1,42 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU_KP + +// Please do not modify the following code +#if defined(__CUDA_ARCH__) +#undef __CUDA_ARCH__ +#endif + +#if defined(__CUDACC__) +#undef __CUDACC__ +#endif + +#if defined(__CUDA__) +#undef __CUDA__ +#endif + +#if defined(__NVCC__) +#undef __NVCC__ +#endif + +#include "paddle/fluid/operators/collective/c_sync_calc_stream_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_KERNEL(c_sync_calc_stream, KP, plat::XPUPlace, + ops::CSyncCalcStreamKernel); + +#endif diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc index 5a9a00aa8e4d2..a3717459a2dac 100644 --- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc @@ -11,25 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include - -#include "paddle/fluid/framework/op_registry.h" - -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/platform/device/gpu/nccl_helper.h" -#endif - -#if defined(PADDLE_WITH_ASCEND_CL) -#include "paddle/fluid/platform/device/npu/hccl_helper.h" -#endif - -#if defined(PADDLE_WITH_CNCL) -#include "paddle/fluid/platform/device/mlu/cncl_helper.h" -#endif - -#if defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) -#include "paddle/fluid/platform/collective_helper.h" -#endif +#include "paddle/fluid/operators/collective/c_sync_comm_stream_op.h" namespace paddle { namespace operators { @@ -58,62 +40,11 @@ class CSyncCommStreamOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("ring_id", "(int default 0) ring id.").SetDefault(0); AddComment(R"DOC( CSyncCommStream Operator - Call communication stream synchronization. )DOC"); } }; -template -class CSyncCommStreamKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto place = ctx.GetPlace(); - int ring_id = ctx.Attr("ring_id"); - auto stream = - platform::NCCLCommContext::Instance().Get(ring_id, place)->stream(); - - platform::GpuStreamSync(stream); - -#elif defined(PADDLE_WITH_ASCEND_CL) - auto place = ctx.GetPlace(); - PADDLE_ENFORCE_EQ(platform::is_npu_place(place), true, - platform::errors::PreconditionNotMet( - "Sync comm stream op can run on npu place only for " - "now, but we got %s, please check the environment.", - place.DebugString())); - int ring_id = ctx.Attr("ring_id"); - auto stream = - platform::HCCLCommContext::Instance().Get(ring_id, place)->stream(); - platform::NPUStreamSync(stream); - -#elif defined(PADDLE_WITH_CNCL) - auto place = ctx.GetPlace(); - PADDLE_ENFORCE_EQ(platform::is_mlu_place(place), true, - platform::errors::PreconditionNotMet( - "Sync stream op can run on mlu place only for now.")); - int ring_id = ctx.Attr("ring_id"); - auto stream = - platform::CNCLCommContext::Instance().Get(ring_id, place)->stream(); - platform::MLUStreamSync(stream); -#elif defined(PADDLE_WITH_XPU_BKCL) - auto place = ctx.GetPlace(); - PADDLE_ENFORCE_EQ(platform::is_xpu_place(place), true, - platform::errors::PreconditionNotMet( - "Sync stream op can run on xpu place only for now.")); - int ring_id = ctx.Attr("ring_id"); - auto comm_dev_ctx = platform::BKCLCommContext::Instance() - .Get(ring_id, place) - ->dev_context(); - comm_dev_ctx->Wait(); -#else - PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddlePaddle should compile with GPU.")); -#endif - } -}; - } // namespace operators } // namespace paddle @@ -127,5 +58,3 @@ REGISTER_OP_CUDA_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel); REGISTER_OP_NPU_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel); REGISTER_OP_MLU_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel); - -REGISTER_OP_XPU_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel); diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.h b/paddle/fluid/operators/collective/c_sync_comm_stream_op.h new file mode 100644 index 0000000000000..f9dec9303742c --- /dev/null +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.h @@ -0,0 +1,88 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +#include "paddle/fluid/framework/op_registry.h" + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" +#endif + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/device/npu/hccl_helper.h" +#endif + +#if defined(PADDLE_WITH_CNCL) +#include "paddle/fluid/platform/device/mlu/cncl_helper.h" +#endif + +#if defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CSyncCommStreamKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto place = ctx.GetPlace(); + int ring_id = ctx.Attr("ring_id"); + auto stream = + platform::NCCLCommContext::Instance().Get(ring_id, place)->stream(); + + platform::GpuStreamSync(stream); + +#elif defined(PADDLE_WITH_ASCEND_CL) + auto place = ctx.GetPlace(); + PADDLE_ENFORCE_EQ(platform::is_npu_place(place), true, + platform::errors::PreconditionNotMet( + "Sync comm stream op can run on npu place only for " + "now, but we got %s, please check the environment.", + place.DebugString())); + int ring_id = ctx.Attr("ring_id"); + auto stream = + platform::HCCLCommContext::Instance().Get(ring_id, place)->stream(); + platform::NPUStreamSync(stream); + +#elif defined(PADDLE_WITH_CNCL) + auto place = ctx.GetPlace(); + PADDLE_ENFORCE_EQ(platform::is_mlu_place(place), true, + platform::errors::PreconditionNotMet( + "Sync stream op can run on mlu place only for now.")); + int ring_id = ctx.Attr("ring_id"); + auto stream = + platform::CNCLCommContext::Instance().Get(ring_id, place)->stream(); + platform::MLUStreamSync(stream); +#elif defined(PADDLE_WITH_XPU_BKCL) + auto place = ctx.GetPlace(); + PADDLE_ENFORCE_EQ(platform::is_xpu_place(place), true, + platform::errors::PreconditionNotMet( + "Sync stream op can run on xpu place only for now.")); + int ring_id = ctx.Attr("ring_id"); + auto comm_dev_ctx = platform::BKCLCommContext::Instance() + .Get(ring_id, place) + ->dev_context(); + comm_dev_ctx->Wait(); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with GPU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.kps b/paddle/fluid/operators/collective/c_sync_comm_stream_op.kps new file mode 100644 index 0000000000000..bfac7bf5c5b92 --- /dev/null +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.kps @@ -0,0 +1,42 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU_KP + +// Please do not modify the following code +#if defined(__CUDA_ARCH__) +#undef __CUDA_ARCH__ +#endif + +#if defined(__CUDACC__) +#undef __CUDACC__ +#endif + +#if defined(__CUDA__) +#undef __CUDA__ +#endif + +#if defined(__NVCC__) +#undef __NVCC__ +#endif + +#include "paddle/fluid/operators/collective/c_sync_comm_stream_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_KERNEL(c_sync_comm_stream, KP, plat::XPUPlace, + ops::CSyncCommStreamKernel); + +#endif diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc index 133085ad3f3b0..91b89486c6a4b 100644 --- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc @@ -26,11 +26,10 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - #include "paddle/fluid/operators/collective/c_broadcast_op.h" #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" +#include "paddle/fluid/string/printf.h" +#include "paddle/phi/kernels/funcs/math_function.h" #if defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/collective_helper.h" diff --git a/paddle/fluid/operators/collective/checknumeric_npu_test.cc b/paddle/fluid/operators/collective/checknumeric_npu_test.cc index 36c6f4fadd0fc..b99ac3816352c 100644 --- a/paddle/fluid/operators/collective/checknumeric_npu_test.cc +++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc @@ -17,21 +17,20 @@ limitations under the License. */ #endif #include + #include #include #include // NOLINT #include #include "gtest/gtest.h" - #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - #include "paddle/fluid/operators/collective/c_allreduce_op.h" #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" +#include "paddle/fluid/string/printf.h" +#include "paddle/phi/kernels/funcs/math_function.h" #if defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/collective_helper.h" diff --git a/paddle/fluid/operators/collective/gen_bkcl_id_op.cc b/paddle/fluid/operators/collective/gen_bkcl_id_op.cc index 1ce8938356895..f60030cec7628 100644 --- a/paddle/fluid/operators/collective/gen_bkcl_id_op.cc +++ b/paddle/fluid/operators/collective/gen_bkcl_id_op.cc @@ -24,11 +24,10 @@ limitations under the License. */ #include "paddle/fluid/platform/device/xpu/bkcl_helper.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/gen_comm_id_helper.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/split.h" -#include "paddle/fluid/platform/gen_comm_id_helper.h" - namespace paddle { namespace operators { @@ -69,9 +68,10 @@ class GenBKCLIdOp : public framework::OperatorBase { int trainer_id = Attr("trainer_id"); std::string endpoint = trainers[trainer_id]; - PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument( - "trainer_id %d is less than 0. Its " - "valid range is [0, trainer_size)")); + PADDLE_ENFORCE_GE( + trainer_id, 0, + platform::errors::InvalidArgument("trainer_id %d is less than 0. Its " + "valid range is [0, trainer_size)")); PADDLE_ENFORCE_LT( trainer_id, static_cast(trainers.size()), platform::errors::OutOfRange("trainer_id %d is out of range. Its valid " diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op.cc b/paddle/fluid/operators/collective/gen_hccl_id_op.cc index 3d78082f12fc9..e0809459be109 100644 --- a/paddle/fluid/operators/collective/gen_hccl_id_op.cc +++ b/paddle/fluid/operators/collective/gen_hccl_id_op.cc @@ -21,14 +21,13 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" #include "paddle/fluid/platform/device/npu/hccl_helper.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/split.h" -#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" - namespace paddle { namespace operators { @@ -48,9 +47,10 @@ class GenHCCLIdOp : public framework::OperatorBase { int trainer_id = Attr("trainer_id"); std::string endpoint = trainers[trainer_id]; - PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument( - "trainer_id %d is less than 0. Its " - "valid range is [0, trainer_size)")); + PADDLE_ENFORCE_GE( + trainer_id, 0, + platform::errors::InvalidArgument("trainer_id %d is less than 0. Its " + "valid range is [0, trainer_size)")); PADDLE_ENFORCE_LT( trainer_id, static_cast(trainers.size()), platform::errors::OutOfRange("trainer_id %d is out of range. Its valid " diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc index ad50ac367508b..ba573509bd18a 100644 --- a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc +++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + #include #include #include diff --git a/paddle/fluid/operators/collective/gen_nccl_id_op.cc b/paddle/fluid/operators/collective/gen_nccl_id_op.cc index 7a5b6b5f429b2..1e23f38c13ad0 100644 --- a/paddle/fluid/operators/collective/gen_nccl_id_op.cc +++ b/paddle/fluid/operators/collective/gen_nccl_id_op.cc @@ -70,9 +70,10 @@ class GenNCCLIdOp : public framework::OperatorBase { int trainer_id = Attr("trainer_id"); std::string endpoint = trainers[trainer_id]; - PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument( - "trainer_id %d is less than 0. Its " - "valid range is [0, trainer_size)")); + PADDLE_ENFORCE_GE( + trainer_id, 0, + platform::errors::InvalidArgument("trainer_id %d is less than 0. Its " + "valid range is [0, trainer_size)")); PADDLE_ENFORCE_LT( trainer_id, static_cast(trainers.size()), platform::errors::OutOfRange("trainer_id %d is out of range. Its valid " diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cc b/paddle/fluid/operators/collective/partial_allgather_op.cc index bef2ff94d6308..6783d2f0b4593 100644 --- a/paddle/fluid/operators/collective/partial_allgather_op.cc +++ b/paddle/fluid/operators/collective/partial_allgather_op.cc @@ -26,8 +26,9 @@ class PartialAllGatherOp : public framework::OperatorWithKernel { int nranks = ctx->Attrs().Get("nranks"); int rank = ctx->Attrs().Get("rank"); - PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument( - "The value of nranks should be >=2.")); + PADDLE_ENFORCE_GE(nranks, 2, + platform::errors::InvalidArgument( + "The value of nranks should be >=2.")); PADDLE_ENFORCE_EQ( (rank >= 0 && rank < nranks), true, platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/collective/partial_allgather_op_npu.cc b/paddle/fluid/operators/collective/partial_allgather_op_npu.cc index 0314bb7d5de1d..c727161d10179 100644 --- a/paddle/fluid/operators/collective/partial_allgather_op_npu.cc +++ b/paddle/fluid/operators/collective/partial_allgather_op_npu.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/collective/partial_allgather_op.h" #include +#include "paddle/fluid/operators/collective/partial_allgather_op.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/npu/hccl_helper.h" diff --git a/paddle/fluid/operators/collective/partial_recv_op.cc b/paddle/fluid/operators/collective/partial_recv_op.cc index 99b2169180c77..df59f49cb3a60 100644 --- a/paddle/fluid/operators/collective/partial_recv_op.cc +++ b/paddle/fluid/operators/collective/partial_recv_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/collective/partial_recv_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/collective/partial_recv_op_npu.cc b/paddle/fluid/operators/collective/partial_recv_op_npu.cc index f14ce5f81f905..4704ab7683cf3 100644 --- a/paddle/fluid/operators/collective/partial_recv_op_npu.cc +++ b/paddle/fluid/operators/collective/partial_recv_op_npu.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/collective/partial_recv_op.h" - #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/npu/hccl_helper.h" @@ -55,8 +54,9 @@ class PartialRecvOpASCENDKernel : public framework::OpKernel { int nranks = comm->nranks(); int peer = ctx.Attr("peer"); - PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument( - "The nranks must be 2, but (%d)", nranks)); + PADDLE_ENFORCE_EQ(nranks, 2, + platform::errors::InvalidArgument( + "The nranks must be 2, but (%d)", nranks)); int root = peer; diff --git a/paddle/fluid/operators/collective/partial_send_op_npu.cc b/paddle/fluid/operators/collective/partial_send_op_npu.cc index 31c74fcc196be..8f53bd8fc5f6a 100644 --- a/paddle/fluid/operators/collective/partial_send_op_npu.cc +++ b/paddle/fluid/operators/collective/partial_send_op_npu.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/collective/send_v2_op.h" - #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/npu/hccl_helper.h" @@ -52,8 +51,9 @@ class PartialSendOpASCENDKernel : public framework::OpKernel { int nranks = comm->nranks(); int rank = comm->rank(); - PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument( - "The nranks must be 2, but (%d)", nranks)); + PADDLE_ENFORCE_EQ(nranks, 2, + platform::errors::InvalidArgument( + "The nranks must be 2, but (%d)", nranks)); int root = rank; diff --git a/paddle/fluid/operators/collective/recv_v2_op.cc b/paddle/fluid/operators/collective/recv_v2_op.cc index 494665544f0d3..15da47e713bb9 100644 --- a/paddle/fluid/operators/collective/recv_v2_op.cc +++ b/paddle/fluid/operators/collective/recv_v2_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/collective/recv_v2_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc index f7a2e198db938..67c30438869b1 100644 --- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc +++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc @@ -224,6 +224,9 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL(recv_v2, ops::RecvOpV2CUDAKernel, ops::RecvOpV2CUDAKernel, +#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000 + ops::RecvOpV2CUDAKernel, +#endif ops::RecvOpV2CUDAKernel, ops::RecvOpV2CUDAKernel, ops::RecvOpV2CUDAKernel, diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc index c31f1210f0422..9aa1ab788693d 100644 --- a/paddle/fluid/operators/collective/recv_v2_op_npu.cc +++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc @@ -61,8 +61,9 @@ class CRecvOpASCENDKernel : public framework::OpKernel { int nranks = comm->nranks(); int peer = ctx.Attr("peer"); - PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument( - "The nranks must be 2, but (%d)", nranks)); + PADDLE_ENFORCE_EQ(nranks, 2, + platform::errors::InvalidArgument( + "The nranks must be 2, but (%d)", nranks)); int root = peer; diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc index 6e02d36215697..0022b6bf39ddf 100644 --- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc +++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc @@ -17,20 +17,19 @@ limitations under the License. */ #endif #include + #include #include // NOLINT #include #include "gtest/gtest.h" - #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" #include "paddle/fluid/operators/collective/recv_v2_op.h" +#include "paddle/fluid/string/printf.h" +#include "paddle/phi/kernels/funcs/math_function.h" #if defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/collective_helper.h" diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc index 8878b7c3449b9..cfb3a11513a21 100644 --- a/paddle/fluid/operators/collective/send_v2_op.cu.cc +++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc @@ -197,6 +197,9 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL(send_v2, ops::SendOpV2CUDAKernel, ops::SendOpV2CUDAKernel, +#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000 + ops::SendOpV2CUDAKernel, +#endif ops::SendOpV2CUDAKernel, ops::SendOpV2CUDAKernel, ops::SendOpV2CUDAKernel, diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc index 882630467a012..ee34026cb28b2 100644 --- a/paddle/fluid/operators/collective/send_v2_op_npu.cc +++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc @@ -60,8 +60,9 @@ class CSendOpASCENDKernel : public framework::OpKernel { int nranks = comm->nranks(); int rank = comm->rank(); - PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument( - "The nranks must be 2, but (%d)", nranks)); + PADDLE_ENFORCE_EQ(nranks, 2, + platform::errors::InvalidArgument( + "The nranks must be 2, but (%d)", nranks)); int root = rank; diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc index 57e3dd53cc774..9784e6ddc1537 100644 --- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc +++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc @@ -17,19 +17,19 @@ limitations under the License. */ #endif #include + #include #include // NOLINT #include -#include "gtest/gtest.h" +#include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" #include "paddle/fluid/operators/collective/send_v2_op.h" +#include "paddle/fluid/string/printf.h" +#include "paddle/phi/kernels/funcs/math_function.h" #if defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/collective_helper.h" diff --git a/paddle/fluid/operators/common_infer_shape_functions.cc b/paddle/fluid/operators/common_infer_shape_functions.cc index 1d187451c6858..8bd60c77c46cf 100644 --- a/paddle/fluid/operators/common_infer_shape_functions.cc +++ b/paddle/fluid/operators/common_infer_shape_functions.cc @@ -61,12 +61,13 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims, PADDLE_ENFORCE_EQ( x_dims_array[i] == y_dims_array[i] || x_dims_array[i] <= 1 || y_dims_array[i] <= 1, - true, platform::errors::InvalidArgument( - "Broadcast dimension mismatch. Operands could " - "not be broadcast together with the shape of X = [%s] and " - "the shape of Y = [%s]. Received [%d] in X is not equal to " - "[%d] in Y at i:%d.", - x_dims, y_dims, x_dims_array[i], y_dims_array[i], i)); + true, + platform::errors::InvalidArgument( + "Broadcast dimension mismatch. Operands could " + "not be broadcast together with the shape of X = [%s] and " + "the shape of Y = [%s]. Received [%d] in X is not equal to " + "[%d] in Y at i:%d.", + x_dims, y_dims, x_dims_array[i], y_dims_array[i], i)); if ((x_dims_array[i] > 1 || y_dims_array[i] > 1) || (x_dims_array[i] == 1 && y_dims_array[i] == 1)) { out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]); diff --git a/paddle/fluid/operators/complex_op.cc b/paddle/fluid/operators/complex_op.cc index 7241c92258eea..d358f5765f9e8 100644 --- a/paddle/fluid/operators/complex_op.cc +++ b/paddle/fluid/operators/complex_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/complex_op.h" #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" diff --git a/paddle/fluid/operators/complex_view_op.cc b/paddle/fluid/operators/complex_view_op.cc index 763f936ec9c48..92b48fe8b06c7 100644 --- a/paddle/fluid/operators/complex_view_op.cc +++ b/paddle/fluid/operators/complex_view_op.cc @@ -18,6 +18,7 @@ #include #include #include + #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/operators/complex_view_op.cu b/paddle/fluid/operators/complex_view_op.cu index 261881cb8d256..b62c0470dd6ba 100644 --- a/paddle/fluid/operators/complex_view_op.cu +++ b/paddle/fluid/operators/complex_view_op.cu @@ -12,9 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/complex_view_op.h" - #include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/complex_view_op.h" #include "paddle/fluid/platform/enforce.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index a467f2dbee7c9..599fbcce39ff3 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -15,11 +15,12 @@ limitations under the License. */ #include "paddle/fluid/operators/concat_op.h" #include + #include #include #include -#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/phi/infermeta/multiary.h" #include "paddle/phi/kernels/funcs/concat_funcs.h" diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h index 50aca54c12dec..746e0e7a056fe 100644 --- a/paddle/fluid/operators/concat_op.h +++ b/paddle/fluid/operators/concat_op.h @@ -17,11 +17,11 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/operators/utils.h" - #include "paddle/phi/kernels/concat_kernel.h" #include "paddle/phi/kernels/funcs/concat_funcs.h" diff --git a/paddle/fluid/operators/concat_op_mlu.cc b/paddle/fluid/operators/concat_op_mlu.cc index 63f4ec46599ba..3d927af96e1b7 100644 --- a/paddle/fluid/operators/concat_op_mlu.cc +++ b/paddle/fluid/operators/concat_op_mlu.cc @@ -74,6 +74,65 @@ class ConcatMLUKernel : public framework::OpKernel { output_desc.get(), GetBasePtr(out)); } }; + +template +class ConcatGradMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto ins = ctx.MultiInput("X"); + auto out_var_names = ctx.OutputNames(framework::GradVarName("X")); + auto outs = + ctx.MultiOutput(framework::GradVarName("X")); + auto axis = ctx.Attr("axis"); + int split_num = ins.size(); + + PADDLE_ENFORCE_NOT_NULL(ins[0], + platform::errors::NotFound( + "The first input tensor is not initalized.")); + + if (ctx.HasInput("AxisTensor")) { + auto* axis_tensor = ctx.Input("AxisTensor"); + axis = GetDataFromTensor(axis_tensor)[0]; + } + + axis = ComputeAxis(static_cast(axis), + static_cast(ins[0]->dims().size())); + PADDLE_ENFORCE_GE(axis, 0, + platform::errors::InvalidArgument( + "concat_grad: axis should be larger than or " + "equal to 0, but received axis is %d.", + axis)); + PADDLE_ENFORCE_LT( + axis, out_grad->dims().size(), + platform::errors::InvalidArgument( + "concat_grad: axis should be less than ins[0]->dims()!" + "But received axis is %d, while ins[0]->dims()" + "size is %d.", + axis, out_grad->dims().size())); + // get output tensor that the name is not kEmptyVarName + std::vector outputs_vec; + std::vector output_descs; + std::vector descs_vec; + for (size_t j = 0; j < outs.size(); ++j) { + if (out_var_names[j] != framework::kEmptyVarName && + outs[j]->numel() != 0UL) { + outs[j]->mutable_data(ctx.GetPlace()); + output_descs.emplace_back(MLUCnnlTensorDesc(*outs[j])); + descs_vec.push_back(output_descs.back().get()); + outputs_vec.push_back(GetBasePtr(outs[j])); + } else { + outputs_vec.push_back(nullptr); + } + } + + MLUCnnlTensorDesc out_grad_desc(*out_grad); + MLUCnnl::Split(ctx, static_cast(split_num), static_cast(axis), + out_grad_desc.get(), GetBasePtr(out_grad), descs_vec.data(), + outputs_vec.data()); + } +}; } // namespace operators } // namespace paddle @@ -84,3 +143,9 @@ REGISTER_OP_MLU_KERNEL(concat, ops::ConcatMLUKernel, ops::ConcatMLUKernel, ops::ConcatMLUKernel, ops::ConcatMLUKernel, ops::ConcatMLUKernel); +REGISTER_OP_MLU_KERNEL(concat_grad, ops::ConcatGradMLUKernel, + ops::ConcatGradMLUKernel, + ops::ConcatGradMLUKernel, + ops::ConcatGradMLUKernel, + ops::ConcatGradMLUKernel, + ops::ConcatGradMLUKernel); diff --git a/paddle/fluid/operators/concat_op_xpu.cc b/paddle/fluid/operators/concat_op_xpu.cc index ba35098bbac10..fcbfc6f7a2b3c 100644 --- a/paddle/fluid/operators/concat_op_xpu.cc +++ b/paddle/fluid/operators/concat_op_xpu.cc @@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/concat_op.h" #include #include #include -#include "paddle/fluid/platform/device/xpu/xpu_header.h" +#include "paddle/fluid/operators/concat_op.h" +#include "paddle/fluid/platform/device/xpu/xpu_header.h" #include "paddle/phi/core/lod_utils.h" namespace paddle { @@ -33,17 +33,19 @@ class ConcatXPUKernel : public framework::OpKernel { auto ins = ctx.MultiInput("X"); framework::LoDTensor* out = ctx.Output("Out"); int axis = ctx.Attr("axis"); - PADDLE_ENFORCE_NE(ins[0], nullptr, platform::errors::InvalidArgument( - "The input should not be null.")); + PADDLE_ENFORCE_NE( + ins[0], nullptr, + platform::errors::InvalidArgument("The input should not be null.")); PADDLE_ENFORCE_NE(ctx.HasInput("AxisTensor"), true, platform::errors::InvalidArgument( "XPU donot surpport AxisTensor for now")); axis = ComputeAxis(static_cast(axis), static_cast(ins[0]->dims().size())); - PADDLE_ENFORCE_GE(axis, 0, platform::errors::InvalidArgument( - "concat: axis should be larger than or " - "equal to 0, but received axis is %d.", - axis)); + PADDLE_ENFORCE_GE(axis, 0, + platform::errors::InvalidArgument( + "concat: axis should be larger than or " + "equal to 0, but received axis is %d.", + axis)); PADDLE_ENFORCE_LT(axis, ins[0]->dims().size(), platform::errors::InvalidArgument( "concat: axis should be less than ins[0]->dims()!" @@ -94,8 +96,9 @@ class ConcatXPUKernel : public framework::OpKernel { } } - PADDLE_ENFORCE_GT(xdims_list.size(), 0, platform::errors::InvalidArgument( - "No tensor need concat")); + PADDLE_ENFORCE_GT( + xdims_list.size(), 0, + platform::errors::InvalidArgument("No tensor need concat")); auto& dev_ctx = ctx.template device_context(); int r = xpu::concat(dev_ctx.x_context(), ptrs, @@ -129,8 +132,9 @@ class ConcatGradXPUKernel : public framework::OpKernel { } } } - PADDLE_ENFORCE_NE(ins[0], nullptr, platform::errors::InvalidArgument( - "The input should not be null.")); + PADDLE_ENFORCE_NE( + ins[0], nullptr, + platform::errors::InvalidArgument("The input should not be null.")); auto axis = ctx.Attr("axis"); if (ctx.HasInput("AxisTensor")) { auto* axis_tensor = ctx.Input("AxisTensor"); @@ -149,10 +153,11 @@ class ConcatGradXPUKernel : public framework::OpKernel { ptrs[j] = nullptr; } } - PADDLE_ENFORCE_GE(axis, 0, platform::errors::InvalidArgument( - "concat_grad: axis should be larger than or " - "equal to 0, but received axis is %d.", - axis)); + PADDLE_ENFORCE_GE(axis, 0, + platform::errors::InvalidArgument( + "concat_grad: axis should be larger than or " + "equal to 0, but received axis is %d.", + axis)); PADDLE_ENFORCE_LT( axis, out_grad->dims().size(), platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/conj_op.cc b/paddle/fluid/operators/conj_op.cc index cbec1182f20b8..0c294b60482e4 100644 --- a/paddle/fluid/operators/conj_op.cc +++ b/paddle/fluid/operators/conj_op.cc @@ -74,8 +74,9 @@ REGISTER_OPERATOR(conj, ops::ConjOp, ops::ConjOpMaker, ConjInferShapeFunctor); REGISTER_OP_CPU_KERNEL( - conj, ops::ConjKernel>, + conj, + ops::ConjKernel>, ops::ConjKernel>, ops::ConjKernel, diff --git a/paddle/fluid/operators/conj_op.cu b/paddle/fluid/operators/conj_op.cu index d04024d70a8ea..548508636ca26 100644 --- a/paddle/fluid/operators/conj_op.cu +++ b/paddle/fluid/operators/conj_op.cu @@ -17,8 +17,9 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( - conj, ops::ConjKernel>, + conj, + ops::ConjKernel>, ops::ConjKernel>, ops::ConjKernel, diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt index 0c18522fa32ea..193c5c4505641 100644 --- a/paddle/fluid/operators/controlflow/CMakeLists.txt +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -1,24 +1,51 @@ include(operators) if(WITH_UNITY_BUILD) - # Load Unity Build rules for operators in paddle/fluid/operators/controlflow. - include(unity_build_rule.cmake) + # Load Unity Build rules for operators in paddle/fluid/operators/controlflow. + include(unity_build_rule.cmake) endif() register_operators(EXCLUDES conditional_block_op DEPS naive_executor) -cc_library(conditional_block_op SRCS conditional_block_op.cc DEPS executor) -cc_library(op_variant SRCS op_variant.cc DEPS operator proto_desc) -cc_library(conditional_block_op_helper SRCS conditional_block_op_helper.cc DEPS operator op_variant conditional_block_op) -cc_library(recurrent_op_helper SRCS recurrent_op_helper.cc DEPS operator op_variant recurrent_op) -cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator op_variant) +cc_library( + conditional_block_op + SRCS conditional_block_op.cc + DEPS executor) +cc_library( + op_variant + SRCS op_variant.cc + DEPS operator proto_desc) +cc_library( + conditional_block_op_helper + SRCS conditional_block_op_helper.cc + DEPS operator op_variant conditional_block_op) +cc_library( + recurrent_op_helper + SRCS recurrent_op_helper.cc + DEPS operator op_variant recurrent_op) +cc_library( + while_op_helper + SRCS while_op_helper.cc + DEPS operator op_variant) -cc_test(conditional_block_op_test SRCS conditional_block_op_test.cc DEPS conditional_block_op executor) +cc_test( + conditional_block_op_test + SRCS conditional_block_op_test.cc + DEPS conditional_block_op executor) if(WITH_UNITY_BUILD) - target_link_libraries(paddle_operators_controlflow_unity conditional_block_op) + target_link_libraries(paddle_operators_controlflow_unity conditional_block_op) else() - target_link_libraries(conditional_block_infer_op conditional_block_op) + target_link_libraries(conditional_block_infer_op conditional_block_op) endif() -file(APPEND ${pybind_file} "USE_OP_ITSELF(less_than);\nUSE_OP_ITSELF(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n") -file(APPEND ${pybind_file} "USE_OP_ITSELF(logical_and);\nUSE_OP_ITSELF(logical_or);\nUSE_OP_ITSELF(logical_xor);\nUSE_OP_ITSELF(logical_not);\n") -file(APPEND ${pybind_file} "USE_OP_ITSELF(bitwise_and);\nUSE_OP_ITSELF(bitwise_or);\nUSE_OP_ITSELF(bitwise_xor);\nUSE_OP_ITSELF(bitwise_not);\n") +file( + APPEND ${pybind_file} + "USE_OP_ITSELF(less_than);\nUSE_OP_ITSELF(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n" +) +file( + APPEND ${pybind_file} + "USE_OP_ITSELF(logical_and);\nUSE_OP_ITSELF(logical_or);\nUSE_OP_ITSELF(logical_xor);\nUSE_OP_ITSELF(logical_not);\n" +) +file( + APPEND ${pybind_file} + "USE_OP_ITSELF(bitwise_and);\nUSE_OP_ITSELF(bitwise_or);\nUSE_OP_ITSELF(bitwise_xor);\nUSE_OP_ITSELF(bitwise_not);\n" +) diff --git a/paddle/fluid/operators/controlflow/bitwise_op.cc b/paddle/fluid/operators/controlflow/bitwise_op.cc index 4dcbbc8568ff1..19865f9a9fb71 100644 --- a/paddle/fluid/operators/controlflow/bitwise_op.cc +++ b/paddle/fluid/operators/controlflow/bitwise_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc index 72d81d8c3fdf2..21fc69eb019d3 100644 --- a/paddle/fluid/operators/controlflow/compare_op.cc +++ b/paddle/fluid/operators/controlflow/compare_op.cc @@ -80,14 +80,12 @@ class CompareOp : public framework::OperatorWithKernel { } // namespace operators } // namespace paddle -#define REGISTER_COMPARE_OP_VERSION(op_type) \ - REGISTER_OP_VERSION(op_type) \ - .AddCheckpoint( \ - R"ROC(Upgrade compare ops, add a new attribute [force_cpu])ROC", \ - paddle::framework::compatible::OpVersionDesc().ModifyAttr( \ - "force_cpu", \ - "In order to force fill output variable to gpu memory.", \ - false)); +#define REGISTER_COMPARE_OP_VERSION(op_type) \ + REGISTER_OP_VERSION(op_type).AddCheckpoint( \ + R"ROC(Upgrade compare ops, add a new attribute [force_cpu])ROC", \ + paddle::framework::compatible::OpVersionDesc().ModifyAttr( \ + "force_cpu", \ + "In order to force fill output variable to gpu memory.", false)); #define REGISTER_COMPARE_OP(op_type, _equation) \ struct _##op_type##Comment { \ diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h index c024e4a12cd47..c1d13ffdf1295 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_op.h +++ b/paddle/fluid/operators/controlflow/conditional_block_op.h @@ -68,10 +68,11 @@ class ConditionalOp : public framework::OperatorBase { PADDLE_ENFORCE_EQ(framework::TransToProtoVarType(ips[0]->dtype()) == framework::proto::VarType::BOOL && ips[0]->numel() == 1, - true, platform::errors::InvalidArgument( - "condition input's data type should be bool, " - "numel should be 1, actual numel is %d", - ips[0]->numel())); + true, + platform::errors::InvalidArgument( + "condition input's data type should be bool, " + "numel should be 1, actual numel is %d", + ips[0]->numel())); bool res = false; if (platform::is_gpu_place(ips[0]->place())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc index 111ca9c63c634..369a1ffedc419 100644 --- a/paddle/fluid/operators/controlflow/fetch_op.cc +++ b/paddle/fluid/operators/controlflow/fetch_op.cc @@ -35,10 +35,11 @@ static void DataCopy(const framework::LoDTensor &src_item, // as params are not a subject to paddle's data_format VLOG(4) << "innerTransDataLayoutFromMKLDNN"; framework::innerTransDataLayoutFromMKLDNN( - src_item.layout(), fetch_var_name == framework::GradVarName("Filter") - ? framework::DataLayout::kNCHW - : paddle::platform::MKLDNNDeviceContext::tls() - .get_cur_paddle_data_layout(), + src_item.layout(), + fetch_var_name == framework::GradVarName("Filter") + ? framework::DataLayout::kNCHW + : paddle::platform::MKLDNNDeviceContext::tls() + .get_cur_paddle_data_layout(), src_item, &out, platform::CPUPlace()); paddle::framework::TensorCopySync(out, platform::CPUPlace(), dst_item); } else { @@ -92,11 +93,12 @@ class FetchOp : public framework::OperatorBase { int col = Attr("col"); PADDLE_ENFORCE_GE( - col, 0, platform::errors::InvalidArgument( - "Expected the column index (the attribute 'col' of " - "operator 'Fetch') of current fetching variable to be " - "no less than 0. But received column index = %d.", - col)); + col, 0, + platform::errors::InvalidArgument( + "Expected the column index (the attribute 'col' of " + "operator 'Fetch') of current fetching variable to be " + "no less than 0. But received column index = %d.", + col)); VLOG(3) << "Fetch variable " << fetch_var_name << " to variable " << out_name << "'s " << col << " column."; diff --git a/paddle/fluid/operators/controlflow/fetch_v2_op.cc b/paddle/fluid/operators/controlflow/fetch_v2_op.cc index caa67139a9b95..29d6eb1b2d44c 100644 --- a/paddle/fluid/operators/controlflow/fetch_v2_op.cc +++ b/paddle/fluid/operators/controlflow/fetch_v2_op.cc @@ -42,10 +42,11 @@ static void DeepCopy(const framework::LoDTensor &src_item, // Convert to desired Paddle layout, apart from grads of filter // as params are not a subject to paddle's data_format framework::innerTransDataLayoutFromMKLDNN( - src_item.layout(), fetch_var_name == framework::GradVarName("Filter") - ? framework::DataLayout::kNCHW - : paddle::platform::MKLDNNDeviceContext::tls() - .get_cur_paddle_data_layout(), + src_item.layout(), + fetch_var_name == framework::GradVarName("Filter") + ? framework::DataLayout::kNCHW + : paddle::platform::MKLDNNDeviceContext::tls() + .get_cur_paddle_data_layout(), src_item, &out, platform::CPUPlace()); paddle::framework::TensorCopySync(out, platform::CPUPlace(), dst_item); } else { @@ -123,11 +124,12 @@ class FetchV2Kernel { int col = ctx.Attr("col"); PADDLE_ENFORCE_GE( - col, 0, platform::errors::InvalidArgument( - "Expected the column index (the attribute 'col' of " - "operator 'Fetch') of current fetching variable to be " - "no less than 0. But received column index = %d.", - col)); + col, 0, + platform::errors::InvalidArgument( + "Expected the column index (the attribute 'col' of " + "operator 'Fetch') of current fetching variable to be " + "no less than 0. But received column index = %d.", + col)); auto *fetch_list = out_var->GetMutable(); diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc index 55bd4879ab794..7f3b004004136 100644 --- a/paddle/fluid/operators/controlflow/get_places_op.cc +++ b/paddle/fluid/operators/controlflow/get_places_op.cc @@ -62,9 +62,10 @@ class GetPlacesOp : public framework::OperatorBase { device_count = is_gpu ? CUDADevCount() : std::thread::hardware_concurrency(); } - PADDLE_ENFORCE_NE(device_count, 0UL, platform::errors::InvalidArgument( - "Cannot indicate %s device count", - is_gpu ? "GPU" : "CPU")); + PADDLE_ENFORCE_NE( + device_count, 0UL, + platform::errors::InvalidArgument("Cannot indicate %s device count", + is_gpu ? "GPU" : "CPU")); auto out_var_name = Output("Out"); auto &places = *(GET_DATA_SAFELY(scope.FindVar(out_var_name), "Output", diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc index 4d11cb5ff74e6..a9c28f48ef739 100644 --- a/paddle/fluid/operators/controlflow/logical_op.cc +++ b/paddle/fluid/operators/controlflow/logical_op.cc @@ -12,6 +12,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" diff --git a/paddle/fluid/operators/controlflow/op_variant.h b/paddle/fluid/operators/controlflow/op_variant.h index cc1f36a875f77..57d44b6793966 100644 --- a/paddle/fluid/operators/controlflow/op_variant.h +++ b/paddle/fluid/operators/controlflow/op_variant.h @@ -50,8 +50,9 @@ class OpVariant { const AttrType &Attr(const std::string &name) const { auto &attrs = Attrs(); auto it = attrs.find(name); - PADDLE_ENFORCE_NE(it, attrs.end(), platform::errors::NotFound( - "Cannot find attribute %s.", name)); + PADDLE_ENFORCE_NE( + it, attrs.end(), + platform::errors::NotFound("Cannot find attribute %s.", name)); return BOOST_GET_CONST(AttrType, it->second); } diff --git a/paddle/fluid/operators/controlflow/recurrent_op_helper.cc b/paddle/fluid/operators/controlflow/recurrent_op_helper.cc index 43913cae6b3c2..62cd2fc3376d5 100644 --- a/paddle/fluid/operators/controlflow/recurrent_op_helper.cc +++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/controlflow/recurrent_op_helper.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/controlflow/unity_build_rule.cmake b/paddle/fluid/operators/controlflow/unity_build_rule.cmake index 690a332d20b4c..594ae3a36cf1d 100644 --- a/paddle/fluid/operators/controlflow/unity_build_rule.cmake +++ b/paddle/fluid/operators/controlflow/unity_build_rule.cmake @@ -4,20 +4,18 @@ # Generally, the combination rules in this file do not need to be modified. # If there are some redefined error in compiling with the source file which # in combination rule, you can remove the source file from the following rules. -register_unity_group(cc - compare_all_op.cc - compare_op.cc - conditional_block_infer_op.cc - feed_op.cc - fetch_op.cc - fetch_v2_op.cc - get_places_op.cc - logical_op.cc - bitwise_op.cc - tensor_array_read_write_op.cc - while_op.cc) -register_unity_group(cu - logical_op.cu - bitwise_op.cu - compare_op.cu - compare_all_op.cu) +register_unity_group( + cc + compare_all_op.cc + compare_op.cc + conditional_block_infer_op.cc + feed_op.cc + fetch_op.cc + fetch_v2_op.cc + get_places_op.cc + logical_op.cc + bitwise_op.cc + tensor_array_read_write_op.cc + while_op.cc) +register_unity_group(cu logical_op.cu bitwise_op.cu compare_op.cu + compare_all_op.cu) diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index d8daa25f31be8..a551bad8eb10e 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -45,7 +45,7 @@ static std::string GetSkipEagerDeletionVarsDebugString( } return str; } -} // NOLINT +} // namespace class WhileOp : public framework::OperatorBase { public: @@ -375,10 +375,11 @@ class WhileGradOp : public framework::OperatorBase { PADDLE_ENFORCE_EQ( var->IsType() || var->IsType(), - true, platform::errors::InvalidArgument( - "Currently the type of var only can be LoDTensorArray, " - "or LoDTensor, but the received var[%s] is %s.", - inside_grad_name, framework::ToTypeName(var->Type()))); + true, + platform::errors::InvalidArgument( + "Currently the type of var only can be LoDTensorArray, " + "or LoDTensor, but the received var[%s] is %s.", + inside_grad_name, framework::ToTypeName(var->Type()))); if ((var_iter == outside_og_names.end()) && var->IsType()) { diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc index 63b273fdbb8bd..2b2001be6bfff 100644 --- a/paddle/fluid/operators/controlflow/while_op_helper.cc +++ b/paddle/fluid/operators/controlflow/while_op_helper.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/operators/controlflow/while_op_helper.h" #include + #include "paddle/fluid/string/string_helper.h" namespace paddle { diff --git a/paddle/fluid/operators/conv_base_helper.h b/paddle/fluid/operators/conv_base_helper.h index 9e1a323fc9f3d..f141c9eb08766 100644 --- a/paddle/fluid/operators/conv_base_helper.h +++ b/paddle/fluid/operators/conv_base_helper.h @@ -19,6 +19,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/conv_search_cache.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/phi/backends/gpu/gpu_context.h" diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h index af67d857e0eb7..3d704c8be30e4 100644 --- a/paddle/fluid/operators/conv_cudnn_op_cache.h +++ b/paddle/fluid/operators/conv_cudnn_op_cache.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index f084862b419d5..28ca2feeec53b 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -19,15 +19,13 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_version_registry.h" - #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif -#include "paddle/fluid/platform/cudnn_workspace_helper.h" - #include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/platform/cudnn_workspace_helper.h" #include "paddle/phi/infermeta/binary.h" namespace paddle { @@ -864,16 +862,15 @@ REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad, ops::Conv3DDoubleGradMaker); REGISTER_OPERATOR(conv3d_grad_grad, ops::ConvOpDoubleGrad); -REGISTER_OP_VERSION(conv2d) - .AddCheckpoint( - R"ROC( +REGISTER_OP_VERSION(conv2d).AddCheckpoint( + R"ROC( Upgrade conv2d, add a new attribute [use_addto]. )ROC", - paddle::framework::compatible::OpVersionDesc().NewAttr( - "use_addto", - "In order to support new feature (inplace addto strategy) for " - "gradient accumulation.", - false)); + paddle::framework::compatible::OpVersionDesc().NewAttr( + "use_addto", + "In order to support new feature (inplace addto strategy) for " + "gradient accumulation.", + false)); REGISTER_OP_VERSION(depthwise_conv2d) .AddCheckpoint( @@ -886,13 +883,12 @@ REGISTER_OP_VERSION(depthwise_conv2d) "gradient accumulation.", false)); -REGISTER_OP_VERSION(conv3d) - .AddCheckpoint( - R"ROC( +REGISTER_OP_VERSION(conv3d).AddCheckpoint( + R"ROC( Upgrade conv3d, add a new attribute [use_addto]. )ROC", - paddle::framework::compatible::OpVersionDesc().NewAttr( - "use_addto", - "In order to support new feature (inplace addto strategy) for " - "gradient accumulation.", - false)); + paddle::framework::compatible::OpVersionDesc().NewAttr( + "use_addto", + "In order to support new feature (inplace addto strategy) for " + "gradient accumulation.", + false)); diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index 58f2eeee256db..644a827b48821 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/layout_utils.h" diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc index 3ace825e7b80d..15a5aa737ae7e 100644 --- a/paddle/fluid/operators/conv_op_npu.cc +++ b/paddle/fluid/operators/conv_op_npu.cc @@ -130,12 +130,12 @@ class DepthwiseConvNPUKernel : public framework::OpKernel { "TransposeD", {*filter}, {transformed_filter}, {{"perm", perm}}); runner_trans.Run(stream); - const auto& runner = - NpuOpRunner("DepthwiseConv2D", {input_tensor, transformed_filter}, - {output_tensor}, {{"strides", strides}, - {"dilations", dilations}, - {"pads", padding}, - {"data_format", data_format}}); + const auto& runner = NpuOpRunner( + "DepthwiseConv2D", {input_tensor, transformed_filter}, {output_tensor}, + {{"strides", strides}, + {"dilations", dilations}, + {"pads", padding}, + {"data_format", data_format}}); runner.Run(stream); } }; @@ -392,14 +392,15 @@ class NPUConvGradOpKernel : public framework::OpKernel { filter_grad_fp32.ShareDataWith(*filter_grad); } - const auto& runner = NpuOpRunner( - "Conv2DBackpropFilterD", {input_tensor, output_grad_tensor}, - {filter_grad_fp32}, {{"filter_size", filter_shape_vec}, - {"strides", strides_vec}, - {"pads", paddings}, - {"dilations", dilations_vec}, - {"groups", groups}, - {"data_format", data_format}}); + const auto& runner = + NpuOpRunner("Conv2DBackpropFilterD", + {input_tensor, output_grad_tensor}, {filter_grad_fp32}, + {{"filter_size", filter_shape_vec}, + {"strides", strides_vec}, + {"pads", paddings}, + {"dilations", dilations_vec}, + {"groups", groups}, + {"data_format", data_format}}); runner.Run(stream); if (framework::TransToProtoVarType(input->dtype()) == @@ -418,12 +419,13 @@ class NPUConvGradOpKernel : public framework::OpKernel { } const auto& runner = NpuOpRunner("Conv2DBackpropInputD", {*filter, output_grad_tensor}, - {input_grad_tensor}, {{"input_size", input_shape_vec}, - {"strides", strides_vec}, - {"pads", paddings}, - {"dilations", dilations_vec}, - {"groups", groups}, - {"data_format", data_format}}); + {input_grad_tensor}, + {{"input_size", input_shape_vec}, + {"strides", strides_vec}, + {"pads", paddings}, + {"dilations", dilations_vec}, + {"groups", groups}, + {"data_format", data_format}}); runner.Run(stream); } } @@ -452,11 +454,12 @@ class NPUConv3dKernel : public framework::OpKernel { "= [%s]", data_format)); - PADDLE_ENFORCE_EQ(groups, 1, platform::errors::Unimplemented( - "the groups must be 1 in " - "the npu kernel of conv3d, but got groups " - "= [%d]", - groups)); + PADDLE_ENFORCE_EQ(groups, 1, + platform::errors::Unimplemented( + "the groups must be 1 in " + "the npu kernel of conv3d, but got groups " + "= [%d]", + groups)); output->mutable_data(ctx.GetPlace()); @@ -537,11 +540,12 @@ class NPUConv3dGradKernel : public framework::OpKernel { "= [%s]", data_format)); - PADDLE_ENFORCE_EQ(groups, 1, platform::errors::Unimplemented( - "the groups must be 1 in " - "the npu kernel of conv3d, but got groups " - "= [%d]", - groups)); + PADDLE_ENFORCE_EQ(groups, 1, + platform::errors::Unimplemented( + "the groups must be 1 in " + "the npu kernel of conv3d, but got groups " + "= [%d]", + groups)); auto& dev_ctx = ctx.template device_context(); auto input_tensor = @@ -593,14 +597,15 @@ class NPUConv3dGradKernel : public framework::OpKernel { filter_grad_tensor.ShareDataWith(*filter_grad); filter_grad_tensor.set_layout(DataLayout::kNCDHW); - const auto& runner = NpuOpRunner( - "Conv3DBackpropFilterD", {input_tensor, output_grad_tensor}, - {filter_grad_tensor}, {{"filter_size", filter_shape_vec}, - {"strides", strides_vec}, - {"pads", paddings}, - {"dilations", dilations_vec}, - {"groups", groups}, - {"data_format", data_format}}); + const auto& runner = + NpuOpRunner("Conv3DBackpropFilterD", + {input_tensor, output_grad_tensor}, {filter_grad_tensor}, + {{"filter_size", filter_shape_vec}, + {"strides", strides_vec}, + {"pads", paddings}, + {"dilations", dilations_vec}, + {"groups", groups}, + {"data_format", data_format}}); runner.Run(stream); } @@ -613,14 +618,15 @@ class NPUConv3dGradKernel : public framework::OpKernel { input_grad_tensor.ShareDataWith(*input_grad); input_grad_tensor.set_layout(DataLayout::kNCDHW); - const auto& runner = NpuOpRunner( - "Conv3DBackpropInputD", {filter_tensor, output_grad_tensor}, - {input_grad_tensor}, {{"input_size", input_shape_vec}, - {"strides", strides_vec}, - {"pads", paddings}, - {"dilations", dilations_vec}, - {"groups", groups}, - {"data_format", data_format}}); + const auto& runner = + NpuOpRunner("Conv3DBackpropInputD", + {filter_tensor, output_grad_tensor}, {input_grad_tensor}, + {{"input_size", input_shape_vec}, + {"strides", strides_vec}, + {"pads", paddings}, + {"dilations", dilations_vec}, + {"groups", groups}, + {"data_format", data_format}}); runner.Run(stream); } } diff --git a/paddle/fluid/operators/conv_op_xpu.cc b/paddle/fluid/operators/conv_op_xpu.cc index cc5c20d392809..d66eefc694691 100644 --- a/paddle/fluid/operators/conv_op_xpu.cc +++ b/paddle/fluid/operators/conv_op_xpu.cc @@ -8,10 +8,11 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/conv_op.h" #include #include #include + +#include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/platform/cudnn_workspace_helper.h" #ifdef PADDLE_WITH_XPU namespace paddle { diff --git a/paddle/fluid/operators/conv_shift_op.cc b/paddle/fluid/operators/conv_shift_op.cc index e7af908eba2c5..e996021ed843e 100644 --- a/paddle/fluid/operators/conv_shift_op.cc +++ b/paddle/fluid/operators/conv_shift_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/conv_shift_op.h" + #include + #include "paddle/fluid/framework/eigen.h" namespace paddle { diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc index fe76fc3aebbc1..8b60c67f92e5e 100644 --- a/paddle/fluid/operators/conv_transpose_op.cc +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/conv_transpose_op_npu.cc b/paddle/fluid/operators/conv_transpose_op_npu.cc index 050ede78f72cf..c07be5a3fdbf1 100644 --- a/paddle/fluid/operators/conv_transpose_op_npu.cc +++ b/paddle/fluid/operators/conv_transpose_op_npu.cc @@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/conv_transpose_op.h" - #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/conv_transpose_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/phi/kernels/cpu/conv_util.h" @@ -90,9 +89,9 @@ class Conv2DTransposeNPUKernel : public framework::OpKernel { auto output_dim_vec = phi::vectorize(output_tensor.dims()); auto stream = ctx.template device_context().stream(); - const auto& runner = - NpuOpRunner("Conv2DTransposeD", {input_tensor, *filter}, - {output_tensor}, {{"input_size", output_dim_vec}, + const auto& runner = NpuOpRunner("Conv2DTransposeD", + {input_tensor, *filter}, {output_tensor}, + {{"input_size", output_dim_vec}, {"strides", strides}, {"dilations", dilations}, {"output_padding", output_padding}, @@ -167,14 +166,15 @@ class Conv2DTransposeGradNPUKernel : public framework::OpKernel { auto stream = ctx.template device_context().stream(); if (filter_grad) { filter_grad->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner( - "Conv2DBackpropFilterD", {output_grad_tensor, input_tensor}, - {*filter_grad}, {{"filter_size", phi::vectorize(filter_dims)}, - {"strides", strides_vec}, - {"pads", paddings}, - {"dilations", dilations_vec}, - {"groups", groups}, - {"data_format", data_format}}); + const auto& runner = + NpuOpRunner("Conv2DBackpropFilterD", + {output_grad_tensor, input_tensor}, {*filter_grad}, + {{"filter_size", phi::vectorize(filter_dims)}, + {"strides", strides_vec}, + {"pads", paddings}, + {"dilations", dilations_vec}, + {"groups", groups}, + {"data_format", data_format}}); runner.Run(stream); } if (input_grad) { @@ -184,13 +184,13 @@ class Conv2DTransposeGradNPUKernel : public framework::OpKernel { if (channel_last) { input_grad_tensor.set_layout(DataLayout::kNHWC); } - const auto& runner = - NpuOpRunner("Conv2D", {output_grad_tensor, *filter}, - {input_grad_tensor}, {{"strides", strides_vec}, - {"pads", paddings}, - {"dilations", dilations_vec}, - {"groups", groups}, - {"data_format", data_format}}); + const auto& runner = NpuOpRunner("Conv2D", {output_grad_tensor, *filter}, + {input_grad_tensor}, + {{"strides", strides_vec}, + {"pads", paddings}, + {"dilations", dilations_vec}, + {"groups", groups}, + {"data_format", data_format}}); runner.Run(stream); } } diff --git a/paddle/fluid/operators/conv_transpose_op_xpu.cc b/paddle/fluid/operators/conv_transpose_op_xpu.cc index b8bd3c4f00608..ae25c57784f02 100644 --- a/paddle/fluid/operators/conv_transpose_op_xpu.cc +++ b/paddle/fluid/operators/conv_transpose_op_xpu.cc @@ -9,12 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/conv_transpose_op.h" - #include #include #include + #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/conv_transpose_op.h" #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/phi/kernels/cpu/conv_util.h" diff --git a/paddle/fluid/operators/correlation_op.cc b/paddle/fluid/operators/correlation_op.cc index 62e0f311d15d0..21258958549ae 100644 --- a/paddle/fluid/operators/correlation_op.cc +++ b/paddle/fluid/operators/correlation_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/correlation_op.cu b/paddle/fluid/operators/correlation_op.cu index f488cc12e642b..f9dd9ab98a308 100644 --- a/paddle/fluid/operators/correlation_op.cu +++ b/paddle/fluid/operators/correlation_op.cu @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_registry.h" #ifdef __HIPCC__ @@ -227,11 +228,11 @@ class CorrelationCUDAKernel : public framework::OpKernel { dim3 threadsPerBlock(THREADS_PER_BLOCK); dim3 totalBlocksCorr(N, OH, OW); - correlation_forward< - T><<>>( - output->data(), OC, OH, OW, rinput1.data(), C, H, W, - rinput2.data(), pad_size, kernel_size, max_displacement, stride1, - stride2); + correlation_forward + <<>>( + output->data(), OC, OH, OW, rinput1.data(), C, H, W, + rinput2.data(), pad_size, kernel_size, max_displacement, stride1, + stride2); } }; @@ -472,19 +473,19 @@ class CorrelationCUDAGradKernel : public framework::OpKernel { dim3 totalBlocksCorr(H, W, C); for (int n = 0; n < N; n++) { - correlation_backward_input1< - T><<>>( - n, grad_input1->data(), C, H, W, grad_output->data(), GOC, GOH, - GOW, rinput2.data(), pad_size, kernel_size, max_displacement, - stride1, stride2); + correlation_backward_input1 + <<>>( + n, grad_input1->data(), C, H, W, grad_output->data(), GOC, + GOH, GOW, rinput2.data(), pad_size, kernel_size, + max_displacement, stride1, stride2); } for (int n = 0; n < N; n++) { - correlation_backward_input2< - T><<>>( - n, grad_input2->data(), C, H, W, grad_output->data(), GOC, GOH, - GOW, rinput1.data(), pad_size, kernel_size, max_displacement, - stride1, stride2); + correlation_backward_input2 + <<>>( + n, grad_input2->data(), C, H, W, grad_output->data(), GOC, + GOH, GOW, rinput1.data(), pad_size, kernel_size, + max_displacement, stride1, stride2); } } }; diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc index d41ceafba1a1b..4c0c5596e5d1b 100644 --- a/paddle/fluid/operators/cos_sim_op.cc +++ b/paddle/fluid/operators/cos_sim_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/cos_sim_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc index 6d3e6e34c3b8e..fa080b7a4b466 100644 --- a/paddle/fluid/operators/crf_decoding_op.cc +++ b/paddle/fluid/operators/crf_decoding_op.cc @@ -158,11 +158,12 @@ class CRFDecodingOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ( (label_dims.size() == 2UL && label_dims[1] == 1) || label_dims.size() == 1UL, - true, platform::errors::InvalidArgument( - "The Input(Label) should be a 2-D tensor with last " - "dimension fixed to 1 or a 1-D tensor. But received: " - "input rank %u, input shape [%s].", - label_dims.size(), label_dims)); + true, + platform::errors::InvalidArgument( + "The Input(Label) should be a 2-D tensor with last " + "dimension fixed to 1 or a 1-D tensor. But received: " + "input rank %u, input shape [%s].", + label_dims.size(), label_dims)); } if (ctx->IsRuntime() || (emission_dims[0] > 0 && label_dims[0] > 0)) { PADDLE_ENFORCE_EQ( diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h index 6b11ff69c3056..8b40abf3debe4 100644 --- a/paddle/fluid/operators/crf_decoding_op.h +++ b/paddle/fluid/operators/crf_decoding_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/jit/kernels.h" @@ -22,8 +23,8 @@ limitations under the License. */ namespace paddle { namespace operators { -using framework::LoDTensor; using framework::LoD; +using framework::LoDTensor; using framework::Tensor; template diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc index 9de5bc6ea3636..2e0a054fa122b 100644 --- a/paddle/fluid/operators/crop_op.cc +++ b/paddle/fluid/operators/crop_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/crop_op.h" + #include #include #include diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h index 5ac28fafb09b9..49e1d6ab5842a 100644 --- a/paddle/fluid/operators/crop_op.h +++ b/paddle/fluid/operators/crop_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/eigen/eigen_function.h" @@ -171,17 +172,19 @@ class CropGradKernel : public framework::OpKernel { size_t rank = context.Input(framework::GradVarName("Out"))->dims().size(); PADDLE_ENFORCE_GE( - rank, 1, platform::errors::InvalidArgument( - "The number of dimensions of the input 'Out@GRAD' for " - "CropGrad must be greater than or equal " - "to 1, but the value received is %d.", - rank)); + rank, 1, + platform::errors::InvalidArgument( + "The number of dimensions of the input 'Out@GRAD' for " + "CropGrad must be greater than or equal " + "to 1, but the value received is %d.", + rank)); PADDLE_ENFORCE_LE( - rank, 6, platform::errors::InvalidArgument( - "The number of dimensions of the input 'Out@GRAD' for " - "CropGrad must be less than or equal " - "to 6, but the value received is %d.", - rank)); + rank, 6, + platform::errors::InvalidArgument( + "The number of dimensions of the input 'Out@GRAD' for " + "CropGrad must be less than or equal " + "to 6, but the value received is %d.", + rank)); switch (rank) { case 1: CropGradFunction(context); diff --git a/paddle/fluid/operators/crop_tensor_op.cc b/paddle/fluid/operators/crop_tensor_op.cc index 0e53bbb5d189f..a9a94e2c948b9 100644 --- a/paddle/fluid/operators/crop_tensor_op.cc +++ b/paddle/fluid/operators/crop_tensor_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/crop_tensor_op.h" + #include #include #include diff --git a/paddle/fluid/operators/crop_tensor_op.h b/paddle/fluid/operators/crop_tensor_op.h index 409458037a204..851d007896d7e 100644 --- a/paddle/fluid/operators/crop_tensor_op.h +++ b/paddle/fluid/operators/crop_tensor_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/eigen/eigen_function.h" @@ -72,11 +73,12 @@ static framework::DDim ValidateShape(const std::vector shape, "The value (%d) of the %uth element for shape of " "Op(crop_tensor) should not be zero.", shape[i], i)); - PADDLE_ENFORCE_EQ(shape[i], -1, platform::errors::InvalidArgument( - "When the value (%d) of the %uth " - "element for shape of Op(crop_tensor)" - " is negative, only -1 is supported.", - shape[i], i)); + PADDLE_ENFORCE_EQ(shape[i], -1, + platform::errors::InvalidArgument( + "When the value (%d) of the %uth " + "element for shape of Op(crop_tensor)" + " is negative, only -1 is supported.", + shape[i], i)); output_shape[i] = in_dims[i] - offsets[i]; } else { output_shape[i] = static_cast(shape[i]); @@ -226,11 +228,12 @@ class CropTensorKernel : public framework::OpKernel { "value received is %d.", rank)); PADDLE_ENFORCE_LE( - rank, 6, platform::errors::InvalidArgument( - "The number of dimensions of the input 'x' for " - "Op(crop_tensor) must be less than or equal to 6, but the " - "value received is %d.", - rank)); + rank, 6, + platform::errors::InvalidArgument( + "The number of dimensions of the input 'x' for " + "Op(crop_tensor) must be less than or equal to 6, but the " + "value received is %d.", + rank)); switch (rank) { case 1: CropTensorFunction(context); diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc index 4f5912c81baef..a880584f4cfe7 100644 --- a/paddle/fluid/operators/cross_entropy_op.cc +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/cross_entropy_op.h" + #include #include #include diff --git a/paddle/fluid/operators/cross_op.cc b/paddle/fluid/operators/cross_op.cc index 674b75625d198..977d84e1e47c8 100644 --- a/paddle/fluid/operators/cross_op.cc +++ b/paddle/fluid/operators/cross_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" @@ -21,8 +22,8 @@ namespace paddle { namespace operators { -using framework::Tensor; using framework::DDim; +using framework::Tensor; const int kDefaultDim = framework::DDim::kMaxRank; class CrossOp : public framework::OperatorWithKernel { diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu index ba90c677570c5..10ec5a6bdd140 100644 --- a/paddle/fluid/operators/ctc_align_op.cu +++ b/paddle/fluid/operators/ctc_align_op.cu @@ -15,7 +15,9 @@ limitations under the License. */ #include #include #include + #include + #include "paddle/fluid/operators/ctc_align_op.h" namespace paddle { @@ -92,10 +94,10 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel { auto* output_length = ctx.Output("OutputLength"); T* output_length_data = output_length->mutable_data({input_dims[0], 1}, ctx.GetPlace()); - PaddingMergeAndDelCudaKernel< - T><<<32, (input_dims[0] + 32 - 1) / 32, 0, stream>>>( - input_dims[1], tokens, input_length_data, blank, merge_repeated, - padding_value, input_dims[0], output_data, output_length_data); + PaddingMergeAndDelCudaKernel + <<<32, (input_dims[0] + 32 - 1) / 32, 0, stream>>>( + input_dims[1], tokens, input_length_data, blank, merge_repeated, + padding_value, input_dims[0], output_data, output_length_data); } else { const size_t level = 0; auto input_lod = framework::ToAbsOffset(input->lod()); diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h index c561974b0c976..9e189a9fb6356 100644 --- a/paddle/fluid/operators/ctc_align_op.h +++ b/paddle/fluid/operators/ctc_align_op.h @@ -15,7 +15,9 @@ limitations under the License. */ #pragma once #include + #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/cuda_graph_with_in_out.h b/paddle/fluid/operators/cuda_graph_with_in_out.h new file mode 100644 index 0000000000000..e7a943aee4d36 --- /dev/null +++ b/paddle/fluid/operators/cuda_graph_with_in_out.h @@ -0,0 +1,156 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/tensor.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" +#endif + +namespace paddle { +namespace operators { + +#ifdef PADDLE_WITH_CUDA +class CUDAGraphWithInOuts { + public: + template + CUDAGraphWithInOuts(Callable &&callable, platform::CUDAPlace place, + const std::vector &in_ptrs, + cudaStreamCaptureMode mode, int64_t pool_id) { + in_indices_.resize(in_ptrs.size()); + ins_.reserve(in_ptrs.size()); + int64_t valid_in_idx = 0; + for (size_t i = 0; i < in_ptrs.size(); ++i) { + if (in_ptrs[i] == nullptr) { + in_indices_[i] = -1; + } else { + in_indices_[i] = (valid_in_idx++); + ins_.push_back(*in_ptrs[i]); + } + } + + platform::BeginCUDAGraphCapture(place, mode, pool_id); + auto out_ptrs = callable(in_ptrs); + graph_ = platform::EndCUDAGraphCapture(); + graph_->Replay(); + + out_indices_.resize(out_ptrs.size()); + outs_.reserve(out_ptrs.size()); + int64_t valid_out_idx = 0; + for (size_t i = 0; i < out_ptrs.size(); ++i) { + if (out_ptrs[i] == nullptr) { + out_indices_[i] = -1; + } else { + out_indices_[i] = (valid_out_idx++); + outs_.push_back(*out_ptrs[i]); + } + } + } + + void Run(const std::vector &ins) { + PADDLE_ENFORCE_EQ( + ins.size(), in_indices_.size(), + phi::errors::InvalidArgument("The input number does not match.")); + for (size_t i = 0; i < in_indices_.size(); ++i) { + if (in_indices_[i] >= 0) { + auto *dst = &ins_[in_indices_[i]]; + framework::TensorCopy(*ins[i], dst->place(), dst); + } + } + graph_->Replay(); + } + + std::vector GetOutputs() { + std::vector outs(out_indices_.size()); + for (size_t i = 0; i < out_indices_.size(); ++i) { + if (out_indices_[i] >= 0) { + outs[i] = &outs_[out_indices_[i]]; + } + } + return outs; + } + + int64_t PoolID() const { return graph_->PoolID(); } + + private: + std::unique_ptr graph_; + std::vector ins_; + std::vector outs_; + std::vector in_indices_; + std::vector out_indices_; +}; + +template +static std::unique_ptr CaptureCUDAGraph( + Callable &&callable, const framework::ExecutionContext &ctx, + const std::vector &input_names, + const std::vector &output_names, cudaStreamCaptureMode mode, + int64_t pool_id) { + std::vector inputs; + for (const auto &name : input_names) { + auto input_tensors = ctx.MultiInput(name); + inputs.insert(inputs.end(), input_tensors.begin(), input_tensors.end()); + } + + auto func = [&](const std::vector &inputs) { + callable(ctx); + std::vector outputs; + for (const auto &name : output_names) { + auto output_tensors = ctx.MultiOutput(name); + outputs.insert(outputs.end(), output_tensors.begin(), + output_tensors.end()); + } + return outputs; + }; + + return std::make_unique(func, ctx.GetPlace(), inputs, + mode, pool_id); +} + +static void ExecuteCUDAGraph(const framework::ExecutionContext &ctx, + const std::vector &input_names, + const std::vector &output_names, + CUDAGraphWithInOuts *graph) { + std::vector inputs; + for (const auto &name : input_names) { + auto input_tensors = ctx.MultiInput(name); + inputs.insert(inputs.end(), input_tensors.begin(), input_tensors.end()); + } + + graph->Run(inputs); + auto outputs = graph->GetOutputs(); + + size_t idx = 0; + for (const auto &name : output_names) { + auto output_tensors = ctx.MultiOutput(name); + for (auto *out_t : output_tensors) { + if (outputs[idx] != nullptr) { + *out_t = *outputs[idx]; + } else { + PADDLE_ENFORCE_EQ( + out_t, nullptr, + phi::errors::InvalidArgument( + "The %d-th output variable should be nullptr.", idx)); + } + ++idx; + } + } +} +#else +class CUDAGraphWithInOuts {}; +#endif + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cudnn_lstm_cache.h b/paddle/fluid/operators/cudnn_lstm_cache.h index 5451cf815cae3..da8284b4f2e43 100644 --- a/paddle/fluid/operators/cudnn_lstm_cache.h +++ b/paddle/fluid/operators/cudnn_lstm_cache.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/dynload/cudnn.h" diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc index ccb0062fcc723..9ff4f796995c0 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h index 6c059257b94e8..e2159a09c120c 100644 --- a/paddle/fluid/operators/cudnn_rnn_cache.h +++ b/paddle/fluid/operators/cudnn_rnn_cache.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc index 11633fb0b8703..dbb703e7e874d 100644 --- a/paddle/fluid/operators/cumsum_op.cc +++ b/paddle/fluid/operators/cumsum_op.cc @@ -86,13 +86,12 @@ REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker, ops::CumsumGradMaker, CumsumInferShapeFunctor); -REGISTER_OP_VERSION(cumsum) - .AddCheckpoint( - R"ROC( +REGISTER_OP_VERSION(cumsum).AddCheckpoint( + R"ROC( Upgrade cumsum add a new attribute [flatten]. )ROC", - paddle::framework::compatible::OpVersionDesc().NewAttr( - "flatten", - "In order to compute the cumsum over the flattened array when the " - "argument `axis` in python API is None.", - false)); + paddle::framework::compatible::OpVersionDesc().NewAttr( + "flatten", + "In order to compute the cumsum over the flattened array when the " + "argument `axis` in python API is None.", + false)); diff --git a/paddle/fluid/operators/cvm_op.cc b/paddle/fluid/operators/cvm_op.cc index e909906da7baa..912167cec5af7 100644 --- a/paddle/fluid/operators/cvm_op.cc +++ b/paddle/fluid/operators/cvm_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/cvm_op.h" + #include + #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc index 137de2d5af985..8287654949e70 100644 --- a/paddle/fluid/operators/data_norm_op.cc +++ b/paddle/fluid/operators/data_norm_op.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/data_norm_op.h" + #include #include + #include "paddle/fluid/framework/data_layout.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -163,10 +165,11 @@ class DataNormOp : public framework::OperatorWithKernel { OperatorWithKernel::IndicateVarDataType(ctx, "BatchSum"), platform::errors::InvalidArgument( "BatchSum input should be of float type")); - PADDLE_ENFORCE_EQ(dn_param_type, OperatorWithKernel::IndicateVarDataType( - ctx, "BatchSquareSum"), - platform::errors::InvalidArgument( - "BatchSquareSum input should be of float type")); + PADDLE_ENFORCE_EQ( + dn_param_type, + OperatorWithKernel::IndicateVarDataType(ctx, "BatchSquareSum"), + platform::errors::InvalidArgument( + "BatchSquareSum input should be of float type")); bool enable_scale_and_shift = ctx.Attr("enable_scale_and_shift"); if (enable_scale_and_shift) { @@ -277,8 +280,9 @@ class DataNormKernel const auto *x = ctx.Input("X"); const auto &x_dims = x->dims(); - PADDLE_ENFORCE_EQ(x_dims.size(), 2, platform::errors::InvalidArgument( - "The Input dim size should be 2")); + PADDLE_ENFORCE_EQ( + x_dims.size(), 2, + platform::errors::InvalidArgument("The Input dim size should be 2")); const int N = x_dims[0]; const int C = (data_layout == DataLayout::kNCHW ? x_dims[1] @@ -515,8 +519,9 @@ class DataNormGradKernel // Get the size for each dimension. // NCHW [batch_size, in_channels, in_height, in_width] const auto &x_dims = x->dims(); - PADDLE_ENFORCE_EQ(x_dims.size(), 2, platform::errors::InvalidArgument( - "The Input dim size should be 2")); + PADDLE_ENFORCE_EQ( + x_dims.size(), 2, + platform::errors::InvalidArgument("The Input dim size should be 2")); const int N = x_dims[0]; const int C = (data_layout == DataLayout::kNCHW ? x_dims[1] @@ -757,10 +762,9 @@ REGISTER_OP_CPU_KERNEL( data_norm_grad, ops::DataNormGradKernel, ops::DataNormGradKernel); -REGISTER_OP_VERSION(data_norm) - .AddCheckpoint( - R"ROC( +REGISTER_OP_VERSION(data_norm).AddCheckpoint( + R"ROC( upgrad data_norm op by adding scale_w to support scale and shift.)ROC", - paddle::framework::compatible::OpVersionDesc().NewInput( - "scale_w", - "scale_w is used to do scale duirng data_norm like batchnorm ")); + paddle::framework::compatible::OpVersionDesc().NewInput( + "scale_w", + "scale_w is used to do scale duirng data_norm like batchnorm ")); diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu index 28a7922120139..21c7d7d4bf496 100644 --- a/paddle/fluid/operators/data_norm_op.cu +++ b/paddle/fluid/operators/data_norm_op.cu @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/operators/data_norm_op.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" @@ -100,8 +101,9 @@ class DataNormKernel const auto *x = ctx.Input("X"); const auto &x_dims = x->dims(); // Align with CPU version, but should we add this restriction? - PADDLE_ENFORCE_EQ(x_dims.size(), 2, platform::errors::PreconditionNotMet( - "The Input dim size should be 2")); + PADDLE_ENFORCE_EQ( + x_dims.size(), 2, + platform::errors::PreconditionNotMet("The Input dim size should be 2")); const int N = x_dims[0]; const int C = x_dims[1]; const T *batch_size_in = ctx.Input("BatchSize")->data(); @@ -143,8 +145,9 @@ class DataNormGradKernel const auto &x_dims = x->dims(); // Align with CPU version, but should we add this restriction? - PADDLE_ENFORCE_EQ(x_dims.size(), 2, platform::errors::PreconditionNotMet( - "The Input dim size should be 2")); + PADDLE_ENFORCE_EQ( + x_dims.size(), 2, + platform::errors::PreconditionNotMet("The Input dim size should be 2")); const int N = x_dims[0]; const int C = x_dims[1]; diff --git a/paddle/fluid/operators/decode_jpeg_op.cu b/paddle/fluid/operators/decode_jpeg_op.cu index de6b35bc9cd0a..a257afc50f955 100644 --- a/paddle/fluid/operators/decode_jpeg_op.cu +++ b/paddle/fluid/operators/decode_jpeg_op.cu @@ -15,6 +15,7 @@ #if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP) #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/dynload/nvjpeg.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/operators/deformable_conv_op.cc b/paddle/fluid/operators/deformable_conv_op.cc index 1b76aca1e660e..b54c8a81abd64 100644 --- a/paddle/fluid/operators/deformable_conv_op.cc +++ b/paddle/fluid/operators/deformable_conv_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" diff --git a/paddle/fluid/operators/deformable_conv_op_xpu.cc b/paddle/fluid/operators/deformable_conv_op_xpu.cc index 240e5658956dd..d977cfe844a6a 100644 --- a/paddle/fluid/operators/deformable_conv_op_xpu.cc +++ b/paddle/fluid/operators/deformable_conv_op_xpu.cc @@ -15,6 +15,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" @@ -169,28 +170,32 @@ class DeformableConvGradXPUKernel : public framework::OpKernel { const float* offset_ptr = offset.data(); const float* mask_ptr = mask.data(); if (dx_data == nullptr) { - PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&dx_data), - input->numel() * sizeof(T)), - XPU_SUCCESS, platform::errors::ResourceExhausted( - "XPU has no enough memory")); + PADDLE_ENFORCE_EQ( + xpu_malloc(reinterpret_cast(&dx_data), + input->numel() * sizeof(T)), + XPU_SUCCESS, + platform::errors::ResourceExhausted("XPU has no enough memory")); } if (dw_data == nullptr) { - PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&dw_data), - filter.numel() * sizeof(T)), - XPU_SUCCESS, platform::errors::ResourceExhausted( - "XPU has no enough memory")); + PADDLE_ENFORCE_EQ( + xpu_malloc(reinterpret_cast(&dw_data), + filter.numel() * sizeof(T)), + XPU_SUCCESS, + platform::errors::ResourceExhausted("XPU has no enough memory")); } if (doffset_data == nullptr) { - PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&doffset_data), - offset.numel() * sizeof(T)), - XPU_SUCCESS, platform::errors::ResourceExhausted( - "XPU has no enough memory")); + PADDLE_ENFORCE_EQ( + xpu_malloc(reinterpret_cast(&doffset_data), + offset.numel() * sizeof(T)), + XPU_SUCCESS, + platform::errors::ResourceExhausted("XPU has no enough memory")); } if (dmask_data == nullptr) { - PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&dmask_data), - mask.numel() * sizeof(T)), - XPU_SUCCESS, platform::errors::ResourceExhausted( - "XPU has no enough memory")); + PADDLE_ENFORCE_EQ( + xpu_malloc(reinterpret_cast(&dmask_data), + mask.numel() * sizeof(T)), + XPU_SUCCESS, + platform::errors::ResourceExhausted("XPU has no enough memory")); } int input_dim = input->numel() / input->dims()[0]; @@ -207,10 +212,11 @@ class DeformableConvGradXPUKernel : public framework::OpKernel { int f = filter.dims()[0]; T* filter_grad_tmp = nullptr; - PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&filter_grad_tmp), - filter_grad->numel() * sizeof(T)), - XPU_SUCCESS, platform::errors::ResourceExhausted( - "XPU has no enough memory")); + PADDLE_ENFORCE_EQ( + xpu_malloc(reinterpret_cast(&filter_grad_tmp), + filter_grad->numel() * sizeof(T)), + XPU_SUCCESS, + platform::errors::ResourceExhausted("XPU has no enough memory")); // set zeros for d_table_data const int zero = 0; diff --git a/paddle/fluid/operators/deformable_conv_v1_op.cc b/paddle/fluid/operators/deformable_conv_v1_op.cc index 0ec95cb54bae8..2da561c868516 100644 --- a/paddle/fluid/operators/deformable_conv_v1_op.cc +++ b/paddle/fluid/operators/deformable_conv_v1_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cc b/paddle/fluid/operators/deformable_psroi_pooling_op.cc index 7e7cdbd8d178c..a989e3f9217c0 100644 --- a/paddle/fluid/operators/deformable_psroi_pooling_op.cc +++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cc @@ -13,9 +13,11 @@ // limitations under the License. #include "paddle/fluid/operators/deformable_psroi_pooling_op.h" + #include #include #include + #include "paddle/phi/kernels/funcs/blas/blas.h" namespace paddle { @@ -165,11 +167,12 @@ class DeformablePSROIPoolOp : public framework::OperatorWithKernel { auto part_width = part_size[1]; auto sample_per_part = ctx->Attrs().Get("sample_per_part"); auto trans_std = ctx->Attrs().Get("trans_std"); - PADDLE_ENFORCE_GE(trans_std, 0., platform::errors::InvalidArgument( - "Input(trans_std) should not be lower " - "than 0.0, but received trans_std " - "is:%f", - trans_std)); + PADDLE_ENFORCE_GE(trans_std, 0., + platform::errors::InvalidArgument( + "Input(trans_std) should not be lower " + "than 0.0, but received trans_std " + "is:%f", + trans_std)); PADDLE_ENFORCE_GE( input_dims[1], output_channels, platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu index 873950b2d2f65..174f045c1605c 100644 --- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu +++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu @@ -23,10 +23,12 @@ #pragma once #include + #include #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/deformable_psroi_pooling_op.h" diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.h b/paddle/fluid/operators/deformable_psroi_pooling_op.h index 3deabce54ed0b..6ff6ab20df2fb 100644 --- a/paddle/fluid/operators/deformable_psroi_pooling_op.h +++ b/paddle/fluid/operators/deformable_psroi_pooling_op.h @@ -25,6 +25,7 @@ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/dequantize_op.cc b/paddle/fluid/operators/dequantize_op.cc index 876bd1199ad3b..2bed296efd77a 100644 --- a/paddle/fluid/operators/dequantize_op.cc +++ b/paddle/fluid/operators/dequantize_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/dequantize_op.h" + #include "paddle/fluid/framework/op_version_registry.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -47,8 +48,8 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(dequantize, ops::DeQuantOp, ops::DeQuantOpMaker); REGISTER_OP_VERSION(dequantize) - .AddCheckpoint( - R"ROC( Add a new attribute [Shift])ROC", - paddle::framework::compatible::OpVersionDesc().NewAttr( - "Shift", "Dequantize data to uint8 if provided non-zero value.", - 0.0f)); + .AddCheckpoint(R"ROC( Add a new attribute [Shift])ROC", + paddle::framework::compatible::OpVersionDesc().NewAttr( + "Shift", + "Dequantize data to uint8 if provided non-zero value.", + 0.0f)); diff --git a/paddle/fluid/operators/dequantize_op.h b/paddle/fluid/operators/dequantize_op.h index 75c27a06c210f..ea7a08c8f3684 100644 --- a/paddle/fluid/operators/dequantize_op.h +++ b/paddle/fluid/operators/dequantize_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/dequeue_op.cc b/paddle/fluid/operators/dequeue_op.cc index fb5d53dacf0ed..1a6286b0a3289 100644 --- a/paddle/fluid/operators/dequeue_op.cc +++ b/paddle/fluid/operators/dequeue_op.cc @@ -14,6 +14,7 @@ #include #include + #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index f10c801919999..6e5ea3e8aa721 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -1,96 +1,129 @@ set(LOCAL_DETECTION_LIBS) function(detection_library TARGET_NAME) - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) - set(options "") - set(common_deps op_registry) - set(pybind_flag 0) - cmake_parse_arguments(detection_library "${options}" "${oneValueArgs}" - "${multiValueArgs}" ${ARGN}) - set(srcs) - # filter cuda source file when not build with cuda/rocm - foreach(src ${detection_library_SRCS}) - if (NOT WITH_GPU AND NOT WITH_ROCM) - if(${src} MATCHES ".*\\.cc$") - list(APPEND srcs ${src}) - endif() - else() + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + set(options "") + set(common_deps op_registry) + set(pybind_flag 0) + cmake_parse_arguments(detection_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + set(srcs) + # filter cuda source file when not build with cuda/rocm + foreach(src ${detection_library_SRCS}) + if(NOT WITH_GPU AND NOT WITH_ROCM) + if(${src} MATCHES ".*\\.cc$") list(APPEND srcs ${src}) endif() - endforeach() - - op_library(${TARGET_NAME} SRCS ${srcs} DEPS ${common_deps} ${detection_library_DEPS}) + else() + list(APPEND srcs ${src}) + endif() + endforeach() + + op_library(${TARGET_NAME} SRCS ${srcs} DEPS ${common_deps} + ${detection_library_DEPS}) - set(LOCAL_DETECTION_LIBS - ${TARGET_NAME} - ${LOCAL_DETECTION_LIBS} - PARENT_SCOPE) + set(LOCAL_DETECTION_LIBS + ${TARGET_NAME} ${LOCAL_DETECTION_LIBS} + PARENT_SCOPE) endfunction() -if (WITH_ASCEND_CL) - detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu box_coder_op_npu.cc) - detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu density_prior_box_op_npu.cc) +if(WITH_ASCEND_CL) + detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu + box_coder_op_npu.cc) + detection_library(density_prior_box_op SRCS density_prior_box_op.cc + density_prior_box_op.cu density_prior_box_op_npu.cc) else() - detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu) - detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu) + detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu) + detection_library(density_prior_box_op SRCS density_prior_box_op.cc + density_prior_box_op.cu) endif() if(WITH_XPU) - detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_xpu.cc) + detection_library(iou_similarity_op SRCS iou_similarity_op.cc + iou_similarity_op_xpu.cc) detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_xpu.cc) + detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc + generate_proposals_v2_op_xpu.cc) elseif(WITH_ASCEND_CL) - detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_npu.cc) - detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu prior_box_op_npu.cc) + detection_library(iou_similarity_op SRCS iou_similarity_op.cc + iou_similarity_op_npu.cc) + detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu + prior_box_op_npu.cc) else() - detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op.cu) + detection_library(iou_similarity_op SRCS iou_similarity_op.cc + iou_similarity_op.cu) detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu) + # detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc) endif() detection_library(bipartite_match_op SRCS bipartite_match_op.cc) detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc) detection_library(anchor_generator_op SRCS anchor_generator_op.cc -anchor_generator_op.cu) -detection_library(target_assign_op SRCS target_assign_op.cc -target_assign_op.cu) + anchor_generator_op.cu) +detection_library(target_assign_op SRCS target_assign_op.cc target_assign_op.cu) detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc -polygon_box_transform_op.cu) + polygon_box_transform_op.cu) detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc) -detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc) +detection_library(generate_proposal_labels_op SRCS + generate_proposal_labels_op.cc) detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc DEPS gpc) detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS gpc) detection_library(matrix_nms_op SRCS matrix_nms_op.cc DEPS gpc) detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu) detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc) detection_library(yolo_box_op SRCS yolo_box_op.cc) -detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu) -detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_focal_loss_op.cu) -detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc) +detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc + box_decoder_and_assign_op.cu) +detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc + sigmoid_focal_loss_op.cu) +detection_library(retinanet_detection_output_op SRCS + retinanet_detection_output_op.cc) detection_library(nms_op SRCS nms_op.cc nms_op.cu) if(WITH_GPU OR WITH_ROCM) set(TMPDEPS memory) if(WITH_GPU) - if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) - set(TMPDEPS memory cub) + if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) + set(TMPDEPS memory cub) endif() endif() - detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS ${TMPDEPS}) - detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc generate_proposals_v2_op.cu DEPS ${TMPDEPS}) - detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS ${TMPDEPS}) - detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS ${TMPDEPS}) + detection_library(generate_proposals_op SRCS generate_proposals_op.cc + generate_proposals_op.cu DEPS ${TMPDEPS}) + detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc + generate_proposals_v2_op.cu DEPS ${TMPDEPS}) + detection_library( + distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc + distribute_fpn_proposals_op.cu DEPS ${TMPDEPS}) + detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc + collect_fpn_proposals_op.cu DEPS ${TMPDEPS}) else() detection_library(generate_proposals_op SRCS generate_proposals_op.cc) - detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc) - detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc) + if(NOT WITH_XPU) + detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc) + endif() + detection_library(distribute_fpn_proposals_op SRCS + distribute_fpn_proposals_op.cc) detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc) endif() -detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu) +detection_library( + roi_perspective_transform_op SRCS roi_perspective_transform_op.cc + roi_perspective_transform_op.cu) #Export local libraries to parent # set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE) -cc_library(mask_util SRCS mask_util.cc DEPS memory) -cc_test(mask_util_test SRCS mask_util_test.cc DEPS memory mask_util) -cc_library(gpc SRCS gpc.cc DEPS op_registry) -detection_library(generate_mask_labels_op SRCS generate_mask_labels_op.cc DEPS mask_util) +cc_library( + mask_util + SRCS mask_util.cc + DEPS memory) +cc_test( + mask_util_test + SRCS mask_util_test.cc + DEPS memory mask_util) +cc_library( + gpc + SRCS gpc.cc + DEPS op_registry) +detection_library(generate_mask_labels_op SRCS generate_mask_labels_op.cc DEPS + mask_util) diff --git a/paddle/fluid/operators/detection/anchor_generator_op.h b/paddle/fluid/operators/detection/anchor_generator_op.h index 0bcb56d7aa8d5..b3d490ac0b512 100644 --- a/paddle/fluid/operators/detection/anchor_generator_op.h +++ b/paddle/fluid/operators/detection/anchor_generator_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/transform.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h index 7bbbbe7f40ecc..b9b9b0b0c0dbf 100644 --- a/paddle/fluid/operators/detection/bbox_util.h +++ b/paddle/fluid/operators/detection/bbox_util.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -122,8 +123,9 @@ void BboxOverlaps(const framework::Tensor& r_boxes, inter_h = std::max(y_max - y_min + 1, zero); inter_area = inter_w * inter_h; overlaps_et(i, j) = - (inter_area == 0.) ? 0 : inter_area / - (r_box_area + c_box_area - inter_area); + (inter_area == 0.) + ? 0 + : inter_area / (r_box_area + c_box_area - inter_area); } } } diff --git a/paddle/fluid/operators/detection/box_clip_op.cc b/paddle/fluid/operators/detection/box_clip_op.cc index 73f0607fdde7f..08d688a149543 100644 --- a/paddle/fluid/operators/detection/box_clip_op.cc +++ b/paddle/fluid/operators/detection/box_clip_op.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/detection/box_clip_op.h" + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu index 65f2a5590716d..672b9a5db95d2 100644 --- a/paddle/fluid/operators/detection/box_clip_op.cu +++ b/paddle/fluid/operators/detection/box_clip_op.cu @@ -12,6 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detection/box_clip_op.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" diff --git a/paddle/fluid/operators/detection/box_clip_op.h b/paddle/fluid/operators/detection/box_clip_op.h index 13ba7894d6009..4bcc81dbf9865 100644 --- a/paddle/fluid/operators/detection/box_clip_op.h +++ b/paddle/fluid/operators/detection/box_clip_op.h @@ -11,6 +11,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc index 69d829e0021f3..461dcb7f39ab5 100644 --- a/paddle/fluid/operators/detection/box_coder_op.cc +++ b/paddle/fluid/operators/detection/box_coder_op.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/detection/box_coder_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/detection/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu index 22dc606df9df5..b7dee412ee319 100644 --- a/paddle/fluid/operators/detection/box_coder_op.cu +++ b/paddle/fluid/operators/detection/box_coder_op.cu @@ -11,6 +11,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/detection/box_coder_op.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h index a626f790fac90..6ddfd71765390 100644 --- a/paddle/fluid/operators/detection/box_coder_op.h +++ b/paddle/fluid/operators/detection/box_coder_op.h @@ -12,6 +12,7 @@ limitations under the License. */ #pragma once #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.h b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h index d3565f87f33bb..7eed920fb3d55 100644 --- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.h +++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h @@ -13,6 +13,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc index 92c9ab34aa454..b1b8c3ba2da84 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License.*/ #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h" + #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu index 860fdd01794cc..bea6fb1748858 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu @@ -18,6 +18,7 @@ namespace cub = hipcub; #endif #include + #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memcpy.h" diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h index e5ae9a6ccbda5..973cbc6ec1658 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h @@ -20,6 +20,7 @@ limitations under the License.*/ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/detection/density_prior_box_op.h b/paddle/fluid/operators/detection/density_prior_box_op.h index adc2723acbf70..0912ce9016031 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.h +++ b/paddle/fluid/operators/detection/density_prior_box_op.h @@ -12,6 +12,7 @@ limitations under the License. */ #pragma once #include #include + #include "paddle/fluid/operators/detection/prior_box_op.h" namespace paddle { diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc index 4e514e62f4081..e382586ec666c 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h" + #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu index 7ad25e003b491..5adf1469ec2f9 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu @@ -21,6 +21,7 @@ namespace cub = hipcub; #endif #include + #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h" diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h index 5479e08c2a5ef..85db2437ee550 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h @@ -19,6 +19,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc index c9cc4e722071c..da86502f78c35 100644 --- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc @@ -10,9 +10,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include #include #include + #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detection/bbox_util.h" diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc index cbf17048400bf..bc528060355f0 100644 --- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc @@ -10,9 +10,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/detection/bbox_util.h" diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index d6130823271f0..a6d2d8a2a0172 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/detection/bbox_util.h" diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu index 5fb7973fd89e4..20efb1fa6ca92 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_op.cu @@ -14,8 +14,10 @@ limitations under the License. */ #include #include + #include #include + #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memory.h" diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc index 1f1802574c5b8..b8b6118058fa2 100644 --- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/detection/bbox_util.h" diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu index 005309e8ee577..deb7f3a41df1f 100644 --- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu @@ -14,8 +14,10 @@ limitations under the License. */ #include #include + #include #include + #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memory.h" diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op_xpu.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op_xpu.cc new file mode 100644 index 0000000000000..28c94668ba7c5 --- /dev/null +++ b/paddle/fluid/operators/detection/generate_proposals_v2_op_xpu.cc @@ -0,0 +1,370 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include +#include +#include +#include +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memory.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +namespace { +template +static void SortDescending(const platform::XPUDeviceContext &dev_ctx, + const Tensor &value, Tensor *index_out, + int pre_nms_top_n) { + auto *value_data = value.data(); + auto place = dev_ctx.GetPlace(); + auto cpu_place = platform::CPUPlace(); + + Tensor scores_slice_cpu; + scores_slice_cpu.Resize({value.numel()}); + auto *scores_slice_cpu_data = scores_slice_cpu.mutable_data(cpu_place); + + memory::Copy(cpu_place, scores_slice_cpu_data, place, value_data, + sizeof(T) * value.numel()); + + // Sort index + Tensor index_t; + int *index = index_t.mutable_data({value.numel()}, cpu_place); + for (int i = 0; i < value.numel(); ++i) { + index[i] = i; + } + auto compare = [scores_slice_cpu_data](const int64_t &i, const int64_t &j) { + return scores_slice_cpu_data[i] > scores_slice_cpu_data[j]; + }; + + if (pre_nms_top_n <= 0 || pre_nms_top_n >= value.numel()) { + std::sort(index, index + value.numel(), compare); + } else { + std::nth_element(index, index + pre_nms_top_n, index + value.numel(), + compare); + std::sort(index, index + pre_nms_top_n, compare); + index_t.Resize({pre_nms_top_n}); + } + + int *idx_out = + index_out->mutable_data({index_t.numel()}, dev_ctx.GetPlace()); + memory::Copy(place, idx_out, cpu_place, index, sizeof(T) * index_t.numel()); +} + +template +static std::pair ProposalForOneImage( + const platform::XPUDeviceContext &dev_ctx, const Tensor &im_shape, + const Tensor &anchors, const Tensor &variances, + const Tensor &bbox_deltas, // [M, 4] + const Tensor &scores, // [N, 1] + int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size, + float eta, bool pixel_offset) { + // 1. pre nms + Tensor index_sort; + SortDescending(dev_ctx, scores, &index_sort, pre_nms_top_n); + + Tensor scores_sel, bbox_sel, anchor_sel, var_sel; + scores_sel.mutable_data({index_sort.numel(), 1}, dev_ctx.GetPlace()); + bbox_sel.mutable_data({index_sort.numel(), 4}, dev_ctx.GetPlace()); + anchor_sel.mutable_data({index_sort.numel(), 4}, dev_ctx.GetPlace()); + var_sel.mutable_data({index_sort.numel(), 4}, dev_ctx.GetPlace()); + + int r = xpu::gather(dev_ctx.x_context(), scores.data(), + index_sort.data(), scores_sel.data(), + {static_cast(scores.numel()), 1}, + index_sort.numel(), 0); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External("XPU API(gather) return " + "wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + + r = xpu::gather(dev_ctx.x_context(), bbox_deltas.data(), + index_sort.data(), bbox_sel.data(), + {static_cast(bbox_deltas.numel()) / 4, 4}, + index_sort.numel(), 0); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External("XPU API(gather) return " + "wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + + r = xpu::gather(dev_ctx.x_context(), anchors.data(), + index_sort.data(), anchor_sel.data(), + {static_cast(anchors.numel()) / 4, 4}, + index_sort.numel(), 0); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External("XPU API(gather) return " + "wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + + r = xpu::gather(dev_ctx.x_context(), variances.data(), + index_sort.data(), var_sel.data(), + {static_cast(variances.numel()) / 4, 4}, + index_sort.numel(), 0); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External("XPU API(gather) return " + "wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + + int num = scores.numel(); + int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel() + : pre_nms_top_n; + scores_sel.Resize({pre_nms_num, 1}); + index_sort.Resize({pre_nms_num, 1}); + + // 2. box decode and clipping + Tensor proposals; + proposals.mutable_data({pre_nms_num, 4}, dev_ctx.GetPlace()); + + r = xpu::box_decoder(dev_ctx.x_context(), anchor_sel.data(), + var_sel.data(), bbox_sel.data(), + proposals.data(), pre_nms_num, !pixel_offset, true, + im_shape.data()); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External("XPU API(box_decoder) return " + "wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + + // 3. filter + Tensor keep_index, keep_num_t; + keep_index.mutable_data({pre_nms_num}, dev_ctx.GetPlace()); + keep_num_t.mutable_data({1}, dev_ctx.GetPlace()); + min_size = std::max(min_size, 1.0f); + r = xpu::remove_small_boxes(dev_ctx.x_context(), proposals.data(), + im_shape.data(), keep_index.data(), + keep_num_t.data(), pre_nms_num, min_size, + false, pixel_offset); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(remove_small_boxes) return " + "wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + int keep_num; + const auto xpu_place = dev_ctx.GetPlace(); + memory::Copy(platform::CPUPlace(), &keep_num, xpu_place, + keep_num_t.data(), sizeof(int)); + keep_index.Resize({keep_num}); + + Tensor scores_filter, proposals_filter; + // Handle the case when there is no keep index left + if (keep_num == 0) { + phi::funcs::SetConstant set_zero; + proposals_filter.mutable_data({1, 4}, dev_ctx.GetPlace()); + scores_filter.mutable_data({1, 1}, dev_ctx.GetPlace()); + set_zero(dev_ctx, &proposals_filter, static_cast(0)); + set_zero(dev_ctx, &scores_filter, static_cast(0)); + return std::make_pair(proposals_filter, scores_filter); + } + proposals_filter.mutable_data({keep_num, 4}, dev_ctx.GetPlace()); + scores_filter.mutable_data({keep_num, 1}, dev_ctx.GetPlace()); + r = xpu::gather(dev_ctx.x_context(), proposals.data(), + keep_index.data(), proposals_filter.data(), + {pre_nms_num, 4}, keep_num, 0); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External("XPU API(gather) return " + "wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + + r = xpu::gather(dev_ctx.x_context(), scores_sel.data(), + keep_index.data(), scores_filter.data(), + {pre_nms_num, 1}, keep_num, 0); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External("XPU API(gather) return " + "wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + + if (nms_thresh <= 0) { + if (dev_ctx.x_context()->xpu_stream) { + dev_ctx.Wait(); + } + return std::make_pair(proposals_filter, scores_filter); + } + + // 4. nms + int nms_keep_num = 0; + r = xpu::nms(dev_ctx.x_context(), proposals_filter.data(), nullptr, + keep_index.data(), 1, 1, keep_num, -1, nms_thresh, -1, 0, + &nms_keep_num, pixel_offset); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External("XPU API(nms) return the" + "wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + if (post_nms_top_n > 0 && post_nms_top_n < nms_keep_num) { + keep_index.Resize({post_nms_top_n}); + } else { + keep_index.Resize({nms_keep_num}); + } + + Tensor scores_nms, proposals_nms; + proposals_nms.mutable_data({keep_index.numel(), 4}, dev_ctx.GetPlace()); + scores_nms.mutable_data({keep_index.numel(), 1}, dev_ctx.GetPlace()); + r = xpu::gather(dev_ctx.x_context(), proposals_filter.data(), + keep_index.data(), proposals_nms.data(), + {keep_num, 4}, keep_index.numel(), 0); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External("XPU API(gather) return " + "wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + r = xpu::gather(dev_ctx.x_context(), scores_filter.data(), + keep_index.data(), scores_nms.data(), + {keep_num, 1}, keep_index.numel(), 0); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External("XPU API(gather) return " + "wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + if (dev_ctx.x_context()->xpu_stream) { + dev_ctx.Wait(); + } + return std::make_pair(proposals_nms, scores_nms); +} +} // namespace + +template +class XPUGenerateProposalsV2Kernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *scores = context.Input("Scores"); + auto *bbox_deltas = context.Input("BboxDeltas"); + auto *im_shape = context.Input("ImShape"); + auto anchors = GET_DATA_SAFELY(context.Input("Anchors"), "Input", + "Anchors", "GenerateProposals"); + auto variances = GET_DATA_SAFELY(context.Input("Variances"), + "Input", "Variances", "GenerateProposals"); + + auto *rpn_rois = context.Output("RpnRois"); + auto *rpn_roi_probs = context.Output("RpnRoiProbs"); + + int pre_nms_top_n = context.Attr("pre_nms_topN"); + int post_nms_top_n = context.Attr("post_nms_topN"); + float nms_thresh = context.Attr("nms_thresh"); + float min_size = context.Attr("min_size"); + float eta = context.Attr("eta"); + bool pixel_offset = context.Attr("pixel_offset"); + PADDLE_ENFORCE_GE(eta, 1., + platform::errors::InvalidArgument( + "Not support adaptive NMS. The attribute 'eta' " + "should not less than 1. But received eta=[%d]", + eta)); + + auto &dev_ctx = context.template device_context(); + + auto scores_dim = scores->dims(); + // the shape of bbox score + int num = scores_dim[0]; + int c_score = scores_dim[1]; + int h_score = scores_dim[2]; + int w_score = scores_dim[3]; + + auto bbox_dim = bbox_deltas->dims(); + int c_bbox = bbox_dim[1]; + int h_bbox = bbox_dim[2]; + int w_bbox = bbox_dim[3]; + + Tensor bbox_deltas_swap, scores_swap; + bbox_deltas_swap.mutable_data({num, h_bbox, w_bbox, c_bbox}, + dev_ctx.GetPlace()); + scores_swap.mutable_data({num, h_score, w_score, c_score}, + dev_ctx.GetPlace()); + + std::vector axis = {0, 2, 3, 1}; + int r = xpu::transpose(dev_ctx.x_context(), bbox_deltas->data(), + bbox_deltas_swap.data(), + {num, c_bbox, h_bbox, w_bbox}, axis); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External("XPU API(transpose) return " + "wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + r = xpu::transpose(dev_ctx.x_context(), scores->data(), + scores_swap.data(), + {num, c_score, h_score, w_score}, axis); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External("XPU API(transpose) return " + "wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + + anchors.Resize({anchors.numel() / 4, 4}); + variances.Resize({variances.numel() / 4, 4}); + + // output + rpn_rois->mutable_data({bbox_deltas->numel() / 4, 4}, + context.GetPlace()); + rpn_roi_probs->mutable_data({scores->numel(), 1}, context.GetPlace()); + + T *rpn_rois_data = rpn_rois->data(); + T *rpn_roi_probs_data = rpn_roi_probs->data(); + + auto place = dev_ctx.GetPlace(); + auto cpu_place = platform::CPUPlace(); + + int num_proposals = 0; + std::vector offset(1, 0); + std::vector tmp_num; + + for (int64_t i = 0; i < num; ++i) { + Tensor im_shape_slice = im_shape->Slice(i, i + 1); + Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1); + Tensor scores_slice = scores_swap.Slice(i, i + 1); + + bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4}); + scores_slice.Resize({h_score * w_score * c_score, 1}); + + std::pair box_score_pair = ProposalForOneImage( + dev_ctx, im_shape_slice, anchors, variances, bbox_deltas_slice, + scores_slice, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, + eta, pixel_offset); + + Tensor &proposals = box_score_pair.first; + Tensor &scores = box_score_pair.second; + + memory::Copy(place, rpn_rois_data + num_proposals * 4, place, + proposals.data(), sizeof(T) * proposals.numel()); + memory::Copy(place, rpn_roi_probs_data + num_proposals, place, + scores.data(), sizeof(T) * scores.numel()); + if (dev_ctx.x_context()->xpu_stream) { + dev_ctx.Wait(); + } + num_proposals += proposals.dims()[0]; + offset.emplace_back(num_proposals); + tmp_num.push_back(proposals.dims()[0]); + } + if (context.HasOutput("RpnRoisNum")) { + auto *rpn_rois_num = context.Output("RpnRoisNum"); + rpn_rois_num->mutable_data({num}, context.GetPlace()); + int *num_data = rpn_rois_num->data(); + memory::Copy(place, num_data, cpu_place, &tmp_num[0], sizeof(int) * num); + rpn_rois_num->Resize({num}); + } + framework::LoD lod; + lod.emplace_back(offset); + rpn_rois->set_lod(lod); + rpn_roi_probs->set_lod(lod); + rpn_rois->Resize({num_proposals, 4}); + rpn_roi_probs->Resize({num_proposals, 1}); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL(generate_proposals_v2, + ops::XPUGenerateProposalsV2Kernel< + paddle::platform::XPUDeviceContext, float>); + +#endif // PADDLE_WITH_XPU diff --git a/paddle/fluid/operators/detection/gpc.cc b/paddle/fluid/operators/detection/gpc.cc index 6b1b0cd8b3578..4dea559d8e466 100644 --- a/paddle/fluid/operators/detection/gpc.cc +++ b/paddle/fluid/operators/detection/gpc.cc @@ -24,6 +24,7 @@ **/ #include "paddle/fluid/operators/detection/gpc.h" + #include "paddle/fluid/platform/enforce.h" namespace gpc { diff --git a/paddle/fluid/operators/detection/locality_aware_nms_op.cc b/paddle/fluid/operators/detection/locality_aware_nms_op.cc index 8cc0ebcab61f7..3f8bc8674186d 100644 --- a/paddle/fluid/operators/detection/locality_aware_nms_op.cc +++ b/paddle/fluid/operators/detection/locality_aware_nms_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detection/nms_util.h" @@ -51,16 +52,17 @@ class LocalityAwareNMSOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ( box_dims[2] == 4 || box_dims[2] == 8 || box_dims[2] == 16 || box_dims[2] == 24 || box_dims[2] == 32, - true, platform::errors::InvalidArgument( - "The last dimension of Input(BBoxes) must be 4 or 8, " - "represents the layout of coordinate " - "[xmin, ymin, xmax, ymax] or " - "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or " - "8 points: [xi, yi] i= 1,2,...,8 or " - "12 points: [xi, yi] i= 1,2,...,12 or " - "16 points: [xi, yi] i= 1,2,...,16. " - "But received %d.", - box_dims[2])); + true, + platform::errors::InvalidArgument( + "The last dimension of Input(BBoxes) must be 4 or 8, " + "represents the layout of coordinate " + "[xmin, ymin, xmax, ymax] or " + "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or " + "8 points: [xi, yi] i= 1,2,...,8 or " + "12 points: [xi, yi] i= 1,2,...,12 or " + "16 points: [xi, yi] i= 1,2,...,16. " + "But received %d.", + box_dims[2])); PADDLE_ENFORCE_EQ( box_dims[1], score_dims[2], platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/detection/mask_util.cc b/paddle/fluid/operators/detection/mask_util.cc index e06218cfe569f..41505ee84286a 100644 --- a/paddle/fluid/operators/detection/mask_util.cc +++ b/paddle/fluid/operators/detection/mask_util.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/detection/mask_util.h" + #include #include + #include "paddle/fluid/memory/memory.h" namespace paddle { diff --git a/paddle/fluid/operators/detection/mask_util.h b/paddle/fluid/operators/detection/mask_util.h index 4e0ea54f6d89f..25b03a11f7db0 100644 --- a/paddle/fluid/operators/detection/mask_util.h +++ b/paddle/fluid/operators/detection/mask_util.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include namespace paddle { diff --git a/paddle/fluid/operators/detection/mask_util_test.cc b/paddle/fluid/operators/detection/mask_util_test.cc index de904e9474639..68f7a6db6488e 100644 --- a/paddle/fluid/operators/detection/mask_util_test.cc +++ b/paddle/fluid/operators/detection/mask_util_test.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/detection/mask_util.h" + #include + #include "paddle/fluid/memory/memory.h" namespace paddle { diff --git a/paddle/fluid/operators/detection/matrix_nms_op.cc b/paddle/fluid/operators/detection/matrix_nms_op.cc index 3353739b01bf6..5eee52dfbc704 100644 --- a/paddle/fluid/operators/detection/matrix_nms_op.cc +++ b/paddle/fluid/operators/detection/matrix_nms_op.cc @@ -405,7 +405,6 @@ REGISTER_OPERATOR( REGISTER_OP_CPU_KERNEL(matrix_nms, ops::MatrixNMSKernel, ops::MatrixNMSKernel); REGISTER_OP_VERSION(matrix_nms) - .AddCheckpoint( - R"ROC(Upgrade matrix_nms: add a new output [RoisNum].)ROC", - paddle::framework::compatible::OpVersionDesc().NewOutput( - "RoisNum", "The number of RoIs in each image.")); + .AddCheckpoint(R"ROC(Upgrade matrix_nms: add a new output [RoisNum].)ROC", + paddle::framework::compatible::OpVersionDesc().NewOutput( + "RoisNum", "The number of RoIs in each image.")); diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc index 83cf6e5fd30f6..f603a501f4b78 100644 --- a/paddle/fluid/operators/detection/multiclass_nms_op.cc +++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detection/nms_util.h" @@ -55,18 +56,19 @@ class MultiClassNMSOp : public framework::OperatorWithKernel { ". But received rank = %d", box_dims.size())); if (score_size == 3) { - PADDLE_ENFORCE_EQ( - box_dims[2] == 4 || box_dims[2] == 8 || box_dims[2] == 16 || - box_dims[2] == 24 || box_dims[2] == 32, - true, platform::errors::InvalidArgument( - "The last dimension of Input" - "(BBoxes) must be 4 or 8, " - "represents the layout of coordinate " - "[xmin, ymin, xmax, ymax] or " - "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or " - "8 points: [xi, yi] i= 1,2,...,8 or " - "12 points: [xi, yi] i= 1,2,...,12 or " - "16 points: [xi, yi] i= 1,2,...,16")); + PADDLE_ENFORCE_EQ(box_dims[2] == 4 || box_dims[2] == 8 || + box_dims[2] == 16 || box_dims[2] == 24 || + box_dims[2] == 32, + true, + platform::errors::InvalidArgument( + "The last dimension of Input" + "(BBoxes) must be 4 or 8, " + "represents the layout of coordinate " + "[xmin, ymin, xmax, ymax] or " + "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or " + "8 points: [xi, yi] i= 1,2,...,8 or " + "12 points: [xi, yi] i= 1,2,...,12 or " + "16 points: [xi, yi] i= 1,2,...,16")); PADDLE_ENFORCE_EQ( box_dims[1], score_dims[2], platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/detection/nms_op.cc b/paddle/fluid/operators/detection/nms_op.cc index f6dc44eb5fc2d..34a92efa68a63 100644 --- a/paddle/fluid/operators/detection/nms_op.cc +++ b/paddle/fluid/operators/detection/nms_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/detection/nms_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/detection/nms_op.cu b/paddle/fluid/operators/detection/nms_op.cu index b6027e67d6ced..4f62c735c265a 100644 --- a/paddle/fluid/operators/detection/nms_op.cu +++ b/paddle/fluid/operators/detection/nms_op.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/operators/detection/nms_op.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" diff --git a/paddle/fluid/operators/detection/nms_util.h b/paddle/fluid/operators/detection/nms_util.h index 0e448d42fc2ed..7a6565ac760f1 100644 --- a/paddle/fluid/operators/detection/nms_util.h +++ b/paddle/fluid/operators/detection/nms_util.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/operators/detection/poly_util.h" namespace paddle { diff --git a/paddle/fluid/operators/detection/poly_util.cc b/paddle/fluid/operators/detection/poly_util.cc index 1af2c95c6cf52..6aa81bf1b39f7 100644 --- a/paddle/fluid/operators/detection/poly_util.cc +++ b/paddle/fluid/operators/detection/poly_util.cc @@ -16,13 +16,14 @@ limitations under the License. */ #define POLY_UTIL_CC_ #include "paddle/fluid/operators/detection/poly_util.h" + #include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { -using gpc::gpc_polygon_clip; using gpc::gpc_free_polygon; +using gpc::gpc_polygon_clip; template void Array2PointVec(const T*& box, const size_t box_size, diff --git a/paddle/fluid/operators/detection/poly_util.h b/paddle/fluid/operators/detection/poly_util.h index f07baf72d9ff0..cc37f00008d33 100644 --- a/paddle/fluid/operators/detection/poly_util.h +++ b/paddle/fluid/operators/detection/poly_util.h @@ -16,6 +16,7 @@ limitations under the License. */ #define POLY_UTIL_H_ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detection/gpc.h" diff --git a/paddle/fluid/operators/detection/prior_box_op.h b/paddle/fluid/operators/detection/prior_box_op.h index 4000994beb541..889bc8354bc41 100644 --- a/paddle/fluid/operators/detection/prior_box_op.h +++ b/paddle/fluid/operators/detection/prior_box_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/transform.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc index bc46ec0b65639..4e49a6ed8521e 100644 --- a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc +++ b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { @@ -66,23 +67,26 @@ class RetinanetDetectionOutputOp : public framework::OperatorWithKernel { auto im_info_dims = ctx->GetInputDim("ImInfo"); const size_t b_n = bboxes_dims.size(); - PADDLE_ENFORCE_GT(b_n, 0, platform::errors::InvalidArgument( - "The number of Variables in Input(BBoxes) " - "should be greater than 0, " - "but received number is:%d.", - b_n)); + PADDLE_ENFORCE_GT(b_n, 0, + platform::errors::InvalidArgument( + "The number of Variables in Input(BBoxes) " + "should be greater than 0, " + "but received number is:%d.", + b_n)); const size_t s_n = scores_dims.size(); - PADDLE_ENFORCE_GT(s_n, 0, platform::errors::InvalidArgument( - "The number of Variables in Input(Scores) " - "should be greater than 0, " - "but received number is:%d.", - s_n)); + PADDLE_ENFORCE_GT(s_n, 0, + platform::errors::InvalidArgument( + "The number of Variables in Input(Scores) " + "should be greater than 0, " + "but received number is:%d.", + s_n)); const size_t a_n = anchors_dims.size(); - PADDLE_ENFORCE_GT(a_n, 0, platform::errors::InvalidArgument( - "The number of Variables in Input(Anchors) " - "should be greater than 0, " - "but received number is:%d.", - a_n)); + PADDLE_ENFORCE_GT(a_n, 0, + platform::errors::InvalidArgument( + "The number of Variables in Input(Anchors) " + "should be greater than 0, " + "but received number is:%d.", + a_n)); auto bbox_dims = bboxes_dims[0]; auto score_dims = scores_dims[0]; auto anchor_dims = anchors_dims[0]; diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc index 353d17a6e09f2..eb6d6c6db9284 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -40,8 +41,8 @@ bool GT(T a, T b) { } /* -*check if (x, y) is in the boundary of roi -*/ + *check if (x, y) is in the boundary of roi + */ template bool in_quad(T x, T y, T roi_x[], T roi_y[]) { for (int i = 0; i < 4; i++) { @@ -431,10 +432,9 @@ class CPUROIPerspectiveTransformGradOpKernel : public framework::OpKernel { T matrix[9]; get_transform_matrix(transformed_width, transformed_height, roi_x, roi_y, matrix); - const T* out_grad_ptr = out_grad_data + - (roi_idx * channels + c) * - transformed_height * - transformed_width; + const T* out_grad_ptr = out_grad_data + (roi_idx * channels + c) * + transformed_height * + transformed_width; for (int out_h = 0; out_h < transformed_height; ++out_h) { for (int out_w = 0; out_w < transformed_width; ++out_w) { T src_w; diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu index 515a4bbac59c2..1bff79606d44b 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu @@ -13,13 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" #include "paddle/phi/kernels/funcs/math_function.h" -using paddle::platform::PADDLE_CUDA_NUM_THREADS; using paddle::platform::float16; +using paddle::platform::PADDLE_CUDA_NUM_THREADS; namespace paddle { namespace operators { @@ -56,8 +57,8 @@ __device__ T min(T a, T b) { } /* -* check if (x, y) is in the boundary of roi -*/ + * check if (x, y) is in the boundary of roi + */ template __device__ bool in_quad(T x, T y, T roi_x[], T roi_y[]) { for (int i = 0; i < 4; i++) { diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc index e96c0bbc27290..b636decdfbff3 100644 --- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc +++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc index 8526f1762cdc9..31f3dab81fef6 100644 --- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc +++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h" + #include #include #include diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h index 518295958630c..fcb7ec1fbfee0 100644 --- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h +++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc index 35e389090175f..ae7dfe0dd6685 100644 --- a/paddle/fluid/operators/detection/yolo_box_op.cc +++ b/paddle/fluid/operators/detection/yolo_box_op.cc @@ -36,10 +36,11 @@ class YoloBoxOp : public framework::OperatorWithKernel { auto iou_aware = ctx->Attrs().Get("iou_aware"); auto iou_aware_factor = ctx->Attrs().Get("iou_aware_factor"); - PADDLE_ENFORCE_EQ(dim_x.size(), 4, platform::errors::InvalidArgument( - "Input(X) should be a 4-D tensor." - "But received X dimension(%s)", - dim_x.size())); + PADDLE_ENFORCE_EQ( + dim_x.size(), 4, + platform::errors::InvalidArgument("Input(X) should be a 4-D tensor." + "But received X dimension(%s)", + dim_x.size())); if (iou_aware) { PADDLE_ENFORCE_EQ( dim_x[1], anchor_num * (6 + class_num), @@ -245,11 +246,10 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, YoloBoxInferShapeFunctor); -REGISTER_OP_VERSION(yolo_box) - .AddCheckpoint( - R"ROC( +REGISTER_OP_VERSION(yolo_box).AddCheckpoint( + R"ROC( Upgrade yolo box to add new attribute [iou_aware, iou_aware_factor]. )ROC", - paddle::framework::compatible::OpVersionDesc() - .NewAttr("iou_aware", "Whether use iou aware", false) - .NewAttr("iou_aware_factor", "iou aware factor", 0.5f)); + paddle::framework::compatible::OpVersionDesc() + .NewAttr("iou_aware", "Whether use iou aware", false) + .NewAttr("iou_aware_factor", "iou aware factor", 0.5f)); diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc index 21044734ca801..2170fd0639fcb 100644 --- a/paddle/fluid/operators/detection/yolov3_loss_op.cc +++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc @@ -10,6 +10,7 @@ limitations under the License. */ #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/imperative/type_defs.h" diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc index 588967f0832a9..aa4695cc97556 100644 --- a/paddle/fluid/operators/detection_map_op.cc +++ b/paddle/fluid/operators/detection_map_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/detection_map_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h index 4dd41837f060e..a034572a0c481 100644 --- a/paddle/fluid/operators/detection_map_op.h +++ b/paddle/fluid/operators/detection_map_op.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/determinant_op.cc b/paddle/fluid/operators/determinant_op.cc index 6959b5cf81106..ec5a51bbffa59 100644 --- a/paddle/fluid/operators/determinant_op.cc +++ b/paddle/fluid/operators/determinant_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/determinant_op.h" + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/backward.h" diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h index 702ff3bfd87b0..d4c05b631e3bb 100644 --- a/paddle/fluid/operators/determinant_op.h +++ b/paddle/fluid/operators/determinant_op.h @@ -18,6 +18,7 @@ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/for_range.h" diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.cc b/paddle/fluid/operators/dgc_clip_by_norm_op.cc index 85a29271b13b5..f60380f047591 100644 --- a/paddle/fluid/operators/dgc_clip_by_norm_op.cc +++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cc @@ -10,10 +10,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include - #include "paddle/fluid/operators/dgc_clip_by_norm_op.h" +#include + namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/dgc_op.cc b/paddle/fluid/operators/dgc_op.cc index 5fe66fa38a83b..95d3f75de9a02 100644 --- a/paddle/fluid/operators/dgc_op.cc +++ b/paddle/fluid/operators/dgc_op.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/dgc_op.h" + #include #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h index b1bf5e2778167..91093f67e0536 100644 --- a/paddle/fluid/operators/dgc_op.h +++ b/paddle/fluid/operators/dgc_op.h @@ -14,8 +14,8 @@ limitations under the License. */ #pragma once #include -#include "dgc/dgc.h" +#include "dgc/dgc.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" @@ -118,10 +118,12 @@ class DGCOpKernel : public framework::OpKernel { 1 - get_period_sparcity( sparsity, static_cast(*current_step - rampup_begin_step), rampup_step); - PADDLE_ENFORCE_GE(ratio, 0.0, platform::errors::InvalidArgument( - "DGC sparsity ratio must >= 0")); - PADDLE_ENFORCE_LT(ratio, 1.0, platform::errors::InvalidArgument( - "DGC sparsity ratio must < 1")); + PADDLE_ENFORCE_GE( + ratio, 0.0, + platform::errors::InvalidArgument("DGC sparsity ratio must >= 0")); + PADDLE_ENFORCE_LT( + ratio, 1.0, + platform::errors::InvalidArgument("DGC sparsity ratio must < 1")); int k = static_cast(g->numel() * ratio); VLOG(10) << "m:" << m << ", use_nesterov:" << use_nesterov diff --git a/paddle/fluid/operators/diag_embed_op.cu b/paddle/fluid/operators/diag_embed_op.cu index 7e3ab6be664cb..a9d92fdf634a7 100644 --- a/paddle/fluid/operators/diag_embed_op.cu +++ b/paddle/fluid/operators/diag_embed_op.cu @@ -14,6 +14,7 @@ #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/diag_embed_op.h" diff --git a/paddle/fluid/operators/diag_embed_op.h b/paddle/fluid/operators/diag_embed_op.h index a5621be3baa27..b07047996d513 100644 --- a/paddle/fluid/operators/diag_embed_op.h +++ b/paddle/fluid/operators/diag_embed_op.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/for_range.h" diff --git a/paddle/fluid/operators/dirichlet_op.h b/paddle/fluid/operators/dirichlet_op.h index 540acad423aa3..658688816eb8f 100644 --- a/paddle/fluid/operators/dirichlet_op.h +++ b/paddle/fluid/operators/dirichlet_op.h @@ -15,6 +15,7 @@ #pragma once #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/for_range.h" diff --git a/paddle/fluid/operators/dist_op.cc b/paddle/fluid/operators/dist_op.cc index 55b2484941293..6f897bff75c24 100644 --- a/paddle/fluid/operators/dist_op.cc +++ b/paddle/fluid/operators/dist_op.cc @@ -14,6 +14,7 @@ #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" diff --git a/paddle/fluid/operators/dlnne/CMakeLists.txt b/paddle/fluid/operators/dlnne/CMakeLists.txt index 4fe9cf214eaa7..11347f0f94e5c 100644 --- a/paddle/fluid/operators/dlnne/CMakeLists.txt +++ b/paddle/fluid/operators/dlnne/CMakeLists.txt @@ -1,39 +1,30 @@ # compile flags -set(DLNNE_FLAGS - -Wno-error=non-virtual-dtor - -Wno-error=unused-variable - -Wno-error=attributes - ${fsanitize} -) +set(DLNNE_FLAGS -Wno-error=non-virtual-dtor -Wno-error=unused-variable + -Wno-error=attributes ${fsanitize}) foreach(flag ${DLNNE_FLAGS}) safe_set_cflag(CMAKE_C_FLAGS ${flag}) safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag}) endforeach() - # add nne -find_path(DLNNE_INCLUDE_DIR dlnne.h - PATHS - $ENV{SOFTWARE_SOURCE_DIR} $ENV{SOFTWARE_SOURCE_DIR}/driver/nne/include - NO_DEFAULT_PATH -) - -find_library(DLNNE_LIB libdlnne.so - PATHS - $ENV{SOFTWARE_BUILD_DIR} $ENV{SOFTWARE_BUILD_DIR}/driver/nne - NO_DEFAULT_PATH -) +find_path( + DLNNE_INCLUDE_DIR dlnne.h + PATHS $ENV{SOFTWARE_SOURCE_DIR} $ENV{SOFTWARE_SOURCE_DIR}/driver/nne/include + NO_DEFAULT_PATH) -find_path(CUDA_INCLUDE_DIR cuda.h - $ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/include -) +find_library( + DLNNE_LIB libdlnne.so + PATHS $ENV{SOFTWARE_BUILD_DIR} $ENV{SOFTWARE_BUILD_DIR}/driver/nne + NO_DEFAULT_PATH) -find_library(CURT_LIB libcurt.so - PATHS - $ENV{SOFTWARE_BUILD_DIR} $ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/lib - NO_DEFAULT_PATH -) +find_path(CUDA_INCLUDE_DIR cuda.h + $ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/include) +find_library( + CURT_LIB libcurt.so + PATHS $ENV{SOFTWARE_BUILD_DIR} + $ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/lib + NO_DEFAULT_PATH) message("DLNNE_INCLUDE_DIR: "${DLNNE_INCLUDE_DIR}) message("DLNNE_LIB: "${DLNNE_LIB}) @@ -43,7 +34,15 @@ message("CURT_LIB: "${CURT_LIB}) include_directories("${DLNNE_INCLUDE_DIR}") include_directories("${CUDA_INCLUDE_DIR}") -op_library(dlnne_engine_op DEPS ${GLOB_OPERATOR_DEPS} framework_proto boost device_context op_registry scope) +op_library( + dlnne_engine_op + DEPS + ${GLOB_OPERATOR_DEPS} + framework_proto + boost + device_context + op_registry + scope) #message("PYBIND_FILE:${pybind_file}") #file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(dlnne_engine);\n") @@ -51,4 +50,7 @@ op_library(dlnne_engine_op DEPS ${GLOB_OPERATOR_DEPS} framework_proto boost devi target_link_libraries(dlnne_engine_op ${DLNNE_LIB} ${CURT_LIB}) -cc_test(test_dlnne_engine_op SRCS dlnne_engine_op_test.cc DEPS dlnne_engine_op analysis) +cc_test( + test_dlnne_engine_op + SRCS dlnne_engine_op_test.cc + DEPS dlnne_engine_op analysis) diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op.h b/paddle/fluid/operators/dlnne/dlnne_engine_op.h index 6b2622366fedc..857f295326b94 100644 --- a/paddle/fluid/operators/dlnne/dlnne_engine_op.h +++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.h @@ -13,11 +13,11 @@ // limitations under the License. #pragma once +#include #include // NOTLINT #include // NOTLINT #include // NOTLINT -#include #include #include #include @@ -128,11 +128,13 @@ class DlnneEngineOp : public framework::OperatorBase { << ".onnx"; builder = dl::nne::CreateInferBuilder(); - PADDLE_ENFORCE_NE(builder, nullptr, platform::errors::Unavailable( - "nne create builder failed")); + PADDLE_ENFORCE_NE( + builder, nullptr, + platform::errors::Unavailable("nne create builder failed")); parser = dl::nne::CreateParser(); - PADDLE_ENFORCE_NE(parser, nullptr, platform::errors::Unavailable( - "nne create parser failed")); + PADDLE_ENFORCE_NE( + parser, nullptr, + platform::errors::Unavailable("nne create parser failed")); network = builder->CreateNetwork(); diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc index 611366f6c5b8a..8e1d7fe5d815a 100644 --- a/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc +++ b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/operators/dlnne/dlnne_engine_op.h" + #include + #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_desc.h" diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index 6af8c925ff580..c40f6c0bbaea0 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -19,11 +19,13 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #include + #include "paddle/fluid/platform/dynload/curand.h" #endif #ifdef PADDLE_WITH_HIP #include #include + #include "paddle/fluid/platform/dynload/hiprand.h" #endif @@ -34,6 +36,7 @@ limitations under the License. */ #include "paddle/fluid/operators/dropout_impl_util.h" #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/platform/aligned_vector.h" +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/kernels/funcs/distribution_helper.h" #include "paddle/phi/kernels/funcs/functors.h" @@ -195,9 +198,11 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test, size_t main_offset = size / (block_size * kVecSize) * (block_size * kVecSize); - VectorizedRandomGenerator<<>>( - size, seed_data, dropout_prob, x_data, mask_data, y_data, - upscale_in_train, increment, main_offset); + PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL( + !is_fix_seed, (VectorizedRandomGenerator), grid_size, + block_size, 0, stream, offset, KERNEL_PARAMS.As(1), + KERNEL_PARAMS.As(7), size, seed_data, dropout_prob, x_data, + mask_data, y_data, upscale_in_train, increment, main_offset); } else { if (upscale_in_train) { // todo: can y share with data with x directly? diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc index 8d033ea3194b9..9426efa494208 100644 --- a/paddle/fluid/operators/dropout_op.cc +++ b/paddle/fluid/operators/dropout_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/infermeta/binary.h" diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc index 851f26ee0e717..24de99d6d8f85 100644 --- a/paddle/fluid/operators/dropout_op_xpu.cc +++ b/paddle/fluid/operators/dropout_op_xpu.cc @@ -11,6 +11,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/device_wrapper.h" namespace paddle { diff --git a/paddle/fluid/operators/edit_distance_op.cc b/paddle/fluid/operators/edit_distance_op.cc index db8a107290eb6..8127895569f6f 100644 --- a/paddle/fluid/operators/edit_distance_op.cc +++ b/paddle/fluid/operators/edit_distance_op.cc @@ -37,12 +37,13 @@ class EditDistanceOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ( hyp_dims.size() == 2 && ref_dims.size() == 2 && hyp_dims[0] == ref_dims[0], - true, platform::errors::InvalidArgument( - "Input(Hyps) and Input(Refs) must be 2-D Tensors with " - "identical first dimension. But received Input(Hyps): " - "input rank %u, input shape [%s]; received Input(Refs): " - "input rank %u, input shape [%s]", - hyp_dims.size(), hyp_dims, ref_dims.size(), ref_dims)); + true, + platform::errors::InvalidArgument( + "Input(Hyps) and Input(Refs) must be 2-D Tensors with " + "identical first dimension. But received Input(Hyps): " + "input rank %u, input shape [%s]; received Input(Refs): " + "input rank %u, input shape [%s]", + hyp_dims.size(), hyp_dims, ref_dims.size(), ref_dims)); PADDLE_ENFORCE_EQ( hyp_length_dims[0] == ref_length_dims[0] && hyp_length_dims[0] == hyp_dims[0], diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu index 49ac7183ff3b0..eb208c559cef6 100644 --- a/paddle/fluid/operators/edit_distance_op.cu +++ b/paddle/fluid/operators/edit_distance_op.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/edit_distance_op.h" diff --git a/paddle/fluid/operators/edit_distance_op.h b/paddle/fluid/operators/edit_distance_op.h index ef290c2eff2be..101e3a90b80d3 100644 --- a/paddle/fluid/operators/edit_distance_op.h +++ b/paddle/fluid/operators/edit_distance_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/eig_op.cc b/paddle/fluid/operators/eig_op.cc index 6f1737dba819c..5239248d82f1f 100644 --- a/paddle/fluid/operators/eig_op.cc +++ b/paddle/fluid/operators/eig_op.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "paddle/fluid/operators/eig_op.h" + #include #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { @@ -32,10 +34,11 @@ class EigOp : public framework::OperatorWithKernel { auto x_dims = ctx->GetInputDim("X"); int rank = x_dims.size(); - PADDLE_ENFORCE_GE(rank, 2, platform::errors::InvalidArgument( - "Expects input tensor x to be not less than " - "2 dimentions, but got dimention %d", - rank)); + PADDLE_ENFORCE_GE(rank, 2, + platform::errors::InvalidArgument( + "Expects input tensor x to be not less than " + "2 dimentions, but got dimention %d", + rank)); PADDLE_ENFORCE_EQ(x_dims[rank - 2], x_dims[rank - 1], platform::errors::InvalidArgument( "The input matrix must be a square matrix, " diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h index fe898a6c41c2a..0f9afae8267bf 100644 --- a/paddle/fluid/operators/eig_op.h +++ b/paddle/fluid/operators/eig_op.h @@ -15,8 +15,10 @@ #pragma once #include + #include #include + #include "paddle/fluid/operators/math/matrix_solve.h" #include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/platform/for_range.h" diff --git a/paddle/fluid/operators/eigvals_op.cc b/paddle/fluid/operators/eigvals_op.cc index 2ef591dd26a06..177dc684662f5 100644 --- a/paddle/fluid/operators/eigvals_op.cc +++ b/paddle/fluid/operators/eigvals_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/eigvals_op.h" + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/eigvals_op.h b/paddle/fluid/operators/eigvals_op.h index 4627acc0d07de..d75b33e0857bc 100644 --- a/paddle/fluid/operators/eigvals_op.h +++ b/paddle/fluid/operators/eigvals_op.h @@ -16,6 +16,7 @@ #include #include + #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/allocation/allocator.h" @@ -71,14 +72,16 @@ static void SpiltBatchSquareMatrix(const Tensor& input, } static void CheckLapackEigResult(const int info, const std::string& name) { - PADDLE_ENFORCE_LE(info, 0, platform::errors::PreconditionNotMet( - "The QR algorithm failed to compute all the " - "eigenvalues in function %s.", - name.c_str())); + PADDLE_ENFORCE_LE(info, 0, + platform::errors::PreconditionNotMet( + "The QR algorithm failed to compute all the " + "eigenvalues in function %s.", + name.c_str())); PADDLE_ENFORCE_GE( - info, 0, platform::errors::InvalidArgument( - "The %d-th argument has an illegal value in function %s.", - -info, name.c_str())); + info, 0, + platform::errors::InvalidArgument( + "The %d-th argument has an illegal value in function %s.", -info, + name.c_str())); } template diff --git a/paddle/fluid/operators/einsum_op.cc b/paddle/fluid/operators/einsum_op.cc index 6da0045443ccc..7fc19d6913f83 100644 --- a/paddle/fluid/operators/einsum_op.cc +++ b/paddle/fluid/operators/einsum_op.cc @@ -14,6 +14,7 @@ #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/operators/elementwise/CMakeLists.txt b/paddle/fluid/operators/elementwise/CMakeLists.txt index 216a3f79d6f92..25b34a2c0a2c3 100644 --- a/paddle/fluid/operators/elementwise/CMakeLists.txt +++ b/paddle/fluid/operators/elementwise/CMakeLists.txt @@ -1,14 +1,32 @@ include(operators) if(WITH_UNITY_BUILD) - # Load Unity Build rules for operators in paddle/fluid/operators/elementwise. - include(unity_build_rule.cmake) + # Load Unity Build rules for operators in paddle/fluid/operators/elementwise. + include(unity_build_rule.cmake) endif() register_operators(DEPS op_version_registry) -cc_test(test_elementwise_add_op_inplace SRCS test_elementwise_add_op_inplace.cc DEPS op_registry elementwise_add_op scope device_context enforce executor) -cc_test(test_elementwise_div_grad_grad SRCS test_elementwise_div_grad_grad.cc DEPS op_registry elementwise_div_op scope device_context enforce executor) -cc_test(test_elementwise_add_grad_grad SRCS test_elementwise_add_grad_grad.cc DEPS op_registry elementwise_add_op scope device_context enforce executor) +cc_test( + test_elementwise_add_op_inplace + SRCS test_elementwise_add_op_inplace.cc + DEPS op_registry elementwise_add_op scope device_context enforce executor) +cc_test( + test_elementwise_div_grad_grad + SRCS test_elementwise_div_grad_grad.cc + DEPS op_registry elementwise_div_op scope device_context enforce executor) +cc_test( + test_elementwise_add_grad_grad + SRCS test_elementwise_add_grad_grad.cc + DEPS op_registry elementwise_add_op scope device_context enforce executor) if(WITH_ASCEND_CL) -cc_test(elementwise_op_npu_test SRCS elementwise_op_npu_test.cc DEPS op_registry elementwise_add_op elementwise_sub_op scope device_context enforce executor) + cc_test( + elementwise_op_npu_test + SRCS elementwise_op_npu_test.cc + DEPS op_registry + elementwise_add_op + elementwise_sub_op + scope + device_context + enforce + executor) endif() diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc index 22a5de4c60941..9c1a84ba8b67f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc @@ -15,8 +15,8 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include #include -#include "paddle/fluid/operators/elementwise/elementwise_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/elementwise/elementwise_xpu.h" #include "paddle/fluid/platform/device/device_wrapper.h" diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc index 13fd9b81a8765..e0523a26ee3ce 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_div_op.h" + #include #include diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h index e9adb9abdb528..b3363862d5f97 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h" namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_heaviside_op.cc b/paddle/fluid/operators/elementwise/elementwise_heaviside_op.cc index e003a43b5c56b..ebdebb2f4852a 100644 --- a/paddle/fluid/operators/elementwise/elementwise_heaviside_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_heaviside_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "paddle/fluid/operators/elementwise/elementwise_op.h" namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_mlu.h b/paddle/fluid/operators/elementwise/elementwise_mlu.h index ff1e12103be91..8c230c5f47bf6 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mlu.h +++ b/paddle/fluid/operators/elementwise/elementwise_mlu.h @@ -16,6 +16,7 @@ #ifdef PADDLE_WITH_MLU #include + #include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op_xpu.cc index 156589384c0dd..19d28301ffb83 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mod_op_xpu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mod_op_xpu.cc @@ -15,11 +15,11 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include #include + #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/common_infer_shape_functions.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" - #include "paddle/fluid/operators/elementwise/elementwise_xpu.h" #include "paddle/fluid/platform/device/device_wrapper.h" diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc index 45b6f7cb39194..253014a79817a 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h" + #include #include + #include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/platform/complex.h" diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index e2dd0e36d400a..39045bf0d5904 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -15,9 +15,9 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/platform/cpu_info.h" - #include "paddle/phi/kernels/elementwise_kernel.h" namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 80b07721f0b4d..476b891bb419d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -28,7 +28,6 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_functor.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/transform.h" - #include "paddle/phi/api/lib/utils/tensor_utils.h" #include "paddle/phi/kernels/cpu/elementwise.h" #include "paddle/phi/kernels/cpu/elementwise_grad.h" @@ -60,14 +59,14 @@ namespace paddle { namespace operators { /* -* Pack input and output tensors into respective vectors with -* consideration of varible X`s class type. -* Input variable X is supported to be whether LoDTensor or -* SelectedRows class type in this package function, once X -* was SelectedRows type, a valid pointer x_for_selectedrows -* is excepted to be passed in from op kernel for acquisition -* of the valid address of LoDTensor created ahead in the function. -*/ + * Pack input and output tensors into respective vectors with + * consideration of varible X`s class type. + * Input variable X is supported to be whether LoDTensor or + * SelectedRows class type in this package function, once X + * was SelectedRows type, a valid pointer x_for_selectedrows + * is excepted to be passed in from op kernel for acquisition + * of the valid address of LoDTensor created ahead in the function. + */ template int PackTensorsIntoVector(const framework::ExecutionContext &ctx, std::vector *ins, @@ -327,10 +326,11 @@ static void FusedElemwiseAndActBroadcast1CUDA(gpuStream_t stream, const T *x, T *intermediate_out) { int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, w); int gird_size = h; - FusedElemwiseAndActBroadcast1CUDAKernel< - T, CompoundFunctor, BcastY, KeepIntermediateOut, - SameShapeOfIntermediateOutAndOut><<>>( - x, y, h, w, compound_functor, out, intermediate_out); + FusedElemwiseAndActBroadcast1CUDAKernel + <<>>(x, y, h, w, compound_functor, out, + intermediate_out); } template <<>>( - x, y, compound_functor, pre, n, post, out, intermediate_out); + FusedElemwiseAndActBroadcast2CUDAKernel + <<>>(x, y, compound_functor, pre, n, + post, out, intermediate_out); } #endif @@ -544,8 +545,9 @@ void FusedElemwiseAndActGradComputeNoBroadcast( out->data(), dout->data(), dx_op, dy_op, dintermediate_op, dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace()), - dintermediate == nullptr ? nullptr : dintermediate->mutable_data( - ctx.GetPlace())}); + dintermediate == nullptr + ? nullptr + : dintermediate->mutable_data(ctx.GetPlace())}); } template <<>>( - x, y, intermediate_out, out, dout, h, w, dx_op, dy_op, dintermediate_op, - dx, dy, d_intermediate); + FusedElemwiseAndActGradBroadcast1CUDAKernel + <<>>(x, y, intermediate_out, out, dout, h, w, + dx_op, dy_op, dintermediate_op, dx, dy, + d_intermediate); } template <<>>( - x, y, intermediate_out, out, dout, pre, n, post, dx_op, dy_op, - dintermediate_op, dx, dy, dintermediate); + FusedElemwiseAndActGradBroadcast2CUDAKernel + <<>>( + x, y, intermediate_out, out, dout, pre, n, post, dx_op, dy_op, + dintermediate_op, dx, dy, dintermediate); } #endif @@ -995,8 +996,9 @@ void FusedElemwiseAndActGradComputeWithBroadcast( out->data(), dout->data(), h, w, dx_op, dy_op, dintermediate_op, dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace()), - dintermediate == nullptr ? nullptr : dintermediate->mutable_data( - ctx.GetPlace())); + dintermediate == nullptr + ? nullptr + : dintermediate->mutable_data(ctx.GetPlace())); #endif } else { FusedElemwiseAndActGradBroadcast1CPUdata(), dout->data(), h, w, dx_op, dy_op, dintermediate_op, dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace()), - dintermediate == nullptr ? nullptr : dintermediate->mutable_data( - ctx.GetPlace())); + dintermediate == nullptr + ? nullptr + : dintermediate->mutable_data(ctx.GetPlace())); } } else { if (platform::is_gpu_place(ctx.GetPlace())) { @@ -1022,8 +1025,9 @@ void FusedElemwiseAndActGradComputeWithBroadcast( dintermediate_op, dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace()), - dintermediate == nullptr ? nullptr : dintermediate->mutable_data( - ctx.GetPlace())); + dintermediate == nullptr + ? nullptr + : dintermediate->mutable_data(ctx.GetPlace())); #endif } else { FusedElemwiseAndActGradBroadcast2CPUmutable_data(ctx.GetPlace()), dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace()), - dintermediate == nullptr ? nullptr : dintermediate->mutable_data( - ctx.GetPlace())); + dintermediate == nullptr + ? nullptr + : dintermediate->mutable_data(ctx.GetPlace())); } } } diff --git a/paddle/fluid/operators/elementwise/elementwise_xpu.h b/paddle/fluid/operators/elementwise/elementwise_xpu.h index db5c94b9d1a6e..3f38450581ec8 100644 --- a/paddle/fluid/operators/elementwise/elementwise_xpu.h +++ b/paddle/fluid/operators/elementwise/elementwise_xpu.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/place.h" #include "xpu/refactor/math.h" @@ -32,8 +33,9 @@ void XPUElementwise( const std::vector&, const std::vector&)> func) { auto x_var = ctx.InputVar("X"); - PADDLE_ENFORCE_NE(x_var, nullptr, platform::errors::InvalidArgument( - "Cannot get input Variable X")); + PADDLE_ENFORCE_NE( + x_var, nullptr, + platform::errors::InvalidArgument("Cannot get input Variable X")); PADDLE_ENFORCE_EQ( x_var->IsType(), true, platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc index 3cecc52a3c481..f647bd91d5f3d 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc @@ -18,6 +18,7 @@ #include #include #include + #include "gtest/gtest.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" @@ -72,11 +73,12 @@ class TestElementwiseDivGradGradWithoutDout std::unique_ptr CreateTestOp() override { auto op = framework::OpRegistry::CreateOp( - this->op_type_, {{"Y", {"Y"}}, - {"Out", {"Out"}}, - {"DDX", {"DDX"}}, - {"DDY", {"DDY"}}, - {"DX", {"DX"}}}, + this->op_type_, + {{"Y", {"Y"}}, + {"Out", {"Out"}}, + {"DDX", {"DDX"}}, + {"DDY", {"DDY"}}, + {"DX", {"DX"}}}, {{"Y@GRAD", {"Y@GRAD"}}, {"DDOut", {"DDOut"}}}, {{"use_mkldnn", false}, {"axis", 0}}); return op; diff --git a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h index 05f87e5465abe..7defe4e5793ab 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h +++ b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h @@ -21,6 +21,7 @@ #include #include #include + #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/operators/elementwise/unity_build_rule.cmake b/paddle/fluid/operators/elementwise/unity_build_rule.cmake index ea001fe438545..060c990ea8712 100644 --- a/paddle/fluid/operators/elementwise/unity_build_rule.cmake +++ b/paddle/fluid/operators/elementwise/unity_build_rule.cmake @@ -4,25 +4,27 @@ # Generally, the combination rules in this file do not need to be modified. # If there are some redefined error in compiling with the source file which # in combination rule, you can remove the source file from the following rules. -register_unity_group(cc - elementwise_add_op.cc - mkldnn/elementwise_add_mkldnn_op.cc - elementwise_div_op.cc - elementwise_floordiv_op.cc - elementwise_max_op.cc - elementwise_min_op.cc - elementwise_mod_op.cc - elementwise_mul_op.cc - mkldnn/elementwise_mul_mkldnn_op.cc - elementwise_pow_op.cc - elementwise_sub_op.cc) -register_unity_group(cu - elementwise_add_op.cu - elementwise_div_op.cu - elementwise_floordiv_op.cu - elementwise_max_op.cu - elementwise_min_op.cu - elementwise_mod_op.cu - elementwise_mul_op.cu - elementwise_pow_op.cu - elementwise_sub_op.cu) +register_unity_group( + cc + elementwise_add_op.cc + mkldnn/elementwise_add_mkldnn_op.cc + elementwise_div_op.cc + elementwise_floordiv_op.cc + elementwise_max_op.cc + elementwise_min_op.cc + elementwise_mod_op.cc + elementwise_mul_op.cc + mkldnn/elementwise_mul_mkldnn_op.cc + elementwise_pow_op.cc + elementwise_sub_op.cc) +register_unity_group( + cu + elementwise_add_op.cu + elementwise_div_op.cu + elementwise_floordiv_op.cu + elementwise_max_op.cu + elementwise_min_op.cu + elementwise_mod_op.cu + elementwise_mul_op.cu + elementwise_pow_op.cu + elementwise_sub_op.cu) diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc index 9e0e4e7fe1c6d..0f6c308b211bf 100644 --- a/paddle/fluid/operators/empty_op.cc +++ b/paddle/fluid/operators/empty_op.cc @@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/op_registry.h" - #include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/infermeta/nullary.h" namespace paddle { diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc index 093c4d8f7930e..cace8b5fdffa7 100644 --- a/paddle/fluid/operators/expand_as_op.cc +++ b/paddle/fluid/operators/expand_as_op.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/expand_as_op.h" + #include #include diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc old mode 100755 new mode 100644 index 9361edd43bf15..8cdab4c5e1a41 --- a/paddle/fluid/operators/expand_as_v2_op.cc +++ b/paddle/fluid/operators/expand_as_v2_op.cc @@ -10,8 +10,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/expand_as_v2_op.h" + #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/infermeta/binary.h" @@ -107,7 +109,6 @@ REGISTER_OPERATOR(expand_as_v2_grad, ops::ExpandAsV2GradOp, ops::ExpandAsV2GradNoNeedBufVarsInferer); REGISTER_OP_VERSION(expand_as_v2) - .AddCheckpoint( - R"ROC(fix expand_as_v2 and add new input [Y])ROC", - paddle::framework::compatible::OpVersionDesc().NewInput( - "Y", "Expand X according to the shape of Y")); + .AddCheckpoint(R"ROC(fix expand_as_v2 and add new input [Y])ROC", + paddle::framework::compatible::OpVersionDesc().NewInput( + "Y", "Expand X according to the shape of Y")); diff --git a/paddle/fluid/operators/expand_as_v2_op_npu.cc b/paddle/fluid/operators/expand_as_v2_op_npu.cc index 67d95e1240022..28fd922d77b81 100644 --- a/paddle/fluid/operators/expand_as_v2_op_npu.cc +++ b/paddle/fluid/operators/expand_as_v2_op_npu.cc @@ -30,10 +30,11 @@ class ExpandAsV2NPUKernel : public framework::OpKernel { "expand_as_v2 op must be greater than or equal to " "the rank (%d) of the input 'x'.", target_rank, rank)); - PADDLE_ENFORCE_GE(rank, 1, platform::errors::InvalidArgument( - "The rank (%d) of the input 'x' for " - "expand_as_v2 op must be positive.", - rank)); + PADDLE_ENFORCE_GE( + rank, 1, + platform::errors::InvalidArgument("The rank (%d) of the input 'x' for " + "expand_as_v2 op must be positive.", + rank)); PADDLE_ENFORCE_LE(target_rank, MAX_RANK_SUPPORTED, platform::errors::InvalidArgument( "The rank (%d) of the input 'target_tensor' for " diff --git a/paddle/fluid/operators/expand_as_v2_op_xpu.cc b/paddle/fluid/operators/expand_as_v2_op_xpu.cc index 0912b280aa6c7..fc3d77f3cc82c 100644 --- a/paddle/fluid/operators/expand_as_v2_op_xpu.cc +++ b/paddle/fluid/operators/expand_as_v2_op_xpu.cc @@ -33,10 +33,11 @@ class ExpandAsV2XPUKernel : public framework::OpKernel { "expand_as_v2 op must be greater than or equal to " "the rank (%d) of the input 'x'.", target_rank, rank)); - PADDLE_ENFORCE_GE(rank, 1, platform::errors::InvalidArgument( - "The rank (%d) of the input 'x' for " - "expand_as_v2 op must be positive.", - rank)); + PADDLE_ENFORCE_GE( + rank, 1, + platform::errors::InvalidArgument("The rank (%d) of the input 'x' for " + "expand_as_v2 op must be positive.", + rank)); PADDLE_ENFORCE_LE(target_rank, MAX_RANK_SUPPORTED, platform::errors::InvalidArgument( "The rank (%d) of the input 'target_tensor' for " diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc index e45761112d4bd..04cdbd5a60615 100644 --- a/paddle/fluid/operators/expand_op.cc +++ b/paddle/fluid/operators/expand_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/expand_op.h" + #include #include #include diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h index 05cd893b057af..880adad743fa3 100644 --- a/paddle/fluid/operators/expand_op.h +++ b/paddle/fluid/operators/expand_op.h @@ -213,12 +213,13 @@ class ExpandGradKernel : public framework::OpKernel { framework::TensorCopy(*in0, context.GetPlace(), context.device_context(), out0); } else { - PADDLE_ENFORCE_GE(dims, 1, platform::errors::InvalidArgument( - "The number of dimensions of the input " - "'Out@GRAD' for Op(expand_grad)" - " must be greater than or equal to 1, but " - "the value received is %d.", - dims)); + PADDLE_ENFORCE_GE(dims, 1, + platform::errors::InvalidArgument( + "The number of dimensions of the input " + "'Out@GRAD' for Op(expand_grad)" + " must be greater than or equal to 1, but " + "the value received is %d.", + dims)); PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED, platform::errors::InvalidArgument( "The number of dimensions of the input 'Out@GRAD' " diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc index 292f706cb186b..6aeea745911aa 100644 --- a/paddle/fluid/operators/expand_v2_op.cc +++ b/paddle/fluid/operators/expand_v2_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/expand_v2_op.h" + #include #include #include diff --git a/paddle/fluid/operators/expand_v2_op_npu.cc b/paddle/fluid/operators/expand_v2_op_npu.cc index c9fe19fd091da..c64bdabf59964 100644 --- a/paddle/fluid/operators/expand_v2_op_npu.cc +++ b/paddle/fluid/operators/expand_v2_op_npu.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/expand_v2_op.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/expand_v2_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/expand_v2_op_xpu.cc b/paddle/fluid/operators/expand_v2_op_xpu.cc index cb2165c4e922e..3d010c964bcfd 100644 --- a/paddle/fluid/operators/expand_v2_op_xpu.cc +++ b/paddle/fluid/operators/expand_v2_op_xpu.cc @@ -13,8 +13,8 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/expand_v2_op.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/expand_v2_op.h" namespace paddle { namespace operators { @@ -110,10 +110,11 @@ class ExpandV2XPUKernel : public framework::OpKernel { r = xpu::broadcast(dev_ctx.x_context(), x_data, out_data, x_shape, out_shape); } - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( - "XPU API(broadcast) return wrong " - "value[%d %s] in ExpandV2XPUKernel.", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU API(broadcast) return wrong " + "value[%d %s] in ExpandV2XPUKernel.", + r, XPUAPIErrorMsg[r])); } }; diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc index 8172f441e64a4..5a3a1cf53deb1 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cc +++ b/paddle/fluid/operators/fake_dequantize_op.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fake_dequantize_op.h" + #include #include + #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/fake_dequantize_op.cu.h b/paddle/fluid/operators/fake_dequantize_op.cu.h index 9859dd4607c15..50f772ec45de8 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cu.h +++ b/paddle/fluid/operators/fake_dequantize_op.cu.h @@ -119,10 +119,10 @@ struct ChannelDequantizeFunctor { quant_stride *= in_dims[i]; } - DequantizeOneScaleQuantAxisN< - T><<>>( - in_data, scale_factor, max_range, num, in_dims[quant_axis], - quant_stride, out_data); + DequantizeOneScaleQuantAxisN + <<>>( + in_data, scale_factor, max_range, num, in_dims[quant_axis], + quant_stride, out_data); } else if (scale_num == 2) { // Not need to consider quant_axis int num = in->numel(); diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h index aad2c2c7d985a..e623a638922d5 100644 --- a/paddle/fluid/operators/fake_dequantize_op.h +++ b/paddle/fluid/operators/fake_dequantize_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/ddim.h" diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc index ac72f23d46ea8..855c78d2998bd 100644 --- a/paddle/fluid/operators/fake_quantize_op.cc +++ b/paddle/fluid/operators/fake_quantize_op.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fake_quantize_op.h" + #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/transform.h" @@ -832,7 +834,7 @@ REGISTER_OP_VERSION(moving_average_abs_max_scale) "Delete output in order to make the inference model not " "save moving_average_abs_max_scale operator. This will " "make the quantitative model be correctly applied in inference.")) - .AddCheckpoint( - R"ROC(Incompatible upgrade of output [Out])ROC", - paddle::framework::compatible::OpVersionDesc().NewOutput( - "Out", "In order to support dygraph qat, add output again.")); + .AddCheckpoint(R"ROC(Incompatible upgrade of output [Out])ROC", + paddle::framework::compatible::OpVersionDesc().NewOutput( + "Out", + "In order to support dygraph qat, add output again.")); diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h index a6130c272d72b..580521183cbdc 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu.h +++ b/paddle/fluid/operators/fake_quantize_op.cu.h @@ -17,6 +17,7 @@ limitations under the License. */ #endif // PADDLE_FLUID_OPERATORS_FAKE_QUANTIZE_OP_CU_H_ #include + #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/fake_quantize_op.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" @@ -80,10 +81,10 @@ struct FindAbsMaxFunctor { framework::Tensor max; T* max_data = max.mutable_data(phi::make_ddim({grid}), ctx.GetPlace()); - FindAbsMaxKernel<<>>( - in, num, max_data); - FindAbsMaxKernel<<<1, block, 1024 * sizeof(T), ctx.stream()>>>( - max_data, grid, out); + FindAbsMaxKernel + <<>>(in, num, max_data); + FindAbsMaxKernel + <<<1, block, 1024 * sizeof(T), ctx.stream()>>>(max_data, grid, out); } }; @@ -176,9 +177,9 @@ struct FindChannelAbsMaxFunctor { int cout = in_dims[0]; int grid = cout; int block = 1024; - FindChannelAbsMaxKernelQuantAxis0< - T><<>>( - in_data, num, cout, out_abs_max); + FindChannelAbsMaxKernelQuantAxis0 + <<>>(in_data, num, cout, + out_abs_max); } else if (quant_axis == 1) { int cin = in_dims[0]; int cout = in_dims[1]; @@ -193,17 +194,17 @@ struct FindChannelAbsMaxFunctor { for (int i = 0; i < cin / max_threads; i++) { int block = max_threads; - FindChannelAbsMaxKernelQuantAxis1< - T><<>>( - in_data, num, cin, cout, out_abs_max); + FindChannelAbsMaxKernelQuantAxis1 + <<>>( + in_data, num, cin, cout, out_abs_max); in_data += num / cin; } int block = cin % max_threads; if (block > 0) { - FindChannelAbsMaxKernelQuantAxis1< - T><<>>( - in_data, num, in_dims[0], in_dims[1], out_abs_max); + FindChannelAbsMaxKernelQuantAxis1 + <<>>( + in_data, num, in_dims[0], in_dims[1], out_abs_max); } } } @@ -549,16 +550,16 @@ struct ChannelClipFakeQuantDequantFunctor { if (quant_axis == 0) { int grid = in_dims[0]; int block = 1024; - ChannelClipAndQuantDequantKernelQuantAxis0< - T><<>>(in_data, scale_data, bin_cnt, - num, in_dims[0], out_data); + ChannelClipAndQuantDequantKernelQuantAxis0 + <<>>(in_data, scale_data, bin_cnt, num, + in_dims[0], out_data); } else if (quant_axis == 1) { int grid = in_dims[0] * in_dims[1]; int block = 1024; - ChannelClipAndQuantDequantKernelQuantAxis1< - T><<>>( - in_data, scale_data, bin_cnt, num, in_dims[0], in_dims[1], out_data); + ChannelClipAndQuantDequantKernelQuantAxis1 + <<>>(in_data, scale_data, bin_cnt, num, + in_dims[0], in_dims[1], out_data); } } }; diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h index dc3f081cc9eab..182db11ed847d 100644 --- a/paddle/fluid/operators/fake_quantize_op.h +++ b/paddle/fluid/operators/fake_quantize_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc index 6e646f0d4bf26..68ef8f3c2be11 100644 --- a/paddle/fluid/operators/fc_op.cc +++ b/paddle/fluid/operators/fc_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fc_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h index 47c7128603587..1c76c2c36b84e 100644 --- a/paddle/fluid/operators/fc_op.h +++ b/paddle/fluid/operators/fc_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/fc_functor.h" diff --git a/paddle/fluid/operators/fill_any_like_op_xpu.cc b/paddle/fluid/operators/fill_any_like_op_xpu.cc index ec4ba6e926c41..a07fbe5a7a550 100644 --- a/paddle/fluid/operators/fill_any_like_op_xpu.cc +++ b/paddle/fluid/operators/fill_any_like_op_xpu.cc @@ -15,7 +15,6 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include "paddle/fluid/framework/op_registry.h" - #include "paddle/phi/kernels/full_kernel.h" namespace paddle { diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index 07593a70f05b7..d6726b99813e6 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fill_constant_op.h" + #include + #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc index edd8613ba525d..a121eb8cc841b 100644 --- a/paddle/fluid/operators/fill_constant_op_npu.cc +++ b/paddle/fluid/operators/fill_constant_op_npu.cc @@ -84,9 +84,10 @@ class FillConstantNPUKernel : public framework::OpKernel { const auto &dev_ctx = ctx.template device_context(); auto op_func = [&shape, &value]( - const std::vector &inputs, const std::vector &outputs, - const NPUAttributeMap &attrs, - const platform::NPUDeviceContext &dev_ctx) { + const std::vector &inputs, + const std::vector &outputs, + const NPUAttributeMap &attrs, + const platform::NPUDeviceContext &dev_ctx) { Tensor tensor_value; tensor_value.mutable_data({1}, dev_ctx.GetPlace()); FillNpuTensorWithConstant(&tensor_value, diff --git a/paddle/fluid/operators/fill_diagonal_tensor_op.h b/paddle/fluid/operators/fill_diagonal_tensor_op.h index ebb980b66af85..5bee72f526815 100644 --- a/paddle/fluid/operators/fill_diagonal_tensor_op.h +++ b/paddle/fluid/operators/fill_diagonal_tensor_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc index 521ddd4ec12b3..e934b794f8ba7 100644 --- a/paddle/fluid/operators/fill_op.cc +++ b/paddle/fluid/operators/fill_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fill_op.h" + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/fill_op.h b/paddle/fluid/operators/fill_op.h index c5cbffbf5c695..7f7e0f2b31aa0 100644 --- a/paddle/fluid/operators/fill_op.h +++ b/paddle/fluid/operators/fill_op.h @@ -14,9 +14,9 @@ limitations under the License. */ #pragma once +#include #include -#include #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/fill_zeros_like_op.cc b/paddle/fluid/operators/fill_zeros_like_op.cc index 2d340829332c8..518d8414c5092 100644 --- a/paddle/fluid/operators/fill_zeros_like_op.cc +++ b/paddle/fluid/operators/fill_zeros_like_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fill_zeros_like_op.h" + #include "paddle/fluid/platform/complex.h" namespace paddle { diff --git a/paddle/fluid/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc index 4cb0887c1f326..91809b8cd11bd 100644 --- a/paddle/fluid/operators/fill_zeros_like_op.cu.cc +++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fill_zeros_like_op.h" + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" diff --git a/paddle/fluid/operators/filter_by_instag_op.cc b/paddle/fluid/operators/filter_by_instag_op.cc index 02ea2d59ae307..cb1e3083320b4 100644 --- a/paddle/fluid/operators/filter_by_instag_op.cc +++ b/paddle/fluid/operators/filter_by_instag_op.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/operators/filter_by_instag_op.h" #include + #include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/var_type_inference.h" diff --git a/paddle/fluid/operators/filter_by_instag_op.cu b/paddle/fluid/operators/filter_by_instag_op.cu index 7870efba4e7a1..75680a61b30eb 100644 --- a/paddle/fluid/operators/filter_by_instag_op.cu +++ b/paddle/fluid/operators/filter_by_instag_op.cu @@ -20,6 +20,7 @@ #include #include + #include #include #include @@ -30,11 +31,10 @@ #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/operators/filter_by_instag_op.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/operators/filter_by_instag_op.h" - #if defined(PADDLE_WITH_CUDA) namespace cg = cooperative_groups; #endif @@ -277,7 +277,7 @@ __global__ void filter_copy_fuse_kernel( T* dst = out_data + output_start_idx * x1_embed_size; const T* src_start = x1_data + x1_lods_data[p] * x1_embed_size; const T* src_end = x1_data + x1_lods_data[p + 1] * x1_embed_size; - for (const T *j = src_start; j != src_end; dst++, j++) { + for (const T* j = src_start; j != src_end; dst++, j++) { *dst = *j; } } @@ -306,7 +306,7 @@ __global__ void copy_grad_kernel(const size_t N, const int ins_per_thread, const T* src_end = out_grad_data + (map_data[p * 3] + map_data[p * 3 + 2]) * x1_embed_size; - for (const T *j = src_start; j != src_end; dst++, j++) { + for (const T* j = src_start; j != src_end; dst++, j++) { *dst = *j; } } diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h index 3abc980ceaafc..6172fef9b4bba 100644 --- a/paddle/fluid/operators/filter_by_instag_op.h +++ b/paddle/fluid/operators/filter_by_instag_op.h @@ -20,6 +20,7 @@ #include #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/mixed_vector.h" diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc index d1ac573b84461..2e767c3705188 100644 --- a/paddle/fluid/operators/flatten_op.cc +++ b/paddle/fluid/operators/flatten_op.cc @@ -13,10 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/flatten_op.h" + #include #include #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h index cacd30cad8a94..6a91cd8b9414a 100644 --- a/paddle/fluid/operators/flatten_op.h +++ b/paddle/fluid/operators/flatten_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/operators/flip_op.cc b/paddle/fluid/operators/flip_op.cc index e1ee1a86a2f96..b00cbf5c4fc26 100644 --- a/paddle/fluid/operators/flip_op.cc +++ b/paddle/fluid/operators/flip_op.cc @@ -93,10 +93,9 @@ REGISTER_OPERATOR(flip, ops::FlipOp, ops::FlipOpMaker, ops::FlipOpInferVarType, FlipInferShapeFunctor); /* ========================== register checkpoint ===========================*/ -REGISTER_OP_VERSION(flip) - .AddCheckpoint( - R"ROC(Upgrade flip, add new attr [axis] and delete attr [dims].)ROC", - paddle::framework::compatible::OpVersionDesc() - .NewAttr("axis", "The added attr 'axis' doesn't set default value.", - paddle::none) - .DeleteAttr("dims", "The attr 'dims' is deleted.")); +REGISTER_OP_VERSION(flip).AddCheckpoint( + R"ROC(Upgrade flip, add new attr [axis] and delete attr [dims].)ROC", + paddle::framework::compatible::OpVersionDesc() + .NewAttr("axis", "The added attr 'axis' doesn't set default value.", + paddle::none) + .DeleteAttr("dims", "The attr 'dims' is deleted.")); diff --git a/paddle/fluid/operators/fold_op.h b/paddle/fluid/operators/fold_op.h index c0aa47a0b4fcc..fd1a7558b7127 100644 --- a/paddle/fluid/operators/fold_op.h +++ b/paddle/fluid/operators/fold_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/im2col.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/frame_op.cc b/paddle/fluid/operators/frame_op.cc index 2ff9beb36f284..00c98cae10e1c 100644 --- a/paddle/fluid/operators/frame_op.cc +++ b/paddle/fluid/operators/frame_op.cc @@ -33,10 +33,11 @@ class FrameOp : public framework::OperatorWithKernel { const int x_rank = x_dims.size(); PADDLE_ENFORCE_GE( - x_rank, 1, platform::errors::InvalidArgument( - "Input(X) of FrameOp should be a tensor which contains " - "at least 1 dimension, but got rank %s.", - x_rank)); + x_rank, 1, + platform::errors::InvalidArgument( + "Input(X) of FrameOp should be a tensor which contains " + "at least 1 dimension, but got rank %s.", + x_rank)); PADDLE_ENFORCE_GT(hop_length, 0, platform::errors::InvalidArgument( "Attribute(hop_length) of FrameOp should be greater " diff --git a/paddle/fluid/operators/fsp_op.cc b/paddle/fluid/operators/fsp_op.cc index f00ec6a1e140c..16ce2b43bf4e1 100644 --- a/paddle/fluid/operators/fsp_op.cc +++ b/paddle/fluid/operators/fsp_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fsp_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index e23891d899de6..4ffb96d3c51bc 100755 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -1,97 +1,149 @@ include(operators) if(WITH_UNITY_BUILD) - # Load Unity Build rules for operators in paddle/fluid/operators/fused. - include(unity_build_rule.cmake) + # Load Unity Build rules for operators in paddle/fluid/operators/fused. + include(unity_build_rule.cmake) endif() -register_operators(EXCLUDES - fused_bn_activation_op - conv_fusion_op - fusion_transpose_flatten_concat_op - fusion_conv_inception_op - fused_fc_elementwise_layernorm_op - multihead_matmul_op - skip_layernorm_op - yolo_box_head_op - yolo_box_post_op - fused_embedding_eltwise_layernorm_op - fusion_group_op - fusion_gru_op - fusion_lstm_op - fused_bn_add_activation_op - fused_attention_op - fused_transformer_op - fused_feedforward_op - fused_multi_transformer_op - fused_bias_dropout_residual_layer_norm_op - resnet_unit_op - fused_gemm_epilogue_op - fused_gate_attention_op) +register_operators( + EXCLUDES + fused_bn_activation_op + conv_fusion_op + fusion_transpose_flatten_concat_op + fusion_conv_inception_op + fused_fc_elementwise_layernorm_op + multihead_matmul_op + skip_layernorm_op + yolo_box_head_op + yolo_box_post_op + fused_embedding_eltwise_layernorm_op + fusion_group_op + fusion_gru_op + fusion_lstm_op + fused_bn_add_activation_op + fused_attention_op + fused_transformer_op + fused_feedforward_op + fused_multi_transformer_op + fused_bias_dropout_residual_layer_norm_op + resnet_unit_op + fused_gemm_epilogue_op + fused_gate_attention_op) # fusion_gru_op does not have CUDA kernel op_library(fusion_gru_op) op_library(fusion_lstm_op) +if(WITH_GPU OR WITH_ROCM) + # fused_bn_activation_op needs cudnn 7.4.1 above + # HIP not support bn act fuse in MIOPEN + if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401)) + op_library(fused_bn_activation_op) + endif() + # conv_fusion_op needs cudnn 7 above + if(NOT ${CUDNN_VERSION} VERSION_LESS 7100) + op_library(conv_fusion_op) + endif() + # fusion_transpose_flatten_concat_op + # HIP not support cudnnTransformTensor + if(NOT WITH_ROCM) + op_library(fusion_transpose_flatten_concat_op) + endif() + # fusion_conv_inception_op needs cudnn 7 above + # HIP not support cudnnConvolutionBiasActivationForward + if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7100)) + op_library(fusion_conv_inception_op) + endif() + # fused_fc_elementwise_layernorm_op + op_library(fused_fc_elementwise_layernorm_op) + # multihead_matmul_op + op_library(multihead_matmul_op) + op_library(skip_layernorm_op) + op_library(yolo_box_head_op) + op_library(yolo_box_post_op) + op_library(fused_embedding_eltwise_layernorm_op) + op_library(fused_gate_attention_op) + # fusion_group + if(NOT APPLE AND NOT WIN32) + op_library(fusion_group_op DEPS device_code) + cc_test( + test_fusion_group_op + SRCS fusion_group_op_test.cc + DEPS fusion_group_op) + endif() + # fused_bn_add_activation + # HIP not support bn act fuse in MIOPEN + if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401)) + op_library(fused_bn_add_activation_op) + endif() + # fused_dropout + # only support CUDA + if(NOT WITH_ROCM) + nv_test( + test_fused_residual_dropout_bias + SRCS fused_residual_dropout_bias_test.cu + DEPS tensor + op_registry + dropout_op + layer_norm_op + device_context + generator + memory) + nv_test( + test_fused_dropout_act_bias + SRCS fused_dropout_act_bias_test.cu + DEPS tensor + op_registry + dropout_op + layer_norm_op + device_context + generator + memory) + nv_test( + test_fused_layernorm_residual_dropout_bias + SRCS fused_layernorm_residual_dropout_bias_test.cu + DEPS tensor + op_registry + dropout_op + layer_norm_op + device_context + generator + memory) -if (WITH_GPU OR WITH_ROCM) - # fused_bn_activation_op needs cudnn 7.4.1 above - # HIP not support bn act fuse in MIOPEN - if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401)) - op_library(fused_bn_activation_op) - endif() - # conv_fusion_op needs cudnn 7 above - if (NOT ${CUDNN_VERSION} VERSION_LESS 7100) - op_library(conv_fusion_op) - endif() - # fusion_transpose_flatten_concat_op - # HIP not support cudnnTransformTensor - if(NOT WITH_ROCM) - op_library(fusion_transpose_flatten_concat_op) - endif() - # fusion_conv_inception_op needs cudnn 7 above - # HIP not support cudnnConvolutionBiasActivationForward - if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7100)) - op_library(fusion_conv_inception_op) - endif() - # fused_fc_elementwise_layernorm_op - op_library(fused_fc_elementwise_layernorm_op) - # multihead_matmul_op - op_library(multihead_matmul_op) - op_library(skip_layernorm_op) - op_library(yolo_box_head_op) - op_library(yolo_box_post_op) - op_library(fused_embedding_eltwise_layernorm_op) - op_library(fused_gate_attention_op) - # fusion_group - if(NOT APPLE AND NOT WIN32) - op_library(fusion_group_op DEPS device_code) - cc_test(test_fusion_group_op SRCS fusion_group_op_test.cc DEPS fusion_group_op) - endif() - # fused_bn_add_activation - # HIP not support bn act fuse in MIOPEN - if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401)) - op_library(fused_bn_add_activation_op) - endif() - # fused_dropout - # only support CUDA - if(NOT WITH_ROCM) - nv_test(test_fused_residual_dropout_bias SRCS fused_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory) - nv_test(test_fused_dropout_act_bias SRCS fused_dropout_act_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory) - nv_test(test_fused_layernorm_residual_dropout_bias SRCS fused_layernorm_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory) + op_library(fused_feedforward_op) + # fused_attention_op + op_library(fused_attention_op) + op_library(fused_multi_transformer_op) + op_library(fused_bias_dropout_residual_layer_norm_op) + endif() + # resnet_unit needs cudnn 8.0 above + if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000)) + op_library(resnet_unit_op) + cc_test( + test_cudnn_norm_conv + SRCS cudnn_norm_conv_test.cc + DEPS conv_op + blas + im2col + vol2col + depthwise_conv + eigen_function + tensor + op_registry + device_context + generator + memory) + cc_test( + test_cudnn_bn_add_relu + SRCS cudnn_bn_add_relu_test.cc + DEPS batch_norm_op + fused_bn_add_activation_op + tensor + op_registry + device_context + generator + memory) + endif() - op_library(fused_feedforward_op) - # fused_attention_op - op_library(fused_attention_op) - op_library(fused_multi_transformer_op) - op_library(fused_bias_dropout_residual_layer_norm_op) - endif() - # resnet_unit needs cudnn 8.0 above - if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000)) - op_library(resnet_unit_op) - cc_test(test_cudnn_norm_conv SRCS cudnn_norm_conv_test.cc DEPS conv_op blas im2col vol2col depthwise_conv eigen_function tensor op_registry device_context generator memory) - cc_test(test_cudnn_bn_add_relu SRCS cudnn_bn_add_relu_test.cc DEPS batch_norm_op fused_bn_add_activation_op tensor op_registry device_context generator memory) - endif() - - if (CUDA_VERSION GREATER_EQUAL 11.6) - op_library(fused_gemm_epilogue_op) - endif() + if(CUDA_VERSION GREATER_EQUAL 11.6) + op_library(fused_gemm_epilogue_op) + endif() endif() diff --git a/paddle/fluid/operators/fused/attention_layer_norm.h b/paddle/fluid/operators/fused/attention_layer_norm.h index 43491a9faf18c..b960b83597973 100644 --- a/paddle/fluid/operators/fused/attention_layer_norm.h +++ b/paddle/fluid/operators/fused/attention_layer_norm.h @@ -38,11 +38,10 @@ class AttnLayerNorm { auto stream = dev_ctx_.stream(); switch (GetDesiredBlockDim(feature_size_)) { - FIXED_BLOCK_DIM_CASE( - LayerNormForward, - kBlockDim><<>>( - x_data, scale_data, bias_data, y_data, mean_data, var_data, - epsilon_, feature_size_)); + FIXED_BLOCK_DIM_CASE(LayerNormForward, kBlockDim> + <<>>( + x_data, scale_data, bias_data, y_data, mean_data, + var_data, epsilon_, feature_size_)); default: PADDLE_THROW(platform::errors::InvalidArgument( "Feature_size must be larger than 1")); diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h index b059223eaf6e7..feac0f7953027 100644 --- a/paddle/fluid/operators/fused/attn_bias_add.cu.h +++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h @@ -120,24 +120,24 @@ void LaunchBiasAddFwKernel(const platform::CUDADeviceContext& ctx, int m, int n, auto stream = ctx.stream(); switch (vec_size) { case 4: { - BroadcastKernelBinary<<>>( - in0, in1, out, use_broadcast, numel, configlists, main_tid, tail_tid, - func); + BroadcastKernelBinary + <<>>(in0, in1, out, use_broadcast, numel, + configlists, main_tid, tail_tid, + func); break; } case 2: { - BroadcastKernelBinary<<>>( - in0, in1, out, use_broadcast, numel, configlists, main_tid, tail_tid, - func); + BroadcastKernelBinary + <<>>(in0, in1, out, use_broadcast, numel, + configlists, main_tid, tail_tid, + func); break; } case 1: { - BroadcastKernelBinary<<>>( - in0, in1, out, use_broadcast, numel, configlists, main_tid, tail_tid, - func); + BroadcastKernelBinary + <<>>(in0, in1, out, use_broadcast, numel, + configlists, main_tid, tail_tid, + func); break; } default: { @@ -176,8 +176,8 @@ void Launch1DColumnReduce(gpuStream_t stream, const int max_threads, const int block = 256; const int max_blocks = std::max(max_threads / block, 1); const int grid = std::min(left_num, max_blocks); - Compute1DColumnReduceKernel<<>>( - reduce_num, left_num, d_out, d_bias); + Compute1DColumnReduceKernel + <<>>(reduce_num, left_num, d_out, d_bias); } void SetConfigForColumnReduce(const int max_threads, const int reduce_num, @@ -273,8 +273,8 @@ void Launch2DColumnReduce(const platform::CUDADeviceContext& dev_ctx, const auto& stream = dev_ctx.stream(); if (!should_reduce_again) { - BiasAddBwSinglePassKernel<<>>(d_out, reduce_num, - left_num, d_bias); + BiasAddBwSinglePassKernel + <<>>(d_out, reduce_num, left_num, d_bias); } else { framework::Tensor tmp_sum; tmp_sum.Resize({grid.y, left_num}); diff --git a/paddle/fluid/operators/fused/attn_gemm.h b/paddle/fluid/operators/fused/attn_gemm.h index 304aad16ad0c6..a85b2f99bb157 100644 --- a/paddle/fluid/operators/fused/attn_gemm.h +++ b/paddle/fluid/operators/fused/attn_gemm.h @@ -14,12 +14,10 @@ limitations under the License. */ #pragma once -#include "paddle/fluid/platform/float16.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/elementwise_functor.h" - #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cc b/paddle/fluid/operators/fused/conv_fusion_op.cc index 671e94061cb5c..490d92880c9a8 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cc +++ b/paddle/fluid/operators/fused/conv_fusion_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu index 8191c85f2a120..9ca9f8aaf743f 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cu +++ b/paddle/fluid/operators/fused/conv_fusion_op.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/conv_search_cache.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h" diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc index 516b10fa021c1..09fa3a247e64b 100644 --- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc +++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc @@ -182,19 +182,20 @@ void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx, std::string data_layout = "NHWC"; attrs.insert({"data_layout", data_layout}); - auto op = framework::OpRegistry::CreateOp( - "batch_norm", {{"X", {"X"}}, - {"Scale", {"Scale"}}, - {"Bias", {"Bias"}}, - {"Mean", {"Mean"}}, - {"Variance", {"Variance"}}}, - {{"Y", {"Y"}}, - {"MeanOut", {"Mean"}}, - {"VarianceOut", {"Variance"}}, - {"SavedMean", {"SavedMean"}}, - {"SavedVariance", {"SavedVariance"}}, - {"ReserveSpace", {"ReserveSpace"}}}, - attrs); + auto op = + framework::OpRegistry::CreateOp("batch_norm", + {{"X", {"X"}}, + {"Scale", {"Scale"}}, + {"Bias", {"Bias"}}, + {"Mean", {"Mean"}}, + {"Variance", {"Variance"}}}, + {{"Y", {"Y"}}, + {"MeanOut", {"Mean"}}, + {"VarianceOut", {"Variance"}}, + {"SavedMean", {"SavedMean"}}, + {"SavedVariance", {"SavedVariance"}}, + {"ReserveSpace", {"ReserveSpace"}}}, + attrs); op->Run(scope, ctx.GetPlace()); paddle::framework::TensorCopySync(*y, platform::CPUPlace(), cpu_y); @@ -314,8 +315,9 @@ void ComputeFusedBNAddReluBackward( attrs.insert({"epsilon", epsilon}); attrs.insert({"act_type", act_type}); - auto op = framework::OpRegistry::CreateOp( - "fused_bn_add_activation_grad", {{"X", {"X"}}, + auto op = + framework::OpRegistry::CreateOp("fused_bn_add_activation_grad", + {{"X", {"X"}}, {"Y", {"Y"}}, {"Y@GRAD", {"Y@GRAD"}}, {"Scale", {"Scale"}}, @@ -323,11 +325,11 @@ void ComputeFusedBNAddReluBackward( {"SavedMean", {"SavedMean"}}, {"SavedVariance", {"SavedVariance"}}, {"ReserveSpace", {"ReserveSpace"}}}, - {{"X@GRAD", {"X@GRAD"}}, - {"Z@GRAD", {"Z@GRAD"}}, - {"Scale@GRAD", {"Scale@GRAD"}}, - {"Bias@GRAD", {"Bias@GRAD"}}}, - attrs); + {{"X@GRAD", {"X@GRAD"}}, + {"Z@GRAD", {"Z@GRAD"}}, + {"Scale@GRAD", {"Scale@GRAD"}}, + {"Bias@GRAD", {"Bias@GRAD"}}}, + attrs); op->Run(scope, ctx.GetPlace()); paddle::framework::TensorCopySync(*dx, platform::CPUPlace(), cpu_dx); diff --git a/paddle/fluid/operators/fused/cudnn_fusion_helper.h b/paddle/fluid/operators/fused/cudnn_fusion_helper.h index 13fad0b7cbb3d..a8f700c21199f 100644 --- a/paddle/fluid/operators/fused/cudnn_fusion_helper.h +++ b/paddle/fluid/operators/fused/cudnn_fusion_helper.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/operator_kernel_configs.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc index 5881322007add..f4443bba3fdb2 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc +++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc @@ -167,9 +167,10 @@ void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx, attrs.insert({"workspace_size_MB", 512}); auto op = framework::OpRegistry::CreateOp( - "conv2d_grad", {{"Input", {"Input"}}, - {"Filter", {"Filter"}}, - {"Output@GRAD", {"Output@GRAD"}}}, + "conv2d_grad", + {{"Input", {"Input"}}, + {"Filter", {"Filter"}}, + {"Output@GRAD", {"Output@GRAD"}}}, {{"Input@GRAD", {"Input@GRAD"}}, {"Filter@GRAD", {"Filter@GRAD"}}}, attrs); op->Run(scope, ctx.GetPlace()); diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h index 38f9aff226ea9..ce95b0a320c66 100644 --- a/paddle/fluid/operators/fused/fmha_ref.h +++ b/paddle/fluid/operators/fused/fmha_ref.h @@ -186,8 +186,9 @@ class FMHARef { if (dropout_param_.dropout_prob_) { DropoutFwGPUKernelDriver( static_cast(dev_ctx_), - dropout_param_.is_test_, static_cast( - dropout_param_.dropout_implementation_), + dropout_param_.is_test_, + static_cast( + dropout_param_.dropout_implementation_), dropout_param_.dropout_prob_, dropout_param_.is_upscale_in_train_, dropout_param_.is_fix_seed_, dropout_param_.seed_val_, static_cast(*softmax_out_tensor), dropout_param_.seed_, diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc index a1adec9641a6e..06ede8e2c7bdd 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cc +++ b/paddle/fluid/operators/fused/fused_attention_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { @@ -88,12 +89,13 @@ class FusedAttentionOp : public framework::OperatorWithKernel { // y: qkv's weight: [3, num_head, dim_head, dim_embed] auto x_dim = ctx->GetInputDim("X"); auto y_dim = ctx->GetInputDim("QKVW"); - PADDLE_ENFORCE_EQ(x_dim.size(), 3, platform::errors::InvalidArgument( - "The dimensions of x must be 3" - "(batch_size, seq_len, dim_embed)," - "but received dimensions of" - "Input is [%d]", - x_dim.size())); + PADDLE_ENFORCE_EQ( + x_dim.size(), 3, + platform::errors::InvalidArgument("The dimensions of x must be 3" + "(batch_size, seq_len, dim_embed)," + "but received dimensions of" + "Input is [%d]", + x_dim.size())); PADDLE_ENFORCE_EQ(y_dim.size(), 4, platform::errors::InvalidArgument( "The dimensions of qkv_weight must be 4" diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu index f25bd53992894..73fdd29fd62c3 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_attention_op.cu @@ -13,21 +13,21 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/fused/attention_layer_norm.h" +#include "paddle/fluid/operators/fused/attn_gemm.h" +#include "paddle/fluid/operators/fused/fmha_ref.h" +#include "paddle/fluid/operators/fused/fused_dropout_helper.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" - #include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/fluid/operators/fused/attention_layer_norm.h" -#include "paddle/fluid/operators/fused/attn_gemm.h" -#include "paddle/fluid/operators/fused/fmha_ref.h" -#include "paddle/fluid/operators/fused/fused_dropout_helper.h" - #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -463,11 +463,13 @@ class FusedAttentionGradKernel : public framework::OpKernel { auto *bias_dropout_residual_out_data = bias_dropout_residual_out->data(); auto *d_ln_2_scale_data = - (d_ln_2_scale == nullptr ? nullptr : d_ln_2_scale->mutable_data( - ctx.GetPlace())); + (d_ln_2_scale == nullptr + ? nullptr + : d_ln_2_scale->mutable_data(ctx.GetPlace())); auto *d_ln_2_bias_data = - (d_ln_2_bias == nullptr ? nullptr : d_ln_2_bias->mutable_data( - ctx.GetPlace())); + (d_ln_2_bias == nullptr + ? nullptr + : d_ln_2_bias->mutable_data(ctx.GetPlace())); auto *d_bias_dropout_residual_out_data = d_bias_dropout_residual_out->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc index 781f51d70ec66..56f9afdbe9090 100644 --- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc +++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu index 71a2c9728cc6b..35a48611a74f1 100644 --- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu +++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/fused/fused_dropout_helper.h" diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_activation_op.cc index 1b3521f14962a..464856003f03f 100644 --- a/paddle/fluid/operators/fused/fused_bn_activation_op.cc +++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cc @@ -13,9 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fused/fused_bn_activation_op.h" + #include #include #include + #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/op_registry.h" @@ -70,20 +72,22 @@ void FusedBatchNormActOp::InferShape(framework::InferShapeContext *ctx) const { const auto x_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_GE(x_dims.size(), 2, platform::errors::PreconditionNotMet( - "ShapeError: the dimension of input " - "X must greater than or equal to 2." - "But received: the shape of input X " - "= [%s], the dimension of input X =" - "[%d]", - x_dims, x_dims.size())); - PADDLE_ENFORCE_LE(x_dims.size(), 5, platform::errors::PreconditionNotMet( - "ShapeError: the dimension of input " - "X must smaller than or equal to 5." - "But received: the shape of input X " - "= [%s], the dimension of input X =" - "[%d]", - x_dims, x_dims.size())); + PADDLE_ENFORCE_GE( + x_dims.size(), 2, + platform::errors::PreconditionNotMet("ShapeError: the dimension of input " + "X must greater than or equal to 2." + "But received: the shape of input X " + "= [%s], the dimension of input X =" + "[%d]", + x_dims, x_dims.size())); + PADDLE_ENFORCE_LE( + x_dims.size(), 5, + platform::errors::PreconditionNotMet("ShapeError: the dimension of input " + "X must smaller than or equal to 5." + "But received: the shape of input X " + "= [%s], the dimension of input X =" + "[%d]", + x_dims, x_dims.size())); const int64_t C = x_dims[x_dims.size() - 1]; @@ -140,22 +144,26 @@ framework::OpKernelType FusedBatchNormActOp::GetExpectedKernelType( if (input_data_type == framework::proto::VarType::FP64) { bn_param_type = framework::proto::VarType::FP64; } - PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType( - ctx.Input("Scale")->dtype()), - platform::errors::PreconditionNotMet( - "Scale input should be of float type")); - PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType( - ctx.Input("Bias")->dtype()), - platform::errors::PreconditionNotMet( - "Bias input should be of float type")); - PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType( - ctx.Input("Mean")->dtype()), - platform::errors::PreconditionNotMet( - "Mean input should be of float type")); - PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType( - ctx.Input("Variance")->dtype()), - platform::errors::PreconditionNotMet( - "Variance input should be of float type")); + PADDLE_ENFORCE_EQ( + bn_param_type, + framework::TransToProtoVarType(ctx.Input("Scale")->dtype()), + platform::errors::PreconditionNotMet( + "Scale input should be of float type")); + PADDLE_ENFORCE_EQ( + bn_param_type, + framework::TransToProtoVarType(ctx.Input("Bias")->dtype()), + platform::errors::PreconditionNotMet( + "Bias input should be of float type")); + PADDLE_ENFORCE_EQ( + bn_param_type, + framework::TransToProtoVarType(ctx.Input("Mean")->dtype()), + platform::errors::PreconditionNotMet( + "Mean input should be of float type")); + PADDLE_ENFORCE_EQ( + bn_param_type, + framework::TransToProtoVarType(ctx.Input("Variance")->dtype()), + platform::errors::PreconditionNotMet( + "Variance input should be of float type")); framework::LibraryType library = framework::LibraryType::kPlain; framework::DataLayout layout = framework::DataLayout::kAnyLayout; diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu index 9e709c9a01a1c..0ebe21dfc6059 100644 --- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu +++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu @@ -16,6 +16,7 @@ #include #include #include + #include "cub/cub.cuh" #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/operators/activation_op.h" @@ -181,8 +182,9 @@ class FusedBatchNormActKernel ctx.GetPlace()), variance_out->template mutable_data>( ctx.GetPlace()), - epsilon, saved_mean->template mutable_data>( - ctx.GetPlace()), + epsilon, + saved_mean->template mutable_data>( + ctx.GetPlace()), saved_variance->template mutable_data>( ctx.GetPlace()), activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr, @@ -343,10 +345,12 @@ class FusedBatchNormActGradKernel /*dBnScaleBiasDesc=*/bn_param_desc_, /*bnScaleData=*/scale->template data>(), /*bnBiasData=*/bias->template data>(), - /*dBnScaleData=*/d_scale - ->template mutable_data>(ctx.GetPlace()), - /*dBnBiasData=*/d_bias - ->template mutable_data>(ctx.GetPlace()), + /*dBnScaleData=*/ + d_scale->template mutable_data>( + ctx.GetPlace()), + /*dBnBiasData=*/ + d_bias->template mutable_data>( + ctx.GetPlace()), /*epsilon=*/epsilon, /*savedMean=*/saved_mean_data, /*savedInvVariance=*/saved_var_data, diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.h b/paddle/fluid/operators/fused/fused_bn_activation_op.h index b8404e4c6553f..da9bca4fc22f7 100644 --- a/paddle/fluid/operators/fused/fused_bn_activation_op.h +++ b/paddle/fluid/operators/fused/fused_bn_activation_op.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/grad_op_desc_maker.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc index d667fafb83594..5d06ac19f9e1c 100644 --- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc +++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc @@ -13,9 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h" + #include #include #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { @@ -52,20 +54,22 @@ void FusedBatchNormAddActOp::InferShape( "of input X = [%s], and the shape of " "input Y = [%s]", x_dims, z_dims)); - PADDLE_ENFORCE_GE(x_dims.size(), 2, platform::errors::InvalidArgument( - "ShapeError: the dimensions of input " - "must greater than or equal to 2." - "But received: the shape of input " - "= [%s], the dimension of input = " - "[%d]", - x_dims, x_dims.size())); - PADDLE_ENFORCE_LE(x_dims.size(), 5, platform::errors::InvalidArgument( - "ShapeError: the dimensions of input " - "must smaller than or equal to 5." - "But received: the shape of input " - "= [%s], the dimension of input = " - "[%d]", - x_dims, x_dims.size())); + PADDLE_ENFORCE_GE( + x_dims.size(), 2, + platform::errors::InvalidArgument("ShapeError: the dimensions of input " + "must greater than or equal to 2." + "But received: the shape of input " + "= [%s], the dimension of input = " + "[%d]", + x_dims, x_dims.size())); + PADDLE_ENFORCE_LE( + x_dims.size(), 5, + platform::errors::InvalidArgument("ShapeError: the dimensions of input " + "must smaller than or equal to 5." + "But received: the shape of input " + "= [%s], the dimension of input = " + "[%d]", + x_dims, x_dims.size())); const int64_t C = x_dims[x_dims.size() - 1]; diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu index 421c1bacb6633..2f7fc6160122d 100644 --- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu +++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu @@ -16,6 +16,7 @@ #include #include #include + #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h" @@ -160,8 +161,9 @@ class FusedBatchNormAddActKernel ctx.GetPlace()), variance_out->template mutable_data>( ctx.GetPlace()), - epsilon, saved_mean->template mutable_data>( - ctx.GetPlace()), + epsilon, + saved_mean->template mutable_data>( + ctx.GetPlace()), saved_variance->template mutable_data>( ctx.GetPlace()), activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr, diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h index d5e5ae9bda642..07d2e4564b692 100644 --- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h +++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/grad_op_desc_maker.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias.h b/paddle/fluid/operators/fused/fused_dropout_act_bias.h old mode 100755 new mode 100644 index 9f5a1bad047b4..f7af7deff5376 --- a/paddle/fluid/operators/fused/fused_dropout_act_bias.h +++ b/paddle/fluid/operators/fused/fused_dropout_act_bias.h @@ -109,15 +109,15 @@ void LaunchDropoutActBias(Functor act_functor, const uint64_t seed, const int real_vec_size = cols % VecSize == 0 ? VecSize : 1; const auto config = Get1DBlocksAnd2DGrids(ctx, rows, cols, real_vec_size); if (cols % VecSize == 0) { - FusedDropoutActBias<<< - config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>( - act_functor, seed, rows, cols, increment, dropout_prob, - is_upscale_in_train, is_test, src, bias, dst, mask_data); + FusedDropoutActBias + <<>>( + act_functor, seed, rows, cols, increment, dropout_prob, + is_upscale_in_train, is_test, src, bias, dst, mask_data); } else { - FusedDropoutActBias<<< - config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>( - act_functor, seed, rows, cols, increment, dropout_prob, - is_upscale_in_train, is_test, src, bias, dst, mask_data); + FusedDropoutActBias + <<>>( + act_functor, seed, rows, cols, increment, dropout_prob, + is_upscale_in_train, is_test, src, bias, dst, mask_data); } } @@ -231,28 +231,28 @@ void LaunchDropoutActBiasGrad(Functor act_functor, const T *dout, dim3 block_dim(threads, 128, 1); dim3 grid_dim(blocks, 1, 1); if (cols % VecSize == 0) { - FusedDropoutActBiasGrad< - T, MaskType, 8, 128, VecSize, - Functor><<>>( - act_functor, dout, mask, src, bias, factor, rows, cols, dx, dbias); + FusedDropoutActBiasGrad + <<>>(act_functor, dout, mask, + src, bias, factor, rows, + cols, dx, dbias); } else { - FusedDropoutActBiasGrad< - T, MaskType, 8, 128, 1, - Functor><<>>( - act_functor, dout, mask, src, bias, factor, rows, cols, dx, dbias); + FusedDropoutActBiasGrad + <<>>(act_functor, dout, mask, + src, bias, factor, rows, + cols, dx, dbias); } } else { const uint64_t n = rows * cols; platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(ctx, n / real_vec_size); if (n % VecSize == 0) { - FusedDropoutActGrad<<< - config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>( - act_functor, dout, mask, src, factor, n, dx); + FusedDropoutActGrad + <<>>( + act_functor, dout, mask, src, factor, n, dx); } else { - FusedDropoutActGrad<<< - config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>( - act_functor, dout, mask, src, factor, n, dx); + FusedDropoutActGrad + <<>>( + act_functor, dout, mask, src, factor, n, dx); } } } diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h index c352f08ec2ba7..6dc1c446bd7d5 100644 --- a/paddle/fluid/operators/fused/fused_dropout_helper.h +++ b/paddle/fluid/operators/fused/fused_dropout_helper.h @@ -30,7 +30,7 @@ namespace operators { * The DropoutParam will be used in the fused_dropout_act_bias, * fused_residual_dropout_bias(pre_layer_norm=ture) or * fused_layernorm_residual_dropout_bias(pre_layer_norm=false). -*/ + */ struct DropoutParam { uint64_t seed; float dropout_prob; @@ -232,8 +232,8 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper { using U = LayerNormParamType; switch (GetDesiredBlockDim(this->cols_)) { FIXED_BLOCK_DIM_CASE( - LayerNormForward< - T, U, kBlockDim><<rows_, kBlockDim, 0, ctx.stream()>>>( + LayerNormForward + <<rows_, kBlockDim, 0, ctx.stream()>>>( src, gamma, beta, out, mean, variance, epsilon_, this->cols_)); } } diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc index 3e69bf0806756..a43562b297228 100644 --- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc +++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fused/fused_elemwise_activation_op.h" + #include #include diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h index 5404cdeab01e0..3ce54968355a5 100644 --- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h +++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" @@ -412,8 +413,9 @@ class FusedElemwiseActivationGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { auto in_y = ctx.Input("Y"); - PADDLE_ENFORCE_NE(in_y, nullptr, platform::errors::InvalidArgument( - "Input(Y) should not be nullptr.")); + PADDLE_ENFORCE_NE( + in_y, nullptr, + platform::errors::InvalidArgument("Input(Y) should not be nullptr.")); auto in_out = ctx.Input("Out"); PADDLE_ENFORCE_NE( in_out, nullptr, @@ -449,15 +451,17 @@ class FusedElemwiseActivationGradKernel : public framework::OpKernel { " so the number of 'Out' should be two.")); } else { if (!InputXCanBeAbsent(functor_list)) { - PADDLE_ENFORCE_NE(in_x, nullptr, platform::errors::InvalidArgument( - "Input(X) should not be null.")); + PADDLE_ENFORCE_NE( + in_x, nullptr, + platform::errors::InvalidArgument("Input(X) should not be null.")); } } // Get in_x if (ctx.HasInput("X")) { - PADDLE_ENFORCE_NE(in_x, nullptr, platform::errors::InvalidArgument( - "Input(X) should not be null.")); + PADDLE_ENFORCE_NE( + in_x, nullptr, + platform::errors::InvalidArgument("Input(X) should not be null.")); } else { // If functor_list contains elementwise_add, the backward doesn't use // in_x, in_y and in_out. diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc index 6746b3b8e8489..951189269c748 100644 --- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/errors.h" diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu index 13f1c6808aef2..f0cb2edb670ec 100644 --- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu +++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu @@ -13,7 +13,9 @@ // limitations under the License. #include + #include + #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/malloc.h" diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc index 7308f30779248..625bfe36e3864 100644 --- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h" + #include + #include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/cpu_vec.h" @@ -100,10 +102,11 @@ void FusedEmbeddingFCLSTMOp::InferShape( platform::errors::InvalidArgument( "The rank of Input(Bias) should be 2, but received value is:%d.", b_dims.size())); - PADDLE_ENFORCE_EQ(b_dims[0], 1, platform::errors::InvalidArgument( - "The first dimension of Input(Bias) " - "should be 1, but received value is:%d.", - b_dims[0])); + PADDLE_ENFORCE_EQ(b_dims[0], 1, + platform::errors::InvalidArgument( + "The first dimension of Input(Bias) " + "should be 1, but received value is:%d.", + b_dims[0])); PADDLE_ENFORCE_EQ( b_dims[1], (ctx->Attrs().Get("use_peepholes") ? 7 : 4) * frame_size, platform::errors::InvalidArgument( @@ -237,21 +240,21 @@ This operator fuse the X into LSTM, more details can refer to LSTM op. template class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { public: -#define INIT_VEC_FUNC \ - std::function act_gate, act_cell, act_cand; \ - auto& act_gate_str = ctx.Attr("gate_activation"); \ - auto& act_cell_str = ctx.Attr("cell_activation"); \ - auto& act_cand_str = ctx.Attr("candidate_activation"); \ - if (platform::MayIUse(platform::avx)) { \ - phi::funcs::VecActivations act_functor; \ - act_gate = act_functor(act_gate_str); \ - act_cell = act_functor(act_cell_str); \ - act_cand = act_functor(act_cand_str); \ - } else { \ - phi::funcs::VecActivations act_functor; \ - act_gate = act_functor(act_gate_str); \ - act_cell = act_functor(act_cell_str); \ - act_cand = act_functor(act_cand_str); \ +#define INIT_VEC_FUNC \ + std::function act_gate, act_cell, act_cand; \ + auto& act_gate_str = ctx.Attr("gate_activation"); \ + auto& act_cell_str = ctx.Attr("cell_activation"); \ + auto& act_cand_str = ctx.Attr("candidate_activation"); \ + if (platform::MayIUse(platform::avx)) { \ + phi::funcs::VecActivations act_functor; \ + act_gate = act_functor(act_gate_str); \ + act_cell = act_functor(act_cell_str); \ + act_cand = act_functor(act_cand_str); \ + } else { \ + phi::funcs::VecActivations act_functor; \ + act_gate = act_functor(act_gate_str); \ + act_cell = act_functor(act_cell_str); \ + act_cand = act_functor(act_cand_str); \ } #define INIT_BASE_INPUT_OUTPUT \ diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc index ec3a76e316ecd..cb3bf5857750f 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h" + #include + #include "paddle/fluid/framework/var_type_inference.h" namespace paddle { diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu index 04d3730a77d4d..2c0184fea463e 100644 --- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu +++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu @@ -179,22 +179,20 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel { if (with_relu) { switch (platform::RoundToPowerOfTwo(N)) { CUDA_LAUNCH_KERNEL_HELPER( - InplaceAddReluAddLayerNormKernel< - T, true, - kPowerOfTwoDim><<>>( - y_data, bias_0_data, bias_1_data, scale_data, out_data, - mean_data, variance_data, M, N, epsilon)); + InplaceAddReluAddLayerNormKernel + <<>>(y_data, bias_0_data, bias_1_data, scale_data, + out_data, mean_data, variance_data, M, N, + epsilon)); } } else { switch (platform::RoundToPowerOfTwo(N)) { CUDA_LAUNCH_KERNEL_HELPER( - InplaceAddReluAddLayerNormKernel< - T, false, - kPowerOfTwoDim><<>>( - y_data, bias_0_data, bias_1_data, scale_data, out_data, - mean_data, variance_data, M, N, epsilon)); + InplaceAddReluAddLayerNormKernel + <<>>(y_data, bias_0_data, bias_1_data, scale_data, + out_data, mean_data, variance_data, M, N, + epsilon)); } } } diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc index 8e15232acda90..d3cc1b9127670 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cc +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/matmul_v2_op.h" diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu index 2eb9885286dab..675ec29da67c8 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cu +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu @@ -14,11 +14,10 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/matmul_v2_op.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - #include "paddle/fluid/operators/fused/fused_dropout_helper.h" #include "paddle/fluid/operators/layer_norm_kernel.cu.h" +#include "paddle/fluid/operators/matmul_v2_op.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" @@ -387,20 +386,19 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { !pre_layer_norm ? context.Input("Ln2Bias") : nullptr; auto* d_x = context.Output(framework::GradVarName("X")); - auto* d_ln1_scale = pre_layer_norm - ? context.Output( - framework::GradVarName("Ln1Scale")) - : nullptr; - auto* d_ln1_bias = pre_layer_norm - ? context.Output( - framework::GradVarName("Ln1Bias")) - : nullptr; - auto* d_ln2_scale = - pre_layer_norm ? nullptr : context.Output( - framework::GradVarName("Ln2Scale")); - auto* d_ln2_bias = - pre_layer_norm ? nullptr : context.Output( - framework::GradVarName("Ln2Bias")); + auto* d_ln1_scale = pre_layer_norm ? context.Output( + framework::GradVarName("Ln1Scale")) + : nullptr; + auto* d_ln1_bias = pre_layer_norm ? context.Output( + framework::GradVarName("Ln1Bias")) + : nullptr; + auto* d_ln2_scale = pre_layer_norm + ? nullptr + : context.Output( + framework::GradVarName("Ln2Scale")); + auto* d_ln2_bias = pre_layer_norm ? nullptr + : context.Output( + framework::GradVarName("Ln2Bias")); auto* d_linear1_weight = context.Output( framework::GradVarName("Linear1Weight")); auto* d_linear1_bias = context.Output( diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cc b/paddle/fluid/operators/fused/fused_gate_attention_op.cc index ba9dbd82e3dcc..0bbeabd5fc9cb 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention_op.cc +++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu index b1badf72557ae..8f375a22cc023 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu @@ -374,9 +374,9 @@ class FusedGateAttentionOpKernel : public framework::OpKernel { v_transpose_out, qkv_transpose_out, softmax_out, fmha_out, &config); // 3. Gating Linear - Tensor *fmha_or_gate_out = - !has_gating ? fmha_out : ComputeGatingLinearForward(ctx, config, - query, fmha_out); + Tensor *fmha_or_gate_out = !has_gating ? fmha_out + : ComputeGatingLinearForward( + ctx, config, query, fmha_out); // 4. Output Linear ComputeOutputLinearForward(ctx, config, fmha_or_gate_out); diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc index 4c4e3661e6d6e..978daa3be85e9 100644 --- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc @@ -13,6 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/fused/fused_gemm_epilogue_op.h" + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" @@ -208,6 +210,9 @@ class FusedGemmEpilogueGradOp : public framework::OperatorWithKernel { auto x_dims = ctx->GetInputDim("X"); auto y_dims = ctx->GetInputDim("Y"); + auto trans_x = ctx->Attrs().Get("trans_x"); + auto trans_y = ctx->Attrs().Get("trans_y"); + PADDLE_ENFORCE_GE( dout_dims.size(), 2, platform::errors::InvalidArgument( @@ -242,14 +247,14 @@ class FusedGemmEpilogueGradOp : public framework::OperatorWithKernel { auto x_mat_dims = phi::flatten_to_2d(x_dims, x_dims.size() - 1); PADDLE_ENFORCE_EQ( - dout_mat_dims[1], y_dims[1], + dout_mat_dims[1], trans_y ? y_dims[0] : y_dims[1], platform::errors::InvalidArgument( "The last dimension of DOut should be equal with Y's last" "dimension. But received DOut[-1] = [%d], Y[1] = [%d].", dout_mat_dims[1], y_dims[1])); PADDLE_ENFORCE_EQ( - dout_mat_dims[0], x_mat_dims[0], + dout_mat_dims[0], trans_x ? x_mat_dims[1] : x_mat_dims[0], platform::errors::InvalidArgument( "The first dimension of DOut should be equal with X's first" "dimension. But received DOut[0] = [%d], Y[0] = [%d].", @@ -288,7 +293,7 @@ class FusedGemmEpilogueGradOp : public framework::OperatorWithKernel { if (ctx->HasOutput("DBias")) { std::vector dbias_dims; - dbias_dims.push_back(y_dims[1]); + dbias_dims.push_back(trans_y ? y_dims[0] : y_dims[1]); ctx->SetOutputDim("DBias", phi::make_ddim(dbias_dims)); } } @@ -323,6 +328,20 @@ class FusedGemmEpilogueGradOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("DBias", "The output grad tensor to bias of Out = (Act(X) * Y) + bias.") .AsDispensable(); + AddAttr( + "trans_x", + R"DOC((bool, default false), Whether to transpose input tensor X + or not. The input tensor X coulbe be more than two dimension. When + set trans_x=true, it would fully reverse X. For instant: X with shpae + [d0, d1, d2, d3] -> [d3, d2, d1, d0].)DOC") + .SetDefault(false); + AddAttr( + "trans_y", + R"DOC((bool, default false), Whether to transpose input tensor Y + or not. The input tensor Y should be two dimension. When + set trans_y=true, it would transpose Y. For instant: Y with shpae + [d0, d1] -> [d1, d0].)DOC") + .SetDefault(false); AddAttr( "activation_grad", @@ -343,11 +362,39 @@ X with shape [d0, d1, d2, d3] -> X_2D with shape [d0*d1*d2, d3] } }; +template +class FusedGemmEpilogueOpGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + const auto& act_type = this->template Attr("activation"); + PADDLE_ENFORCE_EQ( + act_type, "none", + phi::errors::InvalidArgument("The activation should be none.")); + + op->SetType(this->ForwardOpType() + "_grad"); + op->SetInput("X", this->Input("X")); + op->SetInput("Y", this->Input("Y")); + op->SetInput("DOut", this->OutputGrad("Out")); + + op->SetOutput("DX", this->InputGrad("X")); + op->SetOutput("DY", this->InputGrad("Y")); + op->SetOutput("DBias", this->InputGrad("Bias")); + + op->SetAttrMap(this->Attrs()); + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(fused_gemm_epilogue, ops::FusedGemmEpilogueOp, - ops::FusedGemmEpilogueOpMaker) +REGISTER_OPERATOR( + fused_gemm_epilogue, ops::FusedGemmEpilogueOp, + ops::FusedGemmEpilogueOpMaker, + ops::FusedGemmEpilogueOpGradMaker, + ops::FusedGemmEpilogueOpGradMaker); REGISTER_OPERATOR(fused_gemm_epilogue_grad, ops::FusedGemmEpilogueGradOp, - ops::FusedGemmEpilogueGradOpMaker) + ops::FusedGemmEpilogueGradOpMaker); diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu index 9bf3d1a485efc..407cd2b974def 100644 --- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/framework/scope_guard.h" #include "paddle/fluid/operators/fused/fused_gemm_epilogue_op.h" #include "paddle/fluid/platform/dynload/cublasLt.h" #include "paddle/fluid/platform/float16.h" @@ -41,6 +42,8 @@ class FusedGemmEpilogueKernel : public framework::OpKernel { bool trans_y = ctx.Attr("trans_y"); std::string activation = ctx.Attr("activation"); + VLOG(10) << "trans_x = " << trans_x << " , trans_y = " << trans_y + << " , activation = " << activation; bool enable_auxiliary = reserve_space == nullptr ? false : true; out->mutable_data(ctx.GetPlace()); @@ -48,6 +51,7 @@ class FusedGemmEpilogueKernel : public framework::OpKernel { auto x_mat_dims = phi::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1); + // (M * K) * (K * N) int64_t M = trans_x ? x_mat_dims[1] : x_mat_dims[0]; int64_t K = trans_y ? y->dims()[1] : y->dims()[0]; int64_t N = trans_y ? y->dims()[0] : y->dims()[1]; @@ -106,10 +110,11 @@ class FusedGemmEpilogueKernel : public framework::OpKernel { platform::dynload::cublasLtMatmulDescSetAttribute( operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, &aux_data, sizeof(aux_data))); + int64_t aux_ld = N; PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cublasLtMatmulDescSetAttribute( - operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &N, - sizeof(N))); + operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &aux_ld, + sizeof(aux_ld))); } cublasLtMatrixLayout_t x_desc = NULL, y_desc = NULL, out_desc = NULL; @@ -129,8 +134,7 @@ class FusedGemmEpilogueKernel : public framework::OpKernel { &out_desc, mat_type, N, M, N)); cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle(); - size_t workspace_size = 4 * 1024 * 1024; - + size_t workspace_size = static_cast(4) * 1024 * 1024 * 1024; cudaStream_t stream = dev_ctx.stream(); memory::allocation::AllocationPtr workspace = memory::Alloc(dev_ctx, workspace_size); @@ -149,13 +153,13 @@ class FusedGemmEpilogueKernel : public framework::OpKernel { const auto* y_data = y->data(); const auto* x_data = x->data(); - cublasLtMatmulAlgo_t algo = GemmEpilogueAlgoCache::Instance().GetGemmAlgo( + auto algo = GemmEpilogueAlgoCache::Instance().GetGemmAlgo( lt_handle, operation_desc, y_desc, x_desc, out_desc, alpha, beta, y_data, x_data, out_data, stream, workspace->ptr(), workspace_size); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul( lt_handle, operation_desc, alpha, y_data, y_desc, x_data, x_desc, beta, - out_data, out_desc, out_data, out_desc, &algo, workspace->ptr(), + out_data, out_desc, out_data, out_desc, algo, workspace->ptr(), workspace_size, stream)); PADDLE_ENFORCE_GPU_SUCCESS( @@ -191,12 +195,94 @@ class FusedGemmEpilogueKernel : public framework::OpKernel { } }; +enum FusedGEMMGradInType { kDX = 0, kDY = 1, kDZ = 2 }; + +template +struct FusedGEMMGradTrait; + +template <> +struct FusedGEMMGradTrait { + static constexpr auto kXGradA = FusedGEMMGradInType::kDZ; + static constexpr auto kXGradB = FusedGEMMGradInType::kDY; + static constexpr auto kXGradATrans = false; + static constexpr auto kXGradBTrans = true; + + static constexpr auto kYGradA = FusedGEMMGradInType::kDX; + static constexpr auto kYGradB = FusedGEMMGradInType::kDZ; + static constexpr auto kYGradATrans = true; + static constexpr auto kYGradBTrans = false; +}; + +template <> +struct FusedGEMMGradTrait { + static constexpr auto kXGradA = FusedGEMMGradInType::kDY; + static constexpr auto kXGradB = FusedGEMMGradInType::kDZ; + static constexpr auto kXGradATrans = false; + static constexpr auto kXGradBTrans = true; + + static constexpr auto kYGradA = FusedGEMMGradInType::kDX; + static constexpr auto kYGradB = FusedGEMMGradInType::kDZ; + static constexpr auto kYGradATrans = false; + static constexpr auto kYGradBTrans = false; +}; + +template <> +struct FusedGEMMGradTrait { + static constexpr auto kXGradA = FusedGEMMGradInType::kDZ; + static constexpr auto kXGradB = FusedGEMMGradInType::kDY; + static constexpr auto kXGradATrans = false; + static constexpr auto kXGradBTrans = false; + + static constexpr auto kYGradA = FusedGEMMGradInType::kDZ; + static constexpr auto kYGradB = FusedGEMMGradInType::kDX; + static constexpr auto kYGradATrans = true; + static constexpr auto kYGradBTrans = false; +}; + +template <> +struct FusedGEMMGradTrait { + static constexpr auto kXGradA = FusedGEMMGradInType::kDY; + static constexpr auto kXGradB = FusedGEMMGradInType::kDZ; + static constexpr auto kXGradATrans = true; + static constexpr auto kXGradBTrans = true; + + static constexpr auto kYGradA = FusedGEMMGradInType::kDZ; + static constexpr auto kYGradB = FusedGEMMGradInType::kDX; + static constexpr auto kYGradATrans = true; + static constexpr auto kYGradBTrans = true; +}; + +static constexpr auto BoolToCuBlasEnum(bool transpose) { + return transpose ? CUBLAS_OP_T : CUBLAS_OP_N; +} + template class FusedGemmEpilogueGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); + bool transpose_x = ctx.Attr("trans_x"); + bool transpose_y = ctx.Attr("trans_y"); + if (transpose_x) { + if (transpose_y) { + ComputeImpl(ctx); + } else { + ComputeImpl(ctx); + } + } else { + if (transpose_y) { + ComputeImpl(ctx); + } else { + ComputeImpl(ctx); + } + } + } + + private: + template + static void ComputeImpl(const framework::ExecutionContext& ctx) { + using Trait = FusedGEMMGradTrait; + auto& dev_ctx = ctx.template device_context(); const Tensor* dout = ctx.Input("DOut"); const Tensor* x = ctx.Input("X"); const Tensor* y = ctx.Input("Y"); @@ -208,13 +294,18 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel { std::string activation_grad = ctx.Attr("activation_grad"); - auto dout_mat_dims = - phi::flatten_to_2d(dout->dims(), dout->dims().size() - 1); - auto x_mat_dims = phi::flatten_to_2d(x->dims(), x->dims().size() - 1); + VLOG(10) << "trans_x = " << TransX << " , trans_y = " << TransY + << " , activation_grad = " << activation_grad; + + auto x_mat_dims = + phi::flatten_to_2d(x->dims(), TransX ? 1 : x->dims().size() - 1); + + // (M * K) * (K * N) + int64_t M = TransX ? x_mat_dims[1] : x_mat_dims[0]; + int64_t K = TransY ? y->dims()[1] : y->dims()[0]; + int64_t N = TransY ? y->dims()[0] : y->dims()[1]; - int64_t M = x_mat_dims[0]; - int64_t K = y->dims()[0]; - int64_t N = y->dims()[1]; + VLOG(10) << "M = " << M << " , K = " << K << " , N = " << N; cudaDataType_t mat_type = CUDA_R_32F; cudaDataType_t scale_type = CUDA_R_32F; @@ -229,7 +320,8 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel { } cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle(); - size_t workspace_size = 4 * 1024 * 1024; + size_t workspace_size = static_cast(4) * 1024 * 1024 * 1024; + const cublasLtMatmulAlgo_t* algo = nullptr; cudaStream_t stream = dev_ctx.stream(); double alpha64 = 1.0, beta64 = 0.0; @@ -243,24 +335,81 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel { beta = &beta32; } - cublasOperation_t trans_dout = CUBLAS_OP_N; - cublasLtMatrixLayout_t dout_desc = NULL; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( - &dout_desc, mat_type, N, M, N)); + cublasLtMatrixLayout_t dout_desc = nullptr, dout_trans_desc = nullptr; + cublasLtMatrixLayout_t x_desc = nullptr, x_trans_desc = nullptr; + cublasLtMatrixLayout_t y_desc = nullptr, y_trans_desc = nullptr; + cublasLtMatrixLayout_t dx_desc = nullptr, dy_desc = nullptr; + cublasLtMatmulDesc_t dx_operation_desc = nullptr, + dy_operation_desc = nullptr; + + DEFINE_PADDLE_SCOPE_GUARD([&] { + auto descs = {dout_desc, dout_trans_desc, x_desc, x_trans_desc, + y_desc, y_trans_desc, dx_desc, dy_desc}; + for (auto desc : descs) { + if (desc) { + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatrixLayoutDestroy(desc)); + } + } + if (dx_operation_desc) { + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescDestroy(dx_operation_desc)); + } + + if (dy_operation_desc) { + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescDestroy(dy_operation_desc)); + } + }); + + auto x_row = TransX ? K : M; + auto x_col = TransX ? M : K; + auto y_row = TransY ? N : K; + auto y_col = TransY ? K : N; + auto z_row = TransX ? N : M; + auto z_col = TransX ? M : N; + + // dx = func(dout, y) if (dx) { - cublasLtMatmulDesc_t dx_operation_desc = NULL; + constexpr auto kXGradAIsDZ = (Trait::kXGradA == FusedGEMMGradInType::kDZ); + cublasLtMatrixLayout_t *dx_dout_desc, *dx_y_desc; + + if (TransX) { + dx_dout_desc = &dout_trans_desc; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatrixLayoutCreate( + dx_dout_desc, mat_type, z_row, z_col, z_row)); + } else { + dx_dout_desc = &dout_desc; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatrixLayoutCreate( + dx_dout_desc, mat_type, z_col, z_row, z_col)); + } + + dx_y_desc = &y_trans_desc; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + dx_y_desc, mat_type, y_col, y_row, y_col)); + + auto& a_desc = kXGradAIsDZ ? (*dx_dout_desc) : (*dx_y_desc); + auto& b_desc = kXGradAIsDZ ? (*dx_y_desc) : (*dx_dout_desc); + auto a_trans = BoolToCuBlasEnum(Trait::kXGradATrans); + auto b_trans = BoolToCuBlasEnum(Trait::kXGradBTrans); + + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &dx_desc, mat_type, x_col, x_row, x_col)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate( &dx_operation_desc, compute_type, scale_type)); - cublasOperation_t trans_y = CUBLAS_OP_T; PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cublasLtMatmulDescSetAttribute( - dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_dout, - sizeof(trans_dout))); + dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &a_trans, + sizeof(a_trans))); PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cublasLtMatmulDescSetAttribute( - dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_y, - sizeof(trans_y))); + dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &b_trans, + sizeof(b_trans))); + cublasLtEpilogue_t epiloque_func_for_dx = get_epilogue_type_(activation_grad); PADDLE_ENFORCE_GPU_SUCCESS( @@ -274,105 +423,116 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel { platform::dynload::cublasLtMatmulDescSetAttribute( dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, &aux_data, sizeof(aux_data))); + int64_t aux_ld = TransX ? M : K; PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cublasLtMatmulDescSetAttribute( - dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &K, - sizeof(K))); + dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, + &aux_ld, sizeof(aux_ld))); } - cublasLtMatrixLayout_t y_desc = NULL, dx_desc = NULL; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( - &y_desc, mat_type, N, K, N)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( - &dx_desc, mat_type, K, M, K)); - - memory::allocation::AllocationPtr dx_workspace = - memory::Alloc(dev_ctx, workspace_size); + auto dx_workspace = memory::Alloc(dev_ctx, workspace_size); - dx->mutable_data(ctx.GetPlace()); - auto* dx_data = dx->data(); + auto* dx_data = dx->mutable_data(ctx.GetPlace()); const auto* y_data = y->data(); const auto* dout_data = dout->data(); + const auto* a_data = kXGradAIsDZ ? dout_data : y_data; + const auto* b_data = kXGradAIsDZ ? y_data : dout_data; - cublasLtMatmulAlgo_t algo = GemmEpilogueAlgoCache::Instance().GetGemmAlgo( - lt_handle, dx_operation_desc, y_desc, dout_desc, dx_desc, alpha, beta, - y_data, dout_data, dx_data, stream, dx_workspace->ptr(), - workspace_size); + auto algo = GemmEpilogueAlgoCache::Instance().GetGemmAlgo( + lt_handle, dx_operation_desc, b_desc, a_desc, dx_desc, alpha, beta, + b_data, a_data, dx_data, stream, dx_workspace->ptr(), workspace_size); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul( - lt_handle, dx_operation_desc, alpha, y->data(), y_desc, - dout->data(), dout_desc, beta, dx_data, dx_desc, dx_data, dx_desc, - &algo, dx_workspace->ptr(), workspace_size, stream)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cublasLtMatmulDescDestroy(dx_operation_desc)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cublasLtMatrixLayoutDestroy(y_desc)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cublasLtMatrixLayoutDestroy(dx_desc)); + lt_handle, dx_operation_desc, alpha, b_data, b_desc, a_data, a_desc, + beta, dx_data, dx_desc, dx_data, dx_desc, algo, dx_workspace->ptr(), + workspace_size, stream)); } + // dy = func(dout, x) if (dy) { - cublasLtMatmulDesc_t dy_operation_desc = NULL; + constexpr auto kYGradAIsDZ = (Trait::kYGradA == FusedGEMMGradInType::kDZ); + + cublasLtMatrixLayout_t *dy_dout_desc = nullptr, *dy_x_desc = nullptr; + if (TransX) { + dy_dout_desc = &dout_trans_desc; + if (dout_trans_desc == nullptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatrixLayoutCreate( + dy_dout_desc, mat_type, z_row, z_col, z_row)); + } + } else { + dy_dout_desc = &dout_desc; + if (dout_desc == nullptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatrixLayoutCreate( + dy_dout_desc, mat_type, z_col, z_row, z_col)); + } + } + + dy_x_desc = &x_trans_desc; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + dy_x_desc, mat_type, x_col, x_row, x_col)); + + auto& a_desc = kYGradAIsDZ ? (*dy_dout_desc) : (*dy_x_desc); + auto& b_desc = kYGradAIsDZ ? (*dy_x_desc) : (*dy_dout_desc); + auto a_trans = BoolToCuBlasEnum(Trait::kYGradATrans); + auto b_trans = BoolToCuBlasEnum(Trait::kYGradBTrans); + + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &dy_desc, mat_type, y_col, y_row, y_col)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate( &dy_operation_desc, compute_type, scale_type)); - cublasOperation_t trans_x = CUBLAS_OP_T; + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cublasLtMatmulDescSetAttribute( - dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_dout, - sizeof(trans_dout))); + dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &a_trans, + sizeof(a_trans))); PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cublasLtMatmulDescSetAttribute( - dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_x, - sizeof(trans_x))); - cublasLtEpilogue_t epiloque_func_for_dy = dbias == nullptr - ? CUBLASLT_EPILOGUE_DEFAULT - : CUBLASLT_EPILOGUE_BGRADA; + dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &b_trans, + sizeof(b_trans))); + + cublasLtEpilogue_t epiloque_func_for_dy; + if (dbias == nullptr) { + epiloque_func_for_dy = CUBLASLT_EPILOGUE_DEFAULT; + } else { + if (TransY) { + epiloque_func_for_dy = CUBLASLT_EPILOGUE_BGRADB; + } else { + epiloque_func_for_dy = CUBLASLT_EPILOGUE_BGRADA; + } + } + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cublasLtMatmulDescSetAttribute( dy_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epiloque_func_for_dy, sizeof(epiloque_func_for_dy))); if (dbias) { - dbias->mutable_data(ctx.GetPlace()); - auto* dbias_data = dbias->data(); + auto* dbias_data = dbias->mutable_data(ctx.GetPlace()); PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cublasLtMatmulDescSetAttribute( dy_operation_desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &dbias_data, sizeof(dbias_data))); } - cublasLtMatrixLayout_t x_desc = NULL, dy_desc = NULL; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( - &x_desc, mat_type, K, M, K)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( - &dy_desc, mat_type, N, K, N)); - - memory::allocation::AllocationPtr dy_workspace = - memory::Alloc(dev_ctx, workspace_size); - - dy->mutable_data(ctx.GetPlace()); - auto* dy_data = dy->data(); + auto dy_workspace = memory::Alloc(dev_ctx, workspace_size); + auto* dy_data = dy->mutable_data(ctx.GetPlace()); const auto* dout_data = dout->data(); const auto* x_data = x->data(); + const auto* a_data = kYGradAIsDZ ? dout_data : x_data; + const auto* b_data = kYGradAIsDZ ? x_data : dout_data; - cublasLtMatmulAlgo_t algo = GemmEpilogueAlgoCache::Instance().GetGemmAlgo( - lt_handle, dy_operation_desc, dout_desc, x_desc, dy_desc, alpha, beta, - dout_data, x_data, dy_data, stream, dy_workspace->ptr(), - workspace_size); + auto algo = GemmEpilogueAlgoCache::Instance().GetGemmAlgo( + lt_handle, dy_operation_desc, b_desc, a_desc, dy_desc, alpha, beta, + b_data, a_data, dy_data, stream, dy_workspace->ptr(), workspace_size); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul( - lt_handle, dy_operation_desc, alpha, dout_data, dout_desc, x_data, - x_desc, beta, dy_data, dy_desc, dy_data, dy_desc, &algo, - dy_workspace->ptr(), workspace_size, stream)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cublasLtMatmulDescDestroy(dy_operation_desc)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cublasLtMatrixLayoutDestroy(x_desc)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cublasLtMatrixLayoutDestroy(dy_desc)); + lt_handle, dy_operation_desc, alpha, b_data, b_desc, a_data, a_desc, + beta, dy_data, dy_desc, dy_data, dy_desc, algo, dy_workspace->ptr(), + workspace_size, stream)); } - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cublasLtMatrixLayoutDestroy(dout_desc)); } private: diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h index c90a6966fe0a8..b00bdfe5660a9 100644 --- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h @@ -16,12 +16,16 @@ limitations under the License. */ #pragma once #include + #include #include #include + #include "gflags/gflags.h" #include "paddle/fluid/platform/dynload/cublasLt.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/utils/optional.h" DECLARE_int64(cublaslt_exhaustive_search_times); @@ -39,12 +43,14 @@ class GemmEpilogueAlgoCache { GemmEpilogueAlgoCache(GemmEpilogueAlgoCache const &) = delete; void operator=(GemmEpilogueAlgoCache const &) = delete; - cublasLtMatmulAlgo_t GetGemmAlgo( + cublasLtMatmulAlgo_t *GetGemmAlgo( cublasLtHandle_t lt_handle, cublasLtMatmulDesc_t op_desc, cublasLtMatrixLayout_t a_desc, cublasLtMatrixLayout_t b_desc, cublasLtMatrixLayout_t c_desc, const void *alpha, const void *beta, const void *a, const void *b, void *c, cudaStream_t stream, void *workspace, size_t workspace_size) { + if (search_times_ <= 0) return nullptr; + int64_t seed = 0; std::hash hash_fn; @@ -54,132 +60,108 @@ class GemmEpilogueAlgoCache { HashMatrixLayoutDesc_(c_desc, &seed, hash_fn); cublasLtMatmulAlgo_t ret; - auto it = map_.end(); - bool have_found = false; { std::lock_guard lock(cache_mutex_); - it = map_.find(seed); - + auto it = map_.find(seed); if (it != map_.end()) { - ret = it->second; - have_found = true; + return &(it->second); } } - if (!have_found) { - cublasLtMatmulPreference_t preference; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cublasLtMatmulPreferenceCreate(&preference)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cublasLtMatmulPreferenceSetAttribute( - preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, - &workspace_size, sizeof(workspace_size))); - - int returned_results = 0; - cublasLtMatmulHeuristicResult_t heuristic_results[requested_algo_count_] = - {0}; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cublasLtMatmulAlgoGetHeuristic( - lt_handle, op_desc, a_desc, b_desc, c_desc, c_desc, preference, - requested_algo_count_, heuristic_results, &returned_results)); - - PADDLE_ENFORCE_GT( - returned_results, 0, - platform::errors::Unavailable("No GEMM epilogue algorithm support!")); - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cublasLtMatmulPreferenceDestroy(preference)); - - if (search_times_ > 0) { - int best_algo_idx = -1; - float best_algo_time = 0; - - // Run 100 times for warmup - int warmup_algo_idx = 0; - for (int t = 0; t < 100; t++) { - cublasStatus_t status = platform::dynload::cublasLtMatmul( - lt_handle, op_desc, alpha, a, a_desc, b, b_desc, beta, c, c_desc, - c, c_desc, &heuristic_results[warmup_algo_idx].algo, workspace, - workspace_size, stream); - if (status != CUBLAS_STATUS_SUCCESS) { - t = -1; - warmup_algo_idx += 1; - if (warmup_algo_idx == requested_algo_count_) { - PADDLE_THROW(platform::errors::Unavailable( - "No GEMM epilogue algorithm support!")); - } - } - } + cublasLtMatmulPreference_t preference; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulPreferenceCreate(&preference)); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulPreferenceSetAttribute( + preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, + &workspace_size, sizeof(workspace_size))); - cudaEvent_t start_event, stop_event; - PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&start_event)); - PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&stop_event)); - - for (int algo_idx = 0; algo_idx < returned_results; ++algo_idx) { - float curr_time = 0; - for (int check_idx = 0; check_idx < search_times_; check_idx++) { - float time = 0; - PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(start_event, stream)); - - cublasStatus_t status = platform::dynload::cublasLtMatmul( - lt_handle, op_desc, alpha, a, a_desc, b, b_desc, beta, c, - c_desc, c, c_desc, &heuristic_results[algo_idx].algo, workspace, - workspace_size, stream); - - PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(stop_event, stream)); - PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(stop_event)); - PADDLE_ENFORCE_GPU_SUCCESS( - cudaEventElapsedTime(&time, start_event, stop_event)); - curr_time += time; - if (status != CUBLAS_STATUS_SUCCESS) { - curr_time = 3.40282e+038; // Max Value of float - break; - } - } - - curr_time = curr_time / search_times_; - if (curr_time < best_algo_time || algo_idx == 0) { - best_algo_idx = algo_idx; - best_algo_time = curr_time; - } - } + int returned_results = 0; + std::vector heuristic_results( + requested_algo_count_); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulAlgoGetHeuristic( + lt_handle, op_desc, a_desc, b_desc, c_desc, c_desc, preference, + requested_algo_count_, heuristic_results.data(), + &returned_results)); - PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(start_event)); - PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(stop_event)); + PADDLE_ENFORCE_GT( + returned_results, 0, + platform::errors::Unavailable("No GEMM epilogue algorithm support!")); - if (best_algo_idx == -1) { + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulPreferenceDestroy(preference)); + + int best_algo_idx = -1; + float best_algo_time = 0; + + // Run 100 times for warmup + int warmup_algo_idx = 0; + for (int t = 0; t < 100; t++) { + cublasStatus_t status = platform::dynload::cublasLtMatmul( + lt_handle, op_desc, alpha, a, a_desc, b, b_desc, beta, c, c_desc, c, + c_desc, &heuristic_results[warmup_algo_idx].algo, workspace, + workspace_size, stream); + if (status != CUBLAS_STATUS_SUCCESS) { + t = -1; + warmup_algo_idx += 1; + if (warmup_algo_idx == requested_algo_count_) { PADDLE_THROW(platform::errors::Unavailable( "No GEMM epilogue algorithm support!")); } + } + } - ret = heuristic_results[best_algo_idx].algo; - } else { - int decided_algo_idx = -1; - for (int algo_idx = 0; algo_idx < returned_results; ++algo_idx) { - cublasStatus_t status = platform::dynload::cublasLtMatmul( - lt_handle, op_desc, alpha, a, a_desc, b, b_desc, beta, c, c_desc, - c, c_desc, &heuristic_results[algo_idx].algo, workspace, - workspace_size, stream); - if (status == CUBLAS_STATUS_SUCCESS) { - decided_algo_idx = algo_idx; - break; - } - } - if (decided_algo_idx == -1) { - PADDLE_THROW(platform::errors::Unavailable( - "No GEMM epilogue algorithm support!")); + cudaEvent_t start_event, stop_event; + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&start_event)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&stop_event)); + + for (int algo_idx = 0; algo_idx < returned_results; ++algo_idx) { + float curr_time = 0; + for (int check_idx = 0; check_idx < search_times_; check_idx++) { + float time = 0; + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(start_event, stream)); + + cublasStatus_t status = platform::dynload::cublasLtMatmul( + lt_handle, op_desc, alpha, a, a_desc, b, b_desc, beta, c, c_desc, c, + c_desc, &heuristic_results[algo_idx].algo, workspace, + workspace_size, stream); + + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(stop_event, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(stop_event)); + PADDLE_ENFORCE_GPU_SUCCESS( + cudaEventElapsedTime(&time, start_event, stop_event)); + curr_time += time; + if (status != CUBLAS_STATUS_SUCCESS) { + curr_time = 3.40282e+038; // Max Value of float + break; } - ret = heuristic_results[decided_algo_idx].algo; } - std::lock_guard lock(cache_mutex_); - map_[seed] = ret; + curr_time = curr_time / search_times_; + if (curr_time < best_algo_time || algo_idx == 0) { + best_algo_idx = algo_idx; + best_algo_time = curr_time; + } + } + + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(start_event)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(stop_event)); + + if (best_algo_idx == -1) { + PADDLE_THROW( + platform::errors::Unavailable("No GEMM epilogue algorithm support!")); } - VLOG(4) << "Search time:" << search_times_ << ", Is hash-key (" << seed - << ") found in GemmEpilogueAlgoCache? " << have_found; + ret = heuristic_results[best_algo_idx].algo; + + VLOG(4) << "Search time:" << search_times_ << ", hash-key (" << seed + << ") not found in GemmEpilogueAlgoCache"; - return ret; + std::lock_guard lock(cache_mutex_); + auto &algo_in_map = map_[seed]; + algo_in_map = ret; + return &algo_in_map; } private: diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h index 866de8e04a9bc..f72f73438c0a2 100644 --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h @@ -441,11 +441,10 @@ void LaunchLayernormResidualDropoutBias( // call layernorm forward switch (GetDesiredBlockDim(cols)) { FIXED_BLOCK_DIM_CASE( - LayerNormForward< - T, U, kBlockDim, - ScaleBiasWithSameTypeX><<>>( - dst, scale, layernorm_bias, layernorm_dst, mean, var, epsilon, - cols)); + LayerNormForward + <<>>(dst, scale, layernorm_bias, + layernorm_dst, mean, var, + epsilon, cols)); default: PADDLE_THROW(platform::errors::InvalidArgument( "Product from begin_norm_axis to end must be larger than 1")); @@ -468,21 +467,25 @@ void LaunchLayernormResidualDropoutBias( static_cast(std::ceil(rows / static_cast(ROWS_PER_CTA))); \ fused_fast_ln_fwd_kernel< \ T, U, LayerNormScaleBiasT, uint8_t, \ - VecSize, WARPS_M, WARPS_N, BYTES_PER_LDG, \ - cols><<>>( \ - rows, cols, seed, dropout_prob, is_upscale_in_train, is_test, \ - increment, epsilon, src, residual, bias, scale, layernorm_bias, \ - mask_data, mean, var, dst, layernorm_dst); \ + VecSize, WARPS_M, WARPS_N, BYTES_PER_LDG, cols> \ + <<>>( \ + rows, cols, seed, dropout_prob, is_upscale_in_train, is_test, \ + increment, epsilon, src, residual, bias, scale, layernorm_bias, \ + mask_data, mean, var, dst, layernorm_dst); \ } break #define LAUNCH_FUSED_FAST_LN_KERNEL \ LAUNCH_FUSED_FAST_LN_KERNEL_BASE(768); \ LAUNCH_FUSED_FAST_LN_KERNEL_BASE(1024); \ + LAUNCH_FUSED_FAST_LN_KERNEL_BASE(1280); \ + LAUNCH_FUSED_FAST_LN_KERNEL_BASE(1536); \ + LAUNCH_FUSED_FAST_LN_KERNEL_BASE(1792); \ + LAUNCH_FUSED_FAST_LN_KERNEL_BASE(2048); \ LAUNCH_FUSED_FAST_LN_KERNEL_BASE(4096) bool can_call_fast_ln_kernel = false; - if ((cols == 768 || cols == 1024 || cols == 4096) && scale != nullptr && - layernorm_bias != nullptr) { + if (((cols >= 768 && cols <= 2048 && cols % 256 == 0) || cols == 4096) && + scale != nullptr && layernorm_bias != nullptr) { can_call_fast_ln_kernel = true; } VLOG(6) << "can_call_fast_ln_kernel = " << can_call_fast_ln_kernel; @@ -490,12 +493,11 @@ void LaunchLayernormResidualDropoutBias( const int VecSize = MAX_CACHE_BYTES / sizeof(T); if (cols % VecSize != 0) { int blockDim = GetDesiredBlockDim(cols); - FusedLayernormResidualDropoutBias< - T, uint8_t, 1, U, - ScaleBiasWithSameTypeX><<>>( - rows, cols, seed, dropout_prob, is_upscale_in_train, is_test, increment, - epsilon, src, residual, bias, scale, layernorm_bias, mask_data, dst, - layernorm_dst, mean, var); + FusedLayernormResidualDropoutBias + <<>>( + rows, cols, seed, dropout_prob, is_upscale_in_train, is_test, + increment, epsilon, src, residual, bias, scale, layernorm_bias, + mask_data, dst, layernorm_dst, mean, var); } else { if (can_call_fast_ln_kernel) { switch (cols) { @@ -508,12 +510,12 @@ void LaunchLayernormResidualDropoutBias( } } else { int blockDim = GetDesiredBlockDim(cols / VecSize); - FusedLayernormResidualDropoutBias< - T, uint8_t, VecSize, U, - ScaleBiasWithSameTypeX><<>>( - rows, cols, seed, dropout_prob, is_upscale_in_train, is_test, - increment, epsilon, src, residual, bias, scale, layernorm_bias, - mask_data, dst, layernorm_dst, mean, var); + FusedLayernormResidualDropoutBias + <<>>( + rows, cols, seed, dropout_prob, is_upscale_in_train, is_test, + increment, epsilon, src, residual, bias, scale, layernorm_bias, + mask_data, dst, layernorm_dst, mean, var); } } } diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc index 98602e4edd0a2..63627db49d6fa 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { @@ -62,12 +63,13 @@ class FusedMultiTransformerOp : public framework::OperatorWithKernel { // y: qkv's weight: [3, num_head, dim_head, dim_embed] auto x_dim = ctx->GetInputDim("X"); auto y_dim = ctx->GetInputsDim("QKVW")[0]; - PADDLE_ENFORCE_EQ(x_dim.size(), 3, platform::errors::InvalidArgument( - "The dimensions of x must be 3" - "(batch_size, seq_len, dim_embed)," - "but received dimensions of" - "Input is [%d]", - x_dim.size())); + PADDLE_ENFORCE_EQ( + x_dim.size(), 3, + platform::errors::InvalidArgument("The dimensions of x must be 3" + "(batch_size, seq_len, dim_embed)," + "but received dimensions of" + "Input is [%d]", + x_dim.size())); PADDLE_ENFORCE_EQ(y_dim.size(), 4, platform::errors::InvalidArgument( "The dimensions of qkv_weight must be 4" diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu index fe93d323c59bc..814827d95b6bd 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu @@ -16,6 +16,9 @@ limitations under the License. */ // https://github.com/NVIDIA/FasterTransformer/blob/v4.0/fastertransformer/cuda/masked_multihead_attention.cu // We add License in the head. +// headers sort by clang-format may cause compiling error or test faiure, +// see https://github.com/PaddlePaddle/Paddle/pull/42840/ +// clang-format off #include #include #include @@ -35,6 +38,7 @@ limitations under the License. */ #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif +// clang-format on namespace paddle { namespace operators { @@ -529,10 +533,10 @@ inline __device__ void zero(T &dst) { // NOLINT dst = tmp.raw; } -template +template __global__ void masked_multihead_attention_kernel( - Masked_multihead_attention_params params) { + Masked_multihead_attention_params params, int pad_active_groups) { #if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) static_assert(Dh % THREADS_PER_KEY == 0, ""); @@ -560,11 +564,12 @@ __global__ void masked_multihead_attention_kernel( const int tid = threadIdx.x; float qk_max = -FLT_MAX; + float qk = 0; // qkv [B, S=1, 3, num_head, head_dim] int qkv_base_offset = bi * 3 * params.num_head * Dh + hi * Dh; - using Qk_vec = typename Qk_vec_::Type; + using Qk_vec = typename Qk_vec_::Type; constexpr int QK_VEC_SIZE = sizeof(Qk_vec) / sizeof(T); static_assert(Dh % QK_VEC_SIZE == 0 && Dh / QK_VEC_SIZE <= WARP_SIZE, ""); constexpr int QK_VECS_PER_WARP = Dh / QK_VEC_SIZE; @@ -605,18 +610,18 @@ __global__ void masked_multihead_attention_kernel( params.timestep * QK_ELTS_IN_16B + ci; *reinterpret_cast(¶ms.cache_kv[offset]) = k; - float qk = dot(q, k); -#pragma unroll - for (int mask = QK_VECS_PER_WARP / 2; mask >= 1; mask /= 2) { - qk += __shfl_xor_sync(shfl_mask(QK_VECS_PER_WARP), qk, mask); + qk = dot(q, k); + } + if (tid < WARP_SIZE) { + for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) { + qk += __shfl_xor_sync(uint32_t(-1), qk, mask); } - - qk *= params.inv_sqrt_dh; if (tid == 0) { // NOTE(wangxi): mask must be 0.0 // T mask = params.attn_mask[ // bi * (params.timestep + 1) + params.timestep]; // qk += static_cast(mask); + qk *= params.inv_sqrt_dh; qk_max = qk; qk_smem[params.timestep] = qk; } @@ -746,16 +751,18 @@ __global__ void masked_multihead_attention_kernel( zero(out); constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; - for (int ti = vo; ti < params.timestep; ti += V_PER_ITER) { - V_vec v = *reinterpret_cast(&v_cache[ti * Dh]); + if (vo < V_PER_ITER) { + for (int ti = vo; ti < params.timestep; ti += V_PER_ITER) { + V_vec v = *reinterpret_cast(&v_cache[ti * Dh]); #if defined(MMHA_USE_FP32_ACUM_FOR_LOGITS) - float logit = logits_smem[ti]; - out = fma(logit, cast_to_float(v), out); + float logit = logits_smem[ti]; + out = fma(logit, cast_to_float(v), out); #else - T logit = logits_smem[ti]; - // Update the partial sums. - out = fma(logit, v, out); + T logit = logits_smem[ti]; + // Update the partial sums. + out = fma(logit, v, out); #endif + } } #ifdef _DEBUG_FUSED_MULTI_TRANSFORMER @@ -784,8 +791,12 @@ __global__ void masked_multihead_attention_kernel( __syncthreads(); + if (vo < pad_active_groups / 2) { + zero(*reinterpret_cast(&out_smem[vo * Dh + vi])); + } #pragma unroll - for (int active_groups = V_PER_ITER; active_groups >= 2; active_groups /= 2) { + for (int active_groups = pad_active_groups; active_groups >= 2; + active_groups /= 2) { int midpoint = active_groups / 2; if (vo >= midpoint && vo < active_groups) { @@ -830,7 +841,7 @@ __global__ void masked_multihead_attention_kernel( template inline size_t smem_size_in_bytes( const Masked_multihead_attention_params ¶ms, int dim_head, - int threads_per_value, int threads_per_block) { + int threads_per_value, int threads_per_block, int pad_active_groups) { size_t qk_sz = div_up(params.timestep + 1, 4) * 16; size_t logits_sz = 0; @@ -841,31 +852,33 @@ inline size_t smem_size_in_bytes( #endif size_t softmax_sz = qk_sz + logits_sz; - int rows_per_red = threads_per_block / threads_per_value; + int rows_per_red = pad_active_groups; size_t red_sz = rows_per_red * dim_head * sizeof(T) / 2; return max(softmax_sz, red_sz); } -#define MMHA_LAUNCH_KERNEL(T, Dh, THDS_PER_KEY, THDS_PER_VALUE, \ - THDS_PER_BLOCK, stream) \ - size_t smem_sz = \ - smem_size_in_bytes(params, Dh, THDS_PER_VALUE, THDS_PER_BLOCK); \ - dim3 grid(params.num_head, params.batch_size); \ - masked_multihead_attention_kernel< \ - T, Dh, THDS_PER_KEY, THDS_PER_VALUE, \ - THDS_PER_BLOCK><<>>(params) - -template +#define MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, \ + THDS_PER_BLOCK, stream) \ + int pad_active_groups = \ + 1 << static_cast(ceil(std::log2(THDS_PER_BLOCK / THDS_PER_VALUE))); \ + size_t smem_sz = smem_size_in_bytes(params, Dh, THDS_PER_VALUE, \ + THDS_PER_BLOCK, pad_active_groups); \ + dim3 grid(params.num_head, params.batch_size); \ + masked_multihead_attention_kernel \ + <<>>(params, pad_active_groups) + +template void fmha_launch_kernel(const Masked_multihead_attention_params ¶ms, const cudaStream_t &stream) { constexpr int THREADS_PER_VALUE = Dh * sizeof(T) / 16; if (params.timestep < 32) { - MMHA_LAUNCH_KERNEL(T, Dh, 4, THREADS_PER_VALUE, 64, stream); + MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 4, THREADS_PER_VALUE, 64, stream); } else if (params.timestep < 2048) { - MMHA_LAUNCH_KERNEL(T, Dh, 2, THREADS_PER_VALUE, 128, stream); + MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 2, THREADS_PER_VALUE, 128, stream); } else { - MMHA_LAUNCH_KERNEL(T, Dh, 1, THREADS_PER_VALUE, 256, stream); + MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 1, THREADS_PER_VALUE, 256, stream); } } @@ -890,18 +903,21 @@ void fmha(const platform::CUDADeviceContext &dev_ctx, const Tensor &qkv_tensor, switch (dim_head) { case 32: - fmha_launch_kernel(params, dev_ctx.stream()); + fmha_launch_kernel(params, dev_ctx.stream()); break; case 64: - fmha_launch_kernel(params, dev_ctx.stream()); + fmha_launch_kernel(params, dev_ctx.stream()); + break; + case 96: + fmha_launch_kernel(params, dev_ctx.stream()); break; case 128: - fmha_launch_kernel(params, dev_ctx.stream()); + fmha_launch_kernel(params, dev_ctx.stream()); break; default: PADDLE_THROW(platform::errors::Unimplemented( "dim_head = %d is unsupport, only support " - "dim_head = 32, 64 or 128 for now.", + "dim_head = 32, 64, 96 or 128 for now.", dim_head)); } } diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h index 1d3085a013f81..0cc31e6fc3255 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h @@ -153,16 +153,15 @@ void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols, const int real_vec_size = cols % VecSize == 0 ? VecSize : 1; auto config = Get1DBlocksAnd2DGrids(ctx, rows, cols, real_vec_size); if (cols % VecSize == 0) { - FusedResidualDropoutBias<<< - config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>( - rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual, - bias, mask_data, dst, increment, is_test); + FusedResidualDropoutBias + <<>>( + rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual, + bias, mask_data, dst, increment, is_test); } else { - FusedResidualDropoutBias< - T, uint8_t, - 1><<>>( - rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual, - bias, mask_data, dst, increment, is_test); + FusedResidualDropoutBias + <<>>( + rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual, + bias, mask_data, dst, increment, is_test); } } @@ -263,27 +262,26 @@ void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask, dim3 block_dim(threads, 128, 1); dim3 grid_dim(blocks, 1, 1); if (cols % VecSize == 0) { - FusedResidualDropoutBiasGrad< - T, MaskType, 8, 128, - VecSize><<>>( - dout, mask, factor, rows, cols, dx, dbias); + FusedResidualDropoutBiasGrad + <<>>(dout, mask, factor, rows, + cols, dx, dbias); } else { - FusedResidualDropoutBiasGrad<<>>( - dout, mask, factor, rows, cols, dx, dbias); + FusedResidualDropoutBiasGrad + <<>>(dout, mask, factor, rows, + cols, dx, dbias); } } else { const uint64_t n = rows * cols; platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(ctx, n / real_vec_size); if (n % VecSize == 0) { - FusedResidualDropoutGrad<<< - config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>( - dout, mask, factor, n, dx); + FusedResidualDropoutGrad + <<>>( + dout, mask, factor, n, dx); } else { - FusedResidualDropoutGrad<<< - config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>( - dout, mask, factor, n, dx); + FusedResidualDropoutGrad + <<>>( + dout, mask, factor, n, dx); } } } diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc index 23b82ac5d966f..e316f58b3f759 100644 --- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc +++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fused/fused_seqpool_cvm_op.h" + #include namespace paddle { namespace operators { @@ -34,9 +35,10 @@ class FusedSeqpoolCVMOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ( cvm_dims.size(), 2UL, platform::errors::InvalidArgument("Input(CVM)'s rank should be 2.")); - PADDLE_ENFORCE_EQ(cvm_dims[1], 2UL, platform::errors::InvalidArgument( - "The 2nd dimension of " - "Input(CVM) should be 2.")); + PADDLE_ENFORCE_EQ( + cvm_dims[1], 2UL, + platform::errors::InvalidArgument("The 2nd dimension of " + "Input(CVM) should be 2.")); auto ins_dims = ctx->GetInputsDim("X"); const int cvm_offset = ctx->Attrs().Get("cvm_offset"); diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu index 3770a536a8fcf..2b6b7d4934539 100644 --- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu +++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/operators/fused/fused_seqpool_cvm_op.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.h b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.h index 6042772adb054..e3bc424f25910 100644 --- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.h +++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include #include + #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" diff --git a/paddle/fluid/operators/fused/fused_softmax_mask.cu.h b/paddle/fluid/operators/fused/fused_softmax_mask.cu.h index 11f1011dec3a2..4c00f778ced3f 100644 --- a/paddle/fluid/operators/fused/fused_softmax_mask.cu.h +++ b/paddle/fluid/operators/fused/fused_softmax_mask.cu.h @@ -114,10 +114,9 @@ __global__ void FusedSoftmaxMaskVecKernel(T* dst, const T* src, const T* mask, } } -#define SOFTMAX_MASK_KERNEL(VEC_SIZE, ELEMENTS) \ - FusedSoftmaxMaskVecKernel<<>>( \ - dst, src, mask, seq_len) +#define SOFTMAX_MASK_KERNEL(VEC_SIZE, ELEMENTS) \ + FusedSoftmaxMaskVecKernel \ + <<>>(dst, src, mask, seq_len) // FIXME(wangxi): It is found that the performance of VEC_SIZE=2 is better // than that of =4 and =8. Further analysis of the kernel is needed later. diff --git a/paddle/fluid/operators/fused/fused_transformer_op.cc b/paddle/fluid/operators/fused/fused_transformer_op.cc index 9e5fc42fc76dd..d11171eb2d086 100644 --- a/paddle/fluid/operators/fused/fused_transformer_op.cc +++ b/paddle/fluid/operators/fused/fused_transformer_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fused/fused_transformer_op.h" + #include namespace paddle { @@ -157,5 +158,5 @@ void FusedMHA::ComputeForward(T* output, T* softmax_mask) {} template void FusedMHA::ComputeBackward(const T* grad_output, T* softmax_mask, T* grad_x) {} -} -} \ No newline at end of file +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fused/fused_transformer_op.h b/paddle/fluid/operators/fused/fused_transformer_op.h index 2d2d390d243e5..a2d5862abf06a 100644 --- a/paddle/fluid/operators/fused/fused_transformer_op.h +++ b/paddle/fluid/operators/fused/fused_transformer_op.h @@ -151,5 +151,5 @@ class FusedTransformerEncoderLayer { std::string act_method; }; -} -} +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc index eeeb004003c9c..802cd18e1db24 100644 --- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc +++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_registry.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" @@ -35,8 +36,9 @@ class ConvInceptionFusionOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ( in_dims.size(), 4, platform::errors::InvalidArgument("Conv intput should be 4-D tensor.")); - PADDLE_ENFORCE_EQ(w_dims.size(), 4, platform::errors::InvalidArgument( - "There should be 4 filters.")); + PADDLE_ENFORCE_EQ( + w_dims.size(), 4, + platform::errors::InvalidArgument("There should be 4 filters.")); PADDLE_ENFORCE_EQ(w_dims[0][1], in_dims[1], platform::errors::InvalidArgument( "Invalid fileter channel number %d, which should be " diff --git a/paddle/fluid/operators/fused/fusion_group_op.cu.cc b/paddle/fluid/operators/fused/fusion_group_op.cu.cc index 94949f5633116..c592bbe7d3e9a 100644 --- a/paddle/fluid/operators/fused/fusion_group_op.cu.cc +++ b/paddle/fluid/operators/fused/fusion_group_op.cu.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fused/fusion_group_op.h" + #include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/fused/fusion_group_op.h b/paddle/fluid/operators/fused/fusion_group_op.h index 5e5f2c60ffbd4..f71355b85d96a 100644 --- a/paddle/fluid/operators/fused/fusion_group_op.h +++ b/paddle/fluid/operators/fused/fusion_group_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device_code.h" diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index afbd5380a8301..fd05155bc2cef 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -13,9 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fused/fusion_gru_op.h" + #include // for memcpy #include #include + #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/jit/kernels.h" #include "paddle/phi/kernels/funcs/blas/blas.h" diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index 3dada660aeffe..f2e6f099b4b58 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fused/fusion_lstm_op.h" + #include + #include "paddle/fluid/operators/jit/kernels.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/fc_functor.h" diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc index bed5125b99583..c9d6d42efac24 100644 --- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc +++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc @@ -13,8 +13,10 @@ * limitations under the License. */ #include "paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h" + #include #include + #include "paddle/fluid/operators/jit/kernels.h" namespace paddle { @@ -24,10 +26,11 @@ void FusionRepeatedFCReluOp::InferShape( framework::InferShapeContext* ctx) const { OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusionRepeatedFCRelu"); auto sz = ctx->Inputs("W").size(); - PADDLE_ENFORCE_GT(sz, 1UL, platform::errors::InvalidArgument( - "Inputs(W) of FusionRepeatedFCReluOp should " - "be greater than 1, but received value is %d.", - sz)); + PADDLE_ENFORCE_GT(sz, 1UL, + platform::errors::InvalidArgument( + "Inputs(W) of FusionRepeatedFCReluOp should " + "be greater than 1, but received value is %d.", + sz)); PADDLE_ENFORCE_EQ( ctx->Inputs("Bias").size(), sz, platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc index ee28a54805653..b99b53de9c4d6 100644 --- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h" + #include // for min, max #include + #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/fc_functor.h" diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc index 58613173ad212..7341d1f864d93 100644 --- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h" + #include + #include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/cpu_vec.h" @@ -48,8 +50,9 @@ void FusionSeqExpandConcatFCOp::InferShape( for (size_t i = 1; i < ins_dims.size(); ++i) { sum += ins_dims[i][1]; } - PADDLE_ENFORCE_EQ(sum, w_dims[0], platform::errors::InvalidArgument( - "FC height should be sum of all inputs " + PADDLE_ENFORCE_EQ( + sum, w_dims[0], + platform::errors::InvalidArgument("FC height should be sum of all inputs " "width, but received FC height is: %d, " "sum of all inputs width is: %d.", w_dims[0], sum)); diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc index e574d67e3982c..1d487ef3dabc1 100644 --- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc @@ -13,8 +13,10 @@ * limitations under the License. */ #include "paddle/fluid/operators/fused/fusion_seqpool_concat_op.h" + #include #include + #include "paddle/fluid/operators/jit/kernels.h" namespace paddle { @@ -29,17 +31,19 @@ void FusionSeqPoolConcatOp::InferShape( ctx->Inputs("X").size())); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "FusionSeqPoolConcat"); int axis = ctx->Attrs().Get("axis"); - PADDLE_ENFORCE_EQ(axis, 1, platform::errors::InvalidArgument( - "FusionSeqPoolConcatOp only supports concat " - "axis=1 yet, but received axis value is %d", - axis)); + PADDLE_ENFORCE_EQ(axis, 1, + platform::errors::InvalidArgument( + "FusionSeqPoolConcatOp only supports concat " + "axis=1 yet, but received axis value is %d", + axis)); auto ins_dims = ctx->GetInputsDim("X"); const size_t n = ins_dims.size(); - PADDLE_ENFORCE_GT(n, 0UL, platform::errors::InvalidArgument( - "Input tensors count should be greater than 0, " - "but received value is %d.", - n)); + PADDLE_ENFORCE_GT(n, 0UL, + platform::errors::InvalidArgument( + "Input tensors count should be greater than 0, " + "but received value is %d.", + n)); if (n == 1) { LOG(WARNING) << "Only have one input, may waste memory"; } diff --git a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc index c74cc504840d3..d29bc00b5459e 100644 --- a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc @@ -13,8 +13,10 @@ * limitations under the License. */ #include "paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h" + #include #include + #include "paddle/fluid/operators/jit/kernels.h" namespace paddle { @@ -31,20 +33,23 @@ void FusionSeqPoolCVMConcatOp::InferShape( paddle::platform::errors::InvalidArgument( "Output(Out) of FusionSeqPoolCVMConcatOp should not be null.")); int axis = ctx->Attrs().Get("axis"); - PADDLE_ENFORCE_EQ(axis, 1, paddle::platform::errors::InvalidArgument( - "FusionSeqPoolCVMConcatOp only supports " - "concat axis=1 yet, but received %d.", - axis)); + PADDLE_ENFORCE_EQ(axis, 1, + paddle::platform::errors::InvalidArgument( + "FusionSeqPoolCVMConcatOp only supports " + "concat axis=1 yet, but received %d.", + axis)); bool use_cvm = ctx->Attrs().Get("use_cvm"); - PADDLE_ENFORCE_EQ(use_cvm, true, paddle::platform::errors::InvalidArgument( - "FusionSeqPoolCVMConcatOp only supports " - "use_cvm is true yet, but received %d.", - use_cvm)); + PADDLE_ENFORCE_EQ(use_cvm, true, + paddle::platform::errors::InvalidArgument( + "FusionSeqPoolCVMConcatOp only supports " + "use_cvm is true yet, but received %d.", + use_cvm)); auto ins_dims = ctx->GetInputsDim("X"); const size_t n = ins_dims.size(); - PADDLE_ENFORCE_GT(n, 0UL, paddle::platform::errors::InvalidArgument( - "Input tensors count should > 0.")); + PADDLE_ENFORCE_GT(n, 0UL, + paddle::platform::errors::InvalidArgument( + "Input tensors count should > 0.")); if (n == 1) { LOG(WARNING) << "Only have one input, may waste memory"; } diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc index 870f72b8c7f0d..047fefc1eeb07 100644 --- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc +++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc @@ -13,8 +13,10 @@ * limitations under the License. */ #include "paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h" + #include #include + #include "paddle/fluid/operators/jit/kernels.h" namespace paddle { diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc index 954cd7cc7a40b..bf8e9818e545f 100644 --- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc +++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h" + #include #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc index 786f5b4e07798..eb29859d8d15b 100644 --- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc +++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h" + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h index 66e6c00da2db8..52140c0ca46ee 100644 --- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h +++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/phi/core/ddim.h" namespace paddle { diff --git a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc index 0ffc4c91b851c..c9956dcdd2010 100644 --- a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include #include + #include "dnnl.hpp" #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/operator.h" @@ -31,8 +32,8 @@ using paddle::platform::CPUDeviceContext; using paddle::platform::CreateKey; using paddle::platform::MKLDNNGetDataType; using paddle::platform::MKLDNNMemDesc; -using platform::to_void_cast; using phi::vectorize; +using platform::to_void_cast; using Direction = dnnl::rnn_direction; namespace { diff --git a/paddle/fluid/operators/fused/multi_gru_op.cc b/paddle/fluid/operators/fused/multi_gru_op.cc index e7d697767fcac..ad0cc0bd1cf86 100644 --- a/paddle/fluid/operators/fused/multi_gru_op.cc +++ b/paddle/fluid/operators/fused/multi_gru_op.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include // for memcpy #include #include + #include "paddle/fluid/operators/jit/kernels.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/fc_functor.h" diff --git a/paddle/fluid/operators/fused/multi_gru_op.h b/paddle/fluid/operators/fused/multi_gru_op.h index ebd3faf44a84b..8b064c8754f5e 100644 --- a/paddle/fluid/operators/fused/multi_gru_op.h +++ b/paddle/fluid/operators/fused/multi_gru_op.h @@ -19,9 +19,9 @@ limitations under the License. */ namespace paddle { namespace operators { +using framework::ExecutionContext; using framework::LoDTensor; using framework::Tensor; -using framework::ExecutionContext; class MultiGRUOp : public framework::OperatorWithKernel { public: diff --git a/paddle/fluid/operators/fused/multihead_matmul_op.cc b/paddle/fluid/operators/fused/multihead_matmul_op.cc index 8f2c04d5afe12..79b886c37297c 100644 --- a/paddle/fluid/operators/fused/multihead_matmul_op.cc +++ b/paddle/fluid/operators/fused/multihead_matmul_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/errors.h" diff --git a/paddle/fluid/operators/fused/multihead_matmul_op.cu b/paddle/fluid/operators/fused/multihead_matmul_op.cu index f0e05659c9294..301553467165a 100644 --- a/paddle/fluid/operators/fused/multihead_matmul_op.cu +++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu @@ -13,7 +13,9 @@ // limitations under the License. #include + #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/math/bert_encoder_functor.h" @@ -105,8 +107,8 @@ void TransQKVWithBias(const int batch, const int seq_len, const int head_size, platform::errors::InvalidArgument( "head_num (%d) * head_size (%d) should <= %d", head_num, head_size, 1024 * 4)); - TransposeQkvKernel<<>>(h, input4, bias4, - output4); + TransposeQkvKernel + <<>>(h, input4, bias4, output4); } else if (head_size % 2 == 0 && scratch_size % 2 == 0) { const int h = head_size / 2; const float2 *input2 = reinterpret_cast(input); @@ -118,8 +120,8 @@ void TransQKVWithBias(const int batch, const int seq_len, const int head_size, platform::errors::InvalidArgument( "head_num (%d) * head_size (%d) should <= %d", head_num, head_size, 1024 * 2)); - TransposeQkvKernel<<>>(h, input2, bias2, - output2); + TransposeQkvKernel + <<>>(h, input2, bias2, output2); } else { const dim3 block(head_size, head_num, 1); // limit head_size * head_num to max block size(1024). @@ -127,8 +129,8 @@ void TransQKVWithBias(const int batch, const int seq_len, const int head_size, platform::errors::InvalidArgument( "head_num (%d) * head_size (%d) should <= %d", head_num, head_size, 1024)); - TransposeQkvKernel<<>>(head_size, input, - bias, output); + TransposeQkvKernel + <<>>(head_size, input, bias, output); } } diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cc b/paddle/fluid/operators/fused/resnet_unit_op.cc index 6f4246aadd903..d5860fe9cf12b 100644 --- a/paddle/fluid/operators/fused/resnet_unit_op.cc +++ b/paddle/fluid/operators/fused/resnet_unit_op.cc @@ -115,13 +115,14 @@ class ResNetUnitOp : public framework::OperatorWithKernel { bn_param_shape = {1, 1, 1, bn_param_shape[0]}; } framework::DDim bn_param_dims = phi::make_ddim(bn_param_shape); - PADDLE_ENFORCE_EQ(x_dims.size(), 4, platform::errors::InvalidArgument( - "The dimensions of input " - "must equal to 4." - "But received: the shape of input " - "= [%s], the dimension of input = " - "[%d]", - x_dims, x_dims.size())); + PADDLE_ENFORCE_EQ( + x_dims.size(), 4, + platform::errors::InvalidArgument("The dimensions of input " + "must equal to 4." + "But received: the shape of input " + "= [%s], the dimension of input = " + "[%d]", + x_dims, x_dims.size())); PADDLE_ENFORCE_EQ(w_dims.size(), 4, platform::errors::InvalidArgument( "The dimensions of filter " @@ -180,14 +181,16 @@ class ResNetUnitOp : public framework::OperatorWithKernel { // and var tensors should be float when input tensor's dtype is float16. auto bn_param_type = framework::proto::VarType::FP32; - PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType( - ctx.Input("ScaleX")->dtype()), - platform::errors::InvalidArgument( - "Scale input should be of float type")); - PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType( - ctx.Input("BiasX")->dtype()), - platform::errors::InvalidArgument( - "Bias input should be of float type")); + PADDLE_ENFORCE_EQ( + bn_param_type, + framework::TransToProtoVarType(ctx.Input("ScaleX")->dtype()), + platform::errors::InvalidArgument( + "Scale input should be of float type")); + PADDLE_ENFORCE_EQ( + bn_param_type, + framework::TransToProtoVarType(ctx.Input("BiasX")->dtype()), + platform::errors::InvalidArgument( + "Bias input should be of float type")); framework::LibraryType library = framework::LibraryType::kPlain; framework::DataLayout layout = framework::DataLayout::kAnyLayout; return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, diff --git a/paddle/fluid/operators/fused/skip_layernorm_op.cc b/paddle/fluid/operators/fused/skip_layernorm_op.cc index 442f359c0dac5..6ac6f51e4ce47 100644 --- a/paddle/fluid/operators/fused/skip_layernorm_op.cc +++ b/paddle/fluid/operators/fused/skip_layernorm_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/errors.h" diff --git a/paddle/fluid/operators/fused/skip_layernorm_op.cu b/paddle/fluid/operators/fused/skip_layernorm_op.cu index e755ea33755ca..66a164ff31bea 100644 --- a/paddle/fluid/operators/fused/skip_layernorm_op.cu +++ b/paddle/fluid/operators/fused/skip_layernorm_op.cu @@ -13,7 +13,9 @@ // limitations under the License. #include + #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/math/bert_encoder_functor.h" diff --git a/paddle/fluid/operators/fused/unity_build_rule.cmake b/paddle/fluid/operators/fused/unity_build_rule.cmake index c428b7456bb20..8605cd3cdae85 100644 --- a/paddle/fluid/operators/fused/unity_build_rule.cmake +++ b/paddle/fluid/operators/fused/unity_build_rule.cmake @@ -4,16 +4,17 @@ # Generally, the combination rules in this file do not need to be modified. # If there are some redefined error in compiling with the source file which # in combination rule, you can remove the source file from the following rules. -register_unity_group(cc - fused_elemwise_activation_op.cc - fused_embedding_fc_lstm_op.cc - fused_embedding_seq_pool_op.cc - fusion_lstm_op.cc - fusion_repeated_fc_relu_op.cc - fusion_seqconv_eltadd_relu_op.cc - fusion_seqexpand_concat_fc_op.cc - fusion_seqpool_concat_op.cc - fusion_squared_mat_sub_op.cc - multi_gru_op.cc - mkldnn/multi_gru_mkldnn_op.cc - fusion_seqpool_cvm_concat_op.cc) +register_unity_group( + cc + fused_elemwise_activation_op.cc + fused_embedding_fc_lstm_op.cc + fused_embedding_seq_pool_op.cc + fusion_lstm_op.cc + fusion_repeated_fc_relu_op.cc + fusion_seqconv_eltadd_relu_op.cc + fusion_seqexpand_concat_fc_op.cc + fusion_seqpool_concat_op.cc + fusion_squared_mat_sub_op.cc + multi_gru_op.cc + mkldnn/multi_gru_mkldnn_op.cc + fusion_seqpool_cvm_concat_op.cc) diff --git a/paddle/fluid/operators/fused_softmax_mask_op.cc b/paddle/fluid/operators/fused_softmax_mask_op.cc index a41380028338a..a33070d94b919 100644 --- a/paddle/fluid/operators/fused_softmax_mask_op.cc +++ b/paddle/fluid/operators/fused_softmax_mask_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fused_softmax_mask_op.h" + #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/fused_softmax_mask_op.cu b/paddle/fluid/operators/fused_softmax_mask_op.cu index c4ab4de8a64cb..b68a6907d7a65 100644 --- a/paddle/fluid/operators/fused_softmax_mask_op.cu +++ b/paddle/fluid/operators/fused_softmax_mask_op.cu @@ -40,6 +40,7 @@ limitations under the License. */ #include #include #include + #include #include diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc index c737ba361e0f2..eefca7b6ab564 100644 --- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc +++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc @@ -11,6 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.h" + #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu index d4c5b8877056f..4ee90eb318496 100644 --- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu +++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu @@ -39,6 +39,7 @@ limitations under the License. */ #include #include #include + #include #include @@ -395,49 +396,49 @@ class SoftmaxMaskFuseUpperTriangleKernel : public framework::OpKernel { switch (pow2_index) { case 5: // 32 - SoftmaxMaskFuseUpperTriangleGPUKernel< - T, 5><<>>(x_data, y_data, batch_count, - key_seq_len); + SoftmaxMaskFuseUpperTriangleGPUKernel + <<>>(x_data, y_data, batch_count, + key_seq_len); break; case 6: // 64 - SoftmaxMaskFuseUpperTriangleGPUKernel< - T, 6><<>>(x_data, y_data, batch_count, - key_seq_len); + SoftmaxMaskFuseUpperTriangleGPUKernel + <<>>(x_data, y_data, batch_count, + key_seq_len); break; case 7: // 128 - SoftmaxMaskFuseUpperTriangleGPUKernel< - T, 7><<>>(x_data, y_data, batch_count, - key_seq_len); + SoftmaxMaskFuseUpperTriangleGPUKernel + <<>>(x_data, y_data, batch_count, + key_seq_len); break; case 8: // 256 - SoftmaxMaskFuseUpperTriangleGPUKernel< - T, 8><<>>(x_data, y_data, batch_count, - key_seq_len); + SoftmaxMaskFuseUpperTriangleGPUKernel + <<>>(x_data, y_data, batch_count, + key_seq_len); break; case 9: // 512 - SoftmaxMaskFuseUpperTriangleGPUKernel< - T, 9><<>>(x_data, y_data, batch_count, - key_seq_len); + SoftmaxMaskFuseUpperTriangleGPUKernel + <<>>(x_data, y_data, batch_count, + key_seq_len); break; case 10: // 1024 - SoftmaxMaskFuseUpperTriangleGPUKernel< - T, 10><<>>(x_data, y_data, batch_count, - key_seq_len); + SoftmaxMaskFuseUpperTriangleGPUKernel + <<>>(x_data, y_data, batch_count, + key_seq_len); break; case 11: // 2048 - SoftmaxMaskFuseUpperTriangleGPUKernel< - T, 11><<>>(x_data, y_data, batch_count, - key_seq_len); + SoftmaxMaskFuseUpperTriangleGPUKernel + <<>>(x_data, y_data, batch_count, + key_seq_len); break; case 12: // 4096 - SoftmaxMaskFuseUpperTriangleGPUKernel< - T, 12><<>>(x_data, y_data, batch_count, - key_seq_len); + SoftmaxMaskFuseUpperTriangleGPUKernel + <<>>(x_data, y_data, batch_count, + key_seq_len); break; case 13: // 8192 - SoftmaxMaskFuseUpperTriangleGPUKernel< - T, 13><<>>(x_data, y_data, batch_count, - key_seq_len); + SoftmaxMaskFuseUpperTriangleGPUKernel + <<>>(x_data, y_data, batch_count, + key_seq_len); break; default: break; @@ -483,58 +484,58 @@ class SoftmaxMaskFuseUpperTriangleGradKernel : public framework::OpKernel { switch (pow2_index) { case 5: // 32 - SoftmaxMaskFuseUpperTriangleGradGPUKernel< - T, 5><<>>(grad_y_data, grad_x_data, - softmax_rst_data, batch_count, - key_seq_len); + SoftmaxMaskFuseUpperTriangleGradGPUKernel + <<>>(grad_y_data, grad_x_data, + softmax_rst_data, batch_count, + key_seq_len); break; case 6: // 64 - SoftmaxMaskFuseUpperTriangleGradGPUKernel< - T, 6><<>>(grad_y_data, grad_x_data, - softmax_rst_data, batch_count, - key_seq_len); + SoftmaxMaskFuseUpperTriangleGradGPUKernel + <<>>(grad_y_data, grad_x_data, + softmax_rst_data, batch_count, + key_seq_len); break; case 7: // 128 - SoftmaxMaskFuseUpperTriangleGradGPUKernel< - T, 7><<>>(grad_y_data, grad_x_data, - softmax_rst_data, batch_count, - key_seq_len); + SoftmaxMaskFuseUpperTriangleGradGPUKernel + <<>>(grad_y_data, grad_x_data, + softmax_rst_data, batch_count, + key_seq_len); break; case 8: // 256 - SoftmaxMaskFuseUpperTriangleGradGPUKernel< - T, 8><<>>(grad_y_data, grad_x_data, - softmax_rst_data, batch_count, - key_seq_len); + SoftmaxMaskFuseUpperTriangleGradGPUKernel + <<>>(grad_y_data, grad_x_data, + softmax_rst_data, batch_count, + key_seq_len); break; case 9: // 512 - SoftmaxMaskFuseUpperTriangleGradGPUKernel< - T, 9><<>>(grad_y_data, grad_x_data, - softmax_rst_data, batch_count, - key_seq_len); + SoftmaxMaskFuseUpperTriangleGradGPUKernel + <<>>(grad_y_data, grad_x_data, + softmax_rst_data, batch_count, + key_seq_len); break; case 10: // 1024 - SoftmaxMaskFuseUpperTriangleGradGPUKernel< - T, 10><<>>(grad_y_data, grad_x_data, - softmax_rst_data, - batch_count, key_seq_len); + SoftmaxMaskFuseUpperTriangleGradGPUKernel + <<>>(grad_y_data, grad_x_data, + softmax_rst_data, batch_count, + key_seq_len); break; case 11: // 2048 - SoftmaxMaskFuseUpperTriangleGradGPUKernel< - T, 11><<>>(grad_y_data, grad_x_data, - softmax_rst_data, - batch_count, key_seq_len); + SoftmaxMaskFuseUpperTriangleGradGPUKernel + <<>>(grad_y_data, grad_x_data, + softmax_rst_data, batch_count, + key_seq_len); break; case 12: // 4096 - SoftmaxMaskFuseUpperTriangleGradGPUKernel< - T, 12><<>>(grad_y_data, grad_x_data, - softmax_rst_data, - batch_count, key_seq_len); + SoftmaxMaskFuseUpperTriangleGradGPUKernel + <<>>(grad_y_data, grad_x_data, + softmax_rst_data, batch_count, + key_seq_len); break; case 13: // 8192 - SoftmaxMaskFuseUpperTriangleGradGPUKernel< - T, 13><<>>(grad_y_data, grad_x_data, - softmax_rst_data, - batch_count, key_seq_len); + SoftmaxMaskFuseUpperTriangleGradGPUKernel + <<>>(grad_y_data, grad_x_data, + softmax_rst_data, batch_count, + key_seq_len); break; default: break; diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc index 9f2b48a24b447..d44dd324d6ccb 100644 --- a/paddle/fluid/operators/gather_op.cc +++ b/paddle/fluid/operators/gather_op.cc @@ -153,7 +153,7 @@ REGISTER_OPERATOR(gather_grad, ops::GatherGradOp, ops::GatherGradNoNeedBufferVarInferer, GatherGradInferShapeFunctor); -REGISTER_OP_VERSION(gather) - .AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC", - paddle::framework::compatible::OpVersionDesc().NewInput( - "Axis", "Specify the axis of gather operation.")); +REGISTER_OP_VERSION(gather).AddCheckpoint( + R"ROC(upgrad gather, add a new input [Axis])ROC", + paddle::framework::compatible::OpVersionDesc().NewInput( + "Axis", "Specify the axis of gather operation.")); diff --git a/paddle/fluid/operators/gather_op_xpu.cc b/paddle/fluid/operators/gather_op_xpu.cc index 6c691aa14ae77..327eec2a6ca74 100644 --- a/paddle/fluid/operators/gather_op_xpu.cc +++ b/paddle/fluid/operators/gather_op_xpu.cc @@ -38,9 +38,20 @@ class GatherOpXPUKernel : public framework::OpKernel { auto *x = ctx.Input("X"); auto *index = ctx.Input("Index"); auto *output = ctx.Output("Out"); + + int axis = ctx.Attr("axis"); if (ctx.HasInput("Axis")) { - PADDLE_THROW(platform::errors::InvalidArgument( - "Now, it doesn't support XPU with Axis.")); + Tensor cpu_axis; + const Tensor *axis_tensor = ctx.Input("Axis"); + framework::TensorCopy(*axis_tensor, platform::CPUPlace(), &cpu_axis); + const auto &axis_type = axis_tensor->dtype(); + if (framework::TransToProtoVarType(axis_type) == + framework::proto::VarType::INT32) { + axis = static_cast(cpu_axis.data()[0]); + } else if (framework::TransToProtoVarType(axis_type) == + framework::proto::VarType::INT64) { + axis = static_cast(cpu_axis.data()[0]); + } } output->mutable_data(ctx.GetPlace()); @@ -72,13 +83,13 @@ class GatherOpXPUKernel : public framework::OpKernel { r = xpu::gather( dev_ctx.x_context(), reinterpret_cast(x->data()), index->data(), reinterpret_cast(output->data()), - xshape, index->dims()[0], 0); + xshape, index->dims()[0], axis); } else { r = xpu::gather( dev_ctx.x_context(), reinterpret_cast(x->data()), index->data(), reinterpret_cast(output->data()), xshape, - index->dims()[0], 0); + index->dims()[0], axis); } PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, platform::errors::External( @@ -102,9 +113,19 @@ class GatherGradOpXPUKernel : public framework::OpKernel { auto *dout = ctx.Input(framework::GradVarName("Out")); auto &dev_ctx = ctx.template device_context(); + int axis = ctx.Attr("axis"); if (ctx.HasInput("Axis")) { - PADDLE_THROW(platform::errors::InvalidArgument( - "Now, it doesn't support XPU with Axis.")); + Tensor cpu_axis; + const Tensor *axis_tensor = ctx.Input("Axis"); + framework::TensorCopy(*axis_tensor, platform::CPUPlace(), &cpu_axis); + const auto &axis_type = axis_tensor->dtype(); + if (framework::TransToProtoVarType(axis_type) == + framework::proto::VarType::INT32) { + axis = static_cast(cpu_axis.data()[0]); + } else if (framework::TransToProtoVarType(axis_type) == + framework::proto::VarType::INT64) { + axis = static_cast(cpu_axis.data()[0]); + } } if (dout->numel() == 0) { return; @@ -139,7 +160,7 @@ class GatherGradOpXPUKernel : public framework::OpKernel { dev_ctx.x_context(), reinterpret_cast(dout->data()), index->data(), reinterpret_cast(dx->data()), - xshape, index->dims()[0], 0, overwrite); + xshape, index->dims()[0], axis, overwrite); } else { xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); int *index_int_ptr_l3 = @@ -147,16 +168,17 @@ class GatherGradOpXPUKernel : public framework::OpKernel { r = xpu::cast_v2(dev_ctx.x_context(), index->data(), index_int_ptr_l3, index->numel()); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( - "XPU API(cast_v2) return wrong " - "value[%d %s]", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU API(cast_v2) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); r = xpu::gather_grad( dev_ctx.x_context(), reinterpret_cast(dout->data()), index_int_ptr_l3, reinterpret_cast(dx->data()), xshape, index->dims()[0], - 0, overwrite); + axis, overwrite); } PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, platform::errors::External( diff --git a/paddle/fluid/operators/gather_scatter_kernel.cu b/paddle/fluid/operators/gather_scatter_kernel.cu index f97eb3d5e9d9a..6c4a7a01f3fbb 100644 --- a/paddle/fluid/operators/gather_scatter_kernel.cu +++ b/paddle/fluid/operators/gather_scatter_kernel.cu @@ -132,10 +132,11 @@ struct gpu_gather_scatter_functor { int64_t grid = (n + block - 1) / block; auto stream = reinterpret_cast(ctx).stream(); - GatherScatterGPUKernel<<>>( - self_data, dim, index_data, src_data, inner_dim_size, select_dim_size, - replaced_select_dim_size, outer_dim_size, index_size, reduce_op); + GatherScatterGPUKernel + <<>>(self_data, dim, index_data, src_data, + inner_dim_size, select_dim_size, + replaced_select_dim_size, outer_dim_size, + index_size, reduce_op); } }; // struct gpu_gather_scatter_functor diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc index c962dd065234f..676143bf01145 100644 --- a/paddle/fluid/operators/gather_test.cc +++ b/paddle/fluid/operators/gather_test.cc @@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/phi/kernels/funcs/gather.h" + #include #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/place.h" -#include "paddle/phi/kernels/funcs/gather.h" TEST(Gather, GatherData) { paddle::framework::Tensor* src = new paddle::framework::Tensor(); diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu index deac932d59b80..1e89091b202de 100644 --- a/paddle/fluid/operators/gaussian_random_op.cu +++ b/paddle/fluid/operators/gaussian_random_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/operators/gaussian_random_op_xpu.cc b/paddle/fluid/operators/gaussian_random_op_xpu.cc index 5a1ac46f615d2..2ffc90fbd8c20 100644 --- a/paddle/fluid/operators/gaussian_random_op_xpu.cc +++ b/paddle/fluid/operators/gaussian_random_op_xpu.cc @@ -15,6 +15,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include + #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/gelu_op.cc b/paddle/fluid/operators/gelu_op.cc index 3be2606bfc939..080ceaa45e343 100644 --- a/paddle/fluid/operators/gelu_op.cc +++ b/paddle/fluid/operators/gelu_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/operators/gelu_op_xpu.cc b/paddle/fluid/operators/gelu_op_xpu.cc index 559d2448ad945..408638f7d2cfc 100644 --- a/paddle/fluid/operators/gelu_op_xpu.cc +++ b/paddle/fluid/operators/gelu_op_xpu.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/tensor.h" diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cc b/paddle/fluid/operators/graph_khop_sampler_op.cc index c83ee25840605..edf7d20c6d5c8 100644 --- a/paddle/fluid/operators/graph_khop_sampler_op.cc +++ b/paddle/fluid/operators/graph_khop_sampler_op.cc @@ -19,10 +19,11 @@ namespace operators { void InputShapeCheck(const framework::DDim& dims, std::string tensor_name) { if (dims.size() == 2) { - PADDLE_ENFORCE_EQ(dims[1], 1, platform::errors::InvalidArgument( - "The last dim of %s should be 1 when it " - "is 2D, but we get %d", - tensor_name, dims[1])); + PADDLE_ENFORCE_EQ(dims[1], 1, + platform::errors::InvalidArgument( + "The last dim of %s should be 1 when it " + "is 2D, but we get %d", + tensor_name, dims[1])); } else { PADDLE_ENFORCE_EQ( dims.size(), 1, diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu index df977b43512a0..a63fdc89e24b2 100644 --- a/paddle/fluid/operators/graph_khop_sampler_op.cu +++ b/paddle/fluid/operators/graph_khop_sampler_op.cu @@ -26,6 +26,7 @@ limitations under the License. */ #include #include #include + #include #ifdef PADDLE_WITH_HIP @@ -217,15 +218,16 @@ void SampleNeighbors(const framework::ExecutionContext& ctx, const T* src, constexpr int TILE_SIZE = BLOCK_WARPS * 16; const dim3 block(WARP_SIZE, BLOCK_WARPS); const dim3 grid((bs + TILE_SIZE - 1) / TILE_SIZE); - GraphSampleNeighborsCUDAKernel<<< - grid, block, 0, - reinterpret_cast(ctx.device_context()) - .stream()>>>( - 0, k, bs, thrust::raw_pointer_cast(inputs->data()), src, dst_count, - src_eids, thrust::raw_pointer_cast(outputs->data()), - thrust::raw_pointer_cast(outputs_eids->data()), - thrust::raw_pointer_cast(output_ptr.data()), - thrust::raw_pointer_cast(output_idxs.data()), return_eids); + GraphSampleNeighborsCUDAKernel + <<( + ctx.device_context()) + .stream()>>>( + 0, k, bs, thrust::raw_pointer_cast(inputs->data()), src, dst_count, + src_eids, thrust::raw_pointer_cast(outputs->data()), + thrust::raw_pointer_cast(outputs_eids->data()), + thrust::raw_pointer_cast(output_ptr.data()), + thrust::raw_pointer_cast(output_idxs.data()), return_eids); // 5. Get inputs = outputs - inputs: if (!is_last_layer) { @@ -264,19 +266,19 @@ void FillHashTable(const framework::ExecutionContext& ctx, const T* input, int grid_tmp = (num_input + block - 1) / block; int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; // 1. Insert data into keys and values. - BuildHashTable< - T><<( - ctx.device_context()) - .stream()>>>( + BuildHashTable<<( + ctx.device_context()) + .stream()>>>( input, num_input, len_hashtable, thrust::raw_pointer_cast(keys->data()), thrust::raw_pointer_cast(key_index->data())); // 2. Get item index count. thrust::device_vector item_count(num_input + 1, 0); - GetItemIndexCount< - T><<( - ctx.device_context()) - .stream()>>>( + GetItemIndexCount<<( + ctx.device_context()) + .stream()>>>( input, thrust::raw_pointer_cast(item_count.data()), num_input, len_hashtable, thrust::raw_pointer_cast(keys->data()), thrust::raw_pointer_cast(key_index->data())); @@ -287,16 +289,16 @@ void FillHashTable(const framework::ExecutionContext& ctx, const T* input, unique_items->resize(total_unique_items); // 3. Get unique items. - FillUniqueItems< - T><<( - ctx.device_context()) - .stream()>>>( - input, num_input, len_hashtable, - thrust::raw_pointer_cast(unique_items->data()), - thrust::raw_pointer_cast(item_count.data()), - thrust::raw_pointer_cast(keys->data()), - thrust::raw_pointer_cast(values->data()), - thrust::raw_pointer_cast(key_index->data())); + FillUniqueItems + <<( + ctx.device_context()) + .stream()>>>(input, num_input, len_hashtable, + thrust::raw_pointer_cast(unique_items->data()), + thrust::raw_pointer_cast(item_count.data()), + thrust::raw_pointer_cast(keys->data()), + thrust::raw_pointer_cast(values->data()), + thrust::raw_pointer_cast(key_index->data())); } template @@ -337,23 +339,23 @@ void ReindexFunc(const framework::ExecutionContext& ctx, int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; int64_t grid_tmp = (outputs->size() + block - 1) / block; int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; - ReindexSrcOutput< - T><<( - ctx.device_context()) - .stream()>>>( + ReindexSrcOutput<<( + ctx.device_context()) + .stream()>>>( thrust::raw_pointer_cast(outputs->data()), outputs->size(), size, thrust::raw_pointer_cast(keys.data()), thrust::raw_pointer_cast(values.data())); int grid_ = (bs + block - 1) / block; - ReindexInputNodes<<( - ctx.device_context()) - .stream()>>>( - thrust::raw_pointer_cast(orig_nodes->data()), bs, - thrust::raw_pointer_cast(reindex_nodes->data()), size, - thrust::raw_pointer_cast(keys.data()), - thrust::raw_pointer_cast(values.data())); + ReindexInputNodes + <<( + ctx.device_context()) + .stream()>>>(thrust::raw_pointer_cast(orig_nodes->data()), bs, + thrust::raw_pointer_cast(reindex_nodes->data()), size, + thrust::raw_pointer_cast(keys.data()), + thrust::raw_pointer_cast(values.data())); } template @@ -532,15 +534,16 @@ class GraphKhopSamplerOpCUDAKernel : public framework::OpKernel { const dim3 block(WARP_SIZE, BLOCK_WARPS); const dim3 grid((unique_dst_size + TILE_SIZE - 1) / TILE_SIZE); - GetDstEdgeCUDAKernel<<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>( - unique_dst_size, - thrust::raw_pointer_cast(unique_dst_merge_reindex.data()), - thrust::raw_pointer_cast(dst_sample_counts_merge.data()), - thrust::raw_pointer_cast(dst_ptr.data()), - thrust::raw_pointer_cast(dst_merge.data())); + GetDstEdgeCUDAKernel + <<( + ctx.device_context()) + .stream()>>>( + unique_dst_size, + thrust::raw_pointer_cast(unique_dst_merge_reindex.data()), + thrust::raw_pointer_cast(dst_sample_counts_merge.data()), + thrust::raw_pointer_cast(dst_ptr.data()), + thrust::raw_pointer_cast(dst_merge.data())); // 8. Give operator's outputs. auto* out_src = ctx.Output("Out_Src"); diff --git a/paddle/fluid/operators/graph_khop_sampler_op.h b/paddle/fluid/operators/graph_khop_sampler_op.h index d7121cb549370..1005a6ab11cc0 100644 --- a/paddle/fluid/operators/graph_khop_sampler_op.h +++ b/paddle/fluid/operators/graph_khop_sampler_op.h @@ -15,10 +15,12 @@ limitations under the License. */ #pragma once #include + #include #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc index 4331523d26edc..4d989ed1f2ec0 100644 --- a/paddle/fluid/operators/group_norm_op.cc +++ b/paddle/fluid/operators/group_norm_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/group_norm_op.h" + #include #include #include @@ -234,7 +235,6 @@ class GroupNormGradMaker : public framework::SingleGradOpMaker { } }; -DECLARE_INPLACE_OP_INFERER(GroupNormInplaceInferer, {"X", "Y"}); DECLARE_INPLACE_OP_INFERER(GroupNormGradInplaceInferer, {framework::GradVarName("Y"), framework::GradVarName("X")}); @@ -256,8 +256,7 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(group_norm, ops::GroupNormOp, ops::GroupNormOpMaker, ops::GroupNormOpInferVarType, ops::GroupNormGradMaker, - ops::GroupNormGradMaker, - ops::GroupNormInplaceInferer); + ops::GroupNormGradMaker); REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp, ops::GroupNormGradInplaceInferer); REGISTER_OP_CPU_KERNEL( diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index bb8031b0cc4e6..84eb2fbc7d31f 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -322,9 +322,9 @@ class GroupNormKernel ScalarGetMeanAndVarNCHW<<>>( x_data, mean_data, temp_var_data, size); } else { - VectorizedGetMeanAndVarNCHW< - T, AccT, vec_size><<>>( - x_data, mean_data, temp_var_data, size); + VectorizedGetMeanAndVarNCHW + <<>>(x_data, mean_data, + temp_var_data, size); } } else { set_zero(dev_ctx, mean, static_cast(0)); @@ -613,16 +613,16 @@ class GroupNormGradKernel } block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize); dim3 blocks(block_size_nchw); - ScalarGetDsDbCUDAKernel< - T><<>>( - imsize, x_data, dy_data, ds_data, db_data); + ScalarGetDsDbCUDAKernel + <<>>( + imsize, x_data, dy_data, ds_data, db_data); if (d_scale || d_bias) { const int block = 256; - GetScaleBiasGradientCUDAKernel< - T><<<(C + block - 1) / block, block, 0, dev_ctx.stream()>>>( - x_dims[0], C, groups, epsilon, mean_data, var_data, ds_data, - db_data, d_scale_data, d_bias_data); + GetScaleBiasGradientCUDAKernel + <<<(C + block - 1) / block, block, 0, dev_ctx.stream()>>>( + x_dims[0], C, groups, epsilon, mean_data, var_data, ds_data, + db_data, d_scale_data, d_bias_data); } if (d_x_data != nullptr) { @@ -639,10 +639,10 @@ class GroupNormGradKernel T* p2_data = p2.data(); T* p3_data = p3.data(); - GetBackwardParamsCUDAKernel<<< - dim3(x_dims[0], groups), block_dims, 0, dev_ctx.stream()>>>( - imsize, groups, group_size, epsilon, mean_data, var_data, - scale_data, ds_data, db_data, p1_data, p2_data, p3_data); + GetBackwardParamsCUDAKernel + <<>>( + imsize, groups, group_size, epsilon, mean_data, var_data, + scale_data, ds_data, db_data, p1_data, p2_data, p3_data); GetXGradientCUDAKernel<<>>( imsize, C, group_size, groups, p1_data, p2_data, p3_data, x_data, dy_data, d_x_data); diff --git a/paddle/fluid/operators/group_norm_op.h b/paddle/fluid/operators/group_norm_op.h index 2d80ab89471fc..28a3ad2a8e1ee 100644 --- a/paddle/fluid/operators/group_norm_op.h +++ b/paddle/fluid/operators/group_norm_op.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/group_norm_op_npu.cc b/paddle/fluid/operators/group_norm_op_npu.cc index 8de8647186ed3..dfc509941bc2d 100644 --- a/paddle/fluid/operators/group_norm_op_npu.cc +++ b/paddle/fluid/operators/group_norm_op_npu.cc @@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/group_norm_op.h" #include + +#include "paddle/fluid/operators/group_norm_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc index 58cbdfda34799..21ad5914c5d4d 100644 --- a/paddle/fluid/operators/gru_op.cc +++ b/paddle/fluid/operators/gru_op.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/gru_op.h" + #include #include + #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h" #include "paddle/phi/kernels/funcs/detail/gru_kernel.h" diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h index 852655034c8c2..4cc6c65983fe9 100644 --- a/paddle/fluid/operators/gru_op.h +++ b/paddle/fluid/operators/gru_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/detail/activation_functions.h" diff --git a/paddle/fluid/operators/gru_unit_op.cc b/paddle/fluid/operators/gru_unit_op.cc index 8998c51f0df62..b6d9ef50f83e8 100644 --- a/paddle/fluid/operators/gru_unit_op.cc +++ b/paddle/fluid/operators/gru_unit_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/gru_unit_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h index 291f5f4ad2673..2dd1515919b3b 100644 --- a/paddle/fluid/operators/gru_unit_op.h +++ b/paddle/fluid/operators/gru_unit_op.h @@ -77,9 +77,9 @@ class GRUUnitKernel : public framework::OpKernel { // calculate unactivated gate outputs if (bias) { auto b = framework::EigenMatrix::From(*bias); - g.device(place) = x + - b.reshape(Eigen::array({{1, frame_size * 3}})) - .broadcast(Eigen::array({{batch_size, 1}})); + g.device(place) = + x + b.reshape(Eigen::array({{1, frame_size * 3}})) + .broadcast(Eigen::array({{batch_size, 1}})); } else { g.device(place) = x; } diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc index cce80518354d7..f72fe9282abb6 100644 --- a/paddle/fluid/operators/hinge_loss_op.cc +++ b/paddle/fluid/operators/hinge_loss_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/hinge_loss_op.h" + #include #include #include diff --git a/paddle/fluid/operators/huber_loss_op_xpu.cc b/paddle/fluid/operators/huber_loss_op_xpu.cc index ccddec2779515..2fafd18621528 100644 --- a/paddle/fluid/operators/huber_loss_op_xpu.cc +++ b/paddle/fluid/operators/huber_loss_op_xpu.cc @@ -39,10 +39,11 @@ class HuberLossXPUKernel : public framework::OpKernel { ctx.template device_context(); int r = xpu::huber_loss(dev_ctx.x_context(), in0_data, in1_data, residual_data, out_data, in0->numel(), 1, delta); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( - "XPU API(huber_loss) return wrong " - "value[%d %s]", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU API(huber_loss) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); } }; diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc index d248857b8f42f..107384742bbdd 100644 --- a/paddle/fluid/operators/im2sequence_op.cc +++ b/paddle/fluid/operators/im2sequence_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/im2sequence_op.h" + #include #include #include diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h index b0c4b9b4a99a5..218161fd00aaa 100644 --- a/paddle/fluid/operators/im2sequence_op.h +++ b/paddle/fluid/operators/im2sequence_op.h @@ -15,6 +15,7 @@ #pragma once #include #include + #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/index_impl.cu.h b/paddle/fluid/operators/index_impl.cu.h index bb26e2f445e70..d8417e42e1bf7 100644 --- a/paddle/fluid/operators/index_impl.cu.h +++ b/paddle/fluid/operators/index_impl.cu.h @@ -15,6 +15,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" @@ -73,16 +74,16 @@ void IndexKernel(const KPDevice &dev_ctx, Tensor *out, Functor func) { size_t main_offset = (numel / (vec_size * block)) * vec_size * block; switch (vec_size) { case 4: - VectorizedIndexKernel<<>>( - out_data, numel, main_offset, func); + VectorizedIndexKernel + <<>>(out_data, numel, main_offset, func); break; case 2: - VectorizedIndexKernel<<>>( - out_data, numel, main_offset, func); + VectorizedIndexKernel + <<>>(out_data, numel, main_offset, func); break; case 1: - VectorizedIndexKernel<<>>( - out_data, numel, main_offset, func); + VectorizedIndexKernel + <<>>(out_data, numel, main_offset, func); break; default: { PADDLE_THROW(paddle::platform::errors::Unimplemented( diff --git a/paddle/fluid/operators/index_sample_op.cc b/paddle/fluid/operators/index_sample_op.cc index d17c6368c7537..15fc0f6d14fe4 100644 --- a/paddle/fluid/operators/index_sample_op.cc +++ b/paddle/fluid/operators/index_sample_op.cc @@ -13,11 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include "paddle/fluid/framework/no_need_buffer_vars_inference.h" -#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/binary.h" namespace paddle { diff --git a/paddle/fluid/operators/index_select_op.h b/paddle/fluid/operators/index_select_op.h index 684829be2697c..c82aaab0fe1c2 100644 --- a/paddle/fluid/operators/index_select_op.h +++ b/paddle/fluid/operators/index_select_op.h @@ -14,6 +14,7 @@ #pragma once #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc index d420d0319bfe4..6cb8d664d8022 100644 --- a/paddle/fluid/operators/inplace_abn_op.cc +++ b/paddle/fluid/operators/inplace_abn_op.cc @@ -13,9 +13,11 @@ // limitations under the License. #include "paddle/fluid/operators/inplace_abn_op.h" + #include #include #include + #include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/phi/kernels/batch_norm_grad_kernel.h" #include "paddle/phi/kernels/batch_norm_kernel.h" @@ -38,18 +40,21 @@ class InplaceABNOp : public paddle::operators::BatchNormOp { if (input_data_type == framework::proto::VarType::FP64) { bn_param_type = framework::proto::VarType::FP64; } - PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType( - ctx.Input("Scale")->dtype()), - platform::errors::InvalidArgument( - "Scale input should be of float type")); - PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType( - ctx.Input("Bias")->dtype()), - platform::errors::InvalidArgument( - "Bias input should be of float type")); - PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType( - ctx.Input("Mean")->dtype()), - platform::errors::InvalidArgument( - "Mean input should be of float type")); + PADDLE_ENFORCE_EQ( + bn_param_type, + framework::TransToProtoVarType(ctx.Input("Scale")->dtype()), + platform::errors::InvalidArgument( + "Scale input should be of float type")); + PADDLE_ENFORCE_EQ( + bn_param_type, + framework::TransToProtoVarType(ctx.Input("Bias")->dtype()), + platform::errors::InvalidArgument( + "Bias input should be of float type")); + PADDLE_ENFORCE_EQ( + bn_param_type, + framework::TransToProtoVarType(ctx.Input("Mean")->dtype()), + platform::errors::InvalidArgument( + "Mean input should be of float type")); PADDLE_ENFORCE_EQ( bn_param_type, framework::TransToProtoVarType(ctx.Input("Variance")->dtype()), @@ -209,8 +214,9 @@ class InplaceABNKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); auto* y = ctx.Output("Y"); - PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument( - "X and Y not inplaced in inplace mode")); + PADDLE_ENFORCE_EQ(x, y, + platform::errors::InvalidArgument( + "X and Y not inplaced in inplace mode")); auto activation = GetInplaceABNActivationType(ctx.Attr("activation")); auto& place = *ctx.template device_context().eigen_device(); diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu index 6476023fcd20e..7245629e565e9 100644 --- a/paddle/fluid/operators/inplace_abn_op.cu +++ b/paddle/fluid/operators/inplace_abn_op.cu @@ -28,8 +28,9 @@ class InplaceABNKernel void Compute(const framework::ExecutionContext& ctx) const override { auto* y = ctx.Output("Y"); auto* x = ctx.Input("X"); - PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument( - "X and Y not inplaced in inplace mode")); + PADDLE_ENFORCE_EQ(x, y, + platform::errors::InvalidArgument( + "X and Y not inplaced in inplace mode")); auto activation = GetInplaceABNActivationType(ctx.Attr("activation")); auto& place = *ctx.template device_context().eigen_device(); diff --git a/paddle/fluid/operators/inplace_abn_op.h b/paddle/fluid/operators/inplace_abn_op.h index 942404978584d..275209911d18e 100644 --- a/paddle/fluid/operators/inplace_abn_op.h +++ b/paddle/fluid/operators/inplace_abn_op.h @@ -14,6 +14,7 @@ #pragma once #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc index de92de453a354..21ccf777051c2 100644 --- a/paddle/fluid/operators/instance_norm_op.cc +++ b/paddle/fluid/operators/instance_norm_op.cc @@ -13,9 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/instance_norm_op.h" + #include #include #include + #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_version_registry.h" @@ -38,16 +40,18 @@ framework::OpKernelType InstanceNormOp::GetExpectedKernelType( in_param_type = framework::proto::VarType::FP64; } if (ctx.HasInput("Scale")) { - PADDLE_ENFORCE_EQ(in_param_type, framework::TransToProtoVarType( - ctx.Input("Scale")->dtype()), - platform::errors::InvalidArgument( - "Scale input should be of float type")); + PADDLE_ENFORCE_EQ( + in_param_type, + framework::TransToProtoVarType(ctx.Input("Scale")->dtype()), + platform::errors::InvalidArgument( + "Scale input should be of float type")); } if (ctx.HasInput("Bias")) { - PADDLE_ENFORCE_EQ(in_param_type, framework::TransToProtoVarType( - ctx.Input("Bias")->dtype()), - platform::errors::InvalidArgument( - "Bias input should be of float type")); + PADDLE_ENFORCE_EQ( + in_param_type, + framework::TransToProtoVarType(ctx.Input("Bias")->dtype()), + platform::errors::InvalidArgument( + "Bias input should be of float type")); } return framework::OpKernelType(input_data_type, ctx.GetPlace()); diff --git a/paddle/fluid/operators/instance_norm_op.h b/paddle/fluid/operators/instance_norm_op.h index 265e4acef0d7a..3f99cdf10c64b 100644 --- a/paddle/fluid/operators/instance_norm_op.h +++ b/paddle/fluid/operators/instance_norm_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc index fda168c94e1e0..3c746d7c08a1a 100644 --- a/paddle/fluid/operators/interpolate_op.cc +++ b/paddle/fluid/operators/interpolate_op.cc @@ -10,9 +10,11 @@ limitations under the License. */ #include "paddle/fluid/operators/interpolate_op.h" + #include #include #include + #include "paddle/fluid/framework/op_registry.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -112,11 +114,12 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) { PADDLE_ENFORCE_EQ("bilinear" == interp_method || "nearest" == interp_method || "bicubic" == interp_method, - true, platform::errors::InvalidArgument( - "Interpolation method can only be \"bilinear\" " - "or \"nearest\" or \"bicubic\" when " - "Input(X) dimension is 4, but got method is %s.", - interp_method)); + true, + platform::errors::InvalidArgument( + "Interpolation method can only be \"bilinear\" " + "or \"nearest\" or \"bicubic\" when " + "Input(X) dimension is 4, but got method is %s.", + interp_method)); const DataLayout data_layout = framework::StringToDataLayout( ctx->Attrs().Get("data_layout")); diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu index 8a63c9a394638..729eba43d7264 100644 --- a/paddle/fluid/operators/interpolate_op.cu +++ b/paddle/fluid/operators/interpolate_op.cu @@ -11,6 +11,7 @@ #include #include + #include "paddle/fluid/operators/interpolate_op.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" @@ -860,9 +861,10 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx, out_w = size_data[0]; } } - PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); + PADDLE_ENFORCE_GT(out_w, 0, + platform::errors::InvalidArgument( + "out_w in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); framework::DDim dim_out; if (data_layout == DataLayout::kNCHW) { dim_out = {n, c, out_w}; @@ -942,12 +944,14 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx, out_w = size_data[1]; } } - PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument( - "out_h in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); + PADDLE_ENFORCE_GT(out_h, 0, + platform::errors::InvalidArgument( + "out_h in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT(out_w, 0, + platform::errors::InvalidArgument( + "out_w in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); framework::DDim dim_out; if (data_layout == DataLayout::kNCHW) { @@ -984,21 +988,21 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx, platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum); if ("nearest" == interp_method) { - KeNearestNeighborInterpFw< - T><<>>( - input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, - out_chw, c, ratio_h, ratio_w, align_corners, data_layout); + KeNearestNeighborInterpFw + <<>>( + input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, + out_chw, c, ratio_h, ratio_w, align_corners, data_layout); } else if ("bilinear" == interp_method) { KeBilinearInterpFw<<>>( input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout); } else if ("bicubic" == interp_method) { - KeBicubicInterpFw<<>>( - input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, - out_chw, c, ratio_h, ratio_w, align_corners, data_layout); + KeBicubicInterpFw + <<>>( + input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, + out_chw, c, ratio_h, ratio_w, align_corners, data_layout); } } @@ -1051,15 +1055,18 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx, out_w = size_data[2]; } } - PADDLE_ENFORCE_GT(out_d, 0, platform::errors::InvalidArgument( - "out_d in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument( - "out_h in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); + PADDLE_ENFORCE_GT(out_d, 0, + platform::errors::InvalidArgument( + "out_d in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT(out_h, 0, + platform::errors::InvalidArgument( + "out_h in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT(out_w, 0, + platform::errors::InvalidArgument( + "out_w in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); framework::DDim dim_out; if (data_layout == DataLayout::kNCHW) { @@ -1271,11 +1278,11 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum); if ("nearest" == interp_method) { - KeNearestNeighborInterpBw< - T><<>>( - input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w, - n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout); + KeNearestNeighborInterpBw + <<>>( + input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, + out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout); } else if ("bilinear" == interp_method) { KeBilinearInterpBw<<>>( @@ -1283,10 +1290,10 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout); } else if ("bicubic" == interp_method) { - KeBicubicInterpBw<<>>( - input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w, - n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout); + KeBicubicInterpBw + <<>>( + input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, + out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout); } } diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h index 57b5eb553cc4c..18caed22b4855 100644 --- a/paddle/fluid/operators/interpolate_op.h +++ b/paddle/fluid/operators/interpolate_op.h @@ -13,6 +13,7 @@ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -808,9 +809,10 @@ static void Interpolate1DCPUFwd(const framework::ExecutionContext& ctx, out_w = out_size_data[0]; } } - PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); + PADDLE_ENFORCE_GT(out_w, 0, + platform::errors::InvalidArgument( + "out_w in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); framework::DDim dim_out; if (data_layout == DataLayout::kNCHW) { dim_out = {n, c, out_w}; @@ -876,12 +878,14 @@ static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx, out_w = out_size_data[1]; } } - PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument( - "out_h in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); + PADDLE_ENFORCE_GT(out_h, 0, + platform::errors::InvalidArgument( + "out_h in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT(out_w, 0, + platform::errors::InvalidArgument( + "out_w in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); framework::DDim dim_out; if (data_layout == DataLayout::kNCHW) { dim_out = {n, c, out_h, out_w}; @@ -964,15 +968,18 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx, out_w = out_size_data[2]; } } - PADDLE_ENFORCE_GT(out_d, 0, platform::errors::InvalidArgument( - "out_d in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument( - "out_h in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); + PADDLE_ENFORCE_GT(out_d, 0, + platform::errors::InvalidArgument( + "out_d in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT(out_h, 0, + platform::errors::InvalidArgument( + "out_h in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT(out_w, 0, + platform::errors::InvalidArgument( + "out_w in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); framework::DDim dim_out; if (data_layout == DataLayout::kNCHW) { diff --git a/paddle/fluid/operators/interpolate_op_npu.cc b/paddle/fluid/operators/interpolate_op_npu.cc old mode 100755 new mode 100644 index f83f149b87c31..0cbac393af504 --- a/paddle/fluid/operators/interpolate_op_npu.cc +++ b/paddle/fluid/operators/interpolate_op_npu.cc @@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/interpolate_op.h" #include #include + +#include "paddle/fluid/operators/interpolate_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/interpolate_op_xpu.cc b/paddle/fluid/operators/interpolate_op_xpu.cc index 9576dc8452463..09780505ac2ce 100644 --- a/paddle/fluid/operators/interpolate_op_xpu.cc +++ b/paddle/fluid/operators/interpolate_op_xpu.cc @@ -111,14 +111,16 @@ class InterpolateXPUKernel : public framework::OpKernel { out_w = out_size_data[1]; } } - PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument( - "out_h in Attr(out_shape) of " - "Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of " - "Op(interpolate) " - "should be greater than 0.")); + PADDLE_ENFORCE_GT( + out_h, 0, + platform::errors::InvalidArgument("out_h in Attr(out_shape) of " + "Op(interpolate) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT( + out_w, 0, + platform::errors::InvalidArgument("out_w in Attr(out_shape) of " + "Op(interpolate) " + "should be greater than 0.")); framework::DDim dim_out; if (data_layout == DataLayout::kNCHW) { dim_out = {n, c, out_h, out_w}; diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc index d0d7b7694fc3a..6bac35ee1d455 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cc +++ b/paddle/fluid/operators/interpolate_v2_op.cc @@ -40,10 +40,11 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) { const DataLayout data_layout = framework::StringToDataLayout( ctx->Attrs().Get("data_layout")); for (int i = 0; i < dim_x.size(); ++i) { - PADDLE_ENFORCE_NE(dim_x[i], 0, platform::errors::InvalidArgument( - "The shape of input(x) should be larged " - "than 0, bug received shape[%d] is %d ", - i, dim_x[i])); + PADDLE_ENFORCE_NE(dim_x[i], 0, + platform::errors::InvalidArgument( + "The shape of input(x) should be larged " + "than 0, bug received shape[%d] is %d ", + i, dim_x[i])); } if (ctx->HasInputs("SizeTensor")) { // top prority size @@ -144,10 +145,11 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) { ctx->Attrs().Get("data_layout")); for (int i = 0; i < dim_x.size(); ++i) { - PADDLE_ENFORCE_NE(dim_x[i], 0, platform::errors::InvalidArgument( - "The shape of input(x) should be larged " - "than 0, bug received shape[%d] is %d ", - i, dim_x[i])); + PADDLE_ENFORCE_NE(dim_x[i], 0, + platform::errors::InvalidArgument( + "The shape of input(x) should be larged " + "than 0, bug received shape[%d] is %d ", + i, dim_x[i])); } if (ctx->HasInputs("SizeTensor")) { @@ -263,10 +265,11 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) { ctx->Attrs().Get("data_layout")); for (int i = 0; i < dim_x.size(); ++i) { - PADDLE_ENFORCE_NE(dim_x[i], 0, platform::errors::InvalidArgument( - "The shape of input(x) should be larged " - "than 0, bug received shape[%d] is %d ", - i, dim_x[i])); + PADDLE_ENFORCE_NE(dim_x[i], 0, + platform::errors::InvalidArgument( + "The shape of input(x) should be larged " + "than 0, bug received shape[%d] is %d ", + i, dim_x[i])); } if (ctx->HasInputs("SizeTensor")) { diff --git a/paddle/fluid/operators/interpolate_v2_op_npu.cc b/paddle/fluid/operators/interpolate_v2_op_npu.cc index 615b5ea142b58..97f39aa490264 100644 --- a/paddle/fluid/operators/interpolate_v2_op_npu.cc +++ b/paddle/fluid/operators/interpolate_v2_op_npu.cc @@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/platform/device/npu/npu_op_runner.h" - #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/phi/kernels/funcs/interpolate_function.h" namespace paddle { diff --git a/paddle/fluid/operators/interpolate_v2_op_xpu.cc b/paddle/fluid/operators/interpolate_v2_op_xpu.cc index 9cbfc95158348..9d52c9a865ea7 100644 --- a/paddle/fluid/operators/interpolate_v2_op_xpu.cc +++ b/paddle/fluid/operators/interpolate_v2_op_xpu.cc @@ -114,14 +114,16 @@ class InterpolateV2XPUKernel : public framework::OpKernel { out_w = out_size_data[1]; } } - PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument( - "out_h in Attr(out_shape) of " - "Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of " - "Op(interpolate) " - "should be greater than 0.")); + PADDLE_ENFORCE_GT( + out_h, 0, + platform::errors::InvalidArgument("out_h in Attr(out_shape) of " + "Op(interpolate) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT( + out_w, 0, + platform::errors::InvalidArgument("out_w in Attr(out_shape) of " + "Op(interpolate) " + "should be greater than 0.")); framework::DDim dim_out; if (data_layout == DataLayout::kNCHW) { dim_out = {n, c, out_h, out_w}; diff --git a/paddle/fluid/operators/inverse_op.cc b/paddle/fluid/operators/inverse_op.cc index f5b817a0e11fa..c4f3fbb2ca772 100644 --- a/paddle/fluid/operators/inverse_op.cc +++ b/paddle/fluid/operators/inverse_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/inverse_op.h" + #include #include diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc index 2e770f9852569..456c1c2d44f3e 100644 --- a/paddle/fluid/operators/isfinite_op.cc +++ b/paddle/fluid/operators/isfinite_op.cc @@ -120,15 +120,16 @@ namespace ops = paddle::operators; paddle::framework::EmptyGradOpMaker, \ paddle::framework::EmptyGradOpMaker) -#define REGISTER_OVERFLOW_CPU_KERNEL(op_type, functor) \ - REGISTER_OP_CPU_KERNEL( \ - op_type, ops::OverflowKernel, \ - ops::OverflowKernel, \ - ops::OverflowKernel, \ - ops::OverflowKernel, \ + ops::OverflowKernel, \ + ops::OverflowKernel, \ + ops::OverflowKernel); REGISTER_OP_MAKER(isinf, "isinf(X)"); diff --git a/paddle/fluid/operators/isfinite_op.cu b/paddle/fluid/operators/isfinite_op.cu index e233e37136490..d1437d5b44d6f 100644 --- a/paddle/fluid/operators/isfinite_op.cu +++ b/paddle/fluid/operators/isfinite_op.cu @@ -18,8 +18,9 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( - isinf, ops::OverflowKernel, + isinf, + ops::OverflowKernel, ops::OverflowKernel, ops::OverflowKernel); REGISTER_OP_CUDA_KERNEL( - isfinite, ops::OverflowKernel, + isfinite, + ops::OverflowKernel, ops::OverflowKernel, ops::OverflowKernel(upper - lower), (n - 1))); PADDLE_ENFORCE_GT( - n, 0, paddle::platform::errors::InvalidArgument( - "The Sgd size should be larger than 0. But the n is %d.", n)); + n, 0, + paddle::platform::errors::InvalidArgument( + "The Sgd size should be larger than 0. But the n is %d.", n)); std::vector all, out; for (int i = 0; i < n; ++i) { all.push_back(i); diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index ab8829b7baf5f..60e29ea81d5eb 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -1,38 +1,45 @@ +file( + GLOB jitcode_cc_srcs + RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + "*.cc") -file(GLOB jitcode_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") - -cc_library(jit_kernel_jitcode SRCS ${jitcode_cc_srcs} DEPS jit_kernel_base xbyak) -set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode PARENT_SCOPE) +cc_library( + jit_kernel_jitcode + SRCS ${jitcode_cc_srcs} + DEPS jit_kernel_base xbyak) +set(JIT_KERNEL_DEPS + ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode + PARENT_SCOPE) function(USE_JITKERNEL_GEN TARGET) - file(APPEND ${jit_file} "USE_JITKERNEL_GEN(${TARGET});\n") + file(APPEND ${jit_file} "USE_JITKERNEL_GEN(${TARGET});\n") endfunction() # use gen jitcode kernel by name -USE_JITKERNEL_GEN(kMatMul) -USE_JITKERNEL_GEN(kVMul) -USE_JITKERNEL_GEN(kVAdd) -USE_JITKERNEL_GEN(kVSub) -USE_JITKERNEL_GEN(kVAddRelu) -USE_JITKERNEL_GEN(kVScal) -USE_JITKERNEL_GEN(kVAddBias) -USE_JITKERNEL_GEN(kVRelu) -USE_JITKERNEL_GEN(kVSquare) -USE_JITKERNEL_GEN(kVIdentity) -USE_JITKERNEL_GEN(kVExp) -USE_JITKERNEL_GEN(kVSigmoid) -USE_JITKERNEL_GEN(kVTanh) -USE_JITKERNEL_GEN(kLSTMCtHt) -USE_JITKERNEL_GEN(kLSTMC1H1) -USE_JITKERNEL_GEN(kGRUH1) -USE_JITKERNEL_GEN(kGRUHtPart1) -USE_JITKERNEL_GEN(kGRUHtPart2) -USE_JITKERNEL_GEN(kNCHW16CMulNC) -USE_JITKERNEL_GEN(kSeqPool) -USE_JITKERNEL_GEN(kHMax) -USE_JITKERNEL_GEN(kHSum) -USE_JITKERNEL_GEN(kEmbSeqPool) -USE_JITKERNEL_GEN(kAdam) -USE_JITKERNEL_GEN(kAdamW) -USE_JITKERNEL_GEN(kSgd) -USE_JITKERNEL_GEN(kVBroadcast) +use_jitkernel_gen(kMatMul) +use_jitkernel_gen(kVMul) +use_jitkernel_gen(kVAdd) +use_jitkernel_gen(kVSub) +use_jitkernel_gen(kVAddRelu) +use_jitkernel_gen(kVScal) +use_jitkernel_gen(kVAddBias) +use_jitkernel_gen(kVRelu) +use_jitkernel_gen(kVSquare) +use_jitkernel_gen(kVIdentity) +use_jitkernel_gen(kVExp) +use_jitkernel_gen(kVSigmoid) +use_jitkernel_gen(kVTanh) +use_jitkernel_gen(kLSTMCtHt) +use_jitkernel_gen(kLSTMC1H1) +use_jitkernel_gen(kGRUH1) +use_jitkernel_gen(kGRUHtPart1) +use_jitkernel_gen(kGRUHtPart2) +use_jitkernel_gen(kNCHW16CMulNC) +use_jitkernel_gen(kSeqPool) +use_jitkernel_gen(kHMax) +use_jitkernel_gen(kHSum) +use_jitkernel_gen(kEmbSeqPool) +use_jitkernel_gen(kAdam) +use_jitkernel_gen(kAdamW) +use_jitkernel_gen(kSgd) +use_jitkernel_gen(kVBroadcast) diff --git a/paddle/fluid/operators/jit/gen/act.cc b/paddle/fluid/operators/jit/gen/act.cc index 677e9979399c5..5a73e3c56d511 100644 --- a/paddle/fluid/operators/jit/gen/act.cc +++ b/paddle/fluid/operators/jit/gen/act.cc @@ -122,9 +122,8 @@ bool VTanhCreator::CanBeUsed(const int& d) const { } size_t VReluCreator::CodeSize(const int& d) const { - return 96 /* init size */ + - (d / YMM_FLOAT_BLOCK + 3) * 4 /* instructions */ * - 8 /* average bytes for each instruction */; + return 96 /* init size */ + (d / YMM_FLOAT_BLOCK + 3) * 4 /* instructions */ * + 8 /* average bytes for each instruction */; } size_t VSquareCreator::CodeSize(const int& d) const { diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h index bd84368a57388..24434c5993bbb 100644 --- a/paddle/fluid/operators/jit/gen/jitcode.h +++ b/paddle/fluid/operators/jit/gen/jitcode.h @@ -16,6 +16,7 @@ #include #include + #include "paddle/fluid/operators/jit/gen_base.h" #include "paddle/fluid/platform/cpu_info.h" diff --git a/paddle/fluid/operators/jit/gen/matmul.cc b/paddle/fluid/operators/jit/gen/matmul.cc index 3b2139c9ed025..9c859229c5a88 100644 --- a/paddle/fluid/operators/jit/gen/matmul.cc +++ b/paddle/fluid/operators/jit/gen/matmul.cc @@ -122,20 +122,23 @@ class MatMulCreator : public JitCodeCreator { std::unique_ptr CreateJitCode( const matmul_attr_t& attr) const override { PADDLE_ENFORCE_GT( - attr.m, 0, platform::errors::InvalidArgument( - "The attribute m (first matrix's row) of MatMul should " - "be larger than 0. But it is %d.", - attr.m)); + attr.m, 0, + platform::errors::InvalidArgument( + "The attribute m (first matrix's row) of MatMul should " + "be larger than 0. But it is %d.", + attr.m)); PADDLE_ENFORCE_GT( - attr.n, 0, platform::errors::InvalidArgument( - "The attribute n (first matrix's col) of MatMul should " - "be larger than 0. But it is %d.", - attr.n)); + attr.n, 0, + platform::errors::InvalidArgument( + "The attribute n (first matrix's col) of MatMul should " + "be larger than 0. But it is %d.", + attr.n)); PADDLE_ENFORCE_GT( - attr.k, 0, platform::errors::InvalidArgument( - "The attribute k (second matrix's col) of MatMul should " - "be larger than 0. But it is %d.", - attr.k)); + attr.k, 0, + platform::errors::InvalidArgument( + "The attribute k (second matrix's col) of MatMul should " + "be larger than 0. But it is %d.", + attr.k)); return make_unique(attr, CodeSize(attr)); } }; diff --git a/paddle/fluid/operators/jit/gen/matmul.h b/paddle/fluid/operators/jit/gen/matmul.h index eb7328d7e069c..af62632634024 100644 --- a/paddle/fluid/operators/jit/gen/matmul.h +++ b/paddle/fluid/operators/jit/gen/matmul.h @@ -15,6 +15,7 @@ #pragma once #include // for malloc and free + #include #include @@ -33,10 +34,11 @@ class MatMulJitCode : public JitCode { size_t code_size = 256 * 1024, void* code_ptr = nullptr) : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) { - PADDLE_ENFORCE_EQ(m_, 1, platform::errors::Unimplemented( - "Jitcode of matmul only support m==1 (first " - "matrix's row) now. But m is %d.", - m_)); + PADDLE_ENFORCE_EQ(m_, 1, + platform::errors::Unimplemented( + "Jitcode of matmul only support m==1 (first " + "matrix's row) now. But m is %d.", + m_)); this->genCode(); } diff --git a/paddle/fluid/operators/jit/gen/seqpool.cc b/paddle/fluid/operators/jit/gen/seqpool.cc index 52fdf04f3f677..4788050a14cd7 100644 --- a/paddle/fluid/operators/jit/gen/seqpool.cc +++ b/paddle/fluid/operators/jit/gen/seqpool.cc @@ -62,22 +62,23 @@ class SeqPoolCreator : public JitCodeCreator { return platform::MayIUse(platform::avx); } size_t CodeSize(const seq_pool_attr_t& attr) const override { - return 96 + - ((attr.w / YMM_FLOAT_BLOCK + 4 /* for rest */) * - 4 /* load, mul and save */ + - 256) * - 16; + return 96 + ((attr.w / YMM_FLOAT_BLOCK + 4 /* for rest */) * + 4 /* load, mul and save */ + + 256) * + 16; } std::unique_ptr CreateJitCode( const seq_pool_attr_t& attr) const override { - PADDLE_ENFORCE_GT(attr.w, 0, platform::errors::InvalidArgument( - "The attribute width of SeqPool should " - "be larger than 0. But it is %d.", - attr.w)); - PADDLE_ENFORCE_GT(attr.h, 0, platform::errors::InvalidArgument( - "The attribute height of SeqPool should " - "be larger than 0. But it is %d.", - attr.h)); + PADDLE_ENFORCE_GT(attr.w, 0, + platform::errors::InvalidArgument( + "The attribute width of SeqPool should " + "be larger than 0. But it is %d.", + attr.w)); + PADDLE_ENFORCE_GT(attr.h, 0, + platform::errors::InvalidArgument( + "The attribute height of SeqPool should " + "be larger than 0. But it is %d.", + attr.h)); return make_unique(attr, CodeSize(attr)); } }; diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc index 5baafa11cfea0..2a3c347c16a25 100644 --- a/paddle/fluid/operators/jit/gen_base.cc +++ b/paddle/fluid/operators/jit/gen_base.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/operators/jit/gen_base.h" #include + #include "paddle/fluid/memory/allocation/cpu_allocator.h" // for posix_memalign #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h index c22a7f3ec9292..761c52b7d7c79 100644 --- a/paddle/fluid/operators/jit/gen_base.h +++ b/paddle/fluid/operators/jit/gen_base.h @@ -17,8 +17,8 @@ #include // for unique_ptr #include #include -#include "gflags/gflags.h" +#include "gflags/gflags.h" #include "paddle/fluid/operators/jit/kernel_base.h" DECLARE_bool(dump_jitcode); diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index 46da6fba2e98a..07d69658632a6 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -13,7 +13,9 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/helper.h" + #include + #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -112,10 +114,11 @@ void pack_weights(const float* src, float* dst, int n, int k) { int block, rest; const auto groups = packed_groups(n, k, &block, &rest); std::for_each(groups.begin(), groups.end(), [&](int i) { - PADDLE_ENFORCE_GT(i, 0, platform::errors::InvalidArgument( - "Each element of groups should be larger than " - "0. However the element: %d doesn't satify.", - i)); + PADDLE_ENFORCE_GT(i, 0, + platform::errors::InvalidArgument( + "Each element of groups should be larger than " + "0. However the element: %d doesn't satify.", + i)); }); int sum = std::accumulate(groups.begin(), groups.end(), 0); std::memset(dst, 0, k * sum * block * sizeof(float)); diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 9a48d9c3c8d6c..0389828b49537 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -14,6 +14,7 @@ #pragma once #include + #include "paddle/fluid/operators/jit/macro.h" #include "paddle/fluid/platform/macros.h" diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc index 4f652002bc745..528aec9ace1d3 100644 --- a/paddle/fluid/operators/jit/kernel_key.cc +++ b/paddle/fluid/operators/jit/kernel_key.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/kernel_key.h" + #include // XXH64: 13.8 GB/s namespace paddle { diff --git a/paddle/fluid/operators/jit/more/CMakeLists.txt b/paddle/fluid/operators/jit/more/CMakeLists.txt index fa503356baa73..0851ca065b53d 100644 --- a/paddle/fluid/operators/jit/more/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/CMakeLists.txt @@ -1,17 +1,18 @@ - function(USE_JITKERNEL_MORE TARGET TYPE) - file(APPEND ${jit_file} "USE_JITKERNEL_MORE(${TARGET} ${TYPE});\n") + file(APPEND ${jit_file} "USE_JITKERNEL_MORE(${TARGET} ${TYPE});\n") endfunction() if(WITH_MKLML) - add_subdirectory(mkl) + add_subdirectory(mkl) endif() if(WITH_AVX) - add_subdirectory(intrinsic) + add_subdirectory(intrinsic) endif() # mix should be last add_subdirectory(mix) -set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} PARENT_SCOPE) +set(JIT_KERNEL_DEPS + ${JIT_KERNEL_DEPS} + PARENT_SCOPE) diff --git a/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt b/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt index 468937a4f6b27..c6222c9b29b3b 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt @@ -1,9 +1,16 @@ +file( + GLOB jit_kernel_cc_intrinsic + RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + "*.cc") +cc_library( + jit_kernel_intrinsic + SRCS ${jit_kernel_cc_intrinsic} + DEPS jit_kernel_base) -file(GLOB jit_kernel_cc_intrinsic RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") -cc_library(jit_kernel_intrinsic SRCS ${jit_kernel_cc_intrinsic} DEPS jit_kernel_base) - -set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_intrinsic PARENT_SCOPE) +set(JIT_KERNEL_DEPS + ${JIT_KERNEL_DEPS} jit_kernel_intrinsic + PARENT_SCOPE) # use mkl kernels by name and type -USE_JITKERNEL_MORE(kCRFDecoding, intrinsic) -USE_JITKERNEL_MORE(kLayerNorm, intrinsic) +use_jitkernel_more(kCRFDecoding, intrinsic) +use_jitkernel_more(kLayerNorm, intrinsic) diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc index 7e1f7ab8bf8b0..f11a690523bf8 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc +++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc @@ -13,7 +13,9 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h" + #include + #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" diff --git a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc index 61d8c50c56825..ef8fe6963c045 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc +++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc @@ -13,7 +13,9 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/more/intrinsic/layer_norm.h" + #include + #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" diff --git a/paddle/fluid/operators/jit/more/mix/CMakeLists.txt b/paddle/fluid/operators/jit/more/mix/CMakeLists.txt index dd039d2915296..b5bc6c8457577 100644 --- a/paddle/fluid/operators/jit/more/mix/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/mix/CMakeLists.txt @@ -1,15 +1,21 @@ +file( + GLOB jit_kernel_mix_cc + RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + "*.cc") +cc_library( + jit_kernel_mix + SRCS ${jit_kernel_mix_cc} + DEPS jit_kernel_base) +set(JIT_KERNEL_DEPS + ${JIT_KERNEL_DEPS} jit_kernel_mix + PARENT_SCOPE) -file(GLOB jit_kernel_mix_cc RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") -cc_library(jit_kernel_mix SRCS ${jit_kernel_mix_cc} DEPS jit_kernel_base) - -set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_mix PARENT_SCOPE) - -USE_JITKERNEL_MORE(kVSigmoid, mix) -USE_JITKERNEL_MORE(kVTanh, mix) -USE_JITKERNEL_MORE(kLSTMCtHt, mix) -USE_JITKERNEL_MORE(kLSTMC1H1, mix) -USE_JITKERNEL_MORE(kGRUH1, mix) -USE_JITKERNEL_MORE(kGRUHtPart1, mix) -USE_JITKERNEL_MORE(kGRUHtPart2, mix) -USE_JITKERNEL_MORE(kSoftmax, mix) +use_jitkernel_more(kVSigmoid, mix) +use_jitkernel_more(kVTanh, mix) +use_jitkernel_more(kLSTMCtHt, mix) +use_jitkernel_more(kLSTMC1H1, mix) +use_jitkernel_more(kGRUH1, mix) +use_jitkernel_more(kGRUHtPart1, mix) +use_jitkernel_more(kGRUHtPart2, mix) +use_jitkernel_more(kSoftmax, mix) diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc index a4459cee5b8a3..f0008d4152f53 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.cc +++ b/paddle/fluid/operators/jit/more/mix/mix.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/more/mix/mix.h" + #include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/jit/registry.h" diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt index 56f1a62ad4e06..609ddd3c284c8 100644 --- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt @@ -1,20 +1,24 @@ - -cc_library(jit_kernel_mkl SRCS mkl.cc DEPS jit_kernel_base dynload_mklml) -set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl PARENT_SCOPE) +cc_library( + jit_kernel_mkl + SRCS mkl.cc + DEPS jit_kernel_base dynload_mklml) +set(JIT_KERNEL_DEPS + ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl + PARENT_SCOPE) # use mkl kernels by name and type -USE_JITKERNEL_MORE(kMatMul, mkl) -USE_JITKERNEL_MORE(kVMul, mkl) -USE_JITKERNEL_MORE(kVAdd, mkl) -USE_JITKERNEL_MORE(kVScal, mkl) -USE_JITKERNEL_MORE(kStrideScal, mkl) -USE_JITKERNEL_MORE(kVExp, mkl) -USE_JITKERNEL_MORE(kVSquare, mkl) -USE_JITKERNEL_MORE(kVCopy, mkl) -USE_JITKERNEL_MORE(kVSigmoid, mkl) -USE_JITKERNEL_MORE(kVTanh, mkl) -USE_JITKERNEL_MORE(kSeqPool, mkl) -USE_JITKERNEL_MORE(kSoftmax, mkl) -USE_JITKERNEL_MORE(kEmbSeqPool, mkl) -USE_JITKERNEL_MORE(kSgd, mkl) -USE_JITKERNEL_MORE(kVBroadcast, mkl) +use_jitkernel_more(kMatMul, mkl) +use_jitkernel_more(kVMul, mkl) +use_jitkernel_more(kVAdd, mkl) +use_jitkernel_more(kVScal, mkl) +use_jitkernel_more(kStrideScal, mkl) +use_jitkernel_more(kVExp, mkl) +use_jitkernel_more(kVSquare, mkl) +use_jitkernel_more(kVCopy, mkl) +use_jitkernel_more(kVSigmoid, mkl) +use_jitkernel_more(kVTanh, mkl) +use_jitkernel_more(kSeqPool, mkl) +use_jitkernel_more(kSoftmax, mkl) +use_jitkernel_more(kEmbSeqPool, mkl) +use_jitkernel_more(kSgd, mkl) +use_jitkernel_more(kVBroadcast, mkl) diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index 75ebddb125989..16bf045aa6671 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/more/mkl/mkl.h" + #include "paddle/fluid/operators/jit/refer/refer.h" #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index 5f3c29ad5efb8..ad04b4618cb41 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -117,10 +117,11 @@ void EmbSeqPool(const T* table, const int64_t* idx, T* out, "The idx shoud be lower than the attribute table_height of " "EmbSeqPool. But %dth of idx is %d and table_height is %d.", i, idx[i], attr->table_height)); - PADDLE_ENFORCE_GE(idx[i], 0, platform::errors::InvalidArgument( - "The idx shoud be equal to or larger than " - "the 0. But %dth of idx is %d.", - i, idx[i])); + PADDLE_ENFORCE_GE(idx[i], 0, + platform::errors::InvalidArgument( + "The idx shoud be equal to or larger than " + "the 0. But %dth of idx is %d.", + i, idx[i])); }; for (int64_t w = 0; w != attr->index_width; ++w) { @@ -204,11 +205,12 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows, "less than the attribute. But %dth of rows " "is %d and grad_width is %d.", i, h_idx, attr->param_height)); - PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument( - "The rows of Sgd should be " - "larger than 0. But %dth of rows " - "is %d.", - i, h_idx)); + PADDLE_ENFORCE_GE( + h_idx, 0, + platform::errors::InvalidArgument("The rows of Sgd should be " + "larger than 0. But %dth of rows " + "is %d.", + i, h_idx)); VAXPY(scalar, grad + i * width, out + h_idx * width, width); } } else { @@ -220,11 +222,12 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows, "less than the attribute. But %dth of rows " "is %d and grad_width is %d.", i, h_idx, attr->param_height)); - PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument( - "The rows of Sgd should be " - "larger than 0. But %dth of rows " - "is %d.", - i, h_idx)); + PADDLE_ENFORCE_GE( + h_idx, 0, + platform::errors::InvalidArgument("The rows of Sgd should be " + "larger than 0. But %dth of rows " + "is %d.", + i, h_idx)); VScal(&scalar, grad + i * width, out + h_idx * width, width); VAdd(param + h_idx * width, out + h_idx * width, out + h_idx * width, width); diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index a1ee4508f7241..5ef93f989df31 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -1,42 +1,46 @@ - -cc_library(jit_kernel_refer SRCS refer.cc DEPS jit_kernel_base) -set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_refer PARENT_SCOPE) +cc_library( + jit_kernel_refer + SRCS refer.cc + DEPS jit_kernel_base) +set(JIT_KERNEL_DEPS + ${JIT_KERNEL_DEPS} jit_kernel_refer + PARENT_SCOPE) function(USE_JITKERNEL_REFER TARGET) - file(APPEND ${jit_file} "USE_JITKERNEL_REFER(${TARGET});\n") + file(APPEND ${jit_file} "USE_JITKERNEL_REFER(${TARGET});\n") endfunction() # use refer kernel by name -USE_JITKERNEL_REFER(kVMul) -USE_JITKERNEL_REFER(kVAdd) -USE_JITKERNEL_REFER(kVAddRelu) -USE_JITKERNEL_REFER(kVSub) -USE_JITKERNEL_REFER(kVScal) -USE_JITKERNEL_REFER(kStrideScal) -USE_JITKERNEL_REFER(kVAddBias) -USE_JITKERNEL_REFER(kVCopy) -USE_JITKERNEL_REFER(kVRelu) -USE_JITKERNEL_REFER(kVIdentity) -USE_JITKERNEL_REFER(kVExp) -USE_JITKERNEL_REFER(kVSigmoid) -USE_JITKERNEL_REFER(kVTanh) -USE_JITKERNEL_REFER(kLSTMCtHt) -USE_JITKERNEL_REFER(kLSTMC1H1) -USE_JITKERNEL_REFER(kGRUH1) -USE_JITKERNEL_REFER(kGRUHtPart1) -USE_JITKERNEL_REFER(kGRUHtPart2) -USE_JITKERNEL_REFER(kCRFDecoding) -USE_JITKERNEL_REFER(kLayerNorm) -USE_JITKERNEL_REFER(kNCHW16CMulNC) -USE_JITKERNEL_REFER(kSeqPool) -USE_JITKERNEL_REFER(kMatMul) -USE_JITKERNEL_REFER(kVSquare) -USE_JITKERNEL_REFER(kHSum) -USE_JITKERNEL_REFER(kHMax) -USE_JITKERNEL_REFER(kStrideASum) -USE_JITKERNEL_REFER(kSoftmax) -USE_JITKERNEL_REFER(kEmbSeqPool) -USE_JITKERNEL_REFER(kAdam) -USE_JITKERNEL_REFER(kAdamW) -USE_JITKERNEL_REFER(kSgd) -USE_JITKERNEL_REFER(kVBroadcast) +use_jitkernel_refer(kVMul) +use_jitkernel_refer(kVAdd) +use_jitkernel_refer(kVAddRelu) +use_jitkernel_refer(kVSub) +use_jitkernel_refer(kVScal) +use_jitkernel_refer(kStrideScal) +use_jitkernel_refer(kVAddBias) +use_jitkernel_refer(kVCopy) +use_jitkernel_refer(kVRelu) +use_jitkernel_refer(kVIdentity) +use_jitkernel_refer(kVExp) +use_jitkernel_refer(kVSigmoid) +use_jitkernel_refer(kVTanh) +use_jitkernel_refer(kLSTMCtHt) +use_jitkernel_refer(kLSTMC1H1) +use_jitkernel_refer(kGRUH1) +use_jitkernel_refer(kGRUHtPart1) +use_jitkernel_refer(kGRUHtPart2) +use_jitkernel_refer(kCRFDecoding) +use_jitkernel_refer(kLayerNorm) +use_jitkernel_refer(kNCHW16CMulNC) +use_jitkernel_refer(kSeqPool) +use_jitkernel_refer(kMatMul) +use_jitkernel_refer(kVSquare) +use_jitkernel_refer(kHSum) +use_jitkernel_refer(kHMax) +use_jitkernel_refer(kStrideASum) +use_jitkernel_refer(kSoftmax) +use_jitkernel_refer(kEmbSeqPool) +use_jitkernel_refer(kAdam) +use_jitkernel_refer(kAdamW) +use_jitkernel_refer(kSgd) +use_jitkernel_refer(kVBroadcast) diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index 779d4c172b83c..9919f2d46dd8b 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/refer/refer.h" + #include "paddle/fluid/operators/jit/registry.h" namespace refer = paddle::operators::jit::refer; diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 79b2e174efc16..3f1e5b3235b25 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -481,10 +481,11 @@ void EmbSeqPool(const T* table, const int64_t* idx, T* out, "The idx shoud be lower than the attribute table_height of " "EmbSeqPool. But %dth of idx is %d and table_height is %d.", i, idx[i], attr->table_height)); - PADDLE_ENFORCE_GE(idx[i], 0, platform::errors::InvalidArgument( - "The idx shoud be equal to or larger than " - "the 0. But %dth of idx is %d.", - i, idx[i])); + PADDLE_ENFORCE_GE(idx[i], 0, + platform::errors::InvalidArgument( + "The idx shoud be equal to or larger than " + "the 0. But %dth of idx is %d.", + i, idx[i])); }; for (int64_t w = 0; w != attr->index_width; ++w) { @@ -539,11 +540,12 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows, "less than the attribute. But %dth of rows " "is %d and grad_width is %d.", i, h_idx, attr->param_height)); - PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument( - "The rows of Sgd should be " - "larger than 0. But %dth of rows " - "is %d.", - i, h_idx)); + PADDLE_ENFORCE_GE( + h_idx, 0, + platform::errors::InvalidArgument("The rows of Sgd should be " + "larger than 0. But %dth of rows " + "is %d.", + i, h_idx)); for (int64_t j = 0; j < attr->grad_width; ++j) { out[h_idx * attr->grad_width + j] = param[h_idx * attr->grad_width + j] - diff --git a/paddle/fluid/operators/jit/registry.h b/paddle/fluid/operators/jit/registry.h index 567a903236979..15d5e605b01bb 100644 --- a/paddle/fluid/operators/jit/registry.h +++ b/paddle/fluid/operators/jit/registry.h @@ -18,6 +18,7 @@ #include #include #include // for std::move + #include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/operators/jit/kernel_pool.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 74f2d62c64da9..27e816248ab38 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -286,48 +286,48 @@ void TestKernelLSTM() { ref(&step, &attr); VLOG(10) << attr; - auto verifier = []( - const typename KernelTuple::func_type tgt, - const std::vector& xsrc, const std::vector& wp, - const std::vector& ct_1, const std::vector& ct_ref, - const std::vector& ht_ref, - const typename KernelTuple::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(ct_ref.size(), ht_ref.size()); - EXPECT_EQ(ct_1.size(), ht_ref.size()); - EXPECT_EQ(xsrc.size(), 4 * ht_ref.size()); - EXPECT_EQ(wp.size(), 3 * ht_ref.size()); - - // x could be changed after compute, so copy to save src - int d = ht_ref.size(); - std::vector x(xsrc.size()), ct(ct_ref.size()), - ht(ht_ref.size()); - std::vector checked(2 * d); - std::copy(xsrc.begin(), xsrc.end(), x.begin()); - - const T* ct_1_data = ct_1.data(); - const T* wp_data = wp.data(); - const T* ct_ref_data = ct_ref.data(); - const T* ht_ref_data = ht_ref.data(); - T* x_data = x.data(); - T* ct_data = ct.data(); - T* ht_data = ht.data(); - T* checked_data = checked.data(); - - jit::lstm_t step; - step.gates = x_data; - step.ct_1 = ct_1_data; - step.ct = ct_data; - step.ht = ht_data; - if (attr.use_peephole) { - step.wp = wp_data; - step.checked = checked_data; - } - - tgt(&step, &attr); - ExpectEQ(ct_data, ct_ref_data, d); - ExpectEQ(ht_data, ht_ref_data, d); - }; + auto verifier = + [](const typename KernelTuple::func_type tgt, + const std::vector& xsrc, const std::vector& wp, + const std::vector& ct_1, const std::vector& ct_ref, + const std::vector& ht_ref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(ct_ref.size(), ht_ref.size()); + EXPECT_EQ(ct_1.size(), ht_ref.size()); + EXPECT_EQ(xsrc.size(), 4 * ht_ref.size()); + EXPECT_EQ(wp.size(), 3 * ht_ref.size()); + + // x could be changed after compute, so copy to save src + int d = ht_ref.size(); + std::vector x(xsrc.size()), ct(ct_ref.size()), + ht(ht_ref.size()); + std::vector checked(2 * d); + std::copy(xsrc.begin(), xsrc.end(), x.begin()); + + const T* ct_1_data = ct_1.data(); + const T* wp_data = wp.data(); + const T* ct_ref_data = ct_ref.data(); + const T* ht_ref_data = ht_ref.data(); + T* x_data = x.data(); + T* ct_data = ct.data(); + T* ht_data = ht.data(); + T* checked_data = checked.data(); + + jit::lstm_t step; + step.gates = x_data; + step.ct_1 = ct_1_data; + step.ct = ct_data; + step.ht = ht_data; + if (attr.use_peephole) { + step.wp = wp_data; + step.checked = checked_data; + } + + tgt(&step, &attr); + ExpectEQ(ct_data, ct_ref_data, d); + ExpectEQ(ht_data, ht_ref_data, d); + }; TestAllImpls(attr, verifier, xsrc, wp, ct_1, ct_ref, ht_ref, attr); } @@ -484,41 +484,42 @@ void TestKernelLayerNorm() { ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data, left, epsilon, right); - auto verifier = []( - const typename KernelTuple::func_type tgt, const std::vector& x_, - const std::vector& outref_, const std::vector& mean_, - const std::vector& var_, const std::vector& scale, - const std::vector& bias, const int& left, const float& epsilon, - const typename KernelTuple::attr_type& right) { - EXPECT_TRUE(tgt != nullptr); - std::vector outtgt(outref_.size()); - std::vector x(x_.size()); - std::vector mean(mean_.size()); - std::vector var(var_.size()); - std::vector outref(outref_.size()); - std::copy(x_.begin(), x_.end(), x.begin()); - std::copy(mean_.begin(), mean_.end(), mean.begin()); - std::copy(var_.begin(), var_.end(), var.begin()); - std::copy(outref_.begin(), outref_.end(), outref.begin()); - - EXPECT_EQ(x.size(), static_cast(left * right)); - EXPECT_EQ(outref.size(), static_cast(left * right)); - EXPECT_EQ(mean.size(), static_cast(left)); - EXPECT_EQ(var.size(), static_cast(left)); - EXPECT_EQ(scale.size(), static_cast(right)); - EXPECT_EQ(bias.size(), static_cast(right)); - - const T* scale_data = scale.data(); - const T* bias_data = bias.data(); - T* x_data = x.data(); - T* mean_data = mean.data(); - T* var_data = var.data(); - T* outref_data = outref.data(); - T* outtgt_data = outtgt.data(); - tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data, - left, epsilon, right); - ExpectEQ(outtgt_data, outref_data, left * right); - }; + auto verifier = + [](const typename KernelTuple::func_type tgt, + const std::vector& x_, const std::vector& outref_, + const std::vector& mean_, const std::vector& var_, + const std::vector& scale, const std::vector& bias, + const int& left, const float& epsilon, + const typename KernelTuple::attr_type& right) { + EXPECT_TRUE(tgt != nullptr); + std::vector outtgt(outref_.size()); + std::vector x(x_.size()); + std::vector mean(mean_.size()); + std::vector var(var_.size()); + std::vector outref(outref_.size()); + std::copy(x_.begin(), x_.end(), x.begin()); + std::copy(mean_.begin(), mean_.end(), mean.begin()); + std::copy(var_.begin(), var_.end(), var.begin()); + std::copy(outref_.begin(), outref_.end(), outref.begin()); + + EXPECT_EQ(x.size(), static_cast(left * right)); + EXPECT_EQ(outref.size(), static_cast(left * right)); + EXPECT_EQ(mean.size(), static_cast(left)); + EXPECT_EQ(var.size(), static_cast(left)); + EXPECT_EQ(scale.size(), static_cast(right)); + EXPECT_EQ(bias.size(), static_cast(right)); + + const T* scale_data = scale.data(); + const T* bias_data = bias.data(); + T* x_data = x.data(); + T* mean_data = mean.data(); + T* var_data = var.data(); + T* outref_data = outref.data(); + T* outtgt_data = outtgt.data(); + tgt(x_data, outtgt_data, mean_data, var_data, scale_data, + bias_data, left, epsilon, right); + ExpectEQ(outtgt_data, outref_data, left * right); + }; TestAllImpls(right, verifier, x, outref, mean, var, scale, bias, left, epsilon, right); @@ -548,11 +549,12 @@ void TestKernelCRFDecoding() { ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(), trackref.data(), tag_num); - auto verifier = []( - const typename KernelTuple::func_type tgt, const int& seq_len, - const std::vector& x, const std::vector& w, - const std::vector& alpharef, const std::vector& trackref, - const typename KernelTuple::attr_type& tag_num) { + auto verifier = [](const typename KernelTuple::func_type tgt, + const int& seq_len, const std::vector& x, + const std::vector& w, + const std::vector& alpharef, + const std::vector& trackref, + const typename KernelTuple::attr_type& tag_num) { constexpr int state_trans_base_idx = 2; EXPECT_TRUE(tgt != nullptr); EXPECT_EQ(x.size(), static_cast(seq_len * tag_num)); @@ -878,12 +880,13 @@ void TestKernelAdam() { mom2.data(), param.data(), mom1_out.data(), mom2_out.data(), param_out.data()); - auto verifier = []( - const typename KernelTuple::func_type tgt, T beta1, T beta2, T lr, T eps, - int64_t numel, const std::vector& grad, const std::vector& mom1, - const std::vector& mom2, const std::vector& param, - const std::vector& ref_mom1_out, const std::vector& ref_mom2_out, - const std::vector& ref_param_out) { + auto verifier = [](const typename KernelTuple::func_type tgt, T beta1, + T beta2, T lr, T eps, int64_t numel, + const std::vector& grad, const std::vector& mom1, + const std::vector& mom2, const std::vector& param, + const std::vector& ref_mom1_out, + const std::vector& ref_mom2_out, + const std::vector& ref_param_out) { EXPECT_TRUE(tgt != nullptr); EXPECT_EQ(param.size(), static_cast(numel)); EXPECT_EQ(grad.size(), static_cast(numel)); @@ -944,30 +947,31 @@ void TestKernelAdamW() { grad.data(), mom1.data(), mom2.data(), param.data(), mom1_out.data(), mom2_out.data(), param_out.data()); - auto verifier = []( - const typename KernelTuple::func_type tgt, T beta1, T beta2, T lr, T eps, - T old_lr, T lr_ratio, T coeff, int64_t numel, const std::vector& grad, - const std::vector& mom1, const std::vector& mom2, - const std::vector& param, const std::vector& ref_mom1_out, - const std::vector& ref_mom2_out, const std::vector& ref_param_out) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(param.size(), static_cast(numel)); - EXPECT_EQ(grad.size(), static_cast(numel)); - EXPECT_EQ(mom1.size(), static_cast(numel)); - EXPECT_EQ(mom2.size(), static_cast(numel)); - - std::vector jit_mom1_out(ref_mom1_out.size()); - std::vector jit_mom2_out(ref_mom2_out.size()); - std::vector jit_param_out(ref_param_out.size()); - - tgt(beta1, beta2, -lr, eps, old_lr, lr_ratio, coeff, numel, grad.data(), - mom1.data(), mom2.data(), param.data(), jit_mom1_out.data(), - jit_mom2_out.data(), jit_param_out.data()); - - ExpectEQ(ref_mom1_out.data(), jit_mom1_out.data(), numel); - ExpectEQ(ref_mom2_out.data(), jit_mom2_out.data(), numel); - ExpectEQ(ref_param_out.data(), jit_param_out.data(), numel); - }; + auto verifier = + [](const typename KernelTuple::func_type tgt, T beta1, T beta2, T lr, + T eps, T old_lr, T lr_ratio, T coeff, int64_t numel, + const std::vector& grad, const std::vector& mom1, + const std::vector& mom2, const std::vector& param, + const std::vector& ref_mom1_out, const std::vector& ref_mom2_out, + const std::vector& ref_param_out) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(param.size(), static_cast(numel)); + EXPECT_EQ(grad.size(), static_cast(numel)); + EXPECT_EQ(mom1.size(), static_cast(numel)); + EXPECT_EQ(mom2.size(), static_cast(numel)); + + std::vector jit_mom1_out(ref_mom1_out.size()); + std::vector jit_mom2_out(ref_mom2_out.size()); + std::vector jit_param_out(ref_param_out.size()); + + tgt(beta1, beta2, -lr, eps, old_lr, lr_ratio, coeff, numel, grad.data(), + mom1.data(), mom2.data(), param.data(), jit_mom1_out.data(), + jit_mom2_out.data(), jit_param_out.data()); + + ExpectEQ(ref_mom1_out.data(), jit_mom1_out.data(), numel); + ExpectEQ(ref_mom2_out.data(), jit_mom2_out.data(), numel); + ExpectEQ(ref_param_out.data(), jit_param_out.data(), numel); + }; TestAllImpls( 1, verifier, beta1, beta2, learning_rate, eps, old_lr, lr_ratio, coeff, @@ -988,8 +992,9 @@ void TestKernelSgd() { "and n-1 is %d.", static_cast(upper - lower), n - 1)); PADDLE_ENFORCE_GT( - n, 0, paddle::platform::errors::InvalidArgument( - "The Sgd size should be larger than 0. But the n is %d.", n)); + n, 0, + paddle::platform::errors::InvalidArgument( + "The Sgd size should be larger than 0. But the n is %d.", n)); std::vector all, out; for (int i = 0; i < n; ++i) { all.push_back(i); @@ -1031,11 +1036,12 @@ void TestKernelSgd() { grad_w); } - auto verifier = []( - const typename KernelTuple::func_type tgt, const T lr, - const std::vector& param, const std::vector& grad, - const std::vector& rows, const std::vector& oref, - const typename KernelTuple::attr_type& attr) { + auto verifier = [](const typename KernelTuple::func_type tgt, + const T lr, const std::vector& param, + const std::vector& grad, + const std::vector& rows, + const std::vector& oref, + const typename KernelTuple::attr_type& attr) { EXPECT_TRUE(tgt != nullptr); EXPECT_EQ(param.size(), static_cast(attr.param_height * attr.param_width)); diff --git a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h index 169befc88f28d..82de4c82d1121 100644 --- a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h @@ -19,4 +19,4 @@ namespace paddle { namespace operators { namespace kernel_primitives = phi::kps; } -} +} // namespace paddle diff --git a/paddle/fluid/operators/kldiv_loss_op.cc b/paddle/fluid/operators/kldiv_loss_op.cc index 67c1942ea0b41..8597c21b3ec97 100644 --- a/paddle/fluid/operators/kldiv_loss_op.cc +++ b/paddle/fluid/operators/kldiv_loss_op.cc @@ -11,6 +11,7 @@ #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/infermeta/binary.h" diff --git a/paddle/fluid/operators/kldiv_loss_op_npu.cc b/paddle/fluid/operators/kldiv_loss_op_npu.cc index eac181489aa9d..41499f3f7bf8b 100644 --- a/paddle/fluid/operators/kldiv_loss_op_npu.cc +++ b/paddle/fluid/operators/kldiv_loss_op_npu.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the Licnse. */ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" diff --git a/paddle/fluid/operators/kthvalue_op.cc b/paddle/fluid/operators/kthvalue_op.cc index 4c679d3026386..1ff9ab796e9d9 100644 --- a/paddle/fluid/operators/kthvalue_op.cc +++ b/paddle/fluid/operators/kthvalue_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc index ddd0554add510..7a6a28a33c13c 100644 --- a/paddle/fluid/operators/l1_norm_op.cc +++ b/paddle/fluid/operators/l1_norm_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/l1_norm_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc index 7e07610db2875..e14e61006478e 100644 --- a/paddle/fluid/operators/label_smooth_op.cc +++ b/paddle/fluid/operators/label_smooth_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h index 5b5ddddaafb24..ac20a5962f394 100644 --- a/paddle/fluid/operators/layer_norm_kernel.cu.h +++ b/paddle/fluid/operators/layer_norm_kernel.cu.h @@ -36,8 +36,6 @@ using CudnnDataType = platform::CudnnDataType; template using LayerNormParamType = typename CudnnDataType::BatchNormParamType; -#define LN_NUM_COLS 1024 - inline static int GetDesiredBlockDim(int64_t block_dim) { #ifdef __HIPCC__ const int kMaxBlockDim = 256; @@ -183,11 +181,12 @@ template -__global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel( +__global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel( int rows, int cols, const float epsilon, const T *__restrict__ x_ptr, const ScaleT *__restrict__ gamma_ptr, const ScaleT *__restrict__ beta_ptr, U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr, T *__restrict__ y_ptr) { + __shared__ U smem[WARPS_M * WARPS_N]; using Vec = phi::AlignedVector; using Vec_scale = phi::AlignedVector; @@ -210,12 +209,12 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel( col += THREADS_PER_ROW; } - constexpr U rn = 1.f / U(LN_NUM_COLS); + constexpr U rn = 1.f / U(ELTS_PER_ROW); for (int row = r; row < rows; row += gridDim.x * ROWS_PER_CTA) { Vec x[LDGS]; #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - phi::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]); + phi::Load(x_ptr + row * ELTS_PER_ROW + col * VecSize, &x[it]); col += THREADS_PER_ROW; } U xf[LDGS * VecSize]; @@ -235,6 +234,23 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel( for (int it = 1; it < THREADS_PER_WARP; it *= 2) { mu_local += __shfl_xor_sync(uint32_t(-1), mu_local, it); } + if (WARPS_N > 1) { + if (lane == 0) { + smem[warp_m * WARPS_N + warp_n] = mu_local; + } + __syncthreads(); + if (tidx == 0) { + mu_local = 0.f; +#pragma unroll + for (int it = 0; it < WARPS_N; ++it) { + mu_local += smem[warp_m * WARPS_N + it]; + } + smem[warp_m] = mu_local; + } + __syncthreads(); + mu_local = smem[warp_m]; + } + mu_local *= rn; if (lane == 0) { mean_out_ptr[row] = mu_local; @@ -254,6 +270,24 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel( for (int it = 1; it < THREADS_PER_WARP; it *= 2) { var_local += __shfl_xor_sync(uint32_t(-1), var_local, it); } + + if (WARPS_N > 1) { + if (lane == 0) { + smem[warp_m * WARPS_N + warp_n] = var_local; + } + __syncthreads(); + if (tidx == 0) { + var_local = 0.f; +#pragma unroll + for (int it = 0; it < WARPS_N; ++it) { + var_local += smem[warp_m * WARPS_N + it]; + } + smem[warp_m] = var_local; + } + __syncthreads(); + var_local = smem[warp_m]; + } + // Note: to assure if it is right for double U rsigma = rsqrtf(var_local * rn + epsilon); if (lane == 0) { @@ -277,7 +311,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel( #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - phi::Store(x[it], y_ptr + row * LN_NUM_COLS + col * VecSize); + phi::Store(x[it], y_ptr + row * ELTS_PER_ROW + col * VecSize); col += THREADS_PER_ROW; } } @@ -416,10 +450,10 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( const int r = bidx * ROWS_PER_CTA + warp_m; const int c = warp_n * THREADS_PER_WARP + lane; - static_assert(LN_NUM_COLS == THREADS_PER_ROW * LDGS * VecSize, ""); + static_assert(ELTS_PER_ROW == THREADS_PER_ROW * LDGS * VecSize, ""); // smem for column reduction - __shared__ U smem_[ROWS_PER_CTA * LN_NUM_COLS]; + __shared__ U smem_[ROWS_PER_CTA * ELTS_PER_ROW]; U dgamma_sum[LDGS * VecSize]; U dbeta_sum[LDGS * VecSize]; @@ -434,7 +468,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( U *sum_loss2_shared = &smem_sum_loss2[warp_m * WARPS_N]; // step-1: compute dx and local results of dscale and dbias - constexpr float rn = 1.f / static_cast(LN_NUM_COLS); + constexpr float rn = 1.f / static_cast(ELTS_PER_ROW); Vec_scale gamma[LDGS]; int col = c; #pragma unroll @@ -452,12 +486,12 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( int col = c; #pragma unroll for (int it = 0; it < LDGS; it++) { - phi::Load(dout_ptr + row * LN_NUM_COLS + col * VecSize, + phi::Load(dout_ptr + row * ELTS_PER_ROW + col * VecSize, &dout[it]); - phi::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]); + phi::Load(x_ptr + row * ELTS_PER_ROW + col * VecSize, &x[it]); if (isFusedDropoutResidualLn) { phi::Load( - mask_ptr + row * LN_NUM_COLS + col * VecSize, &mask_vec[it]); + mask_ptr + row * ELTS_PER_ROW + col * VecSize, &mask_vec[it]); } col += THREADS_PER_ROW; @@ -551,10 +585,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( col = c; #pragma unroll for (int it = 0; it < LDGS; it++) { - phi::Store(x[it], dx_ptr + row * LN_NUM_COLS + col * VecSize); + phi::Store(x[it], + dx_ptr + row * ELTS_PER_ROW + col * VecSize); if (isFusedDropoutResidualLn) { phi::Store( - dout[it], d_dropout_src_ptr + row * LN_NUM_COLS + col * VecSize); + dout[it], d_dropout_src_ptr + row * ELTS_PER_ROW + col * VecSize); } col += THREADS_PER_ROW; } @@ -562,12 +597,12 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( // step-2: column reduction of dscale and dbias for each thread block. // each block's sum: [4 * 1024] -> [1 * 1024] - enum { NUM_RES = LN_NUM_COLS / THREADS_PER_CTA }; // 1024/128 = 8 - static_assert(NUM_RES * THREADS_PER_CTA == LN_NUM_COLS, ""); + enum { NUM_RES = ELTS_PER_ROW / THREADS_PER_CTA }; // 1024/128 = 8 + static_assert(NUM_RES * THREADS_PER_CTA == ELTS_PER_ROW, ""); U *smem_write; - smem_write = &smem_[warp_m * LN_NUM_COLS + tid_r * VecSize]; // [4 * 1024] + smem_write = &smem_[warp_m * ELTS_PER_ROW + tid_r * VecSize]; // [4 * 1024] #pragma unroll for (int it = 0; it < LDGS; it++) { #pragma unroll @@ -583,12 +618,12 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( for (int it = 0; it < ROWS_PER_CTA; it++) { for (int jt = 0; jt < NUM_RES; jt++) { cta_dbeta_sum[jt] += - smem_[it * LN_NUM_COLS + tidx + jt * THREADS_PER_CTA]; + smem_[it * ELTS_PER_ROW + tidx + jt * THREADS_PER_CTA]; } } __syncthreads(); - smem_write = &smem_[warp_m * LN_NUM_COLS + tid_r * VecSize]; + smem_write = &smem_[warp_m * ELTS_PER_ROW + tid_r * VecSize]; #pragma unroll for (int it = 0; it < LDGS; it++) { #pragma unroll @@ -603,19 +638,19 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( for (int it = 0; it < ROWS_PER_CTA; it++) { for (int jt = 0; jt < NUM_RES; jt++) { cta_dgamma_sum[jt] += - smem_[it * LN_NUM_COLS + tidx + jt * THREADS_PER_CTA]; + smem_[it * ELTS_PER_ROW + tidx + jt * THREADS_PER_CTA]; } } // the shape of results:(#blocks, 1024) U *dgamma_part = - static_cast(dgamma_temp_ptr) + bidx * LN_NUM_COLS + tidx; + static_cast(dgamma_temp_ptr) + bidx * ELTS_PER_ROW + tidx; for (int jt = 0; jt < NUM_RES; jt++) { *dgamma_part = cta_dgamma_sum[jt]; dgamma_part += THREADS_PER_CTA; } - U *dbeta_part = static_cast(dbeta_temp_ptr) + bidx * LN_NUM_COLS + tidx; + U *dbeta_part = static_cast(dbeta_temp_ptr) + bidx * ELTS_PER_ROW + tidx; for (int jt = 0; jt < NUM_RES; jt++) { *dbeta_part = cta_dbeta_sum[jt]; dbeta_part += THREADS_PER_CTA; @@ -626,7 +661,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( * output is [1, 1024]. * #blocks: 32 * #threads: 512 -*/ + */ // todo(@limin29): to think if there are better impl strategies template < typename U, typename ScaleT = U, int VecSize = 1, int WARPS_M = 16, @@ -640,7 +675,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel( const int rows, U *__restrict__ dg_part_, U *__restrict__ db_part_, ScaleT *__restrict__ dg_, ScaleT *__restrict__ db_) { using Vec = phi::AlignedVector; - static_assert(VEC_COLS == LN_NUM_COLS / VecSize, ""); + static_assert(VEC_COLS == ELTS_PER_ROW / VecSize, ""); const int tidx = threadIdx.x; const int bidx = blockIdx.x; @@ -656,8 +691,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel( __shared__ U smem_space[(WARPS_M - 1) * THREADS_PER_ROW * VecSize]; for (int col = c; col < VEC_COLS; col += gridDim.x * THREADS_PER_ROW) { - const U *dg_part_ptr = (dg_part_) + r * LN_NUM_COLS + col * VecSize; - const U *db_part_ptr = (db_part_) + r * LN_NUM_COLS + col * VecSize; + const U *dg_part_ptr = (dg_part_) + r * ELTS_PER_ROW + col * VecSize; + const U *db_part_ptr = (db_part_) + r * ELTS_PER_ROW + col * VecSize; U dg_sum[VecSize]; U db_sum[VecSize]; @@ -669,8 +704,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel( Vec db; phi::Load(dg_part_ptr, &dg); phi::Load(db_part_ptr, &db); - dg_part_ptr += ROWS_PER_CTA * LN_NUM_COLS; - db_part_ptr += ROWS_PER_CTA * LN_NUM_COLS; + dg_part_ptr += ROWS_PER_CTA * ELTS_PER_ROW; + db_part_ptr += ROWS_PER_CTA * ELTS_PER_ROW; #pragma unroll for (int jt = 0; jt < VecSize; jt++) { @@ -748,16 +783,16 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel( } /* This function support two kinds of computations (only for float and fp16 -* type): -* -* Case-1: compute layer_norm_grad for layernorm op by setting mask_ptr and -* d_dropout_src_ptr to nullptr. Here, d_x_ptr returns the grad of layernorm -* input. -* -* Case-2: compute layer_norm_grad + residual_grad + dropout_grad for -* fused_dropout_residual_layernorm op. Here, dx_ptr returns residual_grad. -* -*/ + * type): + * + * Case-1: compute layer_norm_grad for layernorm op by setting mask_ptr and + * d_dropout_src_ptr to nullptr. Here, d_x_ptr returns the grad of layernorm + * input. + * + * Case-2: compute layer_norm_grad + residual_grad + dropout_grad for + * fused_dropout_residual_layernorm op. Here, dx_ptr returns residual_grad. + * + */ template void ln_bwd_1024_kernel_driver(const phi::GPUContext &dev_ctx, const int rows, @@ -804,19 +839,19 @@ void ln_bwd_1024_kernel_driver(const phi::GPUContext &dev_ctx, const int rows, "To compute fused_dropout_residual_ln grad, d_dropout_src_ptr " "can't be null")); } - fused_ln_bwd_1024_kernel< - true, T, U, ScaleT, MaskType, VecSize, WARPS_M, WARPS_N, - BYTES_PER_LDG><<>>( - rows, epsilon, x_ptr, scale_ptr, mean_ptr, var_ptr, dout_ptr, - dscale_temp_ptr, dbias_temp_ptr, dx_ptr, mask_ptr, factor, - d_dropout_src_ptr); + fused_ln_bwd_1024_kernel + <<>>( + rows, epsilon, x_ptr, scale_ptr, mean_ptr, var_ptr, dout_ptr, + dscale_temp_ptr, dbias_temp_ptr, dx_ptr, mask_ptr, factor, + d_dropout_src_ptr); } else { - fused_ln_bwd_1024_kernel< - false, T, U, ScaleT, MaskType, VecSize, WARPS_M, WARPS_N, - BYTES_PER_LDG><<>>( - rows, epsilon, x_ptr, scale_ptr, mean_ptr, var_ptr, dout_ptr, - dscale_temp_ptr, dbias_temp_ptr, dx_ptr); + fused_ln_bwd_1024_kernel + <<>>( + rows, epsilon, x_ptr, scale_ptr, mean_ptr, var_ptr, dout_ptr, + dscale_temp_ptr, dbias_temp_ptr, dx_ptr); } const int WARPS_M_2 = 16; const int WARPS_N_2 = 1; @@ -838,10 +873,10 @@ void ln_bwd_1024_kernel_driver(const phi::GPUContext &dev_ctx, const int rows, PADDLE_THROW(platform::errors::InvalidArgument( "Only support float and fp16 type")); } else { - ln_bwd_1024_final_kernel< - U, ScaleT, VecSize_2, WARPS_M_2, WARPS_N_2, - BYTES_PER_LDG_2><<>>( - gridx, dscale_temp_ptr, dbias_temp_ptr, dscale_ptr, dbias_ptr); + ln_bwd_1024_final_kernel + <<>>( + gridx, dscale_temp_ptr, dbias_temp_ptr, dscale_ptr, dbias_ptr); } } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -1352,16 +1387,17 @@ static void LayerNormBackward( if (gradient_flag == 0) return; if (batch_size == 1) { - LayerNormBackwardWhenBatchSizeIsOne<<< - (feature_size + kMaxBlockDim - 1) / kMaxBlockDim, kMaxBlockDim, 0, - stream>>>(x, d_y, d_x, d_scale, d_bias, mean, var, scale, epsilon, - feature_size); + LayerNormBackwardWhenBatchSizeIsOne + <<<(feature_size + kMaxBlockDim - 1) / kMaxBlockDim, kMaxBlockDim, 0, + stream>>>(x, d_y, d_x, d_scale, d_bias, mean, var, scale, epsilon, + feature_size); if (d_x != nullptr) { switch (GetDesiredBlockDim(feature_size)) { - FIXED_BLOCK_DIM_CASE(LayerNormBackwardPostProcessToCalculateDX< - T, U, kBlockDim><<<1, kBlockDim, 0, stream>>>( - x, d_x, mean, var, epsilon, feature_size)); + FIXED_BLOCK_DIM_CASE( + LayerNormBackwardPostProcessToCalculateDX + <<<1, kBlockDim, 0, stream>>>(x, d_x, mean, var, epsilon, + feature_size)); } } return; @@ -1373,9 +1409,9 @@ static void LayerNormBackward( switch (block_dim) { FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE( feature_size, kMaxBlockNum, - LayerNormBackwardGradientScaleOrBias< - T, U, kBlockDim, false, false, - ScaleBiasWithSameTypeX><<>>( + LayerNormBackwardGradientScaleOrBias + <<>>( x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, feature_size, col_offset)); } @@ -1384,9 +1420,9 @@ static void LayerNormBackward( switch (block_dim) { FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE( feature_size, kMaxBlockNum, - LayerNormBackwardGradientScaleOrBias< - T, U, kBlockDim, false, true, - ScaleBiasWithSameTypeX><<>>( + LayerNormBackwardGradientScaleOrBias + <<>>( x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, feature_size, col_offset)); } @@ -1395,9 +1431,9 @@ static void LayerNormBackward( switch (block_dim) { FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE( feature_size, kMaxBlockNum, - LayerNormBackwardGradientAll< - T, U, kBlockDim, false, - ScaleBiasWithSameTypeX><<>>( + LayerNormBackwardGradientAll + <<>>( x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, feature_size, col_offset)); } @@ -1405,9 +1441,9 @@ static void LayerNormBackward( case 4: // d_x != nullptr, d_scale == nullptr, d_bias == nullptr switch (GetDesiredBlockDim(feature_size)) { FIXED_BLOCK_DIM_CASE( - LayerNormBackwardGradientOnlyDX< - T, U, kBlockDim, - ScaleBiasWithSameTypeX><<>>( + LayerNormBackwardGradientOnlyDX + <<>>( x, d_y, d_x, mean, var, scale, epsilon, feature_size)); } break; @@ -1415,34 +1451,34 @@ static void LayerNormBackward( switch (block_dim) { FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE( feature_size, kMaxBlockNum, - LayerNormBackwardGradientScaleOrBias< - T, U, kBlockDim, true, false, - ScaleBiasWithSameTypeX><<>>( + LayerNormBackwardGradientScaleOrBias + <<>>( x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, feature_size, col_offset)); } switch (GetDesiredBlockDim(feature_size)) { FIXED_BLOCK_DIM_CASE( - LayerNormBackwardPostProcessToCalculateDX< - T, U, kBlockDim><<>>( - x, d_x, mean, var, epsilon, feature_size)); + LayerNormBackwardPostProcessToCalculateDX + <<>>(x, d_x, mean, var, epsilon, + feature_size)); } break; case 6: // d_x != nullptr, d_scale != nullptr, d_bias == nullptr switch (block_dim) { FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE( feature_size, kMaxBlockNum, - LayerNormBackwardGradientScaleOrBias< - T, U, kBlockDim, true, true, - ScaleBiasWithSameTypeX><<>>( + LayerNormBackwardGradientScaleOrBias + <<>>( x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, feature_size, col_offset)); } switch (GetDesiredBlockDim(feature_size)) { FIXED_BLOCK_DIM_CASE( - LayerNormBackwardPostProcessToCalculateDX< - T, U, kBlockDim><<>>( - x, d_x, mean, var, epsilon, feature_size)); + LayerNormBackwardPostProcessToCalculateDX + <<>>(x, d_x, mean, var, epsilon, + feature_size)); } break; case 7: // d_x != nullptr, d_scale != nullptr, d_bias != nullptr @@ -1476,29 +1512,30 @@ static void LayerNormBackward( U *part_grad_gamma = reinterpret_cast(part_grad_gamma_ptr->ptr()); U *part_grad_beta = reinterpret_cast(part_grad_beta_ptr->ptr()); - LayerNormBackwardPartGradGammaBeta< - T, U, BDIMX2, BDIMY2, VPT><<>>( - d_y, x, batch_size, feature_size, mean, var, epsilon, - part_grad_gamma, - part_grad_beta); // compute part_grad_gamma, beta + LayerNormBackwardPartGradGammaBeta + <<>>( + d_y, x, batch_size, feature_size, mean, var, epsilon, + part_grad_gamma, + part_grad_beta); // compute part_grad_gamma, beta constexpr int BDIMX3 = 32; constexpr int BDIMY3 = 8; dim3 threads3(BDIMX3, BDIMY3, 1); const dim3 blocks3((feature_size + BDIMX2 - 1) / BDIMX2, 1, 1); - LayerNormBackwardSumGradGammaBeta< - T, U, BDIMX3, BDIMY3, - ScaleBiasWithSameTypeX><<>>( - part_grad_gamma, part_grad_beta, part_size, batch_size, - feature_size, d_scale, d_bias); + LayerNormBackwardSumGradGammaBeta + <<>>(part_grad_gamma, part_grad_beta, + part_size, batch_size, + feature_size, d_scale, d_bias); constexpr int BDIMX1 = 32; constexpr int BDIMY1 = 4; dim3 threads1(BDIMX1, BDIMY1, 1); - LayerNormBackwardComputeGradInput< - T, U, BDIMX1, BDIMY1, - ScaleBiasWithSameTypeX><<>>( - d_y, x, batch_size, feature_size, mean, var, epsilon, scale, d_x); + LayerNormBackwardComputeGradInput + <<>>(d_y, x, batch_size, + feature_size, mean, var, + epsilon, scale, d_x); #ifdef PADDLE_WITH_CUDA } #endif diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc index 224ab748dab6c..3d1e563ef1aca 100644 --- a/paddle/fluid/operators/layer_norm_op.cc +++ b/paddle/fluid/operators/layer_norm_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_registry.h" #ifdef PADDLE_WITH_MKLDNN diff --git a/paddle/fluid/operators/layer_norm_op_xpu.cc b/paddle/fluid/operators/layer_norm_op_xpu.cc index 3b21a55f8df0d..a27952c57f7fa 100644 --- a/paddle/fluid/operators/layer_norm_op_xpu.cc +++ b/paddle/fluid/operators/layer_norm_op_xpu.cc @@ -88,8 +88,9 @@ class LayerNormGradXPUKernel : public framework::OpKernel { auto* dscale_data = (dscale == nullptr ? nullptr : dscale->mutable_data(ctx.GetPlace())); - auto* dbias_data = (dbias == nullptr ? nullptr : dbias->mutable_data( - ctx.GetPlace())); + auto* dbias_data = + (dbias == nullptr ? nullptr + : dbias->mutable_data(ctx.GetPlace())); auto* dx_data = (dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace())); auto& dev_ctx = ctx.template device_context(); diff --git a/paddle/fluid/operators/layout_utils.h b/paddle/fluid/operators/layout_utils.h index e304f33d0455a..f058afdb4adc3 100644 --- a/paddle/fluid/operators/layout_utils.h +++ b/paddle/fluid/operators/layout_utils.h @@ -18,6 +18,7 @@ #include #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h index 7308363b9fe0d..a6ef87d43e2d4 100644 --- a/paddle/fluid/operators/linear_chain_crf_op.h +++ b/paddle/fluid/operators/linear_chain_crf_op.h @@ -28,9 +28,10 @@ static inline T NormalizeL1(T* x, size_t len) { // Right now, we just bet that sum won't be zero. If this really happens, we // will figure out what should be done then. PADDLE_ENFORCE_GT( - sum, 0., platform::errors::InvalidArgument( - "The unnormalized probabilities of all possible unfinished " - "sequences must be greater than 0.")); + sum, 0., + platform::errors::InvalidArgument( + "The unnormalized probabilities of all possible unfinished " + "sequences must be greater than 0.")); T s = 1. / sum; for (size_t i = 0; i < len; ++i) x[i] *= s; return sum; @@ -44,8 +45,8 @@ struct ScalarMul { T scalar; }; -using framework::LoDTensor; using framework::LoD; +using framework::LoDTensor; using framework::Tensor; template diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc index e9375be1706eb..5e451d99dbc85 100644 --- a/paddle/fluid/operators/linspace_op.cc +++ b/paddle/fluid/operators/linspace_op.cc @@ -77,10 +77,9 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, LinspaceInferShapeFunctor); -REGISTER_OP_VERSION(linspace) - .AddCheckpoint( - R"ROC( +REGISTER_OP_VERSION(linspace).AddCheckpoint( + R"ROC( Upgrade linspace to add a new attribute [dtype]. )ROC", - paddle::framework::compatible::OpVersionDesc().NewAttr( - "dtype", "In order to change output data type ", 5)); + paddle::framework::compatible::OpVersionDesc().NewAttr( + "dtype", "In order to change output data type ", 5)); diff --git a/paddle/fluid/operators/lite/CMakeLists.txt b/paddle/fluid/operators/lite/CMakeLists.txt index 5bb7892590848..3955c6e322b0e 100644 --- a/paddle/fluid/operators/lite/CMakeLists.txt +++ b/paddle/fluid/operators/lite/CMakeLists.txt @@ -1,2 +1,5 @@ op_library(lite_engine_op DEPS lite_engine lite_tensor_utils) -cc_test(test_lite_engine_op SRCS lite_engine_op_test.cc DEPS lite_engine_op analysis) +cc_test( + test_lite_engine_op + SRCS lite_engine_op_test.cc + DEPS lite_engine_op analysis) diff --git a/paddle/fluid/operators/lite/lite_engine_op.cc b/paddle/fluid/operators/lite/lite_engine_op.cc index 7a879c1e21642..0ec1c55f7abee 100644 --- a/paddle/fluid/operators/lite/lite_engine_op.cc +++ b/paddle/fluid/operators/lite/lite_engine_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/lite/lite_engine_op.h" + #include #include diff --git a/paddle/fluid/operators/lite/lite_engine_op.h b/paddle/fluid/operators/lite/lite_engine_op.h index 5d2a1683d381b..240f6b06325f4 100644 --- a/paddle/fluid/operators/lite/lite_engine_op.h +++ b/paddle/fluid/operators/lite/lite_engine_op.h @@ -26,11 +26,10 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/inference/analysis/helper.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" - #include "paddle/fluid/inference/lite/engine.h" #include "paddle/fluid/inference/lite/tensor_utils.h" #include "paddle/fluid/inference/utils/singleton.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/lite/lite_engine_op_test.cc b/paddle/fluid/operators/lite/lite_engine_op_test.cc index 01583cea31222..c38386365f3dc 100644 --- a/paddle/fluid/operators/lite/lite_engine_op_test.cc +++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc @@ -12,6 +12,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "paddle/fluid/operators/lite/lite_engine_op.h" + #include #include "paddle/fluid/framework/block_desc.h" @@ -19,13 +21,12 @@ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/utils/singleton.h" -#include "paddle/fluid/operators/lite/lite_engine_op.h" #include "paddle/fluid/operators/lite/ut_helper.h" USE_NO_KERNEL_OP(lite_engine) -using paddle::inference::lite::AddTensorToBlockDesc; using paddle::inference::lite::AddFetchListToBlockDesc; +using paddle::inference::lite::AddTensorToBlockDesc; using paddle::inference::lite::CreateTensor; using paddle::inference::lite::serialize_params; namespace paddle { diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc index 374bfa73f2187..94797b08ade80 100644 --- a/paddle/fluid/operators/load_combine_op.cc +++ b/paddle/fluid/operators/load_combine_op.cc @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/load_combine_op.h" + #include #include -#include "paddle/fluid/operators/load_combine_op.h" - namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index ba19aee9b8d76..196792707ebbd 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include - #include "paddle/fluid/operators/load_op.h" +#include + namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc index 5616309683365..616aad2b97691 100644 --- a/paddle/fluid/operators/lod_reset_op.cc +++ b/paddle/fluid/operators/lod_reset_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/lod_reset_op.h" + #include #include diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h index 642c8bcd9ae49..f6f7155f37c3a 100644 --- a/paddle/fluid/operators/lod_reset_op.h +++ b/paddle/fluid/operators/lod_reset_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc index 883e3597d8a31..11edbc84a19d9 100644 --- a/paddle/fluid/operators/log_loss_op.cc +++ b/paddle/fluid/operators/log_loss_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" diff --git a/paddle/fluid/operators/log_loss_op_npu.cc b/paddle/fluid/operators/log_loss_op_npu.cc index f103a69707a21..99ccad1ca76a5 100644 --- a/paddle/fluid/operators/log_loss_op_npu.cc +++ b/paddle/fluid/operators/log_loss_op_npu.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" diff --git a/paddle/fluid/operators/log_loss_op_xpu.cc b/paddle/fluid/operators/log_loss_op_xpu.cc index fee1f56ebdcf2..1ba0a0f3b3d7e 100644 --- a/paddle/fluid/operators/log_loss_op_xpu.cc +++ b/paddle/fluid/operators/log_loss_op_xpu.cc @@ -11,6 +11,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/device_wrapper.h" namespace paddle { diff --git a/paddle/fluid/operators/log_softmax_op.cc b/paddle/fluid/operators/log_softmax_op.cc index da38f906b9bd3..95ebeedaf797e 100644 --- a/paddle/fluid/operators/log_softmax_op.cc +++ b/paddle/fluid/operators/log_softmax_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/common_infer_shape_functions.h" diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 29079b8b1385d..c519e0845f750 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -113,26 +113,22 @@ class LookupTableCUDAKernel : public framework::OpKernel { dim3 grids(8, 1); #ifdef PADDLE_WITH_HIP if (padding_idx == -1) - LookupTable< - T, 64, 4, 8, - false><<>>( - output, table, ids, N, K, D, padding_idx); + LookupTable + <<>>( + output, table, ids, N, K, D, padding_idx); else - LookupTable< - T, 64, 4, 8, - true><<>>( - output, table, ids, N, K, D, padding_idx); + LookupTable + <<>>( + output, table, ids, N, K, D, padding_idx); #else if (padding_idx == -1) - LookupTable< - T, 128, 8, 8, - false><<>>( - output, table, ids, N, K, D, padding_idx); + LookupTable + <<>>( + output, table, ids, N, K, D, padding_idx); else - LookupTable< - T, 128, 8, 8, - true><<>>( - output, table, ids, N, K, D, padding_idx); + LookupTable + <<>>( + output, table, ids, N, K, D, padding_idx); #endif // PADDLE_WITH_HIP } }; diff --git a/paddle/fluid/operators/lookup_table_v2_op.cc b/paddle/fluid/operators/lookup_table_v2_op.cc index 48ae080783d11..65aeca1e49928 100644 --- a/paddle/fluid/operators/lookup_table_v2_op.cc +++ b/paddle/fluid/operators/lookup_table_v2_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/lookup_table_v2_op.h" #include + #include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/var_type_inference.h" diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc index c2df6dff5b53c..c47ea64e24c42 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" diff --git a/paddle/fluid/operators/lookup_table_v2_op_xpu.cc b/paddle/fluid/operators/lookup_table_v2_op_xpu.cc index 521d3ab571efd..223bf2cc8678b 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_xpu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_xpu.cc @@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/lookup_table_v2_op.h" #include + #include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/var_type_inference.h" +#include "paddle/fluid/operators/lookup_table_v2_op.h" #include "paddle/fluid/platform/device/device_wrapper.h" #ifdef PADDLE_WITH_XPU namespace paddle { diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc index 88d70d9bb7dae..17c5f08c66c94 100644 --- a/paddle/fluid/operators/lrn_op.cc +++ b/paddle/fluid/operators/lrn_op.cc @@ -13,9 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/lrn_op.h" + #include #include #include + #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" #ifdef PADDLE_WITH_MKLDNN @@ -174,20 +176,23 @@ class LRNOp : public framework::OperatorWithKernel { OP_INOUT_CHECK(ctx->HasOutput("MidOut"), "Output", "MidOut", "LRN"); auto x_dim = ctx->GetInputDim("X"); - PADDLE_ENFORCE_EQ(x_dim.size(), 4, platform::errors::InvalidArgument( - "Input(input) rank should be 4, " - "but received input rank (%d) != 4", - x_dim.size())); + PADDLE_ENFORCE_EQ( + x_dim.size(), 4, + platform::errors::InvalidArgument("Input(input) rank should be 4, " + "but received input rank (%d) != 4", + x_dim.size())); int n = ctx->Attrs().Get("n"); - PADDLE_ENFORCE_GT(n, 0UL, platform::errors::InvalidArgument( - "Argument(n) should be positive, " - "but received n(%d) not greater than 0", - n)); - PADDLE_ENFORCE_EQ(n % 2, 1UL, platform::errors::InvalidArgument( - "Argument(n) should be odd value, " - "but received n(%d) is not an odd value", - n)); + PADDLE_ENFORCE_GT(n, 0UL, + platform::errors::InvalidArgument( + "Argument(n) should be positive, " + "but received n(%d) not greater than 0", + n)); + PADDLE_ENFORCE_EQ(n % 2, 1UL, + platform::errors::InvalidArgument( + "Argument(n) should be odd value, " + "but received n(%d) is not an odd value", + n)); ctx->SetOutputDim("Out", x_dim); ctx->ShareLoD("X", /*->*/ "Out"); diff --git a/paddle/fluid/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h index f2d72d0740573..671055caa16f1 100644 --- a/paddle/fluid/operators/lrn_op.h +++ b/paddle/fluid/operators/lrn_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -68,18 +69,21 @@ class LRNKernel : public framework::OpKernel { T beta = ctx.Attr("beta"); T k = ctx.Attr("k"); - PADDLE_ENFORCE_GE(alpha, 0UL, platform::errors::InvalidArgument( - "Argument(alpha) should >= 0.0, " - "but received alpha(%d) less than 0", - alpha)); - PADDLE_ENFORCE_GE(beta, 0UL, platform::errors::InvalidArgument( - "Argument(beta) should >= 0.0, " - "but received beta(%d) less than 0", - beta)); - PADDLE_ENFORCE_GE(k, 0UL, platform::errors::InvalidArgument( - "Argument(k) should >= 0.0, " - "but received k(%d) less than 0", - k)); + PADDLE_ENFORCE_GE( + alpha, 0UL, + platform::errors::InvalidArgument("Argument(alpha) should >= 0.0, " + "but received alpha(%d) less than 0", + alpha)); + PADDLE_ENFORCE_GE( + beta, 0UL, + platform::errors::InvalidArgument("Argument(beta) should >= 0.0, " + "but received beta(%d) less than 0", + beta)); + PADDLE_ENFORCE_GE( + k, 0UL, + platform::errors::InvalidArgument("Argument(k) should >= 0.0, " + "but received k(%d) less than 0", + k)); LRNFunctor f; f(ctx, x, out, mid, N, C, H, W, n, k, alpha, beta, data_layout); diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc index 2ec9de3e3bbfc..21a0fce289348 100644 --- a/paddle/fluid/operators/lstm_op.cc +++ b/paddle/fluid/operators/lstm_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/lstm_op.h" + #include #include diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h index 4ec3072a96d44..1e1aaf3ea5328 100644 --- a/paddle/fluid/operators/lstm_op.h +++ b/paddle/fluid/operators/lstm_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/detail/activation_functions.h" @@ -272,9 +273,10 @@ class LSTMGradKernel : public framework::OpKernel { phi::funcs::LoDTensor2BatchFunctor to_batch; - auto ToBatch = [&batch_gate, &to_batch]( - const DeviceContext& ctx, const framework::LoDTensor& src, - const framework::DDim& dims, framework::LoDTensor& dst) { + auto ToBatch = [&batch_gate, &to_batch](const DeviceContext& ctx, + const framework::LoDTensor& src, + const framework::DDim& dims, + framework::LoDTensor& dst) { dst.mutable_data(dims, ctx.GetPlace()); dst.set_lod(batch_gate->lod()); to_batch(ctx, src, &dst, false); diff --git a/paddle/fluid/operators/lstm_unit_op.cc b/paddle/fluid/operators/lstm_unit_op.cc index 917482589fcf3..235a4bd689b23 100644 --- a/paddle/fluid/operators/lstm_unit_op.cc +++ b/paddle/fluid/operators/lstm_unit_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/lstm_unit_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc index 562f7755591fd..7ecf294433ead 100644 --- a/paddle/fluid/operators/lstmp_op.cc +++ b/paddle/fluid/operators/lstmp_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/lstmp_op.h" + #include #include diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h index 5d24c0b70d347..5e68259852c28 100644 --- a/paddle/fluid/operators/lstmp_op.h +++ b/paddle/fluid/operators/lstmp_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/activation_op.h" @@ -371,9 +372,10 @@ class LSTMPGradKernel : public framework::OpKernel { phi::funcs::LoDTensor2BatchFunctor to_batch; - auto ToBatch = [&batch_gate, &to_batch]( - const DeviceContext& ctx, const framework::LoDTensor& src, - const framework::DDim& dims, framework::LoDTensor& dst) { + auto ToBatch = [&batch_gate, &to_batch](const DeviceContext& ctx, + const framework::LoDTensor& src, + const framework::DDim& dims, + framework::LoDTensor& dst) { dst.mutable_data(dims, ctx.GetPlace()); dst.set_lod(batch_gate->lod()); to_batch(ctx, src, &dst, false); diff --git a/paddle/fluid/operators/lstsq_op.cc b/paddle/fluid/operators/lstsq_op.cc index f060125620f5a..e093e4d8c01a6 100644 --- a/paddle/fluid/operators/lstsq_op.cc +++ b/paddle/fluid/operators/lstsq_op.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "paddle/fluid/operators/lstsq_op.h" + #include #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { @@ -139,4 +141,4 @@ REGISTER_OPERATOR(lstsq, ops::LstsqOp, ops::LstsqOpMaker) REGISTER_OP_CPU_KERNEL( lstsq, ops::LstsqCPUKernel, - ops::LstsqCPUKernel); \ No newline at end of file + ops::LstsqCPUKernel); diff --git a/paddle/fluid/operators/lstsq_op.cu b/paddle/fluid/operators/lstsq_op.cu index 10e2867bf2953..53c78fef7b5d4 100644 --- a/paddle/fluid/operators/lstsq_op.cu +++ b/paddle/fluid/operators/lstsq_op.cu @@ -17,6 +17,7 @@ #include #include + #include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/operators/lstsq_op.h" #include "paddle/fluid/operators/qr_op.h" diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h index 520722dafcbea..7955b3b7df9a3 100644 --- a/paddle/fluid/operators/lstsq_op.h +++ b/paddle/fluid/operators/lstsq_op.h @@ -15,8 +15,10 @@ #pragma once #include + #include #include + #include "paddle/fluid/operators/eig_op.h" #include "paddle/fluid/operators/math/eigen_values_vectors.h" #include "paddle/fluid/operators/math/matrix_solve.h" diff --git a/paddle/fluid/operators/lu_op.cc b/paddle/fluid/operators/lu_op.cc index fc8673181c467..0894323015e68 100644 --- a/paddle/fluid/operators/lu_op.cc +++ b/paddle/fluid/operators/lu_op.cc @@ -45,8 +45,9 @@ class LUOp : public framework::OperatorWithKernel { bool pivots = context->Attrs().Get("pivots"); auto x_dims = context->GetInputDim("X"); int x_rank = x_dims.size(); - PADDLE_ENFORCE_GE(x_rank, 2, platform::errors::InvalidArgument( - "the rank of input must greater than 2")); + PADDLE_ENFORCE_GE(x_rank, 2, + platform::errors::InvalidArgument( + "the rank of input must greater than 2")); context->SetOutputDim("Out", x_dims); int m = x_dims[x_rank - 1]; int n = x_dims[x_rank - 2]; diff --git a/paddle/fluid/operators/lu_unpack_op.cc b/paddle/fluid/operators/lu_unpack_op.cc index e38a4703f64ee..e3b4263b4ff68 100644 --- a/paddle/fluid/operators/lu_unpack_op.cc +++ b/paddle/fluid/operators/lu_unpack_op.cc @@ -53,8 +53,9 @@ class LU_UnpackOp : public framework::OperatorWithKernel { auto x_dims = context->GetInputDim("X"); int x_rank = x_dims.size(); - PADDLE_ENFORCE_GE(x_rank, 2, platform::errors::InvalidArgument( - "the rank of input must greater than 2")); + PADDLE_ENFORCE_GE(x_rank, 2, + platform::errors::InvalidArgument( + "the rank of input must greater than 2")); // context->SetOutputDim("Out", x_dims); int m = x_dims[x_rank - 1]; diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu index a2e34d98461e0..1cef3705973e7 100644 --- a/paddle/fluid/operators/margin_cross_entropy_op.cu +++ b/paddle/fluid/operators/margin_cross_entropy_op.cu @@ -20,16 +20,19 @@ namespace cub = hipcub; #endif #include + #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/margin_cross_entropy_op.h" #include "paddle/fluid/operators/math/softmax_impl.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/phi/api/include/tensor.h" #include "paddle/phi/kernels/funcs/axis_utils.h" #include "paddle/phi/kernels/funcs/math_function.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/distributed/collective/ProcessGroup.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -63,19 +66,34 @@ void GetClassInterval(const gpuStream_t& stream, const platform::Place& place, framework::TensorFromVector(shard_dim_vec, ctx, &num_classes_per_device); int* num_classes_per_device_ptr = num_classes_per_device.data(); - const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place); - // use global calculate stream - const auto calcu_stream = - static_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( - num_classes_per_device_ptr, num_classes_per_device_ptr, - num_classes_per_device.numel(), - platform::ToNCCLDataType( - framework::TransToProtoVarType(num_classes_per_device.dtype())), - ncclSum, comm->comm(), calcu_stream)); + auto map = distributed::ProcessGroupMapFromGid::getInstance(); + if (map->has(rid)) { + // Use ProcessGroup + distributed::ProcessGroup* pg = map->get(rid); + std::vector in_tensor; + std::vector out_tensor; + in_tensor.push_back(num_classes_per_device); + out_tensor.push_back(num_classes_per_device); + + distributed::AllreduceOptions opts; + opts.reduce_op = distributed::ReduceOp::SUM; + auto task = pg->AllReduce(in_tensor, out_tensor, opts); + task->Wait(); + } else { + const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place); + // use global calculate stream + const auto calcu_stream = + static_cast( + platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + num_classes_per_device_ptr, num_classes_per_device_ptr, + num_classes_per_device.numel(), + platform::ToNCCLDataType( + framework::TransToProtoVarType(num_classes_per_device.dtype())), + ncclSum, comm->comm(), calcu_stream)); + } auto class_interval_ptr = class_interval->mutable_data({nranks + 1}, place); @@ -228,14 +246,21 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) platform::NCCLComm* comm; + distributed::ProcessGroup* pg = nullptr; gpuStream_t stream; if (nranks > 1) { - comm = platform::NCCLCommContext::Instance().Get(rid, place); - - // use global calculate stream - stream = static_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); + auto map = distributed::ProcessGroupMapFromGid::getInstance(); + if (map->has(rid)) { + // Use ProcessGroup + pg = map->get(rid); + } else { + comm = platform::NCCLCommContext::Instance().Get(rid, place); + + // use global calculate stream + stream = static_cast( + platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + } } #endif @@ -274,16 +299,16 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { // save match_logits, used for gradient computation. if (label_type == framework::proto::VarType::INT32) { typedef int32_t LabelT; - AddMarginToPositiveLogitsKernel< - T><<>>( - logits_ptr, labels->data(), margin1, margin2, margin3, rank, - nranks, N, D, class_interval.data()); + AddMarginToPositiveLogitsKernel + <<>>( + logits_ptr, labels->data(), margin1, margin2, margin3, + rank, nranks, N, D, class_interval.data()); } else if (label_type == framework::proto::VarType::INT64) { typedef int64_t LabelT; - AddMarginToPositiveLogitsKernel< - T><<>>( - logits_ptr, labels->data(), margin1, margin2, margin3, rank, - nranks, N, D, class_interval.data()); + AddMarginToPositiveLogitsKernel + <<>>( + logits_ptr, labels->data(), margin1, margin2, margin3, + rank, nranks, N, D, class_interval.data()); } else { PADDLE_THROW(platform::errors::Unimplemented( "margin_cross_entropy label type noly support int32 and int64, " @@ -306,11 +331,23 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (nranks > 1) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( - logits_max_buff, logits_max_buff, logits_max.numel(), - platform::ToNCCLDataType( - framework::TransToProtoVarType(logits_max.dtype())), - ncclMax, comm->comm(), stream)); + if (pg) { + std::vector in_tensor; + std::vector out_tensor; + in_tensor.push_back(logits_max); + out_tensor.push_back(logits_max); + + distributed::AllreduceOptions opts; + opts.reduce_op = distributed::ReduceOp::MAX; + auto task = pg->AllReduce(in_tensor, out_tensor, opts); + task->Wait(); + } else { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + logits_max_buff, logits_max_buff, logits_max.numel(), + platform::ToNCCLDataType( + framework::TransToProtoVarType(logits_max.dtype())), + ncclMax, comm->comm(), stream)); + } } #endif @@ -329,18 +366,30 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (nranks > 1) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( - sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(), - platform::ToNCCLDataType( - framework::TransToProtoVarType(sum_exp_logits.dtype())), - ncclSum, comm->comm(), stream)); + if (pg) { + std::vector in_tensor; + std::vector out_tensor; + in_tensor.push_back(sum_exp_logits); + out_tensor.push_back(sum_exp_logits); + + distributed::AllreduceOptions opts; + opts.reduce_op = distributed::ReduceOp::SUM; + auto task = pg->AllReduce(in_tensor, out_tensor, opts); + task->Wait(); + } else { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(), + platform::ToNCCLDataType( + framework::TransToProtoVarType(sum_exp_logits.dtype())), + ncclSum, comm->comm(), stream)); + } } #endif // step 5, (logit - logit_max) - log(sum(exp(logit - logit_max))) - LogitsMinusLogSumKernel< - T><<>>( - logits_ptr, sum_exp_logits_buff, N, D); + LogitsMinusLogSumKernel + <<>>( + logits_ptr, sum_exp_logits_buff, N, D); // step 6, prob = exp((logit - logit_max) - log(sum(exp(logit - // logit_max)))) @@ -349,25 +398,37 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { dev_ctx, loss, static_cast(0.0)); if (label_type == framework::proto::VarType::INT32) { typedef int32_t LabelT; - HardLabelSoftmaxWithCrossEntropyKernel< - T, LabelT><<>>( - loss_ptr, logits_ptr, labels->data(), rank, N, D, - class_interval.data()); + HardLabelSoftmaxWithCrossEntropyKernel + <<>>( + loss_ptr, logits_ptr, labels->data(), rank, N, D, + class_interval.data()); } else if (label_type == framework::proto::VarType::INT64) { typedef int64_t LabelT; - HardLabelSoftmaxWithCrossEntropyKernel< - T, LabelT><<>>( - loss_ptr, logits_ptr, labels->data(), rank, N, D, - class_interval.data()); + HardLabelSoftmaxWithCrossEntropyKernel + <<>>( + loss_ptr, logits_ptr, labels->data(), rank, N, D, + class_interval.data()); } #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (nranks > 1) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( - loss_ptr, loss_ptr, loss->numel(), - platform::ToNCCLDataType( - framework::TransToProtoVarType(loss->dtype())), - ncclSum, comm->comm(), stream)); + if (pg) { + std::vector in_tensor; + std::vector out_tensor; + in_tensor.push_back(*loss); + out_tensor.push_back(*loss); + + distributed::AllreduceOptions opts; + opts.reduce_op = distributed::ReduceOp::SUM; + auto task = pg->AllReduce(in_tensor, out_tensor, opts); + task->Wait(); + } else { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + loss_ptr, loss_ptr, loss->numel(), + platform::ToNCCLDataType( + framework::TransToProtoVarType(loss->dtype())), + ncclSum, comm->comm(), stream)); + } } #endif } diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc index b4ff8b6d8dcf5..31055002993ed 100644 --- a/paddle/fluid/operators/margin_rank_loss_op.cc +++ b/paddle/fluid/operators/margin_rank_loss_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/margin_rank_loss_op.h" + #include + #include "paddle/fluid/platform/enforce.h" namespace paddle { diff --git a/paddle/fluid/operators/marker_op.cu b/paddle/fluid/operators/marker_op.cu index cfa5c6dc7a918..fe61aefe0bb3a 100644 --- a/paddle/fluid/operators/marker_op.cu +++ b/paddle/fluid/operators/marker_op.cu @@ -48,8 +48,8 @@ class MarkerOpCUDAKernel : public framework::OpKernel { "MarkerCUDA", "marker_" + marker_role + "_" + marker_pos, platform::TracerEventType::OperatorInner, 1, platform::EventRole::kInnerOp); - SimpleMarkerKernel<<<1, 32, 0, dev_ctx.stream()>>>(in_temp, out_temp, - 32); + SimpleMarkerKernel + <<<1, 32, 0, dev_ctx.stream()>>>(in_temp, out_temp, 32); } }; diff --git a/paddle/fluid/operators/match_matrix_tensor_op.cc b/paddle/fluid/operators/match_matrix_tensor_op.cc index d32ab65509e5e..2ae4fbdbe103f 100644 --- a/paddle/fluid/operators/match_matrix_tensor_op.cc +++ b/paddle/fluid/operators/match_matrix_tensor_op.cc @@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/match_matrix_tensor_op.h" + #include #include #include #include #include -#include "paddle/fluid/operators/match_matrix_tensor_op.h" #include "paddle/fluid/operators/search_compute.h" namespace paddle { diff --git a/paddle/fluid/operators/math.h b/paddle/fluid/operators/math.h index d4b9e35bccedc..47281fb0280f0 100644 --- a/paddle/fluid/operators/math.h +++ b/paddle/fluid/operators/math.h @@ -14,11 +14,10 @@ #pragma once +#include "math.h" // NOLINT #include "paddle/fluid/platform/float16.h" #include "paddle/phi/core/hostdevice.h" -#include "math.h" // NOLINT - namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 913ce07ec673c..ac538cfbd5c68 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -1,14 +1,17 @@ -if (WITH_ASCEND_CL) - cc_library(beam_search_npu SRCS beam_search_npu.cc DEPS npu_op_runner) +if(WITH_ASCEND_CL) + cc_library( + beam_search_npu + SRCS beam_search_npu.cc + DEPS npu_op_runner) endif() # please add new math_library in alphabetical order -if (WITH_ASCEND_CL) -math_library(concat_and_split DEPS concat_and_split_functor npu_op_runner) -elseif (WITH_MLU) -math_library(concat_and_split DEPS concat_and_split_functor mlu_baseop) +if(WITH_ASCEND_CL) + math_library(concat_and_split DEPS concat_and_split_functor npu_op_runner) +elseif(WITH_MLU) + math_library(concat_and_split DEPS concat_and_split_functor mlu_baseop) else() -math_library(concat_and_split DEPS concat_and_split_functor) + math_library(concat_and_split DEPS concat_and_split_functor) endif() math_library(context_project DEPS im2col math_function) math_library(cross_entropy) @@ -22,23 +25,30 @@ math_library(sampler DEPS generator) math_library(maxouting) if(WITH_MKLDNN) - math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mkldnn_axpy_handler mixed_vector) + math_library( + selected_rows_functor + DEPS + selected_rows_utils + math_function + blas + mkldnn_axpy_handler + mixed_vector) else() - math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mixed_vector) + math_library(selected_rows_functor DEPS selected_rows_utils math_function + blas mixed_vector) endif() math_library(sequence_padding) math_library(sequence_pooling DEPS math_function jit_kernel_helper) math_library(sequence_scale) math_library(softmax DEPS math_function jit_kernel_helper) -if (WITH_ASCEND_CL) - math_library(beam_search DEPS math_function beam_search_npu) +if(WITH_ASCEND_CL) + math_library(beam_search DEPS math_function beam_search_npu) else() - math_library(beam_search DEPS math_function) + math_library(beam_search DEPS math_function) endif() math_library(matrix_bit_code) - math_library(unpooling) math_library(vol2col) math_library(prelu) @@ -46,28 +56,58 @@ math_library(bert_encoder_functor) math_library(tree2col DEPS math_function) math_library(matrix_solve) -cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor) -cc_test(im2col_test SRCS im2col_test.cc DEPS im2col) -cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col) -cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding) -cc_test(sequence_pooling_test SRCS sequence_pooling_test.cc DEPS sequence_pooling) -cc_test(beam_search_test SRCS beam_search_test.cc DEPS beam_search) +cc_test( + selected_rows_functor_test + SRCS selected_rows_functor_test.cc + DEPS selected_rows_functor) +cc_test( + im2col_test + SRCS im2col_test.cc + DEPS im2col) +cc_test( + vol2col_test + SRCS vol2col_test.cc + DEPS vol2col) +cc_test( + sequence_padding_test + SRCS sequence_padding_test.cc + DEPS sequence_padding) +cc_test( + sequence_pooling_test + SRCS sequence_pooling_test.cc + DEPS sequence_pooling) +cc_test( + beam_search_test + SRCS beam_search_test.cc + DEPS beam_search) if(WITH_GPU) - nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function) + nv_test( + selected_rows_functor_gpu_test + SRCS selected_rows_functor_test.cu.cc + DEPS selected_rows_functor math_function) endif() if(WITH_ROCM) - hip_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function) + hip_test( + selected_rows_functor_gpu_test + SRCS selected_rows_functor_test.cu.cc + DEPS selected_rows_functor math_function) endif() -cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) +cc_test( + concat_test + SRCS concat_test.cc + DEPS concat_and_split) if(WITH_GPU AND (NOT WITH_ROCM)) -#currenty not yet support ROCM -#the generic conversion APIs of dense and sparse are only supported after cuda11.2 - if((NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.2)) - cc_test(cusparse_conversion_api_test SRCS cusparse_conversion_api_test.cc DEPS tensor) - endif() + #currenty not yet support ROCM + #the generic conversion APIs of dense and sparse are only supported after cuda11.2 + if((NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.2)) + cc_test( + cusparse_conversion_api_test + SRCS cusparse_conversion_api_test.cc + DEPS tensor) + endif() endif() if(WITH_TESTING AND TEST im2col_test) - set_tests_properties(im2col_test PROPERTIES TIMEOUT 120) + set_tests_properties(im2col_test PROPERTIES TIMEOUT 120) endif() diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu index 486979aa0a8b3..7a21f2f64978d 100644 --- a/paddle/fluid/operators/math/beam_search.cu +++ b/paddle/fluid/operators/math/beam_search.cu @@ -348,11 +348,10 @@ class BeamSearchFunctor { float* selected_scores_data = selected_scores->mutable_data(selected_dims, context.GetPlace()); int* parent_idx_data = - parent_idx - ? parent_idx->mutable_data( - {static_cast(num_seqs * beam_size)}, - context.GetPlace()) - : nullptr; + parent_idx ? parent_idx->mutable_data( + {static_cast(num_seqs * beam_size)}, + context.GetPlace()) + : nullptr; framework::LoD selected_lod(2); selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end()); @@ -369,8 +368,8 @@ class BeamSearchFunctor { static_cast(beam_size)); switch (platform::RoundToPowerOfTwo(beam_size * seq_width)) { CUDA_LAUNCH_KERNEL_HELPER( - BeamSearchKernelSingle<<< - 1, kMaxThreadsPerSeq, 0, context.stream()>>>( + BeamSearchKernelSingle + <<<1, kMaxThreadsPerSeq, 0, context.stream()>>>( selected_ids_data, selected_scores_data, parent_idx_data, selected_offsets, pre_ids_data, pre_scores_data, ids_data, scores_data, seq_length, static_cast(seq_width), @@ -387,8 +386,8 @@ class BeamSearchFunctor { static_cast(beam_size)); switch (platform::RoundToPowerOfTwo(beam_size * num_seqs * 32)) { CUDA_LAUNCH_KERNEL_HELPER( - BeamSearchKernel<<< - 1, num_seqs * kMaxThreadsPerSeq, 0, context.stream()>>>( + BeamSearchKernel + <<<1, num_seqs * kMaxThreadsPerSeq, 0, context.stream()>>>( selected_ids_data, selected_scores_data, parent_idx_data, selected_offsets, pre_ids_data, pre_scores_data, ids_data, scores_data, seq_offsets, static_cast(num_seqs), diff --git a/paddle/fluid/operators/math/beam_search.h b/paddle/fluid/operators/math/beam_search.h index 4474e7ea52aff..c0d39aa2d8fa9 100644 --- a/paddle/fluid/operators/math/beam_search.h +++ b/paddle/fluid/operators/math/beam_search.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc index b0547ef9d956c..7cf4c867db7a3 100644 --- a/paddle/fluid/operators/math/beam_search_test.cc +++ b/paddle/fluid/operators/math/beam_search_test.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/beam_search.h" #include + #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/operators/math/bert_encoder_functor.cu b/paddle/fluid/operators/math/bert_encoder_functor.cu index 0cdad6beeb9f6..4aba6f3c0b9e9 100644 --- a/paddle/fluid/operators/math/bert_encoder_functor.cu +++ b/paddle/fluid/operators/math/bert_encoder_functor.cu @@ -815,23 +815,23 @@ void SkipLayerNormFunctor::operator()(const int num, const int hidden, const int threads = 256; if (hidden % 2 == 0) { if (std::is_same::value) { - SkipLayerNormKernel2<<>>( - num, hidden / 2, reinterpret_cast(input1), - reinterpret_cast(input2), - reinterpret_cast(output), - reinterpret_cast(scale), - reinterpret_cast(bias), eps); + SkipLayerNormKernel2 + <<>>( + num, hidden / 2, reinterpret_cast(input1), + reinterpret_cast(input2), + reinterpret_cast(output), + reinterpret_cast(scale), + reinterpret_cast(bias), eps); // HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake #ifndef __HIPCC__ } else if (std::is_same::value) { - SkipLayerNormKernel2<__half, __half2, - threads><<>>( - num, hidden / 2, reinterpret_cast(input1), - reinterpret_cast(input2), - reinterpret_cast<__half2 *>(output), - reinterpret_cast(scale), - reinterpret_cast(bias), eps); + SkipLayerNormKernel2<__half, __half2, threads> + <<>>( + num, hidden / 2, reinterpret_cast(input1), + reinterpret_cast(input2), + reinterpret_cast<__half2 *>(output), + reinterpret_cast(scale), + reinterpret_cast(bias), eps); #endif } else { assert(false); diff --git a/paddle/fluid/operators/math/bert_encoder_functor.h b/paddle/fluid/operators/math/bert_encoder_functor.h index 683606ec73383..fd40ac540bfdc 100644 --- a/paddle/fluid/operators/math/bert_encoder_functor.h +++ b/paddle/fluid/operators/math/bert_encoder_functor.h @@ -17,10 +17,12 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #include + #include // NOLINT #endif #ifdef PADDLE_WITH_HIP #include + #include namespace cub = hipcub; #endif diff --git a/paddle/fluid/operators/math/bloomfilter.h b/paddle/fluid/operators/math/bloomfilter.h index fa3d37ed5f41e..f16fdd135b5a4 100644 --- a/paddle/fluid/operators/math/bloomfilter.h +++ b/paddle/fluid/operators/math/bloomfilter.h @@ -16,11 +16,9 @@ limitations under the License. */ #define BLOOMFILTER_MAGIC_NUM_NEW 17070416 #include -#include - #include +#include #include - #include namespace paddle { diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu index e51631385eb75..1ea8cafd25e08 100644 --- a/paddle/fluid/operators/math/concat_and_split.cu +++ b/paddle/fluid/operators/math/concat_and_split.cu @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/concat_and_split.h" - #include "paddle/phi/kernels/funcs/concat_and_split_functor.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/concat_and_split.h b/paddle/fluid/operators/math/concat_and_split.h index b5b0aae23ac87..3b6a12e24023e 100644 --- a/paddle/fluid/operators/math/concat_and_split.h +++ b/paddle/fluid/operators/math/concat_and_split.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc index de358bf623e61..542dcda963aea 100644 --- a/paddle/fluid/operators/math/concat_test.cc +++ b/paddle/fluid/operators/math/concat_test.cc @@ -119,13 +119,13 @@ void ConcatCase1(DeviceContext* context) { } /** - * case 2: - * inputs: - * t_a.shape: [2, 3, 4] - * t_b.shape: [2, 4, 4] - * output: - * out.shape: [2, 7, 4] - */ + * case 2: + * inputs: + * t_a.shape: [2, 3, 4] + * t_b.shape: [2, 4, 4] + * output: + * out.shape: [2, 7, 4] + */ template void ConcatCase2(DeviceContext* context) { paddle::framework::Tensor input_a_cpu; @@ -222,13 +222,13 @@ void ConcatCase2(DeviceContext* context) { } /** - * case 3: - * inputs: - * t_a.shape: [2, 3, 5] - * t_b.shape: [2, 3, 4] - * output: - * out.shape: [2, 3, 9] - */ + * case 3: + * inputs: + * t_a.shape: [2, 3, 5] + * t_b.shape: [2, 3, 4] + * output: + * out.shape: [2, 3, 9] + */ template void ConcatCase3(DeviceContext* context) { paddle::framework::Tensor input_a_cpu; @@ -326,14 +326,14 @@ void ConcatCase3(DeviceContext* context) { } /** - * case 4: - * inputs: - * axis = 1 - * t_a.shape: [2, 3, 4] - * t_b.shape: [2, 3, 4] - * output: - * out.shape: [2, 6, 4] - */ + * case 4: + * inputs: + * axis = 1 + * t_a.shape: [2, 3, 4] + * t_b.shape: [2, 3, 4] + * output: + * out.shape: [2, 6, 4] + */ template void ConcatCase4(DeviceContext* context) { paddle::framework::Tensor input_a_cpu; diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc index cb2f59182c111..a2b83f998566f 100644 --- a/paddle/fluid/operators/math/cross_entropy.cc +++ b/paddle/fluid/operators/math/cross_entropy.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/cross_entropy.h" + #include "paddle/fluid/framework/convert_utils.h" #include "paddle/phi/backends/cpu/cpu_context.h" diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h index da7340e4eb0b3..e562816d6dab6 100644 --- a/paddle/fluid/operators/math/cross_entropy.h +++ b/paddle/fluid/operators/math/cross_entropy.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/phi/core/hostdevice.h" diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h index 1ade2190bb96e..22ce162a44ce0 100644 --- a/paddle/fluid/operators/math/eigen_values_vectors.h +++ b/paddle/fluid/operators/math/eigen_values_vectors.h @@ -42,9 +42,10 @@ static void CheckEighResult(const int batch, const int info) { "tridiagonal form did not converge to zero", batch, info)); PADDLE_ENFORCE_GE( - info, 0, platform::errors::PreconditionNotMet( - "For batch [%d]: the [%d] argument had an illegal value", - batch, info)); + info, 0, + platform::errors::PreconditionNotMet( + "For batch [%d]: the [%d] argument had an illegal value", batch, + info)); } template diff --git a/paddle/fluid/operators/math/gru_compute.cu b/paddle/fluid/operators/math/gru_compute.cu index 9b03895cdef25..946a1477c3b6a 100644 --- a/paddle/fluid/operators/math/gru_compute.cu +++ b/paddle/fluid/operators/math/gru_compute.cu @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/operators/math/detail/gru_gpu_kernel.h" #include "paddle/fluid/operators/math/detail/gru_kernel.h" #include "paddle/fluid/operators/math/gru_compute.h" @@ -36,35 +37,35 @@ struct GRUUnitFunctor { int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size; threads = dim3(tiled_size, 1); grid = dim3(frame_blocks, 1); - detail::KeFastCollectiveGruGate< - T, tiled_size><<>>( - value.gate_value, value.prev_out_value, value.gate_weight, - value.reset_output_value, frame_size, active_gate); + detail::KeFastCollectiveGruGate + <<>>( + value.gate_value, value.prev_out_value, value.gate_weight, + value.reset_output_value, frame_size, active_gate); frame_blocks = (frame_size + tiled_size - 1) / tiled_size; grid = dim3(frame_blocks, 1); - detail::KeFastCollectiveGruOut< - T, tiled_size><<>>( - value.state_weight, value.prev_out_value, value.output_value, - value.gate_value, value.reset_output_value, frame_size, - active_node, origin_mode); + detail::KeFastCollectiveGruOut + <<>>( + value.state_weight, value.prev_out_value, value.output_value, + value.gate_value, value.reset_output_value, frame_size, + active_node, origin_mode); } else { constexpr int tiled_size = 16; int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size; threads = dim3(tiled_size, 1); grid = dim3(frame_blocks, 1); - detail::KeFastCollectiveGruGate< - T, tiled_size><<>>( - value.gate_value, value.prev_out_value, value.gate_weight, - value.reset_output_value, frame_size, active_gate); + detail::KeFastCollectiveGruGate + <<>>( + value.gate_value, value.prev_out_value, value.gate_weight, + value.reset_output_value, frame_size, active_gate); frame_blocks = (frame_size + tiled_size - 1) / tiled_size; grid = dim3(frame_blocks, 1); - detail::KeFastCollectiveGruOut< - T, tiled_size><<>>( - value.state_weight, value.prev_out_value, value.output_value, - value.gate_value, value.reset_output_value, frame_size, - active_node, origin_mode); + detail::KeFastCollectiveGruOut + <<>>( + value.state_weight, value.prev_out_value, value.output_value, + value.gate_value, value.reset_output_value, frame_size, + active_node, origin_mode); } return; } else { @@ -86,18 +87,18 @@ struct GRUUnitFunctor { if (batch_size == 1) { detail::KeGruForwardResetOutput, - /* is_batch= */ false, - T><<>>( - detail::forward::gru_resetOutput(), value.gate_value, - value.reset_output_value, value.prev_out_value, frame_size, - batch_size, active_gate); + /* is_batch= */ false, T> + <<>>( + detail::forward::gru_resetOutput(), value.gate_value, + value.reset_output_value, value.prev_out_value, frame_size, + batch_size, active_gate); } else { detail::KeGruForwardResetOutput, - /* is_batch= */ true, - T><<>>( - detail::forward::gru_resetOutput(), value.gate_value, - value.reset_output_value, value.prev_out_value, frame_size, - batch_size, active_gate); + /* is_batch= */ true, T> + <<>>( + detail::forward::gru_resetOutput(), value.gate_value, + value.reset_output_value, value.prev_out_value, frame_size, + batch_size, active_gate); } if (value.prev_out_value) { @@ -109,18 +110,18 @@ struct GRUUnitFunctor { if (batch_size == 1) { detail::KeGruForwardFinalOutput, - /* is_batch= */ false, - T><<>>( - detail::forward::gru_finalOutput(), value.gate_value, - value.prev_out_value, value.output_value, frame_size, batch_size, - active_node, origin_mode); + /* is_batch= */ false, T> + <<>>(detail::forward::gru_finalOutput(), + value.gate_value, value.prev_out_value, + value.output_value, frame_size, + batch_size, active_node, origin_mode); } else { detail::KeGruForwardFinalOutput, - /* is_batch= */ true, - T><<>>( - detail::forward::gru_finalOutput(), value.gate_value, - value.prev_out_value, value.output_value, frame_size, batch_size, - active_node, origin_mode); + /* is_batch= */ true, T> + <<>>(detail::forward::gru_finalOutput(), + value.gate_value, value.prev_out_value, + value.output_value, frame_size, + batch_size, active_node, origin_mode); } } }; @@ -147,19 +148,21 @@ struct GRUUnitGradFunctor { } if (batch_size == 1) { - detail::KeGruBackwardStateGrad< - detail::backward::gru_stateGrad, - /* is_batch= */ false><<>>( - detail::backward::gru_stateGrad(), value.gate_value, - grad.gate_grad, value.prev_out_value, grad.prev_out_grad, - grad.output_grad, frame_size, batch_size, active_node, origin_mode); + detail::KeGruBackwardStateGrad, + /* is_batch= */ false> + <<>>( + detail::backward::gru_stateGrad(), value.gate_value, + grad.gate_grad, value.prev_out_value, grad.prev_out_grad, + grad.output_grad, frame_size, batch_size, active_node, + origin_mode); } else { - detail::KeGruBackwardStateGrad< - detail::backward::gru_stateGrad, - /* is_batch= */ true><<>>( - detail::backward::gru_stateGrad(), value.gate_value, - grad.gate_grad, value.prev_out_value, grad.prev_out_grad, - grad.output_grad, frame_size, batch_size, active_node, origin_mode); + detail::KeGruBackwardStateGrad, + /* is_batch= */ true> + <<>>( + detail::backward::gru_stateGrad(), value.gate_value, + grad.gate_grad, value.prev_out_value, grad.prev_out_grad, + grad.output_grad, frame_size, batch_size, active_node, + origin_mode); } auto blas = phi::funcs::GetBlas(context); @@ -179,19 +182,19 @@ struct GRUUnitGradFunctor { } if (batch_size == 1) { - detail::KeGruBackwardResetGrad< - detail::backward::gru_resetGrad, - /* is_batch= */ false><<>>( - detail::backward::gru_resetGrad(), value.gate_value, - grad.gate_grad, value.prev_out_value, grad.prev_out_grad, - grad.reset_output_grad, frame_size, batch_size, active_gate); + detail::KeGruBackwardResetGrad, + /* is_batch= */ false> + <<>>( + detail::backward::gru_resetGrad(), value.gate_value, + grad.gate_grad, value.prev_out_value, grad.prev_out_grad, + grad.reset_output_grad, frame_size, batch_size, active_gate); } else { - detail::KeGruBackwardResetGrad< - detail::backward::gru_resetGrad, - /* is_batch= */ true><<>>( - detail::backward::gru_resetGrad(), value.gate_value, - grad.gate_grad, value.prev_out_value, grad.prev_out_grad, - grad.reset_output_grad, frame_size, batch_size, active_gate); + detail::KeGruBackwardResetGrad, + /* is_batch= */ true> + <<>>( + detail::backward::gru_resetGrad(), value.gate_value, + grad.gate_grad, value.prev_out_value, grad.prev_out_grad, + grad.reset_output_grad, frame_size, batch_size, active_gate); } if (grad.prev_out_grad && value.prev_out_value) { diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc index 8fc6c52122abf..1f5f575c7c350 100644 --- a/paddle/fluid/operators/math/im2col.cc +++ b/paddle/fluid/operators/math/im2col.cc @@ -111,16 +111,18 @@ class Col2ImFunctor #include + #include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" @@ -220,16 +221,18 @@ class Col2ImFunctor + #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/operators/math/im2col_cfo_cpu.h b/paddle/fluid/operators/math/im2col_cfo_cpu.h index 01f1e220e65d9..f3755653f28d4 100644 --- a/paddle/fluid/operators/math/im2col_cfo_cpu.h +++ b/paddle/fluid/operators/math/im2col_cfo_cpu.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/tensor.h" namespace paddle { diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc index 0e4032986cf0c..ff766cfad2cb1 100644 --- a/paddle/fluid/operators/math/im2col_test.cc +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/im2col.h" + #include + #include "paddle/fluid/operators/math/im2col_cfo_cpu.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h index b77e23450360c..bd170b674042d 100644 --- a/paddle/fluid/operators/math/inclusive_scan.h +++ b/paddle/fluid/operators/math/inclusive_scan.h @@ -24,6 +24,7 @@ namespace cub = hipcub; #include #include + #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/enforce.h" @@ -196,15 +197,15 @@ static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim, grid_dim = std::min(grid_dim, dev_ctx.GetCUDAMaxGridDimSize()[0]); dim3 thread_dims(kThreadNumX, kThreadNumY); if (reverse) { - InclusiveScanInnerDimCUDAKernel< - T, BinaryOp, kThreadNumX, kThreadNumY, - /*kReverse=*/true><<>>( - x, y, outer_dim, inner_dim, init, op); + InclusiveScanInnerDimCUDAKernel + <<>>(x, y, outer_dim, + inner_dim, init, op); } else { - InclusiveScanInnerDimCUDAKernel< - T, BinaryOp, kThreadNumX, kThreadNumY, - /*kReverse=*/false><<>>( - x, y, outer_dim, inner_dim, init, op); + InclusiveScanInnerDimCUDAKernel + <<>>(x, y, outer_dim, + inner_dim, init, op); } } diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index 083d6967ff03a..a3c1d23e89b37 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -25,6 +25,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/operators/math/math_function_impl.h" #include "paddle/fluid/platform/bfloat16.h" diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index 359552a0717a0..1d6afa50cc930 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/selected_rows_utils.h" diff --git a/paddle/fluid/operators/math/matrix_solve.cc b/paddle/fluid/operators/math/matrix_solve.cc index 7b239b8166644..f2b083b833701 100644 --- a/paddle/fluid/operators/math/matrix_solve.cc +++ b/paddle/fluid/operators/math/matrix_solve.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/matrix_solve.h" + #include "Eigen/Core" #include "Eigen/LU" #include "paddle/phi/kernels/funcs/blas/blas.h" diff --git a/paddle/fluid/operators/math/matrix_solve.cu.cc b/paddle/fluid/operators/math/matrix_solve.cu.cc index 737196dde1dfc..59c8c07e6e186 100644 --- a/paddle/fluid/operators/math/matrix_solve.cu.cc +++ b/paddle/fluid/operators/math/matrix_solve.cu.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/matrix_solve.h" + #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/solve_op.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/operators/math/matrix_solve.h b/paddle/fluid/operators/math/matrix_solve.h index 415d0c6dd8e0c..cecc3517934c7 100644 --- a/paddle/fluid/operators/math/matrix_solve.h +++ b/paddle/fluid/operators/math/matrix_solve.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "Eigen/Core" #include "Eigen/LU" #include "paddle/fluid/framework/tensor.h" diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu index f86eb103449f6..1ae0c709e4da9 100644 --- a/paddle/fluid/operators/math/sample_prob.cu +++ b/paddle/fluid/operators/math/sample_prob.cu @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include #include + #include #include diff --git a/paddle/fluid/operators/math/sampler.cc b/paddle/fluid/operators/math/sampler.cc index 5f1cd25941614..d645e1994f101 100644 --- a/paddle/fluid/operators/math/sampler.cc +++ b/paddle/fluid/operators/math/sampler.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/sampler.h" #include + #include "paddle/fluid/framework/generator.h" namespace paddle { diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index e4b033b6c5857..7689c31838d33 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/selected_rows_functor.h" + #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/platform/device/device_wrapper.h" diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index db5c66d319701..edcb21cb56a25 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -163,10 +163,10 @@ struct SelectedRowsAddTensor { dim3 threads(block_size, 1); dim3 grid(in1_rows.size(), 1); paddle::framework::MixVector mixv_in1_rows(&in1_rows); - SelectedRowsAddTensorKernel< - T, block_size><<>>( - in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), out_data, - in1_row_numel); + SelectedRowsAddTensorKernel + <<>>( + in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), out_data, + in1_row_numel); auto out_eigen = framework::EigenVector::Flatten(*output); auto in2_eigen = framework::EigenVector::Flatten(input2); @@ -223,10 +223,10 @@ struct SelectedRowsAddTensor { dim3 threads(block_size, 1); dim3 grid(in1_rows.size(), 1); paddle::framework::MixVector mixv_in1_rows(&in1_rows); - SelectedRowsAddTensorKernel< - T, block_size><<>>( - in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), out_data, - in1_row_numel); + SelectedRowsAddTensorKernel + <<>>( + in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), out_data, + in1_row_numel); auto out_eigen = framework::EigenVector::Flatten(*output); auto in2_eigen = framework::EigenVector::Flatten(input2); @@ -343,10 +343,10 @@ struct SelectedRowsAddToTensor { dim3 threads(block_size, 1); dim3 grid(in1_rows.size(), 1); paddle::framework::MixVector mixv_in1_rows(&in1_rows); - SelectedRowsAddToTensorKernel< - T, block_size><<>>( - in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), in2_data, - in1_row_numel); + SelectedRowsAddToTensorKernel + <<>>( + in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), in2_data, + in1_row_numel); } }; @@ -380,10 +380,10 @@ struct SelectedRowsAddToTensor { dim3 threads(block_size, 1); dim3 grid(in1_rows.size(), 1); paddle::framework::MixVector mixv_in1_rows(&in1_rows); - SelectedRowsAddToTensorKernel< - T, block_size><<>>( - in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), in2_data, - in1_row_numel); + SelectedRowsAddToTensorKernel + <<>>( + in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), in2_data, + in1_row_numel); } }; @@ -695,9 +695,9 @@ struct UpdateToTensor { dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1); dim3 grid(in1_rows.size(), 1); - UpdateToTensorKernel<<< - grid, threads, 0, context.stream()>>>(in1_data, in1_rows.cuda_data(), - op, in2_data, in1_row_numel); + UpdateToTensorKernel + <<>>(in1_data, in1_rows.cuda_data(), + op, in2_data, in1_row_numel); } }; } // namespace scatter diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc index e0e28f93f367e..e6358cda274f6 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cc +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc @@ -457,8 +457,9 @@ TEST(selected_rows_functor, cpu_sum_to) { paddle::operators::math::SelectedRowsSumTo sum_to_functor; - sum_to_functor(ctx, std::vector( - {selected_rows1.get(), selected_rows2.get()}), + sum_to_functor(ctx, + std::vector( + {selected_rows1.get(), selected_rows2.get()}), std::vector({0, in1_value->numel()}), output.get()); auto out_height = output->height(); EXPECT_EQ(out_height, height); diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc index 0912a964792a8..6e1d0bb367050 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/selected_rows_functor.h" + #include "gtest/gtest.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc index 35ba8c1d118a8..97e276fff02d7 100644 --- a/paddle/fluid/operators/math/sequence_padding.cc +++ b/paddle/fluid/operators/math/sequence_padding.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/sequence_padding.h" + #include "paddle/phi/backends/cpu/cpu_context.h" namespace phi { diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu index 9aca6ad0f5a2f..ef7981858a96d 100644 --- a/paddle/fluid/operators/math/sequence_padding.cu +++ b/paddle/fluid/operators/math/sequence_padding.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/operators/math/sequence_padding.h" #include "paddle/phi/backends/gpu/gpu_context.h" diff --git a/paddle/fluid/operators/math/sequence_padding.h b/paddle/fluid/operators/math/sequence_padding.h index 956a4ff6a2d45..687c64fc23e5d 100644 --- a/paddle/fluid/operators/math/sequence_padding.h +++ b/paddle/fluid/operators/math/sequence_padding.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/platform/device_context.h" @@ -64,13 +65,14 @@ inline static void CheckDims(const framework::DDim& seq_tensor_dims, PADDLE_ENFORCE_EQ( seq_tensor_dims.size() + 1 == pad_tensor_dims.size() || seq_tensor_dims.size() == pad_tensor_dims.size(), - true, platform::errors::InvalidArgument( - "pad_tensor's rank should be 1 greater than seq_tensor's " - "rank, or be equal with it. The pad_tensor's rank is %ld, " - "expected the seq_tensor's rank is %ld or %ld, but got %ld. " - "Please check the input value.", - pad_tensor_dims.size(), pad_tensor_dims.size(), - pad_tensor_dims.size() - 1, seq_tensor_dims.size())); + true, + platform::errors::InvalidArgument( + "pad_tensor's rank should be 1 greater than seq_tensor's " + "rank, or be equal with it. The pad_tensor's rank is %ld, " + "expected the seq_tensor's rank is %ld or %ld, but got %ld. " + "Please check the input value.", + pad_tensor_dims.size(), pad_tensor_dims.size(), + pad_tensor_dims.size() - 1, seq_tensor_dims.size())); } /* diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc index 8312d7cd9b72b..9abe9e598881a 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cc +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/math/sequence_pooling.h" + #include #include "paddle/fluid/operators/jit/kernels.h" -#include "paddle/fluid/operators/math/sequence_pooling.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu index fa7b043153851..217b29e1b6b18 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cu +++ b/paddle/fluid/operators/math/sequence_pooling.cu @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/operators/math/sequence_pooling.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/macros.h" @@ -170,41 +171,41 @@ class SequencePoolFunctor { dim3 grid(std::max(static_cast(lod.size()) - 1, 1), 1); paddle::framework::MixVector mix_vector(&lod); if (pooltype == "MAX") { - sequence_pool_kernel< - T, MaxPoolFunctor><<>>( - MaxPoolFunctor(), input.data(), pad_value, - mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, - output->mutable_data(context.GetPlace()), index->data()); + sequence_pool_kernel> + <<>>( + MaxPoolFunctor(), input.data(), pad_value, + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, + output->mutable_data(context.GetPlace()), index->data()); } else if (pooltype == "AVERAGE") { - sequence_pool_kernel< - T, AvgPoolFunctor><<>>( - AvgPoolFunctor(), input.data(), pad_value, - mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, - output->mutable_data(context.GetPlace()), nullptr); + sequence_pool_kernel> + <<>>( + AvgPoolFunctor(), input.data(), pad_value, + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, + output->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "SUM") { - sequence_pool_kernel< - T, SumPoolFunctor><<>>( - SumPoolFunctor(), input.data(), pad_value, - mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, - output->mutable_data(context.GetPlace()), nullptr); + sequence_pool_kernel> + <<>>( + SumPoolFunctor(), input.data(), pad_value, + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, + output->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "SQRT") { - sequence_pool_kernel< - T, SqrtPoolFunctor><<>>( - SqrtPoolFunctor(), input.data(), pad_value, - mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, - output->mutable_data(context.GetPlace()), nullptr); + sequence_pool_kernel> + <<>>( + SqrtPoolFunctor(), input.data(), pad_value, + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, + output->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "LAST") { - sequence_pool_kernel< - T, LastPoolFunctor><<>>( - LastPoolFunctor(), input.data(), pad_value, - mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, - output->mutable_data(context.GetPlace()), nullptr); + sequence_pool_kernel> + <<>>( + LastPoolFunctor(), input.data(), pad_value, + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, + output->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "FIRST") { - sequence_pool_kernel< - T, FirstPoolFunctor><<>>( - FirstPoolFunctor(), input.data(), pad_value, - mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, - output->mutable_data(context.GetPlace()), nullptr); + sequence_pool_kernel> + <<>>( + FirstPoolFunctor(), input.data(), pad_value, + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, + output->mutable_data(context.GetPlace()), nullptr); } else { PADDLE_THROW(platform::errors::InvalidArgument( "unsupported pooling pooltype: %s. Only support \"MAX\", " @@ -338,41 +339,41 @@ class SequencePoolGradFunctor { dim3 grid(std::max(static_cast(lod.size()) - 1, 1), 1); paddle::framework::MixVector mix_vector(&lod); if (pooltype == "MAX") { - sequence_pool_grad_kernel< - T, MaxPoolGradFunctor><<>>( - MaxPoolGradFunctor(), out_grad.data(), - mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, - in_grad->mutable_data(context.GetPlace()), index->data()); + sequence_pool_grad_kernel> + <<>>( + MaxPoolGradFunctor(), out_grad.data(), + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, + in_grad->mutable_data(context.GetPlace()), index->data()); } else if (pooltype == "AVERAGE") { - sequence_pool_grad_kernel< - T, AvgPoolGradFunctor><<>>( - AvgPoolGradFunctor(), out_grad.data(), - mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, - in_grad->mutable_data(context.GetPlace()), nullptr); + sequence_pool_grad_kernel> + <<>>( + AvgPoolGradFunctor(), out_grad.data(), + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, + in_grad->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "SUM") { - sequence_pool_grad_kernel< - T, SumPoolGradFunctor><<>>( - SumPoolGradFunctor(), out_grad.data(), - mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, - in_grad->mutable_data(context.GetPlace()), nullptr); + sequence_pool_grad_kernel> + <<>>( + SumPoolGradFunctor(), out_grad.data(), + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, + in_grad->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "SQRT") { - sequence_pool_grad_kernel< - T, SqrtPoolGradFunctor><<>>( - SqrtPoolGradFunctor(), out_grad.data(), - mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, - in_grad->mutable_data(context.GetPlace()), nullptr); + sequence_pool_grad_kernel> + <<>>( + SqrtPoolGradFunctor(), out_grad.data(), + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, + in_grad->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "LAST") { - sequence_pool_grad_kernel< - T, LastPoolGradFunctor><<>>( - LastPoolGradFunctor(), out_grad.data(), - mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, - in_grad->mutable_data(context.GetPlace()), nullptr); + sequence_pool_grad_kernel> + <<>>( + LastPoolGradFunctor(), out_grad.data(), + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, + in_grad->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "FIRST") { - sequence_pool_grad_kernel< - T, FirstPoolGradFunctor><<>>( - FirstPoolGradFunctor(), out_grad.data(), - mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, - in_grad->mutable_data(context.GetPlace()), nullptr); + sequence_pool_grad_kernel> + <<>>( + FirstPoolGradFunctor(), out_grad.data(), + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, + in_grad->mutable_data(context.GetPlace()), nullptr); } else { PADDLE_THROW(platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/math/sequence_pooling.h b/paddle/fluid/operators/math/sequence_pooling.h index 847d0bca951a7..f5b6701b46ef4 100644 --- a/paddle/fluid/operators/math/sequence_pooling.h +++ b/paddle/fluid/operators/math/sequence_pooling.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc index 815d221e2556a..6d9c75f955041 100644 --- a/paddle/fluid/operators/math/sequence_pooling_test.cc +++ b/paddle/fluid/operators/math/sequence_pooling_test.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/sequence_pooling.h" + #include template diff --git a/paddle/fluid/operators/math/sequence_scale.cc b/paddle/fluid/operators/math/sequence_scale.cc index bc8832a1bbc56..8f954e068c048 100644 --- a/paddle/fluid/operators/math/sequence_scale.cc +++ b/paddle/fluid/operators/math/sequence_scale.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/sequence_scale.h" + #include "paddle/phi/backends/cpu/cpu_context.h" namespace phi { diff --git a/paddle/fluid/operators/math/sequence_scale.cu b/paddle/fluid/operators/math/sequence_scale.cu index 253a67c2c8cbe..c0b97497cc7bf 100644 --- a/paddle/fluid/operators/math/sequence_scale.cu +++ b/paddle/fluid/operators/math/sequence_scale.cu @@ -53,10 +53,10 @@ class ScaleLoDTensorFunctor { seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales, seq_width); #else - SequenceScaleKernel<<< - num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>( - seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales, - seq_width); + SequenceScaleKernel + <<>>( + seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales, + seq_width); #endif mix_vector.CopyToCPU(); } @@ -82,10 +82,10 @@ class ScaleLoDTensorFunctor { seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales, seq_width); #else - SequenceScaleKernel<<< - num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>( - seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales, - seq_width); + SequenceScaleKernel + <<>>( + seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales, + seq_width); #endif mix_vector.CopyToCPU(); } diff --git a/paddle/fluid/operators/math/softmax.cc b/paddle/fluid/operators/math/softmax.cc index c855cb763a97b..adea86a6c5a87 100644 --- a/paddle/fluid/operators/math/softmax.cc +++ b/paddle/fluid/operators/math/softmax.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/softmax.h" + #include "paddle/fluid/operators/math/softmax_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index 69642c8194221..33da631d27b14 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/jit/kernels.h" @@ -66,34 +67,32 @@ class SoftmaxEigen { if (num_remain == 1) { // axis == -1, axis and class in same dimension, calculate along // class dimension directly for higher performance - softmax.device(*context.eigen_device()) = (logits - - logits.maximum(along_axis) - .eval() - .reshape(batch_by_one) - .broadcast(one_by_class)) - .unaryExpr(ValueClip()); + softmax.device(*context.eigen_device()) = + (logits - logits.maximum(along_axis) + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)) + .unaryExpr(ValueClip()); } else { // axis != -1, class dimension split into (axis, remain), max and sum // should be calculated along axis dimension softmax.device(*context.eigen_device()) = - (logits.reshape(batch_axis_remain) - - logits.reshape(batch_axis_remain) - .maximum(along_axis) - .eval() - .reshape(batch_one_remain) - .broadcast(one_axis_one) - .reshape(batch_classes)) + (logits.reshape(batch_axis_remain) - logits.reshape(batch_axis_remain) + .maximum(along_axis) + .eval() + .reshape(batch_one_remain) + .broadcast(one_axis_one) + .reshape(batch_classes)) .unaryExpr(ValueClip()); } softmax.device(*context.eigen_device()) = softmax.exp(); softmax.device(*context.eigen_device()) = - (softmax * - softmax.reshape(batch_axis_remain) - .sum(along_axis) - .inverse() - .eval() - .broadcast(one_axis)); + (softmax * softmax.reshape(batch_axis_remain) + .sum(along_axis) + .inverse() + .eval() + .broadcast(one_axis)); } }; @@ -128,31 +127,28 @@ class SoftmaxEigen { // axis == -1, axis and class in same dimension, calculate along // class dimension directly for higher performance softmax.device(*context.eigen_device()) = - (logits - - logits.maximum(along_axis) - .reshape(batch_by_one) - .broadcast(one_by_class)) + (logits - logits.maximum(along_axis) + .reshape(batch_by_one) + .broadcast(one_by_class)) .unaryExpr(ValueClip()); } else { // axis != -1, class dimension split into (axis, remain), max and sum // should be calculated along axis dimension softmax.device(*context.eigen_device()) = - (logits.reshape(batch_axis_remain) - - logits.reshape(batch_axis_remain) - .maximum(along_axis) - .reshape(batch_one_remain) - .broadcast(one_axis_one) - .reshape(batch_classes)) + (logits.reshape(batch_axis_remain) - logits.reshape(batch_axis_remain) + .maximum(along_axis) + .reshape(batch_one_remain) + .broadcast(one_axis_one) + .reshape(batch_classes)) .unaryExpr(ValueClip()); } softmax.device(*context.eigen_device()) = softmax.exp(); softmax.device(*context.eigen_device()) = - (softmax * - softmax.reshape(batch_axis_remain) - .sum(along_axis) - .inverse() - .broadcast(one_axis)); + (softmax * softmax.reshape(batch_axis_remain) + .sum(along_axis) + .inverse() + .broadcast(one_axis)); } }; @@ -187,31 +183,28 @@ class SoftmaxEigen { // axis == -1, axis and class in same dimension, calculate along // class dimension directly for higher performance softmax.device(*context.eigen_device()) = - (logits - - logits.maximum(along_axis) - .reshape(batch_by_one) - .broadcast(one_by_class)) + (logits - logits.maximum(along_axis) + .reshape(batch_by_one) + .broadcast(one_by_class)) .unaryExpr(ValueClip()); } else { // axis != -1, class dimension split into (axis, remain), max and sum // should be calculated along axis dimension softmax.device(*context.eigen_device()) = - (logits.reshape(batch_axis_remain) - - logits.reshape(batch_axis_remain) - .maximum(along_axis) - .reshape(batch_one_remain) - .broadcast(one_axis_one) - .reshape(batch_classes)) + (logits.reshape(batch_axis_remain) - logits.reshape(batch_axis_remain) + .maximum(along_axis) + .reshape(batch_one_remain) + .broadcast(one_axis_one) + .reshape(batch_classes)) .unaryExpr(ValueClip()); } softmax.device(*context.eigen_device()) = softmax.exp(); softmax.device(*context.eigen_device()) = - (softmax * - softmax.reshape(batch_axis_remain) - .sum(along_axis) - .inverse() - .broadcast(one_axis)); + (softmax * softmax.reshape(batch_axis_remain) + .sum(along_axis) + .inverse() + .broadcast(one_axis)); } }; diff --git a/paddle/fluid/operators/math/sparse_impl.cu.h b/paddle/fluid/operators/math/sparse_impl.cu.h index dd2d256dd73b2..03f94ed573604 100644 --- a/paddle/fluid/operators/math/sparse_impl.cu.h +++ b/paddle/fluid/operators/math/sparse_impl.cu.h @@ -14,11 +14,10 @@ #pragma once +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/dynload/cusparse.h" #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" - namespace paddle { namespace operators { namespace math { diff --git a/paddle/fluid/operators/math/tree2col.cc b/paddle/fluid/operators/math/tree2col.cc index cd1fa13001ce2..8ad0a17c27ea9 100644 --- a/paddle/fluid/operators/math/tree2col.cc +++ b/paddle/fluid/operators/math/tree2col.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/math/tree2col.h" + #include #include diff --git a/paddle/fluid/operators/math/tree2col.cu b/paddle/fluid/operators/math/tree2col.cu index bdaab212ab170..c8bba20a423e5 100644 --- a/paddle/fluid/operators/math/tree2col.cu +++ b/paddle/fluid/operators/math/tree2col.cu @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "paddle/fluid/operators/math/tree2col.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/math/tree2col.h b/paddle/fluid/operators/math/tree2col.h index 88104b858ba01..df4b233a763d7 100644 --- a/paddle/fluid/operators/math/tree2col.h +++ b/paddle/fluid/operators/math/tree2col.h @@ -17,6 +17,7 @@ #include #include #include + #include "paddle/fluid/framework/tensor.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu index fb61a36a8e1a7..d8581d731e82b 100644 --- a/paddle/fluid/operators/math/vol2col.cu +++ b/paddle/fluid/operators/math/vol2col.cu @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/operators/math/vol2col.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" diff --git a/paddle/fluid/operators/math/vol2col.h b/paddle/fluid/operators/math/vol2col.h index 3122828b2eeba..cddcb0af467dc 100644 --- a/paddle/fluid/operators/math/vol2col.h +++ b/paddle/fluid/operators/math/vol2col.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc index 210cf10d8879d..4889817cd9eac 100644 --- a/paddle/fluid/operators/math/vol2col_test.cc +++ b/paddle/fluid/operators/math/vol2col_test.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/vol2col.h" #include + #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index 9d381e1f22b5f..2c16774e324a7 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -12,6 +12,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/kernels/funcs/blas/blas.h" @@ -258,13 +259,14 @@ class MatMulGradKernel : public framework::OpKernel { MatMul(context, a, trans_a, b, trans_b, out); } else { auto &ctx = context.template device_context(); - MatMul(context, is_fold_init_dims_a - ? FoldInitDims(a) - : FoldHeadAndLastDims(ctx, a), - trans_a, is_fold_init_dims_b - ? FoldInitDims(b) - : FoldHeadAndLastDims(ctx, b), - trans_b, out); + MatMul( + context, + is_fold_init_dims_a ? FoldInitDims(a) + : FoldHeadAndLastDims(ctx, a), + trans_a, + is_fold_init_dims_b ? FoldInitDims(b) + : FoldHeadAndLastDims(ctx, b), + trans_b, out); } } @@ -425,13 +427,14 @@ class MatMulDoubleGradKernel : public framework::OpKernel { MatMul(context, a, trans_a, b, trans_b, flag, out); } else { auto &ctx = context.template device_context(); - MatMul(context, is_fold_init_dims_a - ? FoldInitDims(a) - : FoldHeadAndLastDims(ctx, a), - trans_a, is_fold_init_dims_b - ? FoldInitDims(b) - : FoldHeadAndLastDims(ctx, b), - trans_b, flag, out); + MatMul( + context, + is_fold_init_dims_a ? FoldInitDims(a) + : FoldHeadAndLastDims(ctx, a), + trans_a, + is_fold_init_dims_b ? FoldInitDims(b) + : FoldHeadAndLastDims(ctx, b), + trans_b, flag, out); } } @@ -602,12 +605,13 @@ class MatMulOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ( mat_dim_x.batch_size_ == mat_dim_y.batch_size_ || mat_dim_x.batch_size_ == 0 || mat_dim_y.batch_size_ == 0, - true, platform::errors::InvalidArgument( - "The batch size of the two matrices should be equal, or " - "at least one is zero.\n" - "But received X's shape: %s, Y's shape: %s.", - DumpMatrixShape(mat_dim_x).c_str(), - DumpMatrixShape(mat_dim_y).c_str())); + true, + platform::errors::InvalidArgument( + "The batch size of the two matrices should be equal, or " + "at least one is zero.\n" + "But received X's shape: %s, Y's shape: %s.", + DumpMatrixShape(mat_dim_x).c_str(), + DumpMatrixShape(mat_dim_y).c_str())); } int64_t dim_out_y = mat_dim_y.width_; #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ @@ -996,13 +1000,12 @@ REGISTER_OP_CUDA_KERNEL( ops::MatMulDoubleGradKernel); #endif -REGISTER_OP_VERSION(matmul) - .AddCheckpoint( - R"ROC(Register matmul for adding the attribute of +REGISTER_OP_VERSION(matmul).AddCheckpoint( + R"ROC(Register matmul for adding the attribute of fused_reshape_Y)ROC", - paddle::framework::compatible::OpVersionDesc().NewAttr( - "fused_reshape_Y", - "In order to support the function of fused the input Y " - " and input X into the input X when " - "using the operator of matmul, and get raw shape of input Y.", - std::vector{})); + paddle::framework::compatible::OpVersionDesc().NewAttr( + "fused_reshape_Y", + "In order to support the function of fused the input Y " + " and input X into the input X when " + "using the operator of matmul, and get raw shape of input Y.", + std::vector{})); diff --git a/paddle/fluid/operators/matmul_op_xpu.cc b/paddle/fluid/operators/matmul_op_xpu.cc index 80d4492e04981..3477715d6d3de 100644 --- a/paddle/fluid/operators/matmul_op_xpu.cc +++ b/paddle/fluid/operators/matmul_op_xpu.cc @@ -315,14 +315,15 @@ class MatMulGradXPUKernel : public framework::OpKernel { MatMul(context, a, trans_a, b, trans_b, out); } else { auto &dev_ctx = context.template device_context(); - MatMul( - context, is_fold_init_dims_a - ? FoldInitDims(a) - : XPUFoldHeadAndLastDims(dev_ctx, a), - trans_a, is_fold_init_dims_b - ? FoldInitDims(b) - : XPUFoldHeadAndLastDims(dev_ctx, b), - trans_b, out); + MatMul(context, + is_fold_init_dims_a + ? FoldInitDims(a) + : XPUFoldHeadAndLastDims(dev_ctx, a), + trans_a, + is_fold_init_dims_b + ? FoldInitDims(b) + : XPUFoldHeadAndLastDims(dev_ctx, b), + trans_b, out); } } diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc index 162ebdafec1cb..168a3dbfeaac1 100644 --- a/paddle/fluid/operators/matmul_v2_op.cc +++ b/paddle/fluid/operators/matmul_v2_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/matmul_v2_op.h" + #include #include diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h index 34a8e97af2e1c..b47cdf6e8cb0d 100644 --- a/paddle/fluid/operators/matmul_v2_op.h +++ b/paddle/fluid/operators/matmul_v2_op.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/dot_op.h" diff --git a/paddle/fluid/operators/matmul_v2_op_xpu.cc b/paddle/fluid/operators/matmul_v2_op_xpu.cc index 87df75ac46504..f85e714ce9555 100644 --- a/paddle/fluid/operators/matmul_v2_op_xpu.cc +++ b/paddle/fluid/operators/matmul_v2_op_xpu.cc @@ -14,10 +14,10 @@ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/matmul_v2_op.h" #include #include +#include "paddle/fluid/operators/matmul_v2_op.h" #include "paddle/fluid/operators/xpu_api_wrapper.h" namespace paddle { diff --git a/paddle/fluid/operators/matrix_power_op.cc b/paddle/fluid/operators/matrix_power_op.cc index 56f65340ea999..ffbb8538d947a 100644 --- a/paddle/fluid/operators/matrix_power_op.cc +++ b/paddle/fluid/operators/matrix_power_op.cc @@ -14,6 +14,7 @@ #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" diff --git a/paddle/fluid/operators/matrix_rank_op.cc b/paddle/fluid/operators/matrix_rank_op.cc index e7d08b6597360..fddfaa3526a07 100644 --- a/paddle/fluid/operators/matrix_rank_op.cc +++ b/paddle/fluid/operators/matrix_rank_op.cc @@ -14,6 +14,7 @@ #include #include + #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/svd_helper.h" #include "paddle/phi/kernels/funcs/compare_functors.h" diff --git a/paddle/fluid/operators/mean_iou_op.h b/paddle/fluid/operators/mean_iou_op.h index 9fa00e60e0550..1cf9f4433bc2c 100644 --- a/paddle/fluid/operators/mean_iou_op.h +++ b/paddle/fluid/operators/mean_iou_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/mean_op_xpu.cc b/paddle/fluid/operators/mean_op_xpu.cc index ef96fe2f03ba4..811b138c8d10d 100644 --- a/paddle/fluid/operators/mean_op_xpu.cc +++ b/paddle/fluid/operators/mean_op_xpu.cc @@ -56,8 +56,9 @@ class MeanGradXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto OG = context.Input(framework::GradVarName("Out")); - PADDLE_ENFORCE_EQ(OG->numel(), 1, platform::errors::InvalidArgument( - "Mean Gradient should be scalar")); + PADDLE_ENFORCE_EQ( + OG->numel(), 1, + platform::errors::InvalidArgument("Mean Gradient should be scalar")); auto IG = context.Output(framework::GradVarName("X")); IG->mutable_data(context.GetPlace()); auto& dev_ctx = context.template device_context(); diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc index e2b86bd0e3b92..0d4c2f7b3b4b0 100644 --- a/paddle/fluid/operators/merge_lod_tensor_op.cc +++ b/paddle/fluid/operators/merge_lod_tensor_op.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" - #include "paddle/phi/core/lod_utils.h" namespace phi { diff --git a/paddle/fluid/operators/merge_selected_rows_op.cc b/paddle/fluid/operators/merge_selected_rows_op.cc index ea223ad1b3231..cfb8aa1f8a76e 100644 --- a/paddle/fluid/operators/merge_selected_rows_op.cc +++ b/paddle/fluid/operators/merge_selected_rows_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/merge_selected_rows_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/merge_selected_rows_op.h b/paddle/fluid/operators/merge_selected_rows_op.h index 4c87a4a641194..d0f18b22b2797 100644 --- a/paddle/fluid/operators/merge_selected_rows_op.h +++ b/paddle/fluid/operators/merge_selected_rows_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" diff --git a/paddle/fluid/operators/meshgrid_op.cc b/paddle/fluid/operators/meshgrid_op.cc index 5a6862f380da1..cc57a25a1fb34 100644 --- a/paddle/fluid/operators/meshgrid_op.cc +++ b/paddle/fluid/operators/meshgrid_op.cc @@ -16,10 +16,9 @@ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" - -#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/multiary.h" diff --git a/paddle/fluid/operators/metrics/CMakeLists.txt b/paddle/fluid/operators/metrics/CMakeLists.txt index 101939dde2c01..b968dbf288ee2 100644 --- a/paddle/fluid/operators/metrics/CMakeLists.txt +++ b/paddle/fluid/operators/metrics/CMakeLists.txt @@ -1,6 +1,6 @@ include(operators) if(WITH_UNITY_BUILD) - # Load Unity Build rules for operators in paddle/fluid/operators/metrics. - include(unity_build_rule.cmake) + # Load Unity Build rules for operators in paddle/fluid/operators/metrics. + include(unity_build_rule.cmake) endif() register_operators() diff --git a/paddle/fluid/operators/metrics/unity_build_rule.cmake b/paddle/fluid/operators/metrics/unity_build_rule.cmake index fcb690a7b6a85..58acbc3b1e62f 100644 --- a/paddle/fluid/operators/metrics/unity_build_rule.cmake +++ b/paddle/fluid/operators/metrics/unity_build_rule.cmake @@ -4,10 +4,5 @@ # Generally, the combination rules in this file do not need to be modified. # If there are some redefined error in compiling with the source file which # in combination rule, you can remove the source file from the following rules. -register_unity_group(cc - accuracy_op.cc - auc_op.cc - precision_recall_op.cc) -register_unity_group(cu - accuracy_op.cu - auc_op.cu) +register_unity_group(cc accuracy_op.cc auc_op.cc precision_recall_op.cc) +register_unity_group(cu accuracy_op.cu auc_op.cu) diff --git a/paddle/fluid/operators/miopen_lstm_cache.h b/paddle/fluid/operators/miopen_lstm_cache.h index c307218baa406..045f917de7016 100644 --- a/paddle/fluid/operators/miopen_lstm_cache.h +++ b/paddle/fluid/operators/miopen_lstm_cache.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" diff --git a/paddle/fluid/operators/miopen_rnn_cache.h b/paddle/fluid/operators/miopen_rnn_cache.h index 38cea39abd5de..438163cd77eaa 100644 --- a/paddle/fluid/operators/miopen_rnn_cache.h +++ b/paddle/fluid/operators/miopen_rnn_cache.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" diff --git a/paddle/fluid/operators/mkldnn/CMakeLists.txt b/paddle/fluid/operators/mkldnn/CMakeLists.txt index ce95ec560c25e..f40286ad5d8a2 100644 --- a/paddle/fluid/operators/mkldnn/CMakeLists.txt +++ b/paddle/fluid/operators/mkldnn/CMakeLists.txt @@ -1 +1,4 @@ -cc_library(mkldnn_axpy_handler SRCS axpy_handler.cc DEPS place device_context enforce) +cc_library( + mkldnn_axpy_handler + SRCS axpy_handler.cc + DEPS place device_context enforce) diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index 393247644c2e8..db74b24b405ed 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -30,11 +30,11 @@ class MKLDNNDeviceContext; namespace paddle { namespace operators { -using framework::DataLayout; -using framework::Tensor; using dnnl::memory; using dnnl::primitive; using dnnl::stream; +using framework::DataLayout; +using framework::Tensor; using platform::GetMKLDNNFormat; using platform::MKLDNNDeviceContext; using platform::to_void_cast; diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.cc b/paddle/fluid/operators/mkldnn/axpy_handler.cc index ee630fe186a24..80f74195d8e3c 100644 --- a/paddle/fluid/operators/mkldnn/axpy_handler.cc +++ b/paddle/fluid/operators/mkldnn/axpy_handler.cc @@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/mkldnn/axpy_handler.h" + #include #include #include #include #include "dnnl.hpp" -#include "paddle/fluid/operators/mkldnn/axpy_handler.h" #include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/mkldnn_helper.h" @@ -82,7 +83,7 @@ static void naive_axpy(int n, T alpha, const T *x, T *y) { } } -} // anonnymouse namespace +} // namespace template class OneDNNAXPYHandler::Impl { diff --git a/paddle/fluid/operators/mkldnn/caching_tests.cmake b/paddle/fluid/operators/mkldnn/caching_tests.cmake index f48a5d822f8dc..49f08622265d0 100644 --- a/paddle/fluid/operators/mkldnn/caching_tests.cmake +++ b/paddle/fluid/operators/mkldnn/caching_tests.cmake @@ -1,6 +1,20 @@ -set(TEST_MKLDNN_CACHING_DEPS op_registry elementwise_mul_op elementwise_add_op activation_op softmax_op conv_op im2col vol2col softmax scope device_context enforce) -if (WITH_GPU OR WITH_ROCM) +set(TEST_MKLDNN_CACHING_DEPS + op_registry + elementwise_mul_op + elementwise_add_op + activation_op + softmax_op + conv_op + im2col + vol2col + softmax + scope + device_context + enforce) +if(WITH_GPU OR WITH_ROCM) set(TEST_MKLDNN_CACHING_DEPS ${TEST_MKLDNN_CACHING_DEPS} depthwise_conv) endif() -cc_test(test_mkldnn_caching SRCS mkldnn/test_mkldnn_caching.cc DEPS ${TEST_MKLDNN_CACHING_DEPS}) - +cc_test( + test_mkldnn_caching + SRCS mkldnn/test_mkldnn_caching.cc + DEPS ${TEST_MKLDNN_CACHING_DEPS}) diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc index 5095fa067193a..0881baa6f8eea 100644 --- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/operators/concat_op.h" #include "paddle/fluid/operators/utils.h" #include "paddle/fluid/platform/mkldnn_helper.h" @@ -21,13 +22,13 @@ limitations under the License. */ namespace paddle { namespace operators { -using framework::DataLayout; -using framework::Tensor; -using framework::LoDTensor; +using dnnl::concat; using dnnl::memory; using dnnl::primitive; -using dnnl::concat; using dnnl::stream; +using framework::DataLayout; +using framework::LoDTensor; +using framework::Tensor; using platform::to_void_cast; template diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index fba17d303f282..65092e059f4af 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -203,8 +203,9 @@ class ConvMKLDNNHandlerT dnnl::memory::desc src_md, weights_md; if (platform::is_int8()) { src_md = platform::MKLDNNMemDesc( - src_tz, framework::ToMKLDNNDataType( - framework::TransToProtoVarType(input->dtype())), + src_tz, + framework::ToMKLDNNDataType( + framework::TransToProtoVarType(input->dtype())), chosen_memory_format); weights_md = platform::MKLDNNMemDesc( weights_tz, dnnl::memory::data_type::s8, chosen_memory_format); @@ -459,13 +460,12 @@ class ConvMKLDNNHandlerT auto scale_weights_data = ctx.Attr>("Scale_weights"); bool is_multi_channel = scale_weights_data.size() > 1; bool has_activation = !ctx.Attr("fuse_activation").empty(); - float activation_scale = - force_fp32_output ? 1.0f : has_activation ? ctx.Attr("Scale_out") - : 1.0f; - auto scale_out_data = - force_fp32_output ? 1.0f : has_activation - ? 1.0f - : ctx.Attr("Scale_out"); + float activation_scale = force_fp32_output ? 1.0f + : has_activation ? ctx.Attr("Scale_out") + : 1.0f; + auto scale_out_data = force_fp32_output ? 1.0f + : has_activation ? 1.0f + : ctx.Attr("Scale_out"); float sum_scale = fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f; int count = diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc index 747e4603d7fe7..e507b2429b7d9 100644 --- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc @@ -28,8 +28,8 @@ using dnnl::primitive; using dnnl::reorder; using platform::to_void_cast; using Tensor = framework::Tensor; -using framework::DataLayout; using dnnl::stream; +using framework::DataLayout; using platform::GetMKLDNNFormat; template diff --git a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc index 91dccbee0aef2..035add5fd834d 100644 --- a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc @@ -18,11 +18,11 @@ limitations under the License. */ namespace { -using paddle::framework::Tensor; -using phi::vectorize; -using paddle::framework::GradVarName; using paddle::framework::ExecutionContext; +using paddle::framework::GradVarName; +using paddle::framework::Tensor; using paddle::platform::MKLDNNDeviceContext; +using phi::vectorize; template class ExpandMKLDNNKernel : public paddle::framework::OpKernel { diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc index 4078d012fce90..5cbcad5d965a4 100644 --- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc @@ -31,19 +31,19 @@ class MKLDNNDeviceContext; namespace paddle { namespace operators { +using dnnl::inner_product_forward; +using dnnl::memory; +using dnnl::primitive; +using dnnl::prop_kind; +using dnnl::stream; using framework::DataLayout; -using framework::Tensor; -using framework::LoDTensor; using framework::DDim; using framework::ExecutionContext; +using framework::LoDTensor; +using framework::Tensor; +using platform::GetMKLDNNFormat; using platform::MKLDNNDeviceContext; using platform::to_void_cast; -using platform::GetMKLDNNFormat; -using dnnl::memory; -using dnnl::inner_product_forward; -using dnnl::primitive; -using dnnl::stream; -using dnnl::prop_kind; template class FCPrimitiveFactory { diff --git a/paddle/fluid/operators/mkldnn/inplace_op_tests.cmake b/paddle/fluid/operators/mkldnn/inplace_op_tests.cmake index c03ce74df7d64..18893e22ec85b 100644 --- a/paddle/fluid/operators/mkldnn/inplace_op_tests.cmake +++ b/paddle/fluid/operators/mkldnn/inplace_op_tests.cmake @@ -1,2 +1,12 @@ -cc_test(test_mkldnn_op_inplace SRCS mkldnn/test_mkldnn_op_inplace.cc DEPS op_registry elementwise_add_op activation_op softmax_op softmax scope device_context enforce executor) - +cc_test( + test_mkldnn_op_inplace + SRCS mkldnn/test_mkldnn_op_inplace.cc + DEPS op_registry + elementwise_add_op + activation_op + softmax_op + softmax + scope + device_context + enforce + executor) diff --git a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc index 37d6c07290312..a53a30b737dc4 100644 --- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc @@ -19,12 +19,12 @@ namespace paddle { namespace operators { -using framework::DataLayout; using dnnl::memory; using dnnl::primitive; using dnnl::reorder; -using dnnl::stream; using dnnl::resampling_forward; +using dnnl::stream; +using framework::DataLayout; using platform::GetMKLDNNFormat; using platform::to_void_cast; @@ -114,9 +114,10 @@ class InterpolateMKLDNNKernel : public framework::OpKernel { PADDLE_ENFORCE_GT(std::all_of(out_dims.begin(), out_dims.end(), [](int i) { return i > 0; }), - 0, platform::errors::InvalidArgument( - "out_d, out_h, out_w of Op(interpolate) " - "should be greater than 0.")); + 0, + platform::errors::InvalidArgument( + "out_d, out_h, out_w of Op(interpolate) " + "should be greater than 0.")); const std::vector nc_dims = {in_dims[0], in_dims[1]}; out_dims.insert(out_dims.begin(), nc_dims.begin(), nc_dims.end()); diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc index e9abe84e67980..8921db6cbcef9 100644 --- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc @@ -13,19 +13,21 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h" + #include + #include "paddle/fluid/framework/convert_utils.h" using dnnl::memory; using dnnl::primitive; using paddle::framework::DataLayout; using paddle::framework::ExecutionContext; -using phi::vectorize; using paddle::platform::GetMKLDNNFormat; -using paddle::platform::MKLDNNFormatForSize; using paddle::platform::MKLDNNDeviceContext; +using paddle::platform::MKLDNNFormatForSize; using paddle::platform::MKLDNNGetDataType; using paddle::platform::to_void_cast; +using phi::vectorize; using Tensor = paddle::framework::Tensor; namespace { diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h index 583dcd04018b2..07cb2173a7ec5 100644 --- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h +++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h @@ -22,8 +22,8 @@ limitations under the License. */ namespace paddle { namespace operators { -using platform::MKLDNNDeviceContext; using framework::ExecutionContext; +using platform::MKLDNNDeviceContext; using Tensor = framework::Tensor; template diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc index 6e7ba59cf1ad8..424faf30d3a9f 100644 --- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc @@ -20,8 +20,8 @@ using dnnl::memory; using dnnl::primitive; using paddle::framework::DataLayout; using paddle::framework::ExecutionContext; -using paddle::platform::MatMulV2MKLDNNHandler; using paddle::platform::GetMKLDNNFormat; +using paddle::platform::MatMulV2MKLDNNHandler; using paddle::platform::MKLDNNDeviceContext; using paddle::platform::MKLDNNGetDataType; using paddle::platform::to_void_cast; @@ -206,11 +206,12 @@ class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE_EQ( x_bd_dims[i] == y_bd_dims[i] || x_bd_dims[i] == 1 || y_bd_dims[i] == 1, - true, paddle::platform::errors::InvalidArgument( - "Tensor dimensions are incorrect for broadcasting." - "Dimensions in X and Y must be same or equal to 1, but " - "received x_dim[%d]=%d and y_dims[%d]= %d", - i, x_bd_dims[i], i, y_bd_dims[i])); + true, + paddle::platform::errors::InvalidArgument( + "Tensor dimensions are incorrect for broadcasting." + "Dimensions in X and Y must be same or equal to 1, but " + "received x_dim[%d]=%d and y_dims[%d]= %d", + i, x_bd_dims[i], i, y_bd_dims[i])); out_dims[i] = std::max(x_bd_dims[i], y_bd_dims[i]); } out->Resize(phi::make_ddim(out_dims)); diff --git a/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake index 8bad3e86b2934..4c94bc3f3ad57 100644 --- a/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake +++ b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake @@ -1 +1,14 @@ -cc_test(test_mkldnn_op_nhwc SRCS mkldnn/test_mkldnn_op_nhwc.cc DEPS op_registry pool_op shape_op crop_op activation_op pooling transpose_op scope device_context enforce executor) +cc_test( + test_mkldnn_op_nhwc + SRCS mkldnn/test_mkldnn_op_nhwc.cc + DEPS op_registry + pool_op + shape_op + crop_op + activation_op + pooling + transpose_op + scope + device_context + enforce + executor) diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc index 77763531c8296..dbf3adcdad07d 100644 --- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc @@ -20,14 +20,14 @@ limitations under the License. */ namespace paddle { namespace operators { -using framework::DataLayout; -using framework::Tensor; using dnnl::memory; using dnnl::pooling_backward; using dnnl::pooling_forward; using dnnl::primitive; using dnnl::reorder; using dnnl::stream; +using framework::DataLayout; +using framework::Tensor; using platform::to_void_cast; template diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc index 8cbe46bee481a..8f3a3e8ba65e7 100644 --- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc @@ -27,8 +27,8 @@ using dnnl::primitive; using dnnl::reorder; using platform::to_void_cast; using Tensor = framework::Tensor; -using framework::DataLayout; using dnnl::stream; +using framework::DataLayout; using platform::GetMKLDNNFormat; template diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc index 9a7ac6d505522..778a33f27af0a 100644 --- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc @@ -46,10 +46,12 @@ class ReQuantOpKernel : public framework::OpKernel { bool with_shift = shift_in != 0.0f || shift_out != 0.0f; auto* output = ctx.Output("Output"); - PADDLE_ENFORCE_NE(scale_in, 0.0f, platform::errors::InvalidArgument( - "Scale of input cannot be 0.0")); - PADDLE_ENFORCE_NE(scale_out, 0.0f, platform::errors::InvalidArgument( - "Scale of output cannot be 0.0")); + PADDLE_ENFORCE_NE( + scale_in, 0.0f, + platform::errors::InvalidArgument("Scale of input cannot be 0.0")); + PADDLE_ENFORCE_NE( + scale_out, 0.0f, + platform::errors::InvalidArgument("Scale of output cannot be 0.0")); if (shift_in != 0.0f) { PADDLE_ENFORCE_EQ( framework::TransToProtoVarType(input->dtype()), diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc index a21034d48baaa..f1c5153240ee2 100644 --- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc @@ -31,8 +31,8 @@ namespace paddle { namespace operators { using paddle::framework::LoDTensor; -using platform::to_void_cast; using platform::GetMKLDNNFormat; +using platform::to_void_cast; static std::vector extract_shape( const std::vector& list_new_shape_tensor) { diff --git a/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc index 28a00be5fa47e..798fe51901df0 100644 --- a/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc @@ -17,13 +17,13 @@ limitations under the License. */ namespace paddle { namespace operators { -using framework::DataLayout; -using framework::Tensor; -using framework::LoDTensor; +using dnnl::concat; using dnnl::memory; using dnnl::primitive; -using dnnl::concat; using dnnl::stream; +using framework::DataLayout; +using framework::LoDTensor; +using framework::Tensor; using platform::to_void_cast; template diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc index de21c2687bd44..b564602fdaada 100644 --- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc @@ -116,8 +116,9 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { const auto& mkldnn_engine = dev_ctx.GetEngine(); auto in_vars = ctx.MultiInputVar("X"); - PADDLE_ENFORCE_NE(in_vars.empty(), true, platform::errors::InvalidArgument( - "Input variable is empty.")); + PADDLE_ENFORCE_NE( + in_vars.empty(), true, + platform::errors::InvalidArgument("Input variable is empty.")); auto& input0 = in_vars[0]->Get(); LoDTensor* output = ctx.Output("Out"); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index b5fb0c54c7812..1e04cc8a8a525 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -16,6 +16,7 @@ #include #include #include + #include "gtest/gtest.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" @@ -121,8 +122,9 @@ void RunOperator(const platform::Place &place, const std::string &op_type, auto op = num_inputs[op_type] > 1 ? framework::OpRegistry::CreateOp( - op_type, {{first_input_var_name, {first_input}}, - {second_input_var_name, {"x1"}}}, + op_type, + {{first_input_var_name, {first_input}}, + {second_input_var_name, {"x1"}}}, {{output_var_name, {output_name}}}, {{"use_mkldnn", {true}}}) : framework::OpRegistry::CreateOp( op_type, {{first_input_var_name, {first_input}}}, diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc index 4090d5ffca801..a1acf3706c590 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc @@ -16,6 +16,7 @@ #include #include #include + #include "gtest/gtest.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc index b9866ba8c3647..f4b79a0216332 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc @@ -16,6 +16,7 @@ #include #include #include + #include "gtest/gtest.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc index ee9922773147c..13f9dba9eeb8f 100644 --- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc @@ -66,7 +66,7 @@ class TransposeMKLDNNHandler { protected: dnnl::memory::desc Axis2MemoryDesc(std::vector& nchw_tz, // NOLINT std::vector& axis // NOLINT - ) { + ) { size_t ndims = axis.size(); std::vector strides(ndims); diff --git a/paddle/fluid/operators/mlu/CMakeLists.txt b/paddle/fluid/operators/mlu/CMakeLists.txt index efd6aeb8eeb1c..c383edecaac91 100644 --- a/paddle/fluid/operators/mlu/CMakeLists.txt +++ b/paddle/fluid/operators/mlu/CMakeLists.txt @@ -1,5 +1,10 @@ - -IF(WITH_MLU) - cc_library(mlu_baseop SRCS mlu_baseop.cc DEPS neuware_lib device_context) - cc_test(activation_op_mlu_test SRCS activation_op_mlu_test.cc DEPS op_registry activation_op scope device_context executor) -ENDIF() +if(WITH_MLU) + cc_library( + mlu_baseop + SRCS mlu_baseop.cc + DEPS neuware_lib device_context) + cc_test( + activation_op_mlu_test + SRCS activation_op_mlu_test.cc + DEPS op_registry activation_op scope device_context executor) +endif() diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc index 9d3b8e2407fbf..1ff27454013e1 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.cc +++ b/paddle/fluid/operators/mlu/mlu_baseop.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/mlu/mlu_baseop.h" + #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/framework.pb.h" diff --git a/paddle/fluid/operators/mode_op.cc b/paddle/fluid/operators/mode_op.cc index 9c16ccb138f7d..d946f177545b4 100644 --- a/paddle/fluid/operators/mode_op.cc +++ b/paddle/fluid/operators/mode_op.cc @@ -13,10 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" - -#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/unary.h" diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc index 9a53c7162ff6d..4216ee097be52 100644 --- a/paddle/fluid/operators/modified_huber_loss_op.cc +++ b/paddle/fluid/operators/modified_huber_loss_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/modified_huber_loss_op.h" + #include namespace paddle { @@ -29,10 +30,11 @@ class ModifiedHuberLossOp : public framework::OperatorWithKernel { auto x_dims = ctx->GetInputDim("X"); auto y_dims = ctx->GetInputDim("Y"); - PADDLE_ENFORCE_EQ(x_dims.size(), 2, platform::errors::InvalidArgument( - "Input(input) rank should be 2, " - "but received input rank(%d) != 2", - x_dims.size())); + PADDLE_ENFORCE_EQ( + x_dims.size(), 2, + platform::errors::InvalidArgument("Input(input) rank should be 2, " + "but received input rank(%d) != 2", + x_dims.size())); if (ctx->IsRuntime() || (phi::product(x_dims) > 0 && phi::product(y_dims) > 0)) { diff --git a/paddle/fluid/operators/modified_huber_loss_op.cu b/paddle/fluid/operators/modified_huber_loss_op.cu index 8f1894b5af0a1..ad34a54a9bf29 100644 --- a/paddle/fluid/operators/modified_huber_loss_op.cu +++ b/paddle/fluid/operators/modified_huber_loss_op.cu @@ -15,6 +15,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/modified_huber_loss_op.h" #include "paddle/phi/core/hostdevice.h" diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc index ef04d5582d3c0..b31935cefc235 100644 --- a/paddle/fluid/operators/mul_op.cc +++ b/paddle/fluid/operators/mul_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" diff --git a/paddle/fluid/operators/mul_op_xpu.cc b/paddle/fluid/operators/mul_op_xpu.cc index 7410b3b607c82..9f52dc8559d42 100644 --- a/paddle/fluid/operators/mul_op_xpu.cc +++ b/paddle/fluid/operators/mul_op_xpu.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/xpu_api_wrapper.h" #include "paddle/fluid/platform/device/device_wrapper.h" diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc index 4e6ad35e612b7..72243b408f4be 100644 --- a/paddle/fluid/operators/multiplex_op.cc +++ b/paddle/fluid/operators/multiplex_op.cc @@ -17,7 +17,6 @@ limitations under the License. */ #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" - #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/multiary.h" diff --git a/paddle/fluid/operators/nanmedian_op.cc b/paddle/fluid/operators/nanmedian_op.cc index 23a497bdb1d3d..63bfea650ac00 100644 --- a/paddle/fluid/operators/nanmedian_op.cc +++ b/paddle/fluid/operators/nanmedian_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" diff --git a/paddle/fluid/operators/nccl/CMakeLists.txt b/paddle/fluid/operators/nccl/CMakeLists.txt index b3d53f0d39020..218d53aa6303a 100644 --- a/paddle/fluid/operators/nccl/CMakeLists.txt +++ b/paddle/fluid/operators/nccl/CMakeLists.txt @@ -1,24 +1,38 @@ -if (NOT (WITH_NCCL OR WITH_RCCL)) +if(NOT (WITH_NCCL OR WITH_RCCL)) return() endif() if(WITH_GPU AND NOT WIN32) - nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator ) + nv_library( + nccl_common + SRCS nccl_gpu_common.cc + DEPS device_context operator) endif() if(WITH_ROCM AND NOT WIN32) - hip_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator ) + hip_library( + nccl_common + SRCS nccl_gpu_common.cc + DEPS device_context operator) endif() if(WITH_GPU OR WITH_ROCM) - op_library(nccl_op DEPS nccl_common) - set(OPERATOR_DEPS ${OPERATOR_DEPS} nccl_common PARENT_SCOPE) + op_library(nccl_op DEPS nccl_common) + set(OPERATOR_DEPS + ${OPERATOR_DEPS} nccl_common + PARENT_SCOPE) endif() if(WITH_GPU AND NOT WIN32) - nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) + nv_test( + nccl_op_test + SRCS nccl_op_test.cu.cc + DEPS nccl_op gpu_info device_context) endif() if(WITH_ROCM AND NOT WIN32) - hip_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) + hip_test( + nccl_op_test + SRCS nccl_op_test.cu.cc + DEPS nccl_op gpu_info device_context) endif() diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.cc b/paddle/fluid/operators/nccl/nccl_gpu_common.cc index bcbc96ea1b6d1..8a0112fa11d80 100644 --- a/paddle/fluid/operators/nccl/nccl_gpu_common.cc +++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc @@ -25,7 +25,7 @@ size_t last_num_gpus = -1; // TODO(panyx0718): Need to decide whether Paddle supports parallel // runs with different number GPUs. If true, current solution is not enough. std::mutex comm_mu; -} +} // namespace int Communicator::GetCommId(int device_id) const { std::lock_guard guard(comm_mu); diff --git a/paddle/fluid/operators/nccl/nccl_op.cu.cc b/paddle/fluid/operators/nccl/nccl_op.cu.cc index 65c3447ff23ee..b99800ecd64be 100644 --- a/paddle/fluid/operators/nccl/nccl_op.cu.cc +++ b/paddle/fluid/operators/nccl/nccl_op.cu.cc @@ -19,9 +19,9 @@ limitations under the License. */ namespace paddle { namespace operators { +using framework::LoDTensor; using framework::Tensor; using platform::Communicator; -using framework::LoDTensor; template class NCCLTypeWrapper; diff --git a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc index 80144c6f25894..21649bfcd378f 100644 --- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc +++ b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include #include // NOLINT #include // NOLINT diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index c8af241559429..38c9b809eb6e4 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -15,11 +15,13 @@ limitations under the License. */ #pragma once #include + #include #include #include #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/selected_rows_utils.h" diff --git a/paddle/fluid/operators/nll_loss_op.cc b/paddle/fluid/operators/nll_loss_op.cc index 8f14bc10d5094..d3cbec495fdb5 100644 --- a/paddle/fluid/operators/nll_loss_op.cc +++ b/paddle/fluid/operators/nll_loss_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/infermeta/backward.h" diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc index 51daccce0e882..0a1f647627a9a 100644 --- a/paddle/fluid/operators/norm_op.cc +++ b/paddle/fluid/operators/norm_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/infermeta/unary.h" diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h index 0ed1f2719de25..18ae152a689e8 100644 --- a/paddle/fluid/operators/norm_utils.cu.h +++ b/paddle/fluid/operators/norm_utils.cu.h @@ -450,27 +450,27 @@ void NormDoubleGradFunctor(const DeviceContext &ctx, set_constant(ctx, dX, static_cast(0)); if (use_global_stats) { if (data_layout == DataLayout::kNHWC) { - DoubleGradComputeDXWithGlobal< - T, DataLayout::kNHWC><<>>( - dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, - dx_data); + DoubleGradComputeDXWithGlobal + <<>>(dy_data, ddscale_data, + variance_data, epsilon, C, + sample_size, num, dx_data); } else { - DoubleGradComputeDXWithGlobal< - T, DataLayout::kNCHW><<>>( - dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, - dx_data); + DoubleGradComputeDXWithGlobal + <<>>(dy_data, ddscale_data, + variance_data, epsilon, C, + sample_size, num, dx_data); } } else { if (data_layout == DataLayout::kNHWC) { - DoubleGradComputeDX< - T, block, DataLayout::kNHWC><<>>( - x_data, mean_data, variance_data, ddx_data, dy_data, scale_data, - ddscale_data, N, C, sample_size, epsilon, dx_data); + DoubleGradComputeDX + <<>>( + x_data, mean_data, variance_data, ddx_data, dy_data, scale_data, + ddscale_data, N, C, sample_size, epsilon, dx_data); } else { - DoubleGradComputeDX< - T, block, DataLayout::kNCHW><<>>( - x_data, mean_data, variance_data, ddx_data, dy_data, scale_data, - ddscale_data, N, C, sample_size, epsilon, dx_data); + DoubleGradComputeDX + <<>>( + x_data, mean_data, variance_data, ddx_data, dy_data, scale_data, + ddscale_data, N, C, sample_size, epsilon, dx_data); } } } @@ -479,27 +479,27 @@ void NormDoubleGradFunctor(const DeviceContext &ctx, set_constant(ctx, dScale, static_cast(0)); if (use_global_stats) { if (data_layout == DataLayout::kNHWC) { - DoubleGradComputeDScaleWithGlobal< - T, block, DataLayout::kNHWC><<>>( - ddx_data, variance_data, dy_data, epsilon, N, C, sample_size, - dscale_data); + DoubleGradComputeDScaleWithGlobal + <<>>(ddx_data, variance_data, dy_data, + epsilon, N, C, sample_size, + dscale_data); } else { - DoubleGradComputeDScaleWithGlobal< - T, block, DataLayout::kNCHW><<>>( - ddx_data, variance_data, dy_data, epsilon, N, C, sample_size, - dscale_data); + DoubleGradComputeDScaleWithGlobal + <<>>(ddx_data, variance_data, dy_data, + epsilon, N, C, sample_size, + dscale_data); } } else { if (data_layout == DataLayout::kNHWC) { - DoubleGradComputeDScale< - T, block, DataLayout::kNHWC><<>>( - x_data, mean_data, variance_data, ddx_data, dy_data, N, C, - sample_size, epsilon, dscale_data); + DoubleGradComputeDScale + <<>>( + x_data, mean_data, variance_data, ddx_data, dy_data, N, C, + sample_size, epsilon, dscale_data); } else { - DoubleGradComputeDScale< - T, block, DataLayout::kNCHW><<>>( - x_data, mean_data, variance_data, ddx_data, dy_data, N, C, - sample_size, epsilon, dscale_data); + DoubleGradComputeDScale + <<>>( + x_data, mean_data, variance_data, ddx_data, dy_data, N, C, + sample_size, epsilon, dscale_data); } } } @@ -508,27 +508,29 @@ void NormDoubleGradFunctor(const DeviceContext &ctx, set_constant(ctx, ddY, static_cast(0)); if (use_global_stats) { if (data_layout == DataLayout::kNHWC) { - DoubleGradComputeDDYWithGlobal< - T, DataLayout::kNHWC><<>>( - ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data, - ddscale_data, epsilon, C, sample_size, num, ddy_data); + DoubleGradComputeDDYWithGlobal + <<>>( + ddx_data, scale_data, mean_data, variance_data, x_data, + ddbias_data, ddscale_data, epsilon, C, sample_size, num, + ddy_data); } else { - DoubleGradComputeDDYWithGlobal< - T, DataLayout::kNCHW><<>>( - ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data, - ddscale_data, epsilon, C, sample_size, num, ddy_data); + DoubleGradComputeDDYWithGlobal + <<>>( + ddx_data, scale_data, mean_data, variance_data, x_data, + ddbias_data, ddscale_data, epsilon, C, sample_size, num, + ddy_data); } } else { if (data_layout == DataLayout::kNHWC) { - DoubleGradComputeDDY< - T, block, DataLayout::kNHWC><<>>( - x_data, mean_data, variance_data, ddscale_data, ddbias_data, - ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data); + DoubleGradComputeDDY + <<>>( + x_data, mean_data, variance_data, ddscale_data, ddbias_data, + ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data); } else { - DoubleGradComputeDDY< - T, block, DataLayout::kNCHW><<>>( - x_data, mean_data, variance_data, ddscale_data, ddbias_data, - ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data); + DoubleGradComputeDDY + <<>>( + x_data, mean_data, variance_data, ddscale_data, ddbias_data, + ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data); } } } diff --git a/paddle/fluid/operators/norm_utils.h b/paddle/fluid/operators/norm_utils.h index fee06fe5dd4fa..363702459221d 100644 --- a/paddle/fluid/operators/norm_utils.h +++ b/paddle/fluid/operators/norm_utils.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/number_count_op.cu b/paddle/fluid/operators/number_count_op.cu index 923d89c24853f..2fc180fe678e9 100644 --- a/paddle/fluid/operators/number_count_op.cu +++ b/paddle/fluid/operators/number_count_op.cu @@ -97,13 +97,13 @@ class NumberCountOpCUDAKernel : public framework::OpKernel { auto out_data = number_count->mutable_data(out_dims, place); const T* gate_data = numbers->data(); - initialize_zero_kernel< - T><<>>( - out_data, upper_range); + initialize_zero_kernel + <<>>( + out_data, upper_range); - NumberCount< - T><<>>( - gate_data, out_data, batch_size, upper_range); + NumberCount + <<>>( + gate_data, out_data, batch_size, upper_range); } }; diff --git a/paddle/fluid/operators/one_hot_op.cc b/paddle/fluid/operators/one_hot_op.cc index 64323e588c628..e6b6320898fb1 100644 --- a/paddle/fluid/operators/one_hot_op.cc +++ b/paddle/fluid/operators/one_hot_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/one_hot_op.h" + #include #include diff --git a/paddle/fluid/operators/one_hot_op_npu.cc b/paddle/fluid/operators/one_hot_op_npu.cc index 24b506ebf8a06..4e11cbb38883b 100644 --- a/paddle/fluid/operators/one_hot_op_npu.cc +++ b/paddle/fluid/operators/one_hot_op_npu.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/one_hot_op.h" - #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/one_hot_v2_op.cc b/paddle/fluid/operators/one_hot_v2_op.cc index 122b6a8a80aac..cb7b9963bbdf3 100644 --- a/paddle/fluid/operators/one_hot_v2_op.cc +++ b/paddle/fluid/operators/one_hot_v2_op.cc @@ -14,6 +14,7 @@ #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" diff --git a/paddle/fluid/operators/one_hot_v2_op_npu.cc b/paddle/fluid/operators/one_hot_v2_op_npu.cc index e5702a37bb2b4..dcf098f105c21 100644 --- a/paddle/fluid/operators/one_hot_v2_op_npu.cc +++ b/paddle/fluid/operators/one_hot_v2_op_npu.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" - #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/optimizers/CMakeLists.txt b/paddle/fluid/operators/optimizers/CMakeLists.txt index 6989447fc04fd..7a27dda735c4a 100644 --- a/paddle/fluid/operators/optimizers/CMakeLists.txt +++ b/paddle/fluid/operators/optimizers/CMakeLists.txt @@ -1,6 +1,6 @@ include(operators) if(WITH_UNITY_BUILD) - # Load Unity Build rules for operators in paddle/fluid/operators/optimizers. - include(unity_build_rule.cmake) + # Load Unity Build rules for operators in paddle/fluid/operators/optimizers. + include(unity_build_rule.cmake) endif() register_operators() diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cc b/paddle/fluid/operators/optimizers/adagrad_op.cc index 91bad1430615f..64f22cced3baf 100644 --- a/paddle/fluid/operators/optimizers/adagrad_op.cc +++ b/paddle/fluid/operators/optimizers/adagrad_op.cc @@ -15,13 +15,12 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/multiary.h" +#include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc index 1ea91f6ebfa3e..e13805f694bd6 100644 --- a/paddle/fluid/operators/optimizers/adam_op_npu.cc +++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc @@ -183,16 +183,25 @@ class AdamNPUKernel : public framework::OpKernel { auto stream = ctx.template device_context() .stream(); - const auto& runner = - NpuOpRunner("ApplyAdamD", - { - *param, *mom1, *mom2, *beta1_pow, *beta2_pow, *lr, - *beta1_tensor, *beta2_tensor, *epsilon_tensor, *grad, - }, - { - *param_out, *mom1_out, *mom2_out, - }, - {}); + const auto& runner = NpuOpRunner("ApplyAdamD", + { + *param, + *mom1, + *mom2, + *beta1_pow, + *beta2_pow, + *lr, + *beta1_tensor, + *beta2_tensor, + *epsilon_tensor, + *grad, + }, + { + *param_out, + *mom1_out, + *mom2_out, + }, + {}); runner.Run(stream); // NOTE(zhiqiu): ApplyAdamD updates params inplace, so diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc index 6ea0b2054cdea..37467c7ba9614 100644 --- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc @@ -306,8 +306,9 @@ class AdamOpXPUKernel : public framework::OpKernel { } xpu_wait(dev_ctx.x_context()->xpu_stream); } else { - PADDLE_ENFORCE_EQ(1, 2, platform::errors::InvalidArgument( - "Variable type not supported by adam_op")); + PADDLE_ENFORCE_EQ(1, 2, + platform::errors::InvalidArgument( + "Variable type not supported by adam_op")); } } }; diff --git a/paddle/fluid/operators/optimizers/adamw_op.cc b/paddle/fluid/operators/optimizers/adamw_op.cc index e2670625d4e50..43e9dc0cae8ef 100644 --- a/paddle/fluid/operators/optimizers/adamw_op.cc +++ b/paddle/fluid/operators/optimizers/adamw_op.cc @@ -12,9 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/optimizers/adam_op.h" - #include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/operators/optimizers/adam_op.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/multiary.h" diff --git a/paddle/fluid/operators/optimizers/adamw_op_xpu.cc b/paddle/fluid/operators/optimizers/adamw_op_xpu.cc index d86d2bd2ffb4a..57a6b744fd6db 100644 --- a/paddle/fluid/operators/optimizers/adamw_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/adamw_op_xpu.cc @@ -205,8 +205,9 @@ class AdamwOpXPUKernel : public framework::OpKernel { } } } else { - PADDLE_ENFORCE_EQ(1, 2, platform::errors::InvalidArgument( - "Variable type not supported by adamw_op")); + PADDLE_ENFORCE_EQ(1, 2, + platform::errors::InvalidArgument( + "Variable type not supported by adamw_op")); } } }; diff --git a/paddle/fluid/operators/optimizers/cast_with_ptr.h b/paddle/fluid/operators/optimizers/cast_with_ptr.h index a3fbb0e59e24e..eb031ae0c933a 100644 --- a/paddle/fluid/operators/optimizers/cast_with_ptr.h +++ b/paddle/fluid/operators/optimizers/cast_with_ptr.h @@ -43,9 +43,9 @@ static void VecCastKernel(const platform::CUDADeviceContext &ctx, const InT *x, in_arr[0] = reinterpret_cast(x); phi::Array<_ptr_ OutT *, 1> out_arr; out_arr[0] = y; - phi::funcs::VectorizedElementwiseKernel< - OutT, FunctorT, 1, 1, VecSize><<>>( - in_arr, out_arr, n, main_offset, FunctorT()); + phi::funcs::VectorizedElementwiseKernel + <<>>(in_arr, out_arr, n, main_offset, + FunctorT()); } } // namespace details diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc index 7f0b2b7d064ed..40ac044e6475e 100644 --- a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc +++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include - #include "paddle/fluid/operators/optimizers/dgc_momentum_op.h" +#include + namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu index 3688b8067c231..7cbc52f4235de 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu @@ -83,10 +83,12 @@ static void GetParamGradShardInfo(const std::vector &infos, VLOG(10) << "start_size = " << start_size << " , end_size = " << end_size; if (infos.empty()) { - PADDLE_ENFORCE_EQ(start_size, 0, platform::errors::InvalidArgument( - "start_size should be 0.")); - PADDLE_ENFORCE_EQ(end_size, 0, platform::errors::InvalidArgument( - "end_size should be 0.")); + PADDLE_ENFORCE_EQ( + start_size, 0, + platform::errors::InvalidArgument("start_size should be 0.")); + PADDLE_ENFORCE_EQ( + end_size, 0, + platform::errors::InvalidArgument("end_size should be 0.")); *start_idx = 0; *end_idx = 0; *start_numel_offset = 0; @@ -104,15 +106,17 @@ static void GetParamGradShardInfo(const std::vector &infos, infos.begin()); if (i == n || infos[i].numel_offset != start_size) { PADDLE_ENFORCE_GT( - i, 0, platform::errors::InvalidArgument( - "Cannot find suitable sharding which is between [%d, %d)", - start_size, end_size)); + i, 0, + platform::errors::InvalidArgument( + "Cannot find suitable sharding which is between [%d, %d)", + start_size, end_size)); --i; } PADDLE_ENFORCE_LT( - i, n, platform::errors::InvalidArgument( - "Cannot find suitable sharding which is between [%d, %d)", - start_size, end_size)); + i, n, + platform::errors::InvalidArgument( + "Cannot find suitable sharding which is between [%d, %d)", start_size, + end_size)); *start_idx = i; *start_numel_offset = start_size - infos[i].numel_offset; auto j = static_cast( @@ -450,8 +454,9 @@ class DistributedFusedLambInitOpKernel platform::errors::InvalidArgument( "The attr(alignment) should be the power of 2.")); PADDLE_ENFORCE_GE( - rank, 0, platform::errors::InvalidArgument( - "The attr(rank) should be equal to or larger than 0.")); + rank, 0, + platform::errors::InvalidArgument( + "The attr(rank) should be equal to or larger than 0.")); PADDLE_ENFORCE_LT( rank, nranks, platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index c857c6de4d093..eb354ef6d7576 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "paddle/fluid/memory/buffer.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/optimizers/cast_with_ptr.h" @@ -32,6 +33,7 @@ #ifdef __HIPCC__ #include + #include "math.h" // NOLINT namespace cub = hipcub; #endif @@ -190,9 +192,8 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place, PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL); #undef PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL - MultiTensorL2NormReduceAgainCUDAKernel< - MT, OutT, kBlockDim><<>>(tmp_out_ptr, y, - max_chunk_num); + MultiTensorL2NormReduceAgainCUDAKernel + <<>>(tmp_out_ptr, y, max_chunk_num); } template @@ -508,14 +509,14 @@ static void MultiTensorUpdateLambMomentAndTrustRatioDiv( "Output(Step) cannot be nullptr.")); } -#define PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL \ - do { \ - UpdateLambMomentAndTrustRatioDivCUDAKernel<<< \ - config.block_per_grid, config.thread_per_block, 0, stream>>>( \ - param_p, grad_p, square_grad_norm_p, global_scale, beta1pow_p, \ - beta2pow_p, mom1_p, mom2_p, trust_ratio_div_p, found_inf_p, step, \ - weight_decay, weight_decay_end_numel, beta1, beta2, epsilon, \ - max_global_grad_norm, numel, rescale_grad); \ +#define PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL \ + do { \ + UpdateLambMomentAndTrustRatioDivCUDAKernel \ + <<>>( \ + param_p, grad_p, square_grad_norm_p, global_scale, beta1pow_p, \ + beta2pow_p, mom1_p, mom2_p, trust_ratio_div_p, found_inf_p, step, \ + weight_decay, weight_decay_end_numel, beta1, beta2, epsilon, \ + max_global_grad_norm, numel, rescale_grad); \ } while (0) PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL); @@ -705,8 +706,9 @@ static void MultiTensorUpdateLambParamAndBetaPows( PADDLE_ENFORCE_NOT_NULL(beta2pow, platform::errors::InvalidArgument( "Beta2Pow should not be nullptr.")); } else { - PADDLE_ENFORCE_EQ(beta2pow, nullptr, platform::errors::InvalidArgument( - "Beta2Pow should be nullptr.")); + PADDLE_ENFORCE_EQ( + beta2pow, nullptr, + platform::errors::InvalidArgument("Beta2Pow should be nullptr.")); } const int block_dim = 512; @@ -744,21 +746,21 @@ static void MultiTensorUpdateLambParamAndBetaPows( betapow_helper); \ } while (0) -#define PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE \ - do { \ - auto callback = [&]( \ - const MultiTensorLauncher &launcher, \ - int launch_n) { \ - if (has_beta_pow && launch_n == 0) { \ - PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(true); \ - beta1pow = nullptr; \ - beta2pow = nullptr; \ - } else { \ - PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(false); \ - } \ - }; \ - MultiTensorApplyWithCallback( \ - stream, offsets, n, chunk_size, block_dim, callback); \ +#define PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE \ + do { \ + auto callback = \ + [&](const MultiTensorLauncher &launcher, \ + int launch_n) { \ + if (has_beta_pow && launch_n == 0) { \ + PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(true); \ + beta1pow = nullptr; \ + beta2pow = nullptr; \ + } else { \ + PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(false); \ + } \ + }; \ + MultiTensorApplyWithCallback( \ + stream, offsets, n, chunk_size, block_dim, callback); \ } while (0) PD_VEC_LAUNCH_KERNEL(vec_size, @@ -793,11 +795,11 @@ static void LaunchScaleKernel(const platform::CUDADeviceContext &dev_ctx, int vec_size = std::min(GetChunkedVecSize(x, 0), GetChunkedVecSize(y, 0)); auto config = platform::GetGpuLaunchConfig1D(dev_ctx, n, vec_size); -#define PD_LAMB_VEC_SCALE_KERNEL_CASE \ - do { \ - ScaleCUDAKernel<<>>( \ - x, scale, y, n); \ +#define PD_LAMB_VEC_SCALE_KERNEL_CASE \ + do { \ + ScaleCUDAKernel \ + <<>>( \ + x, scale, y, n); \ } while (0) PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAMB_VEC_SCALE_KERNEL_CASE); @@ -1015,7 +1017,7 @@ static void CheckHasNanInfGrad(const float *fp32_grad, int fp32_numel, if (fp32_numel > 0) { fp32_has_nan_inf = reinterpret_cast(nan_inf_flag + 1); cub::TransformInputIterator, const float *> - iter(fp32_grad, IsNanInfFunctor()); + iter(fp32_grad, IsNanInfFunctor()); CubDeviceReduce(iter, fp32_has_nan_inf, fp32_numel, OrFunctor(), false, stream, cub_tmp_buffer); } @@ -1082,11 +1084,11 @@ static void LaunchElementwiseAddWithCastKernel( GetChunkedVecSize(z, 0)); auto config = platform::GetGpuLaunchConfig1D(dev_ctx, n, vec_size); -#define PD_LAUNCH_ELEMENTWISE_ADD_WITH_CAST_KERNEL \ - do { \ - ElementwiseAddWithCastCUDAKernel<<< \ - config.block_per_grid, config.thread_per_block, 0, stream>>>(x, y, z, \ - n); \ +#define PD_LAUNCH_ELEMENTWISE_ADD_WITH_CAST_KERNEL \ + do { \ + ElementwiseAddWithCastCUDAKernel \ + <<>>(x, y, \ + z, n); \ } while (0) PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_ELEMENTWISE_ADD_WITH_CAST_KERNEL); @@ -1445,10 +1447,10 @@ class DistributedFusedLambOpKernel if (is_grad_scaled_by_nranks) { clip_scale *= num_devices; } - CalcGradNormClipBeforeAllReduceScale< - float, platform::float16><<<1, 1, 0, stream>>>( - global_scale, max_global_grad_norm, fp32_square_grad_norm, - fp32_scale, fp16_scale, clip_scale); + CalcGradNormClipBeforeAllReduceScale + <<<1, 1, 0, stream>>>(global_scale, max_global_grad_norm, + fp32_square_grad_norm, fp32_scale, fp16_scale, + clip_scale); if (fp32_scale) { VLOG(1) << "Grad scale: " << FlattenToString(fp32_scale, 1, place); } else { @@ -1567,11 +1569,12 @@ class DistributedFusedLambOpKernel fp16_partial_fused_offsets, fp16_local_param_num, param_square_norm + fp16_local_start_idx); } else { - MultiTensorL2Norm( - place, stream, fp16_param + fused_offsets[fp16_local_start_idx] - - fused_offsets[fp32_global_param_num], - fused_offsets + fp16_local_start_idx, fp16_local_param_num, - param_square_norm + fp16_local_start_idx); + MultiTensorL2Norm(place, stream, + fp16_param + fused_offsets[fp16_local_start_idx] - + fused_offsets[fp32_global_param_num], + fused_offsets + fp16_local_start_idx, + fp16_local_param_num, + param_square_norm + fp16_local_start_idx); } MultiTensorL2Norm(place, stream, trust_ratio_div, diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.h b/paddle/fluid/operators/optimizers/dpsgd_op.h index 688a7f1ad8435..69a853c5d1846 100644 --- a/paddle/fluid/operators/optimizers/dpsgd_op.h +++ b/paddle/fluid/operators/optimizers/dpsgd_op.h @@ -15,7 +15,9 @@ limitations under the License. */ #pragma once #include #include + #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -108,9 +110,8 @@ class DpsgdOpKernel : public framework::OpKernel { // update parameters for (int64_t i = 0; i < grad->numel(); ++i) { - out_data[i] = - param_data[i] - - lr[0] * (grad_data[i] / scale + gaussian_noise / batch_size); + out_data[i] = param_data[i] - lr[0] * (grad_data[i] / scale + + gaussian_noise / batch_size); } // CCS16 - Deep Learning with Differential Privacy. // [https://arxiv.org/abs/1607.00133] diff --git a/paddle/fluid/operators/optimizers/ftrl_op.h b/paddle/fluid/operators/optimizers/ftrl_op.h index 596ed05df3ffd..73fd7ceb67b0e 100644 --- a/paddle/fluid/operators/optimizers/ftrl_op.h +++ b/paddle/fluid/operators/optimizers/ftrl_op.h @@ -74,9 +74,8 @@ class SparseFTRLFunctor { l_acc_out_[j] += g - (std::sqrt(new_acc) - std::sqrt(s_acc)) / lr * p; } else { l_acc_out_[j] += - g - - (std::pow(new_acc, -lr_power_) - std::pow(s_acc, -lr_power_)) / lr * - p; + g - (std::pow(new_acc, -lr_power_) - std::pow(s_acc, -lr_power_)) / + lr * p; } auto l_acc = l_acc_out_[j]; diff --git a/paddle/fluid/operators/optimizers/lamb_op.cc b/paddle/fluid/operators/optimizers/lamb_op.cc index 48ceba3695f83..fb2a78d28edfc 100644 --- a/paddle/fluid/operators/optimizers/lamb_op.cc +++ b/paddle/fluid/operators/optimizers/lamb_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/optimizers/lamb_op.h" + #include + #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { @@ -227,13 +229,12 @@ REGISTER_OP_CPU_KERNEL( ops::LambOpKernel); /* ========================== register checkpoint ===========================*/ -REGISTER_OP_VERSION(lamb) - .AddCheckpoint( - R"ROC(Upgrade lamb, add two new outputs [Beta1PowOut] and [Beta2PowOut].)ROC", - paddle::framework::compatible::OpVersionDesc() - .NewInput("Beta1PowOut", - "The Output beta1 power accumulator. 'Beta1PowOut' is " - "dispensable.") - .NewInput("Beta2PowOut", - "The Output beta2 power accumulator. 'Beta2PowOut' is " - "dispensable.")); +REGISTER_OP_VERSION(lamb).AddCheckpoint( + R"ROC(Upgrade lamb, add two new outputs [Beta1PowOut] and [Beta2PowOut].)ROC", + paddle::framework::compatible::OpVersionDesc() + .NewInput("Beta1PowOut", + "The Output beta1 power accumulator. 'Beta1PowOut' is " + "dispensable.") + .NewInput("Beta2PowOut", + "The Output beta2 power accumulator. 'Beta2PowOut' is " + "dispensable.")); diff --git a/paddle/fluid/operators/optimizers/lamb_op.cu b/paddle/fluid/operators/optimizers/lamb_op.cu index b46fa19ea1352..a9f880fdbb67d 100644 --- a/paddle/fluid/operators/optimizers/lamb_op.cu +++ b/paddle/fluid/operators/optimizers/lamb_op.cu @@ -16,7 +16,8 @@ limitations under the License. */ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( - lamb, ops::LambOpKernel, + lamb, + ops::LambOpKernel, ops::LambOpKernel, ops::LambOpKernel); diff --git a/paddle/fluid/operators/optimizers/lamb_op.h b/paddle/fluid/operators/optimizers/lamb_op.h index 45acf2b3e4834..2956ff204679e 100644 --- a/paddle/fluid/operators/optimizers/lamb_op.h +++ b/paddle/fluid/operators/optimizers/lamb_op.h @@ -14,8 +14,10 @@ limitations under the License. */ #pragma once #include // for sqrt in CPU and CUDA + #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/buffer.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" @@ -655,10 +657,10 @@ class LambOpKernel : public framework::OpKernel { // TODO(zengjinle): remove the following Eigen operations when // *skip_update == true. memory::Buffer buffer(dev_ctx.GetPlace()); - math::SquaredL2Norm( - dev_ctx, reinterpret_cast(IsMultiPrecision ? master_param_ptr - : param_ptr), - p_norm_ptr, numel, &buffer); + math::SquaredL2Norm(dev_ctx, + reinterpret_cast( + IsMultiPrecision ? master_param_ptr : param_ptr), + p_norm_ptr, numel, &buffer); math::SquaredL2Norm(dev_ctx, trust_ratio_div_ptr, trust_ratio_div_norm_ptr, numel, &buffer); @@ -675,12 +677,12 @@ class LambOpKernel : public framework::OpKernel { #define CALL_PADDLE_UPDATE_LAMB_PARAM_FUNC(__should_update_beta_pow) \ do { \ LambParamUpateFunctor \ - param_update_functor( \ - lr.template data(), static_cast(param_ptr), \ - static_cast(master_param_ptr), p_norm_ptr, \ - trust_ratio_div_ptr, trust_ratio_div_norm_ptr, \ - static_cast(param_out_ptr), \ - static_cast(master_param_out_ptr), skip_update_flag); \ + param_update_functor( \ + lr.template data(), static_cast(param_ptr), \ + static_cast(master_param_ptr), p_norm_ptr, \ + trust_ratio_div_ptr, trust_ratio_div_norm_ptr, \ + static_cast(param_out_ptr), \ + static_cast(master_param_out_ptr), skip_update_flag); \ if (__should_update_beta_pow) { \ param_update_functor.SetBetaPows(beta1_pow_ptr, beta2_pow_ptr, \ beta1_pow_out_ptr, beta2_pow_out_ptr, \ diff --git a/paddle/fluid/operators/optimizers/lamb_op_xpu.cc b/paddle/fluid/operators/optimizers/lamb_op_xpu.cc index 7aa5783a01bfd..ef224382cd091 100644 --- a/paddle/fluid/operators/optimizers/lamb_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/lamb_op_xpu.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/optimizers/lamb_op.h" #include "gflags/gflags.h" +#include "paddle/fluid/operators/optimizers/lamb_op.h" #include "paddle/fluid/platform/device/device_wrapper.h" namespace paddle { diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu index 5b883a11e5733..553ac69edcac7 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu @@ -129,8 +129,9 @@ __device__ inline void VectorizeLarsUpdate( for (int i = tid + tail_offset; i < numel; i += grid_stride) { MT grad_val = static_cast(grad[i]) * rescale_grad; MT param_val = param[i]; - MT velocity_tmp = Fma(velocity[i], mu, local_lr * Fma(lars_weight_decay, - param_val, grad_val)); + MT velocity_tmp = + Fma(velocity[i], mu, + local_lr * Fma(lars_weight_decay, param_val, grad_val)); MT param_tmp = param_val - velocity_tmp; param_out[i] = static_cast(param_tmp); velocity_out[i] = velocity_tmp; @@ -314,10 +315,10 @@ inline void SeparatedLarsMomentumOpCUDAKernel( const MT rescale_grad, const int64_t numel, const MT* master_param_data, MT* master_out_data, const bool is_amp) { LarsThreadConfig lars_thread_config(numel); - L2NormKernel<<>>( - param_data, grad_data, p_buffer, g_buffer, numel, - lars_thread_config.repeat_times, rescale_grad); + L2NormKernel + <<>>(param_data, grad_data, p_buffer, g_buffer, numel, + lars_thread_config.repeat_times, rescale_grad); MomentumLarsKernel<<>>( diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc index 54ead6d3df7f0..280c0930e91d5 100644 --- a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc +++ b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/optimizers/merged_momentum_op.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/fluid/operators/optimizers/merged_momentum_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc index 5fad5eca9affc..d405500d60768 100644 --- a/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc +++ b/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/fluid/operators/optimizers/merged_momentum_op.h" - #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/phi/kernels/impl/momentum_kernel_impl.h" @@ -151,10 +150,11 @@ class NPUMergedMomentumOpKernel : public framework::OpKernel { framework::TensorCopy(*param, ctx.GetPlace(), dev_ctx, param_out); framework::TensorCopy(*velocity, ctx.GetPlace(), dev_ctx, velocity_out); // NOTE: ApplyMomentum will change the input - const auto& runner = NpuOpRunner( - "ApplyMomentum", {*param_out, *velocity_out, *learning_rate, - regularized_grad, mu_tensor}, - {*param_out}, {{"use_nesterov", use_nesterov}}); + const auto& runner = + NpuOpRunner("ApplyMomentum", + {*param_out, *velocity_out, *learning_rate, + regularized_grad, mu_tensor}, + {*param_out}, {{"use_nesterov", use_nesterov}}); runner.Run(dev_ctx.stream()); } } diff --git a/paddle/fluid/operators/optimizers/momentum_op.cc b/paddle/fluid/operators/optimizers/momentum_op.cc index 50d2c946f3afe..94fb4c156ef5f 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.cc +++ b/paddle/fluid/operators/optimizers/momentum_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/optimizers/momentum_op.h" + #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { @@ -109,28 +110,26 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, ops::MomentumOpInferVarType); -REGISTER_OP_VERSION(momentum) - .AddCheckpoint( - R"ROC( +REGISTER_OP_VERSION(momentum).AddCheckpoint( + R"ROC( Upgrade momentum add 4 attributes [regularization_method, regularization_coeff, multi_precision, rescale_grad]. )ROC", - paddle::framework::compatible::OpVersionDesc() - .NewInput("MasterParam", "FP32 master weight for AMP.") - .NewOutput("MasterParamOut", - "The updated FP32 master weight for AMP. " - "It shared memory with Input(MasterParam).") - .NewAttr("regularization_method", - "(string) regularization_method, right now only support " - "l2decay or none", - std::string("")) - .NewAttr("regularization_coeff", "(float) regularization_coeff", - 0.0f) - .NewAttr( - "multi_precision", - "(bool) Whether to use multi-precision during weight updating.", - false) - .NewAttr("rescale_grad", - "(float) Multiply the gradient with `rescale_grad`" - "before updating. Often choose to be `1.0/batch_size`.", - 1.0f)); + paddle::framework::compatible::OpVersionDesc() + .NewInput("MasterParam", "FP32 master weight for AMP.") + .NewOutput("MasterParamOut", + "The updated FP32 master weight for AMP. " + "It shared memory with Input(MasterParam).") + .NewAttr("regularization_method", + "(string) regularization_method, right now only support " + "l2decay or none", + std::string("")) + .NewAttr("regularization_coeff", "(float) regularization_coeff", 0.0f) + .NewAttr( + "multi_precision", + "(bool) Whether to use multi-precision during weight updating.", + false) + .NewAttr("rescale_grad", + "(float) Multiply the gradient with `rescale_grad`" + "before updating. Often choose to be `1.0/batch_size`.", + 1.0f)); diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h index 017f33d7458fc..2f6a9758a2cf5 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.h +++ b/paddle/fluid/operators/optimizers/momentum_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" diff --git a/paddle/fluid/operators/optimizers/momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/momentum_op_mlu.cc index b8fa81b2e7123..417f89410cf88 100644 --- a/paddle/fluid/operators/optimizers/momentum_op_mlu.cc +++ b/paddle/fluid/operators/optimizers/momentum_op_mlu.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/optimizers/momentum_op.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/fluid/operators/optimizers/momentum_op.h" #include "paddle/phi/kernels/impl/momentum_kernel_impl.h" namespace paddle { @@ -77,8 +77,9 @@ class MLUMomentumOpKernel : public framework::OpKernel { GetBasePtr(learning_rate), GetBasePtr(&mu_tensor), GetBasePtr(param_out), GetBasePtr(velocity_out)); } else if (grad_var->IsType()) { - PADDLE_ENFORCE_EQ(false, true, platform::errors::PermissionDenied( - "Unsupport SparseMomentum")); + PADDLE_ENFORCE_EQ( + false, true, + platform::errors::PermissionDenied("Unsupport SparseMomentum")); } else { PADDLE_ENFORCE_EQ(false, true, platform::errors::PermissionDenied( diff --git a/paddle/fluid/operators/optimizers/momentum_op_npu.cc b/paddle/fluid/operators/optimizers/momentum_op_npu.cc index 2d73766b97364..d3ffeb18be7b9 100644 --- a/paddle/fluid/operators/optimizers/momentum_op_npu.cc +++ b/paddle/fluid/operators/optimizers/momentum_op_npu.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/optimizers/momentum_op.h" - #include "paddle/fluid/operators/optimizers/sgd_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/phi/kernels/impl/momentum_kernel_impl.h" @@ -70,14 +69,16 @@ class NPUMomentumOpKernel : public framework::OpKernel { framework::TensorCopy(*param, ctx.GetPlace(), dev_ctx, param_out); framework::TensorCopy(*velocity, ctx.GetPlace(), dev_ctx, velocity_out); // NOTE: ApplyMomentum will change the input - const auto& runner = NpuOpRunner( - "ApplyMomentum", {*param_out, *velocity_out, *learning_rate, - regularized_grad, mu_tensor}, - {*param_out}, {{"use_nesterov", use_nesterov}}); + const auto& runner = + NpuOpRunner("ApplyMomentum", + {*param_out, *velocity_out, *learning_rate, + regularized_grad, mu_tensor}, + {*param_out}, {{"use_nesterov", use_nesterov}}); runner.Run(dev_ctx.stream()); } else if (grad_var->IsType()) { - PADDLE_ENFORCE_EQ(false, true, platform::errors::PermissionDenied( - "Unsupport SparseMomentum")); + PADDLE_ENFORCE_EQ( + false, true, + platform::errors::PermissionDenied("Unsupport SparseMomentum")); } else { PADDLE_ENFORCE_EQ(false, true, platform::errors::PermissionDenied( diff --git a/paddle/fluid/operators/optimizers/momentum_op_xpu.cc b/paddle/fluid/operators/optimizers/momentum_op_xpu.cc index 6897213c91a34..749d38f315e00 100644 --- a/paddle/fluid/operators/optimizers/momentum_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/momentum_op_xpu.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU #include + #include "paddle/fluid/operators/optimizers/sgd_op.h" #include "paddle/fluid/platform/device/device_wrapper.h" namespace paddle { diff --git a/paddle/fluid/operators/optimizers/multi_tensor_apply.h b/paddle/fluid/operators/optimizers/multi_tensor_apply.h index 179e8f452545c..98850aa816bdc 100644 --- a/paddle/fluid/operators/optimizers/multi_tensor_apply.h +++ b/paddle/fluid/operators/optimizers/multi_tensor_apply.h @@ -15,6 +15,7 @@ #pragma once #include + #include "math.h" // NOLINT namespace paddle { @@ -108,11 +109,11 @@ class MultiTensorLauncher { stream_(stream) {} template - void Launch(Functor &&functor, Args &&... args) const { - MultiTensorApplyCUDAKernel< - Functor, MaxTensorNumPerLaunch, - MaxChunkNumPerLaunch><<>>( - functor, meta_, chunk_size_, args...); + void Launch(Functor &&functor, Args &&...args) const { + MultiTensorApplyCUDAKernel + <<>>(functor, meta_, chunk_size_, + args...); } private: @@ -189,7 +190,7 @@ template static void MultiTensorApply(Functor functor, gpuStream_t stream, const int *offsets, int n, int chunk_size, - int block_dim, Args &&... args) { + int block_dim, Args &&...args) { auto callback = [&](const MultiTensorLauncher &launcher, int i) { launcher.Launch(functor, args...); }; diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc index 6893e5d6b9b2c..5eeeb7353072e 100644 --- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc +++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h" + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/float16.h" diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h index 74cf762745077..353d8777a84ab 100644 --- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h +++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h @@ -47,9 +47,8 @@ struct Pow2DecayWithLinearWarmupFunctor { auto new_lr = static_cast(step) / warmup_steps_ * base_lr_; *lr_ = static_cast(new_lr); } else if (step < total_steps_) { - auto factor = 1 - - static_cast(step - warmup_steps_) / - (total_steps_ - warmup_steps_); + auto factor = 1 - static_cast(step - warmup_steps_) / + (total_steps_ - warmup_steps_); auto new_lr = static_cast(base_lr_ - end_lr_) * (factor * factor) + end_lr_; *lr_ = static_cast(new_lr); @@ -76,9 +75,10 @@ class Pow2DecayWithLinearWarmupOpKernel : public framework::OpKernel { auto *lr_out = ctx.Output("LearningRateOut"); auto *step_out = ctx.Output("StepOut"); PADDLE_ENFORCE_EQ( - lr, lr_out, platform::errors::InvalidArgument("Input(LearningRate) and " - "Output(LearningRateOut) " - "must be the same.")); + lr, lr_out, + platform::errors::InvalidArgument("Input(LearningRate) and " + "Output(LearningRateOut) " + "must be the same.")); PADDLE_ENFORCE_NOT_NULL(lr, platform::errors::InvalidArgument( "Input(LearingRate) should not be nullptr.")); diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.cc b/paddle/fluid/operators/optimizers/rmsprop_op.cc index b3458724482e9..874e21cc6ccbf 100644 --- a/paddle/fluid/operators/optimizers/rmsprop_op.cc +++ b/paddle/fluid/operators/optimizers/rmsprop_op.cc @@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/op_registry.h" - #include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/multiary.h" diff --git a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc index b53d51686cfd7..7f4810ea4207a 100644 --- a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc @@ -15,7 +15,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include + #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/device_wrapper.h" diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc index a2af131cb505e..b5822fd5c446e 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cc +++ b/paddle/fluid/operators/optimizers/sgd_op.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include - #include "paddle/fluid/operators/optimizers/sgd_op.h" + +#include #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif @@ -76,10 +76,11 @@ class SGDOpInferVarType : public framework::VarTypeInference { auto in_var_type = ctx->GetInputType("Param"); PADDLE_ENFORCE_EQ(in_var_type == framework::proto::VarType::SELECTED_ROWS || in_var_type == framework::proto::VarType::LOD_TENSOR, - true, platform::errors::InvalidArgument( - "The input Var's type should be LoDtensor or " - "SelectedRows, but the received type is %s", - in_var_type)); + true, + platform::errors::InvalidArgument( + "The input Var's type should be LoDtensor or " + "SelectedRows, but the received type is %s", + in_var_type)); ctx->SetOutputType("ParamOut", in_var_type, framework::ALL_ELEMENTS); } diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu index 222244a2fd1e3..ba2e84a6a789d 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cu +++ b/paddle/fluid/operators/optimizers/sgd_op.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/optimizers/sgd_op.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" @@ -106,11 +107,11 @@ class SGDOpKernel int block = 512; int grid = (param->numel() + block - 1) / block; - SGDKernelMT< - T, MPDType><<>>( - param->data(), grad->data(), learning_rate->data(), - param->numel(), param_out->mutable_data(ctx.GetPlace()), - master_in_data, master_out_data); + SGDKernelMT + <<>>( + param->data(), grad->data(), learning_rate->data(), + param->numel(), param_out->mutable_data(ctx.GetPlace()), + master_in_data, master_out_data); } else if (grad_var->IsType()) { // TODO(qijun): In Sparse SGD operator, in-place update is enforced. diff --git a/paddle/fluid/operators/optimizers/sgd_op_xpu.cc b/paddle/fluid/operators/optimizers/sgd_op_xpu.cc index e7c03be95cae1..7203357db10e3 100644 --- a/paddle/fluid/operators/optimizers/sgd_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/sgd_op_xpu.cc @@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/optimizers/sgd_op.h" #include + +#include "paddle/fluid/operators/optimizers/sgd_op.h" #include "paddle/fluid/platform/device/device_wrapper.h" namespace paddle { diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.cc b/paddle/fluid/operators/optimizers/sparse_momentum_op.cc index c38545df17311..0c4fa916f4331 100644 --- a/paddle/fluid/operators/optimizers/sparse_momentum_op.cc +++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/optimizers/sparse_momentum_op.h" + #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.h b/paddle/fluid/operators/optimizers/sparse_momentum_op.h index 08b2d3764feba..296a3d5b88975 100644 --- a/paddle/fluid/operators/optimizers/sparse_momentum_op.h +++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.h @@ -17,6 +17,7 @@ #include #include #include + #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/optimizers/unity_build_rule.cmake b/paddle/fluid/operators/optimizers/unity_build_rule.cmake index 769bb781d6e72..61e63ad9a6e61 100644 --- a/paddle/fluid/operators/optimizers/unity_build_rule.cmake +++ b/paddle/fluid/operators/optimizers/unity_build_rule.cmake @@ -4,32 +4,34 @@ # Generally, the combination rules in this file do not need to be modified. # If there are some redefined error in compiling with the source file which # in combination rule, you can remove the source file from the following rules. -register_unity_group(cc - ftrl_op.cc - lars_momentum_op.cc - momentum_op.cc - sgd_op.cc - proximal_adagrad_op.cc - adagrad_op.cc - adam_op.cc - adamax_op.cc - dgc_momentum_op.cc - proximal_gd_op.cc - decayed_adagrad_op.cc - adadelta_op.cc - lamb_op.cc - dpsgd_op.cc - rmsprop_op.cc) -register_unity_group(cu - ftrl_op.cu - lars_momentum_op.cu - momentum_op.cu - sgd_op.cu - proximal_adagrad_op.cu - adagrad_op.cu - adam_op.cu - adamax_op.cu - decayed_adagrad_op.cu - adadelta_op.cu - lamb_op.cu - rmsprop_op.cu) +register_unity_group( + cc + ftrl_op.cc + lars_momentum_op.cc + momentum_op.cc + sgd_op.cc + proximal_adagrad_op.cc + adagrad_op.cc + adam_op.cc + adamax_op.cc + dgc_momentum_op.cc + proximal_gd_op.cc + decayed_adagrad_op.cc + adadelta_op.cc + lamb_op.cc + dpsgd_op.cc + rmsprop_op.cc) +register_unity_group( + cu + ftrl_op.cu + lars_momentum_op.cu + momentum_op.cu + sgd_op.cu + proximal_adagrad_op.cu + adagrad_op.cu + adam_op.cu + adamax_op.cu + decayed_adagrad_op.cu + adadelta_op.cu + lamb_op.cu + rmsprop_op.cu) diff --git a/paddle/fluid/operators/p_norm_op.cc b/paddle/fluid/operators/p_norm_op.cc index c7c8ebf562b4d..21254521fa912 100644 --- a/paddle/fluid/operators/p_norm_op.cc +++ b/paddle/fluid/operators/p_norm_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" @@ -124,11 +125,10 @@ REGISTER_OPERATOR(p_norm, ops::PnormOp, ops::PnormOpMaker, PNormInferShapeFunctor); REGISTER_OPERATOR(p_norm_grad, ops::PnormOpGrad, PNormGradInferShapeFunctor); -REGISTER_OP_VERSION(p_norm) - .AddCheckpoint( - R"ROC( +REGISTER_OP_VERSION(p_norm).AddCheckpoint( + R"ROC( Upgrade p_norm, add 1 attribute [asvector]. )ROC", - paddle::framework::compatible::OpVersionDesc().NewAttr( - "asvector", - "Compute as vector when axis is None and input is matrix", false)); + paddle::framework::compatible::OpVersionDesc().NewAttr( + "asvector", "Compute as vector when axis is None and input is matrix", + false)); diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc index 38fa3316a6e27..6d27433512e90 100644 --- a/paddle/fluid/operators/pad2d_op.cc +++ b/paddle/fluid/operators/pad2d_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu index 80931fea90f9c..b7f9977f3edb7 100644 --- a/paddle/fluid/operators/pad2d_op.cu +++ b/paddle/fluid/operators/pad2d_op.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" diff --git a/paddle/fluid/operators/pad3d_op.cc b/paddle/fluid/operators/pad3d_op.cc index e4952a243262b..b7a638d7ce930 100644 --- a/paddle/fluid/operators/pad3d_op.cc +++ b/paddle/fluid/operators/pad3d_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/infermeta/unary.h" diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc index 087b8ecba6e1f..61a2120e1e43e 100644 --- a/paddle/fluid/operators/pad_constant_like_op.cc +++ b/paddle/fluid/operators/pad_constant_like_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/pad_constant_like_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/pad_constant_like_op.h b/paddle/fluid/operators/pad_constant_like_op.h index 0aedd800e1a23..cc7c39d12cd1a 100644 --- a/paddle/fluid/operators/pad_constant_like_op.h +++ b/paddle/fluid/operators/pad_constant_like_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc index dc162ae5782f2..eaf343dde0f0f 100644 --- a/paddle/fluid/operators/pad_op.cc +++ b/paddle/fluid/operators/pad_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/complex.h" diff --git a/paddle/fluid/operators/partial_concat_op.cc b/paddle/fluid/operators/partial_concat_op.cc index fedadc7581e71..e0e6ec31e41e0 100644 --- a/paddle/fluid/operators/partial_concat_op.cc +++ b/paddle/fluid/operators/partial_concat_op.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/partial_concat_op.h" + #include #include #include @@ -93,8 +94,9 @@ class PartialConcatOp : public framework::OperatorWithKernel { break; } } - PADDLE_ENFORCE_EQ(flag, 1, platform::errors::InvalidArgument( - "All Inputs of PartialSum OP are Empty!")); + PADDLE_ENFORCE_EQ(flag, 1, + platform::errors::InvalidArgument( + "All Inputs of PartialSum OP are Empty!")); return framework::OpKernelType(input_data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/partial_concat_op.cu b/paddle/fluid/operators/partial_concat_op.cu index 322e84ae8b9c2..d36a73037151d 100644 --- a/paddle/fluid/operators/partial_concat_op.cu +++ b/paddle/fluid/operators/partial_concat_op.cu @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/partial_concat_op.h" diff --git a/paddle/fluid/operators/partial_concat_op.h b/paddle/fluid/operators/partial_concat_op.h index 20a6639e23301..b12cb0a0293e7 100644 --- a/paddle/fluid/operators/partial_concat_op.h +++ b/paddle/fluid/operators/partial_concat_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/strided_memcpy.h" diff --git a/paddle/fluid/operators/partial_sum_op.cc b/paddle/fluid/operators/partial_sum_op.cc index 72630998d4337..a3ce78054acde 100644 --- a/paddle/fluid/operators/partial_sum_op.cc +++ b/paddle/fluid/operators/partial_sum_op.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/partial_sum_op.h" + #include #include #include @@ -96,8 +97,9 @@ class PartialSumOp : public framework::OperatorWithKernel { } } - PADDLE_ENFORCE_EQ(flag, 1, platform::errors::InvalidArgument( - "All Inputs of PartialSum OP are Empty!")); + PADDLE_ENFORCE_EQ(flag, 1, + platform::errors::InvalidArgument( + "All Inputs of PartialSum OP are Empty!")); return framework::OpKernelType(input_data_type, platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/partial_sum_op.cu b/paddle/fluid/operators/partial_sum_op.cu index 63d140d6769b8..b363483fe6945 100644 --- a/paddle/fluid/operators/partial_sum_op.cu +++ b/paddle/fluid/operators/partial_sum_op.cu @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/partial_sum_op.h" diff --git a/paddle/fluid/operators/partial_sum_op.h b/paddle/fluid/operators/partial_sum_op.h index d9c6fd758f44c..21c16ed2f6227 100644 --- a/paddle/fluid/operators/partial_sum_op.h +++ b/paddle/fluid/operators/partial_sum_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc index 1724aedbe9b24..026a1749c39d0 100644 --- a/paddle/fluid/operators/pixel_shuffle_op.cc +++ b/paddle/fluid/operators/pixel_shuffle_op.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" diff --git a/paddle/fluid/operators/poisson_op.cc b/paddle/fluid/operators/poisson_op.cc index d5896c4105932..b964d8fe116e9 100644 --- a/paddle/fluid/operators/poisson_op.cc +++ b/paddle/fluid/operators/poisson_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 44f3d8090e565..30ead84d1a987 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -15,13 +15,13 @@ limitations under the License. */ #include "paddle/fluid/operators/pool_op.h" #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/backward.h" #include "paddle/phi/infermeta/unary.h" - -#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif diff --git a/paddle/fluid/operators/pool_op_xpu.cc b/paddle/fluid/operators/pool_op_xpu.cc index f178a966e1e08..d2ec4089f9da9 100644 --- a/paddle/fluid/operators/pool_op_xpu.cc +++ b/paddle/fluid/operators/pool_op_xpu.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" @@ -112,11 +113,12 @@ class PoolGradXPUKernel : public framework::OpKernel { bool exclusive = context.Attr("exclusive"); bool adaptive = context.Attr("adaptive"); const int* index_data = nullptr; - PADDLE_ENFORCE_EQ(ksize.size(), 2, platform::errors::InvalidArgument( - "The Pool2d XPU OP only support 2 " - "dimension pooling!, but received " - "%d-dimension pool kernel size", - ksize.size())); + PADDLE_ENFORCE_EQ( + ksize.size(), 2, + platform::errors::InvalidArgument("The Pool2d XPU OP only support 2 " + "dimension pooling!, but received " + "%d-dimension pool kernel size", + ksize.size())); PADDLE_ENFORCE_EQ(!adaptive || (ksize[0] * ksize[1] == 1), true, platform::errors::InvalidArgument( "The Pool2d XPU OP does not support (adaptive == " diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc index e0341f4a4b471..8619cc28d50d3 100644 --- a/paddle/fluid/operators/pool_with_index_op.cc +++ b/paddle/fluid/operators/pool_with_index_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc index cbe58644f5381..02273b7943ae2 100644 --- a/paddle/fluid/operators/positive_negative_pair_op.cc +++ b/paddle/fluid/operators/positive_negative_pair_op.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/positive_negative_pair_op.h" + #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -41,11 +42,12 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel { ctx->HasInput("AccumulatePositivePair") && ctx->HasInput("AccumulateNegativePair") && ctx->HasInput("AccumulateNeutralPair"), - true, platform::errors::InvalidArgument( - "All optional inputs(AccumulatePositivePair, " - "AccumulateNegativePair, AccumulateNeutralPair) of " - "PositiveNegativePairOp are required if one of them " - "is specified.")); + true, + platform::errors::InvalidArgument( + "All optional inputs(AccumulatePositivePair, " + "AccumulateNegativePair, AccumulateNeutralPair) of " + "PositiveNegativePairOp are required if one of them " + "is specified.")); PADDLE_ENFORCE_EQ( ctx->GetInputDim("AccumulatePositivePair"), scalar_dim, platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/positive_negative_pair_op.h b/paddle/fluid/operators/positive_negative_pair_op.h index a47deb18b6fcc..972258350bf19 100644 --- a/paddle/fluid/operators/positive_negative_pair_op.h +++ b/paddle/fluid/operators/positive_negative_pair_op.h @@ -12,6 +12,7 @@ limitations under the License. */ #pragma once #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index de35f67405810..50dc9d6429af0 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -11,6 +11,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" diff --git a/paddle/fluid/operators/prim_ops/CMakeLists.txt b/paddle/fluid/operators/prim_ops/CMakeLists.txt index a58ee6dc1f7ba..d29933bc1964a 100644 --- a/paddle/fluid/operators/prim_ops/CMakeLists.txt +++ b/paddle/fluid/operators/prim_ops/CMakeLists.txt @@ -1,11 +1,11 @@ include(operators) if(WITH_UNITY_BUILD) - # Load Unity Build rules for operators in paddle/fluid/operators/prim_ops. - include(unity_build_rule.cmake) + # Load Unity Build rules for operators in paddle/fluid/operators/prim_ops. + include(unity_build_rule.cmake) endif() register_operators() -SET(PRIM_OP_SRCS +set(PRIM_OP_SRCS reshape_p_op.cc broadcast_p_op.cc reduce_p_op.cc @@ -25,4 +25,7 @@ SET(PRIM_OP_SRCS matmul_p_op.cc fill_constant_p_op.cc) -cc_test(prim_op_test SRCS prim_op_test.cc ${PRIM_OP_SRCS} DEPS op_registry) +cc_test( + prim_op_test + SRCS prim_op_test.cc ${PRIM_OP_SRCS} + DEPS op_registry) diff --git a/paddle/fluid/operators/prim_ops/prim_op_test.cc b/paddle/fluid/operators/prim_ops/prim_op_test.cc index 2d65149d130bb..e5b84d00f1f28 100644 --- a/paddle/fluid/operators/prim_ops/prim_op_test.cc +++ b/paddle/fluid/operators/prim_ops/prim_op_test.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "gtest/gtest.h" - #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/program_desc.h" diff --git a/paddle/fluid/operators/prim_ops/unity_build_rule.cmake b/paddle/fluid/operators/prim_ops/unity_build_rule.cmake index 5d6a732272b9b..74b04d234fcde 100644 --- a/paddle/fluid/operators/prim_ops/unity_build_rule.cmake +++ b/paddle/fluid/operators/prim_ops/unity_build_rule.cmake @@ -1,20 +1,20 @@ -register_unity_group(cc - reshape_p_op.cc - broadcast_p_op.cc - reduce_p_op.cc - transpose_p_op.cc - split_p_op.cc - concat_p_op.cc - slice_select_p_op.cc - slice_assign_p_op.cc - gather_p_op.cc - scatter_add_p_op.cc - add_p_op.cc - sub_p_op.cc - mul_p_op.cc - div_p_op.cc - sqrt_p_op.cc - tanh_p_op.cc - matmul_p_op.cc - fill_constant_p_op.cc - ) +register_unity_group( + cc + reshape_p_op.cc + broadcast_p_op.cc + reduce_p_op.cc + transpose_p_op.cc + split_p_op.cc + concat_p_op.cc + slice_select_p_op.cc + slice_assign_p_op.cc + gather_p_op.cc + scatter_add_p_op.cc + add_p_op.cc + sub_p_op.cc + mul_p_op.cc + div_p_op.cc + sqrt_p_op.cc + tanh_p_op.cc + matmul_p_op.cc + fill_constant_p_op.cc) diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc index 4dd4114d378e9..16d6185e87e15 100644 --- a/paddle/fluid/operators/print_op.cc +++ b/paddle/fluid/operators/print_op.cc @@ -178,10 +178,8 @@ REGISTER_OPERATOR(print, ops::PrintOp, ops::PrintOpProtoAndCheckMaker, ops::PrintOpGradientMaker, ops::PrintOpInferShape, ops::PrintOpVarTypeInference); -REGISTER_OP_VERSION(print) - .AddCheckpoint( - R"ROC(Upgrade print add a new attribute [print_tensor_layout] to " +REGISTER_OP_VERSION(print).AddCheckpoint( + R"ROC(Upgrade print add a new attribute [print_tensor_layout] to " "contorl whether to print tensor's layout.)ROC", - paddle::framework::compatible::OpVersionDesc().NewAttr( - "print_tensor_layout", "Whether to print the tensor's layout.", - true)); + paddle::framework::compatible::OpVersionDesc().NewAttr( + "print_tensor_layout", "Whether to print the tensor's layout.", true)); diff --git a/paddle/fluid/operators/prroi_pool_op.cc b/paddle/fluid/operators/prroi_pool_op.cc index f03a392bfc736..51bd079849a52 100644 --- a/paddle/fluid/operators/prroi_pool_op.cc +++ b/paddle/fluid/operators/prroi_pool_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/prroi_pool_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h index 0fdccc729adde..8431d945749f3 100644 --- a/paddle/fluid/operators/prroi_pool_op.h +++ b/paddle/fluid/operators/prroi_pool_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" #if defined(__NVCC__) || defined(__HIPCC__) diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.cu b/paddle/fluid/operators/prune_gate_by_capacity_op.cu index 6a2ed6592e7fe..2e729f94dc8f3 100644 --- a/paddle/fluid/operators/prune_gate_by_capacity_op.cu +++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cu @@ -79,9 +79,10 @@ class PruneGateByCapacityFunctor { int blocks = NumBlocks(batch_size); int threads = kNumCUDAThreads; - prune_gate_by_capacity_kernel<<>>( - gate_idx_data, new_gate_idx_data_, expert_count_out_data, batch_size); + prune_gate_by_capacity_kernel + <<>>( + gate_idx_data, new_gate_idx_data_, expert_count_out_data, + batch_size); } private: diff --git a/paddle/fluid/operators/pscore/CMakeLists.txt b/paddle/fluid/operators/pscore/CMakeLists.txt index de0ee481aa6e7..04407ea117d17 100755 --- a/paddle/fluid/operators/pscore/CMakeLists.txt +++ b/paddle/fluid/operators/pscore/CMakeLists.txt @@ -1,49 +1,152 @@ -if (WITH_PSLIB) - return() +if(WITH_PSLIB) + return() endif() include(operators) set(DISTRIBUTE_DEPS "") -if (WITH_ARM_BRPC) - list(APPEND DISTRIBUTE_DEPS executor fleet ps_service brpc_utils heter_server heter_client ps_framework_proto framework_proto sendrecv_rpc arm_brpc gflags glog snappy device_context) +if(WITH_ARM_BRPC) + list( + APPEND + DISTRIBUTE_DEPS + executor + fleet + ps_service + brpc_utils + heter_server + heter_client + ps_framework_proto + framework_proto + sendrecv_rpc + arm_brpc + gflags + glog + snappy + device_context) else() - list(APPEND DISTRIBUTE_DEPS executor fleet ps_service brpc_utils heter_server heter_client ps_framework_proto framework_proto sendrecv_rpc brpc leveldb ssl crypto protobuf gflags glog zlib snappy device_context) + list( + APPEND + DISTRIBUTE_DEPS + executor + fleet + ps_service + brpc_utils + heter_server + heter_client + ps_framework_proto + framework_proto + sendrecv_rpc + brpc + leveldb + ssl + crypto + protobuf + gflags + glog + zlib + snappy + device_context) endif() -set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=parentheses") +set(DISTRIBUTE_COMPILE_FLAGS + "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=parentheses" +) -if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) - set(DISTRIBUTE_COMPILE_FLAGS - "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") +if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) + set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") endif() -file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") +file( + GLOB OPS + RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + "*_op.cc") list(REMOVE_DUPLICATES OPS) -foreach (src ${OPS}) - set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -endforeach () +foreach(src ${OPS}) + set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) +endforeach() register_operators(DEPS ${DISTRIBUTE_DEPS}) -set(OPERATOR_DEPS ${OPERATOR_DEPS} ${DISTRIBUTE_DEPS} PARENT_SCOPE) +set(OPERATOR_DEPS + ${OPERATOR_DEPS} ${DISTRIBUTE_DEPS} + PARENT_SCOPE) -set_source_files_properties(heter_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(heter_server_test SRCS heter_server_test.cc DEPS ${RPC_DEPS} ${DISTRIBUTE_DEPS} executor scope proto_desc scale_op eigen_function) +set_source_files_properties( + heter_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + heter_server_test + SRCS heter_server_test.cc + DEPS ${RPC_DEPS} + ${DISTRIBUTE_DEPS} + executor + scope + proto_desc + scale_op + eigen_function) -set_source_files_properties(send_and_recv_op_cpu_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(send_and_recv_cpu_test SRCS send_and_recv_op_cpu_test.cc DEPS executor scope proto_desc scale_op send_and_recv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} eigen_function) +set_source_files_properties( + send_and_recv_op_cpu_test.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + send_and_recv_cpu_test + SRCS send_and_recv_op_cpu_test.cc + DEPS executor + scope + proto_desc + scale_op + send_and_recv_op + ${RPC_DEPS} + ${DISTRIBUTE_DEPS} + eigen_function) -set_source_files_properties(send_and_recv_op_gpu_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(send_and_recv_gpu_test SRCS send_and_recv_op_gpu_test.cc DEPS executor scope proto_desc scale_op send_and_recv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} eigen_function) +set_source_files_properties( + send_and_recv_op_gpu_test.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + send_and_recv_gpu_test + SRCS send_and_recv_op_gpu_test.cc + DEPS executor + scope + proto_desc + scale_op + send_and_recv_op + ${RPC_DEPS} + ${DISTRIBUTE_DEPS} + eigen_function) -set_source_files_properties(heter_listen_and_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(heter_listen_and_server_test SRCS heter_listen_and_server_test.cc DEPS executor scope proto_desc scale_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} eigen_function) +set_source_files_properties( + heter_listen_and_server_test.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test( + heter_listen_and_server_test + SRCS heter_listen_and_server_test.cc + DEPS executor + scope + proto_desc + scale_op + heter_listen_and_serv_op + ${RPC_DEPS} + ${DISTRIBUTE_DEPS} + eigen_function) #set_source_files_properties(heter_cloud_comm_cpu_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) #cc_test(heter_cloud_comm_cpu_test SRCS heter_cloud_comm_cpu_test.cc DEPS executor scope proto_desc scale_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} eigen_function) -set_source_files_properties(switch_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_binary(switch_server_test SRCS switch_server_test.cc DEPS executor scope proto_desc scale_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} eigen_function) +set_source_files_properties( + switch_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_binary( + switch_server_test + SRCS + switch_server_test.cc + DEPS + executor + scope + proto_desc + scale_op + heter_listen_and_serv_op + ${RPC_DEPS} + ${DISTRIBUTE_DEPS} + eigen_function) diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc index f101e509d936f..d09b1c7aa068e 100644 --- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc +++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc @@ -9,11 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/pscore/distributed_lookup_table_op.h" + #include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/pscore/distributed_lookup_table_op.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h index c2717c19b2d8e..c9390aa42a656 100644 --- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h +++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h @@ -13,6 +13,7 @@ #include #include #include + #include "paddle/fluid/distributed/ps/wrapper/fleet.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc index 9868a6257924e..701b6250445bd 100644 --- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc +++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc @@ -9,11 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/pscore/distributed_push_sparse_op.h" + #include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/pscore/distributed_push_sparse_op.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { diff --git a/paddle/fluid/operators/pscore/distributed_push_sparse_op.h b/paddle/fluid/operators/pscore/distributed_push_sparse_op.h index 6d3faae6a2d09..7c361dfd1a7dc 100644 --- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.h +++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.h @@ -13,6 +13,7 @@ #include #include #include + #include "paddle/fluid/distributed/ps/wrapper/fleet.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc index 457e37744d316..5d77851b72a24 100644 --- a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc +++ b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/pscore/heter_listen_and_serv_op.h" + #include "paddle/fluid/framework/op_registry.h" PADDLE_DEFINE_EXPORTED_int32(rpc_send_thread_num, 12, @@ -92,8 +93,9 @@ void HeterListenAndServOp::RunAsyncLoop(framework::ProgramDesc *program) const { auto blkid = block_list[i]; auto it = message_to_block_id.find_value(blkid); heter_server_->RegisterServiceHandler( - it->first, [&](const MultiVarMsg *request, MultiVarMsg *response, - brpc::Controller *cntl) -> int { + it->first, + [&](const MultiVarMsg *request, MultiVarMsg *response, + brpc::Controller *cntl) -> int { return send_and_recv_variable_handler_->Handle(request, response, cntl); }); diff --git a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h old mode 100755 new mode 100644 index 3ecff083b00c7..29cc041d68216 --- a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h +++ b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include #include #include diff --git a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc index ab2fcba51062f..da57660a74d39 100644 --- a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc +++ b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc @@ -14,11 +14,11 @@ limitations under the License. */ #include #include -#include -#include // NOLINT #include #include +#include +#include // NOLINT #include "gtest/gtest.h" #include "paddle/fluid/distributed/ps/service/heter_client.h" diff --git a/paddle/fluid/operators/pscore/heter_server_test.cc b/paddle/fluid/operators/pscore/heter_server_test.cc index d4ee00d10a50b..db647dfaf238b 100644 --- a/paddle/fluid/operators/pscore/heter_server_test.cc +++ b/paddle/fluid/operators/pscore/heter_server_test.cc @@ -12,17 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/distributed/ps/service/heter_server.h" + #include -#include -#include -#include // NOLINT +#include #include #include +#include +#include // NOLINT #include "gtest/gtest.h" #include "paddle/fluid/distributed/ps/service/heter_client.h" -#include "paddle/fluid/distributed/ps/service/heter_server.h" #include "paddle/fluid/framework/op_registry.h" namespace framework = paddle::framework; @@ -181,13 +182,15 @@ void StartSendAndRecvServer(std::string endpoint) { heter_server_ptr_->SetEndPoint(endpoint); LOG(INFO) << "before HeterServer::RegisterServiceHandler"; heter_server_ptr_->RegisterServiceHandler( - in_var_name, [&](const MultiVarMsg* request, MultiVarMsg* response, - brpc::Controller* cntl) -> int { + in_var_name, + [&](const MultiVarMsg* request, MultiVarMsg* response, + brpc::Controller* cntl) -> int { return b_req_handler->Handle(request, response, cntl); }); heter_server_ptr_->RegisterServiceHandler( - in_var_name2, [&](const MultiVarMsg* request, MultiVarMsg* response, - brpc::Controller* cntl) -> int { + in_var_name2, + [&](const MultiVarMsg* request, MultiVarMsg* response, + brpc::Controller* cntl) -> int { return b_req_handler->Handle(request, response, cntl); }); diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc old mode 100755 new mode 100644 index 7c25d38d1ebad..a21d11ee1b19e --- a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc @@ -14,12 +14,13 @@ limitations under the License. */ #if defined PADDLE_WITH_PSCORE #include + #include +#include +#include #include #include // NOLINT -#include -#include #include "gtest/gtest.h" #include "paddle/fluid/distributed/ps/service/heter_client.h" #include "paddle/fluid/distributed/ps/service/heter_server.h" @@ -158,8 +159,9 @@ void StartSendAndRecvServer(std::string endpoint) { b_rpc_service->SetEndPoint(endpoint); LOG(INFO) << "before HeterServer::RegisterServiceHandler"; b_rpc_service->RegisterServiceHandler( - in_var_name, [&](const MultiVarMsg* request, MultiVarMsg* response, - brpc::Controller* cntl) -> int { + in_var_name, + [&](const MultiVarMsg* request, MultiVarMsg* response, + brpc::Controller* cntl) -> int { return b_req_handler->Handle(request, response, cntl); }); diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc index 4054846460b07..c8e24c77734f8 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc @@ -15,12 +15,12 @@ limitations under the License. */ #if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSCORE) #include -#include -#include -#include // NOLINT +#include #include #include +#include +#include // NOLINT #include "gtest/gtest.h" #include "paddle/fluid/distributed/ps/service/heter_client.h" @@ -178,8 +178,9 @@ void StartSendAndRecvServer(std::string endpoint) { b_rpc_service2->SetEndPoint(endpoint); LOG(INFO) << "before HeterServer::RegisterServiceHandler"; b_rpc_service2->RegisterServiceHandler( - in_var_name, [&](const MultiVarMsg* request, MultiVarMsg* response, - brpc::Controller* cntl) -> int { + in_var_name, + [&](const MultiVarMsg* request, MultiVarMsg* response, + brpc::Controller* cntl) -> int { return b_req_handler->Handle(request, response, cntl); }); diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.h b/paddle/fluid/operators/pull_box_extended_sparse_op.h index 559c7eed84e6f..f803b57b187f8 100644 --- a/paddle/fluid/operators/pull_box_extended_sparse_op.h +++ b/paddle/fluid/operators/pull_box_extended_sparse_op.h @@ -15,6 +15,7 @@ #pragma once #include #include + #include "paddle/fluid/framework/fleet/box_wrapper.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.h b/paddle/fluid/operators/pull_gpups_sparse_op.h index abfdb62ec34ac..58e1172552135 100644 --- a/paddle/fluid/operators/pull_gpups_sparse_op.h +++ b/paddle/fluid/operators/pull_gpups_sparse_op.h @@ -15,6 +15,7 @@ #pragma once #include #include + #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" diff --git a/paddle/fluid/operators/pull_sparse_op.cc b/paddle/fluid/operators/pull_sparse_op.cc index fb83746de19ec..57d361b7a77bb 100644 --- a/paddle/fluid/operators/pull_sparse_op.cc +++ b/paddle/fluid/operators/pull_sparse_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/pull_sparse_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/pull_sparse_op.h b/paddle/fluid/operators/pull_sparse_op.h index 2498adc141cd7..e3f0f88ce5552 100644 --- a/paddle/fluid/operators/pull_sparse_op.h +++ b/paddle/fluid/operators/pull_sparse_op.h @@ -16,6 +16,7 @@ #include #include #include + #include "paddle/fluid/framework/fleet/fleet_wrapper.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" diff --git a/paddle/fluid/operators/pull_sparse_v2_op.cc b/paddle/fluid/operators/pull_sparse_v2_op.cc index f5f2e728e38c0..a8fc84b9c2b73 100644 --- a/paddle/fluid/operators/pull_sparse_v2_op.cc +++ b/paddle/fluid/operators/pull_sparse_v2_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/pull_sparse_v2_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/pull_sparse_v2_op.h b/paddle/fluid/operators/pull_sparse_v2_op.h index 29337cc2d94b4..c24d0a4f338e7 100644 --- a/paddle/fluid/operators/pull_sparse_v2_op.h +++ b/paddle/fluid/operators/pull_sparse_v2_op.h @@ -16,6 +16,7 @@ #include #include #include + #include "paddle/fluid/framework/fleet/fleet_wrapper.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" diff --git a/paddle/fluid/operators/push_dense_op.cc b/paddle/fluid/operators/push_dense_op.cc index 5b9f05bd126b8..5284a1a61e5ef 100644 --- a/paddle/fluid/operators/push_dense_op.cc +++ b/paddle/fluid/operators/push_dense_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/push_dense_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/push_dense_op.h b/paddle/fluid/operators/push_dense_op.h index 592ef5ff72a65..c8f98a1ea9e5d 100644 --- a/paddle/fluid/operators/push_dense_op.h +++ b/paddle/fluid/operators/push_dense_op.h @@ -16,6 +16,7 @@ #include #include #include + #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/fleet/fleet_wrapper.h" #include "paddle/fluid/framework/no_need_buffer_vars_inference.h" diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc index f676348bc0af2..de46357e497fd 100644 --- a/paddle/fluid/operators/py_func_op.cc +++ b/paddle/fluid/operators/py_func_op.cc @@ -20,6 +20,7 @@ #include #include #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/py_layer_op.cc b/paddle/fluid/operators/py_layer_op.cc index 14c9e8b0c260f..db8f315366a7b 100644 --- a/paddle/fluid/operators/py_layer_op.cc +++ b/paddle/fluid/operators/py_layer_op.cc @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include - #include "paddle/fluid/operators/py_layer_op.h" +#include + namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/py_layer_op.h b/paddle/fluid/operators/py_layer_op.h index 6625a4a1a753c..ea048ee9e5948 100644 --- a/paddle/fluid/operators/py_layer_op.h +++ b/paddle/fluid/operators/py_layer_op.h @@ -18,6 +18,7 @@ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/python_headers.h" diff --git a/paddle/fluid/operators/pyramid_hash_op.cc b/paddle/fluid/operators/pyramid_hash_op.cc index 4b0ade99154a1..6650037e4d2f4 100644 --- a/paddle/fluid/operators/pyramid_hash_op.cc +++ b/paddle/fluid/operators/pyramid_hash_op.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include #include + #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/search_compute.h" @@ -216,9 +218,8 @@ class CPUPyramidHashOPKernel : public framework::OpKernel { bool should_use_term(math::bloomfilter* _filter, math::bloomfilter* _black_filter, const float* word_repr, int len) const { - return (!_filter || - 1 == math::bloomfilter_get(_filter, word_repr, - len * sizeof(float))) && + return (!_filter || 1 == math::bloomfilter_get(_filter, word_repr, + len * sizeof(float))) && (!_black_filter || 0 == math::bloomfilter_get(_black_filter, word_repr, len * sizeof(float))); diff --git a/paddle/fluid/operators/qr_op.cc b/paddle/fluid/operators/qr_op.cc index 02d5e5f03f02e..55cab539c4d4e 100644 --- a/paddle/fluid/operators/qr_op.cc +++ b/paddle/fluid/operators/qr_op.cc @@ -13,10 +13,12 @@ // limitations under the License. #include "paddle/fluid/operators/qr_op.h" + #include #include #include #include + #include "paddle/phi/core/ddim.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu index a57a8d5cf8b7f..695b90e9319e4 100644 --- a/paddle/fluid/operators/qr_op.cu +++ b/paddle/fluid/operators/qr_op.cu @@ -16,8 +16,10 @@ limitations under the License. */ // HIP not support cusolver #include + #include #include + #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/qr_op.h" #include "paddle/fluid/platform/dynload/cusolver.h" @@ -43,8 +45,9 @@ class QrGPUKernel : public framework::OpKernel { std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode); auto numel = x.numel(); - PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet( - "The input of QR is empty.")); + PADDLE_ENFORCE_GT( + numel, 0, + platform::errors::PreconditionNotMet("The input of QR is empty.")); auto x_dims = x.dims(); int x_rank = x_dims.size(); int m = x_dims[x_rank - 2]; diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h index 5ef02d8942797..760b2efd21f6b 100644 --- a/paddle/fluid/operators/qr_op.h +++ b/paddle/fluid/operators/qr_op.h @@ -16,6 +16,7 @@ #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/svd_helper.h" @@ -89,11 +90,11 @@ class QrGradKernel : public framework::OpKernel { } // m >= n case - auto m_gt_n_case = []( - const framework::ExecutionContext& ctx, - math::DeviceIndependenceTensorOperations& dito, - const Tensor& dQ, const Tensor& dR, const Tensor& A, const Tensor& Q, - const Tensor& R) -> framework::Tensor { + auto m_gt_n_case = + [](const framework::ExecutionContext& ctx, + math::DeviceIndependenceTensorOperations& dito, + const Tensor& dQ, const Tensor& dR, const Tensor& A, const Tensor& Q, + const Tensor& R) -> framework::Tensor { // Hai-Jun Liao, Jin-Guo Liu, Lei Wang, Tao Xiang (2019). Differentiable // Programming Tensor Networks. // https://arxiv.org/abs/1903.09650 Section 3. QR factorization diff --git a/paddle/fluid/operators/quantize_linear_op.cc b/paddle/fluid/operators/quantize_linear_op.cc index 4039f0e9d07e1..edd2a06a50001 100644 --- a/paddle/fluid/operators/quantize_linear_op.cc +++ b/paddle/fluid/operators/quantize_linear_op.cc @@ -10,9 +10,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/quantize_linear_op.h" + #include #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/transform.h" diff --git a/paddle/fluid/operators/quantize_linear_op.cu b/paddle/fluid/operators/quantize_linear_op.cu index 6c7e430f51126..6e3e39562c719 100644 --- a/paddle/fluid/operators/quantize_linear_op.cu +++ b/paddle/fluid/operators/quantize_linear_op.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/fake_dequantize_op.cu.h" #include "paddle/fluid/operators/fake_quantize_op.cu.h" @@ -46,10 +47,10 @@ struct ChannelDequantizeFunctorV2 { quant_stride *= in_dims[i]; } - DequantizeOneScaleQuantAxisN< - T><<>>( - in_data, scale_factor, max_range, num, in_dims[quant_axis], - quant_stride, out_data); + DequantizeOneScaleQuantAxisN + <<>>( + in_data, scale_factor, max_range, num, in_dims[quant_axis], + quant_stride, out_data); } }; diff --git a/paddle/fluid/operators/quantize_linear_op.h b/paddle/fluid/operators/quantize_linear_op.h index e20b99e85f0b3..df1a93ba638ae 100644 --- a/paddle/fluid/operators/quantize_linear_op.h +++ b/paddle/fluid/operators/quantize_linear_op.h @@ -13,6 +13,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/memory/malloc.h" diff --git a/paddle/fluid/operators/quantize_op.cc b/paddle/fluid/operators/quantize_op.cc index 951951253c47a..62ec77bc2240f 100644 --- a/paddle/fluid/operators/quantize_op.cc +++ b/paddle/fluid/operators/quantize_op.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/quantize_op.h" + #include "paddle/fluid/framework/op_version_registry.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -57,13 +58,13 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(quantize, ops::QuantOp, ops::QuantOpMaker); REGISTER_OP_VERSION(quantize) - .AddCheckpoint( - R"ROC( Add a new attribute [bfloat16])ROC", - paddle::framework::compatible::OpVersionDesc().NewAttr( - "bfloat16", "If true, float32 input is converted to bfloat16", - false)) - .AddCheckpoint( - R"ROC( Add a new attribute [Shift])ROC", - paddle::framework::compatible::OpVersionDesc().NewAttr( - "Shift", "Quantize data to uint8 if provided non-zero value.", - 0.0f)); + .AddCheckpoint(R"ROC( Add a new attribute [bfloat16])ROC", + paddle::framework::compatible::OpVersionDesc().NewAttr( + "bfloat16", + "If true, float32 input is converted to bfloat16", + false)) + .AddCheckpoint(R"ROC( Add a new attribute [Shift])ROC", + paddle::framework::compatible::OpVersionDesc().NewAttr( + "Shift", + "Quantize data to uint8 if provided non-zero value.", + 0.0f)); diff --git a/paddle/fluid/operators/quantize_op.h b/paddle/fluid/operators/quantize_op.h index 091306e4637c7..dd1b3c42fb5f9 100644 --- a/paddle/fluid/operators/quantize_op.h +++ b/paddle/fluid/operators/quantize_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/queue_generator_op.cc b/paddle/fluid/operators/queue_generator_op.cc index e2174b9346e1e..3683fbd075db2 100644 --- a/paddle/fluid/operators/queue_generator_op.cc +++ b/paddle/fluid/operators/queue_generator_op.cc @@ -43,9 +43,10 @@ class QueueGeneratorOp : public framework::OperatorBase { void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { std::vector names = Attr>("names"); - PADDLE_ENFORCE_GT(names.size(), 0, platform::errors::InvalidArgument( - "The attribute 'names' for " - "Op(queue_generator) must be set.")); + PADDLE_ENFORCE_GT( + names.size(), 0, + platform::errors::InvalidArgument("The attribute 'names' for " + "Op(queue_generator) must be set.")); int capacity = Attr("capacity"); PADDLE_ENFORCE_GT(capacity, 0, diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h index 2928c3b502781..cfda710bd7745 100644 --- a/paddle/fluid/operators/random_crop_op.h +++ b/paddle/fluid/operators/random_crop_op.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/for_range.h" diff --git a/paddle/fluid/operators/random_routing_op.cu b/paddle/fluid/operators/random_routing_op.cu index fec65518a9d48..471cfb40e6167 100644 --- a/paddle/fluid/operators/random_routing_op.cu +++ b/paddle/fluid/operators/random_routing_op.cu @@ -71,9 +71,9 @@ class RandomRoutingOpCUDAKernel : public framework::OpKernel { auto topk_idx_data = topk_idx->data(); auto out_data = out->data(); - random_routing_kernel< - T><<>>( - out_data, num_idx, N, D, prob_data, topk_idx_data, topk_value_data); + random_routing_kernel + <<>>( + out_data, num_idx, N, D, prob_data, topk_idx_data, topk_value_data); } }; diff --git a/paddle/fluid/operators/randperm_op.cc b/paddle/fluid/operators/randperm_op.cc index 1b28ab3c133f7..aed1f2b0ed102 100644 --- a/paddle/fluid/operators/randperm_op.cc +++ b/paddle/fluid/operators/randperm_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" @@ -29,10 +30,11 @@ class RandpermOp : public framework::OperatorWithKernel { "The output(Out) of randperm op must not be null.")); int n = ctx->Attrs().Get("n"); PADDLE_ENFORCE_GT( - n, 0, platform::errors::InvalidArgument( - "The input 'n' of randperm op should be greater than 0. " - "But received %d.", - n)); + n, 0, + platform::errors::InvalidArgument( + "The input 'n' of randperm op should be greater than 0. " + "But received %d.", + n)); ctx->SetOutputDim("Out", phi::make_ddim({n})); } diff --git a/paddle/fluid/operators/randperm_op_npu.cc b/paddle/fluid/operators/randperm_op_npu.cc index a16c0d905a555..c9f6121101601 100644 --- a/paddle/fluid/operators/randperm_op_npu.cc +++ b/paddle/fluid/operators/randperm_op_npu.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/randperm_op.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/randperm_op.h" template using kernel = diff --git a/paddle/fluid/operators/range_op.cc b/paddle/fluid/operators/range_op.cc index 80fdb2ce6c345..215f83698186c 100644 --- a/paddle/fluid/operators/range_op.cc +++ b/paddle/fluid/operators/range_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/range_op.h" + #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" diff --git a/paddle/fluid/operators/range_op.h b/paddle/fluid/operators/range_op.h index 8924b23ce5cf8..e2fd16dd629ad 100644 --- a/paddle/fluid/operators/range_op.h +++ b/paddle/fluid/operators/range_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -22,13 +23,15 @@ namespace operators { template void GetSize(T start, T end, T step, int64_t* size) { - PADDLE_ENFORCE_NE(step, 0, platform::errors::InvalidArgument( - "The step of range op should not be 0.")); + PADDLE_ENFORCE_NE(step, 0, + platform::errors::InvalidArgument( + "The step of range op should not be 0.")); if (start < end) { PADDLE_ENFORCE_GT( - step, 0, platform::errors::InvalidArgument( - "The step should be greater than 0 while start < end.")); + step, 0, + platform::errors::InvalidArgument( + "The step should be greater than 0 while start < end.")); } if (start > end) { diff --git a/paddle/fluid/operators/range_op_xpu.cc b/paddle/fluid/operators/range_op_xpu.cc index 6672968de3a02..bfc0d27f7ca26 100644 --- a/paddle/fluid/operators/range_op_xpu.cc +++ b/paddle/fluid/operators/range_op_xpu.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/range_op.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/range_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/rank_attention_op.cc b/paddle/fluid/operators/rank_attention_op.cc index e5332da6475d7..89bdeb57b5fdf 100644 --- a/paddle/fluid/operators/rank_attention_op.cc +++ b/paddle/fluid/operators/rank_attention_op.cc @@ -10,9 +10,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/rank_attention_op.h" + #include #include #include + #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/rank_attention_op.cu b/paddle/fluid/operators/rank_attention_op.cu index 9b3a1e5637115..61d723c27f7e5 100644 --- a/paddle/fluid/operators/rank_attention_op.cu +++ b/paddle/fluid/operators/rank_attention_op.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/operators/rank_attention.cu.h" #include "paddle/fluid/operators/rank_attention_op.h" diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt index 7e06b45943cdc..9dd59de98d553 100644 --- a/paddle/fluid/operators/reader/CMakeLists.txt +++ b/paddle/fluid/operators/reader/CMakeLists.txt @@ -1,26 +1,36 @@ include(operators) -cc_library(reader_op_registry SRCS reader_op_registry.cc DEPS operator op_registry reader) +cc_library( + reader_op_registry + SRCS reader_op_registry.cc + DEPS operator op_registry reader) set(LOCAL_READER_LIBS) function(reader_library TARGET_NAME) - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) - set(options "") - set(common_deps reader_op_registry) - cmake_parse_arguments(reader_library "${options}" "${oneValueArgs}" - "${multiValueArgs}" ${ARGN}) - op_library(${TARGET_NAME} SRCS ${reader_library_SRCS} DEPS ${common_deps} ${reader_library_DEPS}) - set(LOCAL_READER_LIBS - ${TARGET_NAME} - ${LOCAL_READER_LIBS} - PARENT_SCOPE) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + set(options "") + set(common_deps reader_op_registry) + cmake_parse_arguments(reader_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + op_library(${TARGET_NAME} SRCS ${reader_library_SRCS} DEPS ${common_deps} + ${reader_library_DEPS}) + set(LOCAL_READER_LIBS + ${TARGET_NAME} ${LOCAL_READER_LIBS} + PARENT_SCOPE) endfunction() -cc_library(py_reader SRCS py_reader.cc DEPS reader) -cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool) +cc_library( + py_reader + SRCS py_reader.cc + DEPS reader) +cc_library( + buffered_reader + SRCS buffered_reader.cc + DEPS reader simple_threadpool) -reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS buffered_reader) +reader_library(create_double_buffer_reader_op SRCS + create_double_buffer_reader_op.cc DEPS buffered_reader) reader_library(create_py_reader_op SRCS create_py_reader_op.cc DEPS py_reader) op_library(read_op DEPS py_reader buffered_reader) diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index f126070a7eb96..38c45ca2803ff 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -161,9 +161,10 @@ class BlockingQueue { private: inline void EnforceNotKilled() { - PADDLE_ENFORCE_NE(killed_, true, platform::errors::Fatal( - "Blocking queue is killed because the " - "data reader raises an exception.")); + PADDLE_ENFORCE_NE( + killed_, true, + platform::errors::Fatal("Blocking queue is killed because the " + "data reader raises an exception.")); } private: diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index db0f5758d2f53..193f6c29724b7 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/reader/buffered_reader.h" + #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/profiler.h" diff --git a/paddle/fluid/operators/reader/create_ctr_reader_op.cc b/paddle/fluid/operators/reader/create_ctr_reader_op.cc index 86fbddc0ec2cf..b83d085284175 100644 --- a/paddle/fluid/operators/reader/create_ctr_reader_op.cc +++ b/paddle/fluid/operators/reader/create_ctr_reader_op.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/fluid/operators/reader/ctr_reader.h" - #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/operators/reader/reader_op_registry.h" diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc index 8557ef950b3e9..8b2809b286cfe 100644 --- a/paddle/fluid/operators/recurrent_op.cc +++ b/paddle/fluid/operators/recurrent_op.cc @@ -322,9 +322,10 @@ void RecurrentGradOp::RunImpl(const framework::Scope &scope, framework::Executor executor(place); auto *block = Attr(kStepBlock); auto *program = block->Program(); - auto ctx = executor.Prepare( - *program, block->ID(), Attr>( - kSkipEagerDeletionVars) /*skip_ref_cnt_vars*/); + auto ctx = + executor.Prepare(*program, block->ID(), + Attr>( + kSkipEagerDeletionVars) /*skip_ref_cnt_vars*/); for (size_t step_id = 0; step_id < seq_len; ++step_id) { size_t seq_offset = reverse ? step_id : seq_len - step_id - 1; @@ -387,19 +388,19 @@ void RecurrentGradOp::RunImpl(const framework::Scope &scope, // outside::output[seq_offset: seq_offset + 1] = inside::output executor.CreateVariables(ctx->prog_, &cur_scope, ctx->block_id_); if (step_id > 0) { - LinkTensorWithCallback(scope, Outputs(kInputGrads), cur_scope, - GradVarLists(Inputs(kInputs)), - [&](const framework::LoDTensor &src_tensor, - framework::LoDTensor *dst_tensor) { - if (src_tensor.memory_size() == - 0) { // Inside Gradient is not created. - return; - } - framework::Tensor src_slice = - src_tensor.Slice(seq_offset, seq_offset + 1); - dst_tensor->ShareDataWith(src_slice); - }, - true /*is_backward*/); + LinkTensorWithCallback( + scope, Outputs(kInputGrads), cur_scope, GradVarLists(Inputs(kInputs)), + [&](const framework::LoDTensor &src_tensor, + framework::LoDTensor *dst_tensor) { + if (src_tensor.memory_size() == + 0) { // Inside Gradient is not created. + return; + } + framework::Tensor src_slice = + src_tensor.Slice(seq_offset, seq_offset + 1); + dst_tensor->ShareDataWith(src_slice); + }, + true /*is_backward*/); } VLOG(5) << "Recurrent memory linking finished "; @@ -604,7 +605,8 @@ if reverse is True | | | | v v v v o o o o -)DOC").SetDefault(false); +)DOC") + .SetDefault(false); AddAttr(RecurrentBase::kIsTrain, "").SetDefault(true); AddAttr>(RecurrentBase::kSkipEagerDeletionVars, "Vars that would skip eager deletion." @@ -663,14 +665,16 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase { ctx->Attrs() .Get>(RecurrentBase::kExStates) .size(), - 0, platform::errors::InvalidArgument("The Attr(%s) should be empty.", - RecurrentBase::kExStates)); + 0, + platform::errors::InvalidArgument("The Attr(%s) should be empty.", + RecurrentBase::kExStates)); PADDLE_ENFORCE_EQ( ctx->Attrs() .Get>(RecurrentBase::kStates) .size(), - 0, platform::errors::InvalidArgument("The Attr(%s) should be empty.", - RecurrentBase::kStates)); + 0, + platform::errors::InvalidArgument("The Attr(%s) should be empty.", + RecurrentBase::kStates)); } PADDLE_ENFORCE_EQ( @@ -702,9 +706,10 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase { if (ctx->HasInputs(RecurrentBase::kParameters)) { PADDLE_ENFORCE_EQ( ctx->HasOutputs(framework::GradVarName(RecurrentBase::kParameters)), - true, platform::errors::InvalidArgument( - "The output of(%s) should not be empty.", - framework::GradVarName(RecurrentBase::kParameters))); + true, + platform::errors::InvalidArgument( + "The output of(%s) should not be empty.", + framework::GradVarName(RecurrentBase::kParameters))); ctx->SetOutputsDim(framework::GradVarName(RecurrentBase::kParameters), ctx->GetInputsDim(RecurrentBase::kParameters)); } diff --git a/paddle/fluid/operators/reduce_ops/CMakeLists.txt b/paddle/fluid/operators/reduce_ops/CMakeLists.txt index 9a2abfd93d066..7c2f91999e964 100644 --- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt +++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt @@ -1,30 +1,42 @@ include(operators) if(WITH_UNITY_BUILD) - # Load Unity Build rules for operators in paddle/fluid/operators/reduce_ops. - include(unity_build_rule.cmake) + # Load Unity Build rules for operators in paddle/fluid/operators/reduce_ops. + include(unity_build_rule.cmake) endif() if(WITH_GPU) - if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) - register_operators(DEPS cub) - else() - register_operators() - endif() -else() + if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) + register_operators(DEPS cub) + else() register_operators() + endif() +else() + register_operators() endif() if(WITH_GPU) - if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) - nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor cub) - else() - nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor) - endif() + if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) + nv_test( + check_reduce_rank_test + SRCS check_reduce_rank_test.cu + DEPS tensor cub) + else() + nv_test( + check_reduce_rank_test + SRCS check_reduce_rank_test.cu + DEPS tensor) + endif() endif() if(WITH_ROCM) - hip_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor) + hip_test( + check_reduce_rank_test + SRCS check_reduce_rank_test.cu + DEPS tensor) endif() if(WITH_ASCEND_CL) - cc_test(reduce_any_op_npu_test SRCS reduce_any_op_npu_test.cc DEPS op_registry reduce_any_op scope device_context enforce executor) + cc_test( + reduce_any_op_npu_test + SRCS reduce_any_op_npu_test.cc + DEPS op_registry reduce_any_op scope device_context enforce executor) endif() diff --git a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc index 83a21a919dcaa..063f7ca041a86 100644 --- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc +++ b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.h" diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc index 0602c73db6bbc..4128d51559c30 100644 --- a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc +++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc @@ -15,6 +15,7 @@ #include #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/operators/reduce_ops/reduce_op_function.h" #include "paddle/phi/core/infermeta_utils.h" diff --git a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc index c5bc66e23ce8a..29587faa48005 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc @@ -16,16 +16,18 @@ REGISTER_REDUCE_OP(reduce_amax); REGISTER_OP_CPU_KERNEL( - reduce_amax, ops::ReduceKernel, + reduce_amax, + ops::ReduceKernel, ops::ReduceKernel, ops::ReduceKernel, ops::ReduceKernel); REGISTER_OP_CPU_KERNEL( - reduce_amax_grad, ops::ReduceGradKernel, + reduce_amax_grad, + ops::ReduceGradKernel, ops::ReduceGradKernel, ops::ReduceGradKernel, + reduce_amax_grad, + ops::ReduceGradKernel, ops::ReduceGradKernel, ops::ReduceGradKernel, + reduce_amin, + ops::ReduceKernel, ops::ReduceKernel, ops::ReduceKernel, ops::ReduceKernel); REGISTER_OP_CPU_KERNEL( - reduce_amin_grad, ops::ReduceGradKernel, + reduce_amin_grad, + ops::ReduceGradKernel, ops::ReduceGradKernel, ops::ReduceGradKernel, + reduce_amin_grad, + ops::ReduceGradKernel, ops::ReduceGradKernel, ops::ReduceGradKernel #include + #include "paddle/fluid/operators/reduce_ops/reduce_op_xpu.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc index dc41979defb93..8ce115ce66921 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h" + #include #include #include diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc index 715dcb25c209f..111537f64558c 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc @@ -11,8 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h" #include "paddle/fluid/operators/elementwise/elementwise_npu.h" +#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc index 5e5b04d57b002..f6d8aa1318234 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc @@ -12,9 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" - #include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/unary.h" diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index b21e41c5b8548..a2048004615b7 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -21,7 +21,6 @@ #include #include "paddle/fluid/framework/tensor.h" - #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/funcs/reduce_function.h" namespace paddle { diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index 76641698ead67..322ef1fdff67a 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/cast_op.h" @@ -484,8 +485,9 @@ class ReduceOp : public framework::OperatorWithKernel { platform::is_gpu_place(ctx.GetPlace()) || platform::is_npu_place(ctx.GetPlace()) || platform::is_mlu_place(ctx.GetPlace()), - true, platform::errors::InvalidArgument( - "float16 can only be used on GPU or NPU or MLU place")); + true, + platform::errors::InvalidArgument( + "float16 can only be used on GPU or NPU or MLU place")); } return framework::OpKernelType(input_data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_function.h b/paddle/fluid/operators/reduce_ops/reduce_op_function.h index c144e65cbf647..a9d5863558cf7 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op_function.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op_function.h @@ -14,6 +14,7 @@ #pragma once #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h b/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h index 95dda354cae7d..96e496217d04f 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h @@ -17,6 +17,7 @@ #ifdef PADDLE_WITH_MLU #include #include + #include "paddle/fluid/operators/mlu/mlu_baseop.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.h" diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h b/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h index 324fd369e82b5..f9ae575e801b9 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h @@ -20,6 +20,7 @@ #include #include #include + #include "paddle/fluid/operators/reduce_ops/reduce_op.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc index 7a5c86c35c6a2..f50cfd0417aaf 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc @@ -15,6 +15,7 @@ #ifdef PADDLE_WITH_XPU #include #include + #include "paddle/fluid/operators/reduce_ops/reduce_op_xpu.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" diff --git a/paddle/fluid/operators/reduce_ops/unity_build_rule.cmake b/paddle/fluid/operators/reduce_ops/unity_build_rule.cmake index c4f32a8d25764..f5c1af004f34f 100644 --- a/paddle/fluid/operators/reduce_ops/unity_build_rule.cmake +++ b/paddle/fluid/operators/reduce_ops/unity_build_rule.cmake @@ -4,18 +4,16 @@ # Generally, the combination rules in this file do not need to be modified. # If there are some redefined error in compiling with the source file which # in combination rule, you can remove the source file from the following rules. -register_unity_group(cc - reduce_all_op.cc - reduce_any_op.cc - reduce_prod_op.cc - reduce_sum_op.cc) -register_unity_group(cu - reduce_all_op.cu - reduce_any_op.cu - reduce_prod_op.cu - reduce_prod_op.part.cu - reduce_sum_op.cu - reduce_sum_op.part.cu) +register_unity_group(cc reduce_all_op.cc reduce_any_op.cc reduce_prod_op.cc + reduce_sum_op.cc) +register_unity_group( + cu + reduce_all_op.cu + reduce_any_op.cu + reduce_prod_op.cu + reduce_prod_op.part.cu + reduce_sum_op.cu + reduce_sum_op.part.cu) # The following groups are to make better use of `/MP` which MSVC's parallel # compilation instruction when compiling in Unity Build. register_unity_group(cu frobenius_norm_op.cu) diff --git a/paddle/fluid/operators/renorm_op.cu b/paddle/fluid/operators/renorm_op.cu index e40bd147b9925..028f5a7f51567 100644 --- a/paddle/fluid/operators/renorm_op.cu +++ b/paddle/fluid/operators/renorm_op.cu @@ -12,14 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/renorm_op.h" - #include #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" +#include "paddle/fluid/operators/renorm_op.h" #include "paddle/fluid/operators/utils.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" @@ -107,10 +106,10 @@ __global__ void RenormGradKernelFunc2(const T* x_data, const T* dout_data, __syncthreads(); if (i < size) { dx_data[i] = dim_value[dim_index] * dout_data[i]; - dx_data[i] = dx_data[i] + - weight_derivative[dim_index] * dim_power_sum[dim_index] * - pow(abs(x_data[i]), T(p - 1.0)) * - (x_data[i] >= 0 ? 1 : -1); + dx_data[i] = dx_data[i] + weight_derivative[dim_index] * + dim_power_sum[dim_index] * + pow(abs(x_data[i]), T(p - 1.0)) * + (x_data[i] >= 0 ? 1 : -1); } } diff --git a/paddle/fluid/operators/repeat_interleave_op.cc b/paddle/fluid/operators/repeat_interleave_op.cc index d6f9df5d79e60..daa45bf78f27d 100644 --- a/paddle/fluid/operators/repeat_interleave_op.cc +++ b/paddle/fluid/operators/repeat_interleave_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/repeat_interleave_op.h" + #include namespace paddle { @@ -51,11 +52,12 @@ class RepeatInterleaveOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ( repeats_dim.size() == 1 || (repeats_dim.size() == 2 && repeats_dim[1] == 1), - true, platform::errors::InvalidArgument( - "The 'shape' of Input(RepeatsTensor) must be 1-D tensor. " - "But received: the 'shape' of Input(Index) is [%s], " - "the dimension of Input(Index) is [%d].", - repeats_dim, repeats_dim.size())); + true, + platform::errors::InvalidArgument( + "The 'shape' of Input(RepeatsTensor) must be 1-D tensor. " + "But received: the 'shape' of Input(Index) is [%s], " + "the dimension of Input(Index) is [%d].", + repeats_dim, repeats_dim.size())); PADDLE_ENFORCE_EQ(repeats_dim[0] != 0, true, platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/repeat_interleave_op.cu b/paddle/fluid/operators/repeat_interleave_op.cu index 5f48a4a94ac99..2b8464d5bf6bf 100644 --- a/paddle/fluid/operators/repeat_interleave_op.cu +++ b/paddle/fluid/operators/repeat_interleave_op.cu @@ -127,10 +127,10 @@ class RepeatInterleaveCUDAKernel : public framework::OpKernel { int64_t size = output_dim[dim]; int64_t delta = input_dim[dim] - size; - index_select_cuda_kernel<<< - (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, index_data, - numel, stride, size, delta); + index_select_cuda_kernel + <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>( + in_data, out_data, index_data, numel, stride, size, delta); } else { RepeatsTensor2IndexTensor(*repeats_tensor, &index); @@ -143,10 +143,10 @@ class RepeatInterleaveCUDAKernel : public framework::OpKernel { int64_t size = output_dim[dim]; int64_t delta = input_dim[dim] - size; - index_select_cuda_kernel<<< - (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, index_data, - numel, stride, size, delta); + index_select_cuda_kernel + <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>( + in_data, out_data, index_data, numel, stride, size, delta); } } else if (repeats > 0) { int64_t index_size = in->dims()[dim] * repeats; @@ -169,10 +169,10 @@ class RepeatInterleaveCUDAKernel : public framework::OpKernel { int64_t delta = input_dim[dim] - size; const int* index_data = index.data(); - index_select_cuda_kernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - in_data, out_data, index_data, numel, stride, size, delta); + index_select_cuda_kernel + <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>( + in_data, out_data, index_data, numel, stride, size, delta); platform::GpuStreamSync(stream); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -206,9 +206,9 @@ class RepeatInterleaveGradCUDAKernel : public framework::OpKernel { auto stream = context.template device_context().stream(); - index_select_grad_init< - T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_grad_data, numel); + index_select_grad_init + <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_grad_data, numel); int repeats = context.Attr("Repeats"); framework::LoDTensor index; @@ -237,22 +237,24 @@ class RepeatInterleaveGradCUDAKernel : public framework::OpKernel { int64_t index_nums = index.numel(); const int64_t* index_data = index.data(); - index_select_grad_cuda_kernel<<< - (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - output_grad_data, in_grad_data, index_data, index_nums, out_nums, - stride, size, delta); + index_select_grad_cuda_kernel + <<<(out_nums + PADDLE_CUDA_NUM_THREADS - 1) / + PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>( + output_grad_data, in_grad_data, index_data, index_nums, + out_nums, stride, size, delta); platform::GpuStreamSync(stream); } else { RepeatsTensor2IndexTensor(*repeats_tensor, &index); int64_t index_nums = index.numel(); const int* index_data = index.data(); - index_select_grad_cuda_kernel<<< - (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - output_grad_data, in_grad_data, index_data, index_nums, out_nums, - stride, size, delta); + index_select_grad_cuda_kernel + <<<(out_nums + PADDLE_CUDA_NUM_THREADS - 1) / + PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>( + output_grad_data, in_grad_data, index_data, index_nums, + out_nums, stride, size, delta); platform::GpuStreamSync(stream); } } else if (repeats > 0) { @@ -268,11 +270,11 @@ class RepeatInterleaveGradCUDAKernel : public framework::OpKernel { const int* index_data = index.data(); int64_t index_nums = index.numel(); - index_select_grad_cuda_kernel<<< - (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data, - index_data, index_nums, - out_nums, stride, size, delta); + index_select_grad_cuda_kernel + <<<(out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>( + output_grad_data, in_grad_data, index_data, index_nums, out_nums, + stride, size, delta); platform::GpuStreamSync(stream); } else { PADDLE_THROW(platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/repeat_interleave_op.h b/paddle/fluid/operators/repeat_interleave_op.h index 68b66bd534ca8..f8e39fdc90762 100644 --- a/paddle/fluid/operators/repeat_interleave_op.h +++ b/paddle/fluid/operators/repeat_interleave_op.h @@ -14,11 +14,11 @@ #pragma once #include + #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/index_select_op.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" - -#include "paddle/fluid/operators/index_select_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/requantize_op.cc b/paddle/fluid/operators/requantize_op.cc index 2d87ae91fbe60..d9345c1145ba6 100644 --- a/paddle/fluid/operators/requantize_op.cc +++ b/paddle/fluid/operators/requantize_op.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/requantize_op.h" + #include "paddle/fluid/framework/op_version_registry.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" diff --git a/paddle/fluid/operators/requantize_op.h b/paddle/fluid/operators/requantize_op.h index c2b154db11dc7..8166aa98f076f 100644 --- a/paddle/fluid/operators/requantize_op.h +++ b/paddle/fluid/operators/requantize_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/rnn_op.cc b/paddle/fluid/operators/rnn_op.cc index caf90219935de..d3c6ee7c1e1a8 100644 --- a/paddle/fluid/operators/rnn_op.cc +++ b/paddle/fluid/operators/rnn_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc index bf78b6a696559..db84387e6cfa7 100644 --- a/paddle/fluid/operators/roi_align_op.cc +++ b/paddle/fluid/operators/roi_align_op.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc index 7be1c19012099..18938d7183200 100644 --- a/paddle/fluid/operators/roi_align_op_xpu.cc +++ b/paddle/fluid/operators/roi_align_op_xpu.cc @@ -15,6 +15,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc index 12e33d56c0020..e47145535a389 100644 --- a/paddle/fluid/operators/roi_pool_op.cc +++ b/paddle/fluid/operators/roi_pool_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc index 07a6117d71119..9c66566fdfd89 100644 --- a/paddle/fluid/operators/row_conv_op.cc +++ b/paddle/fluid/operators/row_conv_op.cc @@ -12,9 +12,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/row_conv_op.h" + #include #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu index c5794948aaec6..b1cabb018b9e0 100644 --- a/paddle/fluid/operators/row_conv_op.cu +++ b/paddle/fluid/operators/row_conv_op.cu @@ -344,9 +344,9 @@ class RowConvKernel dim3 block_dim = dim3(32, 32); dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); int mem_per_block = (future_context * block_dim.x) * sizeof(T); - RowConvForwardSharedMemory< - T><<>>( - in, weight, num_sequence, input_dim, future_context, idx, out); + RowConvForwardSharedMemory + <<>>( + in, weight, num_sequence, input_dim, future_context, idx, out); } else { dim3 block_dim = dim3(32, 32); dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); @@ -413,10 +413,10 @@ class RowConvGradKernel (block_y * block_x + block_y * (block_x + future_context - 1) + future_context * block_y) * sizeof(T); - RowConvGradFilterImproved< - T><<>>( - in, dout, num_sequence, input_dim, future_context, block_x, block_y, - idx, dfilter); + RowConvGradFilterImproved + <<>>( + in, dout, num_sequence, input_dim, future_context, block_x, + block_y, idx, dfilter); } else { dim3 block_dim = dim3(32, 32); dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); @@ -424,10 +424,10 @@ class RowConvGradKernel int block_y = block_dim.y; int mem_per_block = (block_x * block_y * 2) * sizeof(T); // For 2 arrays of size 32x32 - RowConvGradFilter< - T><<>>( - in, dout, num_sequence, input_dim, future_context, block_x, block_y, - idx, dfilter); + RowConvGradFilter + <<>>( + in, dout, num_sequence, input_dim, future_context, block_x, + block_y, idx, dfilter); } } @@ -437,9 +437,10 @@ class RowConvGradKernel dim3 block_dim = dim3(32, 32); dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); int mem_per_block = (future_context * block_dim.x) * sizeof(T); - RowConvGradInputSharedMemory< - T><<>>( - dout, weights, num_sequence, input_dim, future_context, idx, din); + RowConvGradInputSharedMemory + <<>>( + dout, weights, num_sequence, input_dim, future_context, idx, + din); } else { dim3 block_dim = dim3(32, 32); dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); diff --git a/paddle/fluid/operators/rrelu_op.cc b/paddle/fluid/operators/rrelu_op.cc index c543a088e9d7f..558c77b5b9220 100644 --- a/paddle/fluid/operators/rrelu_op.cc +++ b/paddle/fluid/operators/rrelu_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/infermeta/unary.h" diff --git a/paddle/fluid/operators/run_program_op.cc b/paddle/fluid/operators/run_program_op.cc index ec62feb07bc80..38c92de4523d5 100644 --- a/paddle/fluid/operators/run_program_op.cc +++ b/paddle/fluid/operators/run_program_op.cc @@ -90,6 +90,8 @@ class RunProgramOpMaker : public framework::OpProtoAndCheckerMaker { "computes double grad.") .AsDuplicable() .AsDispensable(); + AddOutput("CUDAGraph", "The output CUDA Graph when use_cuda_graph=True.") + .AsDispensable(); AddAttr("global_block", "(BlockDesc *)" "The global block of executed program desc."); @@ -107,6 +109,13 @@ class RunProgramOpMaker : public framework::OpProtoAndCheckerMaker { "program_id", "(int64_t)" "The unique hash id used as cache key for ExecutorInfoCache."); + AddAttr("cuda_graph_capture_mode", + "(str, default '') The CUDA Graph capture mode. " + "Default '' means no CUDA Graph capturing.") + .SetDefault(""); + AddAttr("cuda_graph_pool_id", + "(int64_t, default 0) The CUDA Graph memory pool ID.") + .SetDefault(0); AddComment(R"DOC( RunProgram operator. @@ -191,6 +200,9 @@ class RunProgramGradOpMaker : public framework::SingleGradOpMaker { grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); grad_op->SetInput("OutScope", this->Output("OutScope")); grad_op->SetInput("DOut", this->Output("DOut")); + if (this->HasOutput("CUDAGraph")) { + grad_op->SetInput("CUDAGraph", this->Output("CUDAGraph")); + } grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); auto block_desc = diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h index fbc52480c8266..bfd33efe833d2 100644 --- a/paddle/fluid/operators/run_program_op.h +++ b/paddle/fluid/operators/run_program_op.h @@ -34,6 +34,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/operators/cuda_graph_with_in_out.h" +#endif DECLARE_bool(use_mkldnn); @@ -96,11 +99,12 @@ static void CheckOutputVarStatus(const Variable &src_var, var_name, platform::demangle(framework::ToTypeName(src_var.Type())))); PADDLE_ENFORCE_EQ(src_var.Get().value().IsInitialized(), - true, platform::errors::InvalidArgument( - "The tensor in output variable %s get from " - "RunProgram(Grad)Op's " - "internal scope is not initialized.", - var_name)); + true, + platform::errors::InvalidArgument( + "The tensor in output variable %s get from " + "RunProgram(Grad)Op's " + "internal scope is not initialized.", + var_name)); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -167,13 +171,84 @@ static void ShareVarsFromScope(const std::vector &vars, } } +#ifdef PADDLE_WITH_CUDA +static cudaStreamCaptureMode StringToCUDAGraphCaptureMode( + const std::string &mode) { + if (mode == "global") { + return cudaStreamCaptureModeGlobal; + } else if (mode == "thread_local") { + return cudaStreamCaptureModeThreadLocal; + } else if (mode == "relaxed") { + return cudaStreamCaptureModeRelaxed; + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "Unsupported CUDA Graph capture mode %s", mode)); + } +} +#endif + } // namespace details template class RunProgramOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { + const auto &capture_mode = ctx.Attr("cuda_graph_capture_mode"); + auto is_test = ctx.Attr("is_test"); + if (capture_mode.empty()) { + ComputeImpl(ctx, is_test, false); + return; + } + +#ifdef PADDLE_WITH_CUDA + auto mode = details::StringToCUDAGraphCaptureMode(capture_mode); + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(ctx.GetPlace()), true, + phi::errors::InvalidArgument("The cuda_graph_capture_mode is only " + "valid when using NVIDIA GPU.")); + auto *graph_var = ctx.OutputVar("CUDAGraph"); + PADDLE_ENFORCE_NOT_NULL( + graph_var, + phi::errors::InvalidArgument("Output(CUDAGraph) must exist when " + "cuda_graph_capture_mode is valid.")); + using GraphVecType = std::vector>; + auto &inner_graphs = *(graph_var->GetMutable()); + inner_graphs.resize(std::max(3, inner_graphs.size())); + size_t graph_idx = is_test ? 0 : 1; + if (inner_graphs[graph_idx].get() == nullptr) { + int64_t pool_id; + if (inner_graphs[1 - graph_idx].get() != nullptr) { + pool_id = inner_graphs[1 - graph_idx]->PoolID(); + } else { + pool_id = ctx.Attr("cuda_graph_pool_id"); + } + + framework::PEAndGraphPair pe_and_graph; + auto callable = [this, is_test, &pe_and_graph]( + const framework::ExecutionContext &exe_ctx) { + pe_and_graph = ComputeImpl(exe_ctx, is_test, true); + }; + inner_graphs[graph_idx] = CaptureCUDAGraph( + callable, ctx, {"X"}, {"Out", "DOut"}, mode, pool_id); + VLOG(10) << "Capture Forward CUDA Graph"; + } else { + VLOG(10) << "Run Forward CUDA Graph directly"; + ExecuteCUDAGraph(ctx, {"X"}, {"Out", "DOut"}, + inner_graphs[graph_idx].get()); + } +#else + PADDLE_THROW( + phi::errors::InvalidArgument("The cuda_graph_capture_mode is only " + "valid when using NVIDIA GPU.")); +#endif + } + + private: + framework::PEAndGraphPair ComputeImpl(const framework::ExecutionContext &ctx, + bool is_test, + bool use_cuda_graph) const { VLOG(2) << "RunProgramOpKernel Compute"; + framework::PEAndGraphPair pe_and_graph; // Step 1. prepare inputs, outputs, attrs auto &input_vars = ctx.MultiInputVar("X"); auto ¶m_vars = ctx.MultiInputVar("Params"); @@ -192,7 +267,6 @@ class RunProgramOpKernel : public framework::OpKernel { auto start_op_index = ctx.Attr("start_op_index"); auto end_op_index = ctx.Attr("end_op_index"); - auto is_test = ctx.Attr("is_test"); auto program_id = ctx.Attr("program_id"); // NOTE(chenweihang): In order not to add new variable type, use vector @@ -223,15 +297,29 @@ class RunProgramOpKernel : public framework::OpKernel { if (end_op_index > start_op_index) { auto *program = global_block->Program(); - auto cache_info = framework::GetExecutorInfoFromCache( - *program, ctx.GetPlace(), start_op_index, end_op_index, - /*is_grad=*/false, program_id, &scope); - auto ¶llel_executor = cache_info.first; + bool is_new_created; + if (use_cuda_graph) { + pe_and_graph = framework::CreateFixOrderExecutorInfo( + *program, ctx.GetPlace(), start_op_index, end_op_index, &scope); + is_new_created = true; + } else { + auto cache_info = framework::GetExecutorInfoFromCache( + *program, ctx.GetPlace(), start_op_index, end_op_index, + /*is_grad=*/false, program_id, &scope); + pe_and_graph.first = cache_info.first; + is_new_created = cache_info.second; + } + + auto ¶llel_executor = pe_and_graph.first; + // all out_vars are skip_eager_var + std::vector tmp_vars; auto &skip_eager_delete_vars = - framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( - program_id, false); - if (cache_info.second /*is_new_created*/) { + use_cuda_graph + ? tmp_vars + : framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( + program_id, false); + if (is_new_created) { parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, input_var_names); skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), output_var_names.begin(), @@ -263,6 +351,7 @@ class RunProgramOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_MKLDNN if (FLAGS_use_mkldnn) platform::DontClearMKLDNNCache(ctx.GetPlace()); #endif + return pe_and_graph; } }; @@ -270,14 +359,68 @@ template class RunProgramGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { + const auto &capture_mode = ctx.Attr("cuda_graph_capture_mode"); + if (capture_mode.empty()) { + ComputeImpl(ctx, false); + return; + } + +#ifdef PADDLE_WITH_CUDA + auto mode = details::StringToCUDAGraphCaptureMode(capture_mode); + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(ctx.GetPlace()), true, + phi::errors::InvalidArgument("The cuda_graph_capture_mode is only " + "valid when using NVIDIA GPU.")); + auto *graph_var = + const_cast(ctx.InputVar("CUDAGraph")); + PADDLE_ENFORCE_NOT_NULL( + graph_var, + phi::errors::InvalidArgument("Output(CUDAGraph) must exist when " + "cuda_graph_capture_mode is valid.")); + auto &inner_graphs = *( + graph_var + ->GetMutable>>()); + const size_t graph_idx = 2; + if (inner_graphs[graph_idx].get() == nullptr) { + framework::PEAndGraphPair pe_and_graph; + auto callable = + [this, &pe_and_graph](const framework::ExecutionContext &exe_ctx) { + pe_and_graph = ComputeImpl(exe_ctx, true); + }; + int64_t pool_id = inner_graphs[0].get() != nullptr + ? inner_graphs[0]->PoolID() + : inner_graphs[1]->PoolID(); + inner_graphs[graph_idx] = + CaptureCUDAGraph(callable, ctx, {framework::GradVarName("Out")}, + {framework::GradVarName("X")}, mode, pool_id); + VLOG(10) << "Capture Backward CUDA Graph"; + } else { + ExecuteCUDAGraph(ctx, {framework::GradVarName("Out")}, + {framework::GradVarName("X")}, + inner_graphs[graph_idx].get()); + VLOG(10) << "Run Backward CUDA Graph directly"; + } +#else + PADDLE_THROW( + phi::errors::InvalidArgument("The cuda_graph_capture_mode is only " + "valid when using NVIDIA GPU.")); +#endif + } + + private: + framework::PEAndGraphPair ComputeImpl(const framework::ExecutionContext &ctx, + bool use_cuda_graph) const { VLOG(2) << "RunProgramGradOpKernel Compute"; + framework::PEAndGraphPair pe_and_graph; // Step 1. prepare inputs and outputs auto &output_grad_vars = ctx.MultiInputVar(framework::GradVarName("Out")); auto input_grad_vars = ctx.MultiOutputVar(framework::GradVarName("X")); auto param_grad_vars = ctx.MultiOutputVar(framework::GradVarName("Params")); // if all output vars are set to stop_gradient, grad op no need to executed - if (input_grad_vars.empty() && param_grad_vars.empty()) return; + if (input_grad_vars.empty() && param_grad_vars.empty()) { + return pe_and_graph; + } auto output_grad_var_names = ctx.InputNames(framework::GradVarName("Out")); // NOTE: after PR22939 [Add double grad] merged, the grad op maker's @@ -321,15 +464,27 @@ class RunProgramGradOpKernel : public framework::OpKernel { if (end_op_index > start_op_index) { // Step 2. prepare executor and scope auto *program = global_block->Program(); - auto cache_info = framework::GetExecutorInfoFromCache( - *program, ctx.GetPlace(), start_op_index, end_op_index, - /*is_grad*/ true, program_id, &scope); - auto ¶llel_executor = cache_info.first; + bool is_new_created; + if (use_cuda_graph) { + pe_and_graph = framework::CreateFixOrderExecutorInfo( + *program, ctx.GetPlace(), start_op_index, end_op_index, &scope); + is_new_created = true; + } else { + auto cache_info = framework::GetExecutorInfoFromCache( + *program, ctx.GetPlace(), start_op_index, end_op_index, + /*is_grad*/ true, program_id, &scope); + pe_and_graph.first = cache_info.first; + is_new_created = cache_info.second; + } + auto ¶llel_executor = pe_and_graph.first; + std::vector tmp_vars; auto &skip_eager_delete_vars = - framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( - program_id, true); - if (cache_info.second /*is_new_created*/) { + use_cuda_graph + ? tmp_vars + : framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( + program_id, true); + if (is_new_created) { parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, output_grad_var_names); @@ -360,6 +515,7 @@ class RunProgramGradOpKernel : public framework::OpKernel { global_inner_scope->DeleteScope(&scope); VLOG(2) << "The number of sub scopes after backward: " << global_inner_scope->kids().size(); + return pe_and_graph; } }; diff --git a/paddle/fluid/operators/sample_logits_op.cc b/paddle/fluid/operators/sample_logits_op.cc index e02c7ade9a11a..a80d527fd5c38 100644 --- a/paddle/fluid/operators/sample_logits_op.cc +++ b/paddle/fluid/operators/sample_logits_op.cc @@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sample_logits_op.h" + #include + #include "paddle/fluid/operators/math/sample_prob.h" namespace paddle { diff --git a/paddle/fluid/operators/sample_logits_op.cu b/paddle/fluid/operators/sample_logits_op.cu index 273010e5443f8..7eff9429244fc 100644 --- a/paddle/fluid/operators/sample_logits_op.cu +++ b/paddle/fluid/operators/sample_logits_op.cu @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" @@ -146,9 +147,9 @@ class SampleLogitsCUDAKernel : public framework::OpKernel { int threads = 512; size_t size = batch_size * num_true; int grid = (size + threads - 1) / threads; - GPUSetLabel< - T><<>>( - size, num_true, sampled_labels_data); + GPUSetLabel + <<>>( + size, num_true, sampled_labels_data); if (use_customized_samples) { const Tensor* customized_samples = @@ -190,17 +191,17 @@ class SampleLogitsCUDAKernel : public framework::OpKernel { size = batch_size * num_take; grid = (size + threads - 1) / threads; - GPUTakeAlongD1< - T><<>>( - size, batch_size, array_slice_size, idx_slice_size, p_array, p_index, - p_value); + GPUTakeAlongD1 + <<>>( + size, batch_size, array_slice_size, idx_slice_size, p_array, + p_index, p_value); if (remove_accidental_hits) { const size_t size = batch_size * (num_true + num_samples); int grid = (size + threads - 1) / threads; - gpu_compute_remove_accidental_hits< - T><<>>( - size, num_true, idx_slice_size, p_index, p_value); + gpu_compute_remove_accidental_hits + <<>>( + size, num_true, idx_slice_size, p_index, p_value); } // subtracted sampled logits with logQ(y|x) @@ -246,10 +247,10 @@ class SampleLogitsGradCUDAKernel : public framework::OpKernel { const size_t size = batch_size; int grid = (size + threads - 1) / threads; - GPUPutAlongD1< - T><<>>( - size, batch_size, array_slice_size, idx_slice_size, p_array, p_index, - p_value); + GPUPutAlongD1 + <<>>( + size, batch_size, array_slice_size, idx_slice_size, p_array, + p_index, p_value); } }; diff --git a/paddle/fluid/operators/sample_logits_op.h b/paddle/fluid/operators/sample_logits_op.h index ae741ae321292..815a2897d5d20 100644 --- a/paddle/fluid/operators/sample_logits_op.h +++ b/paddle/fluid/operators/sample_logits_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc index 7fe6623dcca14..23aa88459cec1 100644 --- a/paddle/fluid/operators/save_combine_op.cc +++ b/paddle/fluid/operators/save_combine_op.cc @@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include - #include "paddle/fluid/operators/save_combine_op.h" +#include + namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/save_combine_op.h b/paddle/fluid/operators/save_combine_op.h index 8b8e27b79b96b..a419e862501f6 100644 --- a/paddle/fluid/operators/save_combine_op.h +++ b/paddle/fluid/operators/save_combine_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include #include #include diff --git a/paddle/fluid/operators/save_load_combine_op_test.cc b/paddle/fluid/operators/save_load_combine_op_test.cc index 493f5081ee42b..797321efd6c45 100644 --- a/paddle/fluid/operators/save_load_combine_op_test.cc +++ b/paddle/fluid/operators/save_load_combine_op_test.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include #include + #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/bfloat16.h" diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc index d819c172e4a9d..02774c6b72aca 100644 --- a/paddle/fluid/operators/save_op.cc +++ b/paddle/fluid/operators/save_op.cc @@ -12,14 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/save_op.h" + #include + #include #include #include #include -#include "paddle/fluid/operators/save_op.h" - namespace paddle { namespace operators { class SaveOp : public framework::OperatorWithKernel { diff --git a/paddle/fluid/operators/save_op.h b/paddle/fluid/operators/save_op.h index e4ca1423afaea..64aca1ab6b71f 100644 --- a/paddle/fluid/operators/save_op.h +++ b/paddle/fluid/operators/save_op.h @@ -12,6 +12,7 @@ limitations under the License. */ #pragma once #include + #include #include #include diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index cbf2b9152079e..ebc4c6441489c 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/float16.h" diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc index 40f5699a29b35..fdc98d084ed07 100644 --- a/paddle/fluid/operators/scale_op_xpu.cc +++ b/paddle/fluid/operators/scale_op_xpu.cc @@ -15,6 +15,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/scale_kernel.h" diff --git a/paddle/fluid/operators/scatter_nd_add_op.cc b/paddle/fluid/operators/scatter_nd_add_op.cc index 0ae0e1500c166..0cfc3a77aadb2 100644 --- a/paddle/fluid/operators/scatter_nd_add_op.cc +++ b/paddle/fluid/operators/scatter_nd_add_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/ddim.h" diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc index 5f6b04cf59e0e..a2e8071e01353 100644 --- a/paddle/fluid/operators/scatter_op.cc +++ b/paddle/fluid/operators/scatter_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/ddim.h" diff --git a/paddle/fluid/operators/scatter_op_xpu.cc b/paddle/fluid/operators/scatter_op_xpu.cc index 07dd2f2d85fe9..3ab084b660a0a 100644 --- a/paddle/fluid/operators/scatter_op_xpu.cc +++ b/paddle/fluid/operators/scatter_op_xpu.cc @@ -56,11 +56,12 @@ class ScatterOpXPUKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ( index->dims().size() == 1 || (index->dims().size() == 2 && index->dims()[1] == 1), - true, platform::errors::InvalidArgument( - "index's shape is error, " - "expect index'dims shape is 1 or 2 and index.dims[1] is 1" - "but got index'dims shape is %d", - index->dims().size())); + true, + platform::errors::InvalidArgument( + "index's shape is error, " + "expect index'dims shape is 1 or 2 and index.dims[1] is 1" + "but got index'dims shape is %d", + index->dims().size())); int index_size = static_cast(index->dims()[0]); auto x_dims = x->dims(); diff --git a/paddle/fluid/operators/seed_op.cc b/paddle/fluid/operators/seed_op.cc index 837ccae0284f5..7cad6dcab7ca3 100644 --- a/paddle/fluid/operators/seed_op.cc +++ b/paddle/fluid/operators/seed_op.cc @@ -74,13 +74,12 @@ REGISTER_OP_CPU_KERNEL( seed, ops::CPUSeedKernel); /* ========================== register checkpoint ===========================*/ -REGISTER_OP_VERSION(seed) - .AddCheckpoint( - R"ROC( +REGISTER_OP_VERSION(seed).AddCheckpoint( + R"ROC( Upgrade seed add a new attribute [force_cpu])ROC", - paddle::framework::compatible::OpVersionDesc().NewAttr( - "force_cpu", - "If true, Force fill output variable to cpu." - "memory. Otherwise, fill output variable to the running " - "device", - false)); + paddle::framework::compatible::OpVersionDesc().NewAttr( + "force_cpu", + "If true, Force fill output variable to cpu." + "memory. Otherwise, fill output variable to the running " + "device", + false)); diff --git a/paddle/fluid/operators/segment_pool_op.cc b/paddle/fluid/operators/segment_pool_op.cc index 9d4c8532a82c0..92010e8afc058 100644 --- a/paddle/fluid/operators/segment_pool_op.cc +++ b/paddle/fluid/operators/segment_pool_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" diff --git a/paddle/fluid/operators/sequence_ops/CMakeLists.txt b/paddle/fluid/operators/sequence_ops/CMakeLists.txt index 0ca88409f4126..fe36afd96c5e8 100644 --- a/paddle/fluid/operators/sequence_ops/CMakeLists.txt +++ b/paddle/fluid/operators/sequence_ops/CMakeLists.txt @@ -1,6 +1,6 @@ include(operators) if(WITH_UNITY_BUILD) - # Load Unity Build rules for operators in paddle/fluid/operators/sequence_ops. - include(unity_build_rule.cmake) + # Load Unity Build rules for operators in paddle/fluid/operators/sequence_ops. + include(unity_build_rule.cmake) endif() register_operators() diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc index f6523255e2438..0f17ff1e1b7bc 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/sequence_ops/sequence_concat_op.h" + #include #include diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc index d58a2da29c941..4856e38011bae 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/sequence_ops/sequence_concat_op.h" + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h index 1b8525febe2d4..f27e6535d3199 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h @@ -16,6 +16,7 @@ #include #include + #include "boost/optional.hpp" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/concat_and_split.h" @@ -49,7 +50,7 @@ inline framework::LoD ConcatLoD(const Container &xs, template inline std::vector> GetDataVectorSafely( - const std::vector &vec, ARGS &&... args) { + const std::vector &vec, ARGS &&...args) { std::vector> result; result.reserve(vec.size()); for (auto *ptr : vec) { diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.h b/paddle/fluid/operators/sequence_ops/sequence_conv_op.h index 62fa5bc26aca2..1935a62621de4 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/context_project.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc index 23c6a0133e1ed..ef440a580f913 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc @@ -54,10 +54,12 @@ class SequenceConvXPUKernel : public framework::OpKernel { int up_pad = std::max(0, -context_start); int down_pad = std::max(0, context_start + context_length - 1); - PADDLE_ENFORCE_EQ(up_pad, 2, platform::errors::InvalidArgument( - "Only support up_pad equal 2.")); - PADDLE_ENFORCE_EQ(down_pad, 2, platform::errors::InvalidArgument( - "Only support down_pad equal 2.")); + PADDLE_ENFORCE_EQ( + up_pad, 2, + platform::errors::InvalidArgument("Only support up_pad equal 2.")); + PADDLE_ENFORCE_EQ( + down_pad, 2, + platform::errors::InvalidArgument("Only support down_pad equal 2.")); auto xpu_context = context.template device_context().x_context(); @@ -75,8 +77,9 @@ class SequenceConvXPUKernel : public framework::OpKernel { // If batch size set to 256, the lod is {0, batch[0] - 0, // batch[1] - batch [0], ..., batch[255] - batch[254]}, // so the lod_size will be 257. - PADDLE_ENFORCE_LE(lod_size, 257, platform::errors::InvalidArgument( - "Only support batch size <= 256.")); + PADDLE_ENFORCE_LE( + lod_size, 257, + platform::errors::InvalidArgument("Only support batch size <= 256.")); std::vector cpu_lodx(lod_size); for (int i = 0; i < lod_size; i++) { @@ -155,15 +158,18 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { int up_pad = std::max(0, -context_start); int down_pad = std::max(0, context_start + context_length - 1); - PADDLE_ENFORCE_EQ(up_pad, 2, platform::errors::InvalidArgument( - "Only support up_pad equal 2.")); - PADDLE_ENFORCE_EQ(down_pad, 2, platform::errors::InvalidArgument( - "Only support down_pad equal 2.")); + PADDLE_ENFORCE_EQ( + up_pad, 2, + platform::errors::InvalidArgument("Only support up_pad equal 2.")); + PADDLE_ENFORCE_EQ( + down_pad, 2, + platform::errors::InvalidArgument("Only support down_pad equal 2.")); auto lod_level_0 = in->lod()[0]; int lod_size = lod_level_0.size(); - PADDLE_ENFORCE_LE(lod_size, 257, platform::errors::InvalidArgument( - "Only support batch size <= 256.")); + PADDLE_ENFORCE_LE( + lod_size, 257, + platform::errors::InvalidArgument("Only support batch size <= 256.")); std::vector cpu_lodx(lod_size); for (int i = 0; i < lod_size; i++) { diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu index 9591f3e8b5bbf..0f47e8a9c2a98 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu @@ -14,6 +14,7 @@ #include #include + #include "paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc index 79503d9714f5b..552a8283b3671 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_erase_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu index 12d3eee65da70..a87c327922425 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/operators/sequence_ops/sequence_erase_op.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.h b/paddle/fluid/operators/sequence_ops/sequence_erase_op.h index ed98b694b2754..8d10ee508a22d 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc index 494c8e3ab74a0..01e9835270cac 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h" + #include #include diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu index 7e1a06b9eca5b..5cc4ecdd12aa3 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h index 117fa504ff354..5abe6df09e52d 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h @@ -17,6 +17,7 @@ limitations under the License. */ #include // std::iota #include #include + #include "glog/logging.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc index e4f2c1b2b8fd1..4817b003a2870 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h" + #include namespace paddle { @@ -64,10 +65,11 @@ class SequenceExpandOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ( ref_level == -1 || (ref_level >= 0 && ref_level < static_cast(y_lod.size())), - true, platform::errors::InvalidArgument( - "Invlid `ref_level`, which should be either equal to -1 " - "or in [0, %d), but received `ref_level` = %u.", - y_lod.size(), ref_level)); + true, + platform::errors::InvalidArgument( + "Invlid `ref_level`, which should be either equal to -1 " + "or in [0, %d), but received `ref_level` = %u.", + y_lod.size(), ref_level)); if (ref_level == -1) ref_level = y_lod.size() - 1; diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu index 7b7bc5183bf1f..90f911c438bc9 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc index f22b424b30735..060a3e7cab332 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/sequence_ops/sequence_mask_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc index 5d0e1d0194edd..7d018e764bdc9 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_pad_op.h" + #include #include diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.h b/paddle/fluid/operators/sequence_ops/sequence_pad_op.h index 3aaa2828d5bfb..d4022e80d8000 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/math/sequence_padding.h" diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc index 01990ebb73291..af42285158bcb 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_pool_op.h" + #include #include @@ -30,11 +31,12 @@ class SequencePoolOp : public framework::OperatorWithKernel { if (!ctx->IsRuntime()) { // Check the lod_level for compile-time. auto in_lod_level = ctx->GetLoDLevel("X"); - PADDLE_ENFORCE_GT(in_lod_level, 0, platform::errors::InvalidArgument( - "The LoD level of Input(X) should " - "be larger than 0, but received: " - "lod level %u.", - in_lod_level)); + PADDLE_ENFORCE_GT( + in_lod_level, 0, + platform::errors::InvalidArgument("The LoD level of Input(X) should " + "be larger than 0, but received: " + "lod level %u.", + in_lod_level)); ctx->SetLoDLevel("Out", in_lod_level - 1); } diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h index 4d981e0187aca..96d02e6d2e542 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/sequence_pooling.h" @@ -38,9 +39,10 @@ class SequencePoolKernel : public framework::OpKernel { auto lod = in->lod(); auto lod_level = lod.size(); // InferShape by lod - PADDLE_ENFORCE_GT(lod_level, 0, platform::errors::InvalidArgument( - "Input(X) Tensor of SequencePoolOp " - "does not contain LoD information.")); + PADDLE_ENFORCE_GT( + lod_level, 0, + platform::errors::InvalidArgument("Input(X) Tensor of SequencePoolOp " + "does not contain LoD information.")); PADDLE_ENFORCE_LE(lod_level, 2UL, platform::errors::InvalidArgument( "The lod level of input shall be no more than 2." diff --git a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc index 980879db4d06e..3a62bc554df2c 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/operators/sequence_ops/sequence_reshape_op.h" + #include + #include "paddle/phi/core/ddim.h" namespace paddle { diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h index 90a17d713cf29..85282bf23b48c 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/funcs/algorithm.h" diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc index 25c12ab565a14..6fa151af4e117 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_scatter_op.h" + #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc index 06fb444740fee..fdb24892e09a2 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_slice_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc index e7585f7ab0487..e3f8d16a7ade9 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_softmax_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu index c91c59dbfee99..0d91832948dc8 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu @@ -134,10 +134,10 @@ struct SequenceSoftmaxFunctor { dim3 block_size(thread_x); dim3 grid_size(max_blocks); paddle::framework::MixVector mixv_ref_lod(&ref_lod); - sequence_softmax_kernel< - T, kThreadsPerBlock><<>>( - x.data(), mixv_ref_lod.CUDAData(context.GetPlace()), height, - out->mutable_data(context.GetPlace())); + sequence_softmax_kernel + <<>>( + x.data(), mixv_ref_lod.CUDAData(context.GetPlace()), height, + out->mutable_data(context.GetPlace())); } }; @@ -158,11 +158,11 @@ struct SequenceSoftmaxGradFunctor { dim3 grid_size(max_blocks); paddle::framework::MixVector mixv_ref_lod(&ref_lod); - sequence_softmax_grad_kernel< - T, kThreadsPerBlock><<>>( - dout.data(), out.data(), - mixv_ref_lod.CUDAData(context.GetPlace()), height, - dx->mutable_data(context.GetPlace())); + sequence_softmax_grad_kernel + <<>>( + dout.data(), out.data(), + mixv_ref_lod.CUDAData(context.GetPlace()), height, + dx->mutable_data(context.GetPlace())); } }; diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc index bacdd7e4ccb74..b1d5ec8e9c65e 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h" + #include #include @@ -44,8 +45,9 @@ class SequenceTopkAvgPoolingOp : public framework::OperatorWithKernel { auto topks = attr.Get>("topks"); auto num_k = topks.size(); PADDLE_ENFORCE_GT( - num_k, 0, platform::errors::InvalidArgument( - "Expected topks.size() > 0, but received %zu.", num_k)); + num_k, 0, + platform::errors::InvalidArgument( + "Expected topks.size() > 0, but received %zu.", num_k)); auto row_dim = ctx->GetInputDim("ROW"); auto row_shape_0 = row_dim[0]; diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h index 47180f123fa78..b5ee43387b35e 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h @@ -19,6 +19,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc index 180d14cfada31..636be3b2f6ca7 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_unpad_op.h" + #include #include diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h index d96dc91f3bc16..d643ef860c3ca 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/math/sequence_padding.h" diff --git a/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake b/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake index 9ccc4432df5cd..9a87e27b24197 100644 --- a/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake +++ b/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake @@ -4,36 +4,38 @@ # Generally, the combination rules in this file do not need to be modified. # If there are some redefined error in compiling with the source file which # in combination rule, you can remove the source file from the following rules. -register_unity_group(cc - sequence_concat_op.cc - sequence_conv_op.cc - sequence_enumerate_op.cc - sequence_erase_op.cc - sequence_expand_op.cc - sequence_mask_op.cc - sequence_pad_op.cc - sequence_pool_op.cc - sequence_expand_as_op.cc - sequence_reshape_op.cc - sequence_reverse_op.cc - sequence_scatter_op.cc - sequence_slice_op.cc - sequence_softmax_op.cc - sequence_topk_avg_pooling_op.cc - sequence_unpad_op.cc - sequence_concat_op.cu.cc - sequence_conv_op.cu.cc) -register_unity_group(cu - sequence_enumerate_op.cu - sequence_erase_op.cu - sequence_expand_op.cu - sequence_mask_op.cu - sequence_pad_op.cu - sequence_pool_op.cu - sequence_expand_as_op.cu - sequence_reshape_op.cu - sequence_reverse_op.cu - sequence_slice_op.cu - sequence_softmax_cudnn_op.cu.cc - sequence_softmax_op.cu - sequence_unpad_op.cu) +register_unity_group( + cc + sequence_concat_op.cc + sequence_conv_op.cc + sequence_enumerate_op.cc + sequence_erase_op.cc + sequence_expand_op.cc + sequence_mask_op.cc + sequence_pad_op.cc + sequence_pool_op.cc + sequence_expand_as_op.cc + sequence_reshape_op.cc + sequence_reverse_op.cc + sequence_scatter_op.cc + sequence_slice_op.cc + sequence_softmax_op.cc + sequence_topk_avg_pooling_op.cc + sequence_unpad_op.cc + sequence_concat_op.cu.cc + sequence_conv_op.cu.cc) +register_unity_group( + cu + sequence_enumerate_op.cu + sequence_erase_op.cu + sequence_expand_op.cu + sequence_mask_op.cu + sequence_pad_op.cu + sequence_pool_op.cu + sequence_expand_as_op.cu + sequence_reshape_op.cu + sequence_reverse_op.cu + sequence_slice_op.cu + sequence_softmax_cudnn_op.cu.cc + sequence_softmax_op.cu + sequence_unpad_op.cu) diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc index 73655bcb18500..4adedf09aa354 100644 --- a/paddle/fluid/operators/set_value_op.cc +++ b/paddle/fluid/operators/set_value_op.cc @@ -18,7 +18,6 @@ #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_version_registry.h" - #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/unary.h" diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc index daa033f9dc66d..2231eb212a2bc 100644 --- a/paddle/fluid/operators/set_value_op_npu.cc +++ b/paddle/fluid/operators/set_value_op_npu.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/set_value_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" - #include "paddle/phi/kernels/funcs/slice_utils.h" namespace paddle { diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc index 9001ce5d51dec..38482f7b55edf 100644 --- a/paddle/fluid/operators/shape_op.cc +++ b/paddle/fluid/operators/shape_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" diff --git a/paddle/fluid/operators/shape_op_xpu.cc b/paddle/fluid/operators/shape_op_xpu.cc index a62d1b434e764..d4c7d937d4b7b 100644 --- a/paddle/fluid/operators/shape_op_xpu.cc +++ b/paddle/fluid/operators/shape_op_xpu.cc @@ -11,6 +11,7 @@ #ifdef PADDLE_WITH_XPU #include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/share_buffer_op.h b/paddle/fluid/operators/share_buffer_op.h index 1d0abf14f577e..1b564c3bef09f 100644 --- a/paddle/fluid/operators/share_buffer_op.h +++ b/paddle/fluid/operators/share_buffer_op.h @@ -27,8 +27,9 @@ class ShareBufferOpKernel : public framework::OpKernel { const auto inputs = ctx.MultiInput("X"); auto outputs = ctx.MultiOutput("Out"); size_t n = inputs.size(); - PADDLE_ENFORCE_EQ(n, outputs.size(), platform::errors::PermissionDenied( - "Variable number not match.")); + PADDLE_ENFORCE_EQ( + n, outputs.size(), + platform::errors::PermissionDenied("Variable number not match.")); const auto &share_dims_and_dtype = ctx.Attr>("share_dims_and_dtype"); if (!share_dims_and_dtype.empty()) { diff --git a/paddle/fluid/operators/share_data_op.cc b/paddle/fluid/operators/share_data_op.cc index 6fcc29e900261..63e8cb648e84b 100644 --- a/paddle/fluid/operators/share_data_op.cc +++ b/paddle/fluid/operators/share_data_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/share_data_op.h" + #include "paddle/fluid/framework/op_registry.h" namespace paddle { @@ -31,8 +32,9 @@ class ShareDataOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ( in_type == framework::proto::VarType::LOD_TENSOR || in_type == framework::proto::VarType::SELECTED_ROWS, - true, platform::errors::InvalidArgument( - "Type of Variable[X] must be LoDTensor or SelectedRows!")); + true, + platform::errors::InvalidArgument( + "Type of Variable[X] must be LoDTensor or SelectedRows!")); PADDLE_ENFORCE_EQ( in_type, out_type, platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc index 1a3666ad82368..7388144dda320 100644 --- a/paddle/fluid/operators/shrink_rnn_memory_op.cc +++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc @@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/array_operator.h" -#include "paddle/phi/kernels/funcs/math_function.h" - #include "paddle/phi/core/lod_utils.h" +#include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/operators/shuffle_batch_op.cc b/paddle/fluid/operators/shuffle_batch_op.cc index 45f7ab278a3c1..e338b48a4ccaa 100644 --- a/paddle/fluid/operators/shuffle_batch_op.cc +++ b/paddle/fluid/operators/shuffle_batch_op.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/operators/shuffle_batch_op.h" + #include + #include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/var_type_inference.h" diff --git a/paddle/fluid/operators/shuffle_batch_op.h b/paddle/fluid/operators/shuffle_batch_op.h index 2708b4a392d17..f56832f959919 100644 --- a/paddle/fluid/operators/shuffle_batch_op.h +++ b/paddle/fluid/operators/shuffle_batch_op.h @@ -21,6 +21,7 @@ #include #include #include + #include "glog/logging.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/lod_tensor.h" diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc index 70fddc9b04712..c43d456e94e47 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cc +++ b/paddle/fluid/operators/shuffle_channel_op.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/shuffle_channel_op.h" + #include #include @@ -61,8 +62,9 @@ class ShuffleChannelOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("group", "the number of groups.") .SetDefault(1) .AddCustomChecker([](const int& group) { - PADDLE_ENFORCE_GE(group, 1, platform::errors::InvalidArgument( - "group should be larger than 0.")); + PADDLE_ENFORCE_GE(group, 1, + platform::errors::InvalidArgument( + "group should be larger than 0.")); }); AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu index 582d1ea0f26af..d3f6224594be3 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cu +++ b/paddle/fluid/operators/shuffle_channel_op.cu @@ -67,10 +67,10 @@ class ShuffleChannelOpCUDAKernel : public framework::OpKernel { const T* input_data = input->data(); T* output_data = output->mutable_data(ctx.GetPlace()); - ShuffleChannel< - T><<>>( - count, feature_map_size, output_data, input_data, group_row, - group_column, sp_sz); + ShuffleChannel + <<>>( + count, feature_map_size, output_data, input_data, group_row, + group_column, sp_sz); } }; @@ -103,10 +103,10 @@ class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel { int threads = kNumCUDAThreads; int count = num * group_column * group_row * sp_sz; - ShuffleChannel< - T><<>>( - count, feature_map_size, input_grad_data, output_grad_data, group_row, - group_column, sp_sz); + ShuffleChannel + <<>>( + count, feature_map_size, input_grad_data, output_grad_data, + group_row, group_column, sp_sz); } }; } // namespace operators diff --git a/paddle/fluid/operators/shuffle_channel_op.h b/paddle/fluid/operators/shuffle_channel_op.h index aeaac486f3f2b..409acdfdff7ba 100644 --- a/paddle/fluid/operators/shuffle_channel_op.h +++ b/paddle/fluid/operators/shuffle_channel_op.h @@ -12,6 +12,7 @@ limitations under the License. */ #pragma once #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc index 016ff54645b02..0cf1296fce650 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" diff --git a/paddle/fluid/operators/similarity_focus_op.h b/paddle/fluid/operators/similarity_focus_op.h index 4fa4d772aa3a9..17ea30277b85d 100644 --- a/paddle/fluid/operators/similarity_focus_op.h +++ b/paddle/fluid/operators/similarity_focus_op.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -67,16 +68,16 @@ class SimilarityFocusKernel : public framework::OpKernel { std::vector> array(array_size); - bool (*cmp)(std::pair, std::pair) = []( - std::pair x, std::pair y) { - return x.first > y.first; - }; + bool (*cmp)(std::pair, std::pair) = + [](std::pair x, std::pair y) { + return x.first > y.first; + }; - int64_t (*compute_index)(int64_t*, int, int, int, int) = []( - int64_t* dim, int d1, int d2, int d3, int d4) { - return d1 * dim[1] * dim[2] * dim[3] + d2 * dim[2] * dim[3] + - d3 * dim[3] + d4; - }; + int64_t (*compute_index)(int64_t*, int, int, int, int) = + [](int64_t* dim, int d1, int d2, int d3, int d4) { + return d1 * dim[1] * dim[2] * dim[3] + d2 * dim[2] * dim[3] + + d3 * dim[3] + d4; + }; PADDLE_ENFORCE_GT( axis, 0, diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc index c6432d00e9de1..a815e12d061cf 100644 --- a/paddle/fluid/operators/slice_op.cc +++ b/paddle/fluid/operators/slice_op.cc @@ -13,10 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/slice_op.h" + #include #include #include #include + #include "paddle/phi/kernels/funcs/slice_utils.h" namespace paddle { @@ -85,8 +87,9 @@ class SliceOp : public framework::OperatorWithKernel { } if (ctx->HasInputs("EndsTensorList")) { ends_size = ctx->Inputs("EndsTensorList").size(); - PADDLE_ENFORCE_GT(ends_size, 0, platform::errors::InvalidArgument( - "EndsTensorList size can't be zero")); + PADDLE_ENFORCE_GT(ends_size, 0, + platform::errors::InvalidArgument( + "EndsTensorList size can't be zero")); } if (!ctx->HasInput("StartsTensor")) { diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h index a9a98b46d5eb7..f18ffef3f5834 100644 --- a/paddle/fluid/operators/slice_op.h +++ b/paddle/fluid/operators/slice_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/utils.h" diff --git a/paddle/fluid/operators/slice_op_mlu.cc b/paddle/fluid/operators/slice_op_mlu.cc index 43322e4b2e75b..7645232ec0cbc 100644 --- a/paddle/fluid/operators/slice_op_mlu.cc +++ b/paddle/fluid/operators/slice_op_mlu.cc @@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/slice_op.h" - #include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/fluid/operators/slice_op.h" #include "paddle/phi/kernels/funcs/slice_utils.h" namespace paddle { diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc index 0d0d9ab19df30..3441453430e5c 100644 --- a/paddle/fluid/operators/slice_op_npu.cc +++ b/paddle/fluid/operators/slice_op_npu.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/slice_op.h" - #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/phi/kernels/funcs/slice_utils.h" diff --git a/paddle/fluid/operators/slice_op_xpu.cc b/paddle/fluid/operators/slice_op_xpu.cc index 6ac1027b0ce19..8f2dfd38d491b 100644 --- a/paddle/fluid/operators/slice_op_xpu.cc +++ b/paddle/fluid/operators/slice_op_xpu.cc @@ -13,11 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/slice_op.h" #include #include #include #include + +#include "paddle/fluid/operators/slice_op.h" #include "xpu/refactor/math.h" namespace paddle { @@ -53,8 +54,9 @@ class SliceXPUKernel : public framework::OpKernel { start = std::max(start, 0); end = std::max(end, 0); end = std::min(end, dim_value); - PADDLE_ENFORCE_GT(end, start, platform::errors::InvalidArgument( - "end should greater than start")); + PADDLE_ENFORCE_GT( + end, start, + platform::errors::InvalidArgument("end should greater than start")); starts[i] = start; ends[i] = end; } diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cc b/paddle/fluid/operators/smooth_l1_loss_op.cc index c0318d344aef3..05204354d0912 100644 --- a/paddle/fluid/operators/smooth_l1_loss_op.cc +++ b/paddle/fluid/operators/smooth_l1_loss_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/smooth_l1_loss_op.h" + #include namespace paddle { diff --git a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc b/paddle/fluid/operators/smooth_l1_loss_op_npu.cc index 136ea68ac9efe..bdc46abff2ad2 100644 --- a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc +++ b/paddle/fluid/operators/smooth_l1_loss_op_npu.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/smooth_l1_loss_op.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/smooth_l1_loss_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 3840b99dd176d..7304467833a90 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -61,8 +61,9 @@ class SoftmaxOp : public framework::OperatorWithKernel { if (input_data_type == framework::proto::VarType::FP16) { PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()) || platform::is_xpu_place(ctx.GetPlace()), - true, platform::errors::InvalidArgument( - "float16 can only be used on GPU/XPU place")); + true, + platform::errors::InvalidArgument( + "float16 can only be used on GPU/XPU place")); } #endif diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc index c07467a9b0ba3..4b55f5af09dc6 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc @@ -44,8 +44,9 @@ class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel { Tensor* loss = context.Output("Loss"); const int rank = logits->dims().size(); const int axis = phi::funcs::CanonicalAxis(context.Attr("axis"), rank); - PADDLE_ENFORCE_EQ(axis, rank - 1, platform::errors::InvalidArgument( - "axis should == rank - 1")); + PADDLE_ENFORCE_EQ( + axis, rank - 1, + platform::errors::InvalidArgument("axis should == rank - 1")); softmax->mutable_data(context.GetPlace()); loss->mutable_data(context.GetPlace()); const int n = phi::funcs::SizeToAxis(axis, logits->dims()); @@ -140,8 +141,9 @@ class SoftmaxWithCrossEntropyGradXPUKernel : public framework::OpKernel { const int rank = logit_grad->dims().size(); const int axis = phi::funcs::CanonicalAxis(context.Attr("axis"), rank); - PADDLE_ENFORCE_EQ(axis, rank - 1, platform::errors::InvalidArgument( - "axis should == rank - 1")); + PADDLE_ENFORCE_EQ( + axis, rank - 1, + platform::errors::InvalidArgument("axis should == rank - 1")); const int n = phi::funcs::SizeToAxis(axis, logit_grad->dims()); const int d = phi::funcs::SizeFromAxis(axis, logit_grad->dims()); diff --git a/paddle/fluid/operators/solve_op.cc b/paddle/fluid/operators/solve_op.cc index 57302ae034271..4d23f1ce20945 100644 --- a/paddle/fluid/operators/solve_op.cc +++ b/paddle/fluid/operators/solve_op.cc @@ -13,10 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/solve_op.h" + #include #include #include #include + #include "paddle/phi/core/ddim.h" namespace paddle { diff --git a/paddle/fluid/operators/solve_op.h b/paddle/fluid/operators/solve_op.h index 7f3a574866604..928fbf755d7f7 100644 --- a/paddle/fluid/operators/solve_op.h +++ b/paddle/fluid/operators/solve_op.h @@ -92,9 +92,10 @@ static framework::DDim GetOutputShapeUnsqueeze( for (int axis : unsqz_dims) { int cur = axis < 0 ? axis + cur_output_size + 1 : axis; // Vaildity Check: the axis bound - PADDLE_ENFORCE_GE(cur, 0, platform::errors::InvalidArgument( - "The insert dimension value should " - "not be less than 0")); + PADDLE_ENFORCE_GE( + cur, 0, + platform::errors::InvalidArgument("The insert dimension value should " + "not be less than 0")); PADDLE_ENFORCE_LE(cur, cur_output_size, platform::errors::InvalidArgument( "The insert dimension value shoule not be larger " diff --git a/paddle/fluid/operators/space_to_depth_op.cc b/paddle/fluid/operators/space_to_depth_op.cc index 013467396b3a6..6a6972f3293e4 100644 --- a/paddle/fluid/operators/space_to_depth_op.cc +++ b/paddle/fluid/operators/space_to_depth_op.cc @@ -38,8 +38,9 @@ class SpaceToDepthOp : public framework::OperatorWithKernel { "Output(Out) of SpaceToDepthOp should not be null.")); auto x_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_EQ(x_dims.size(), 4, platform::errors::InvalidArgument( - "input should be a 4D tensor")); + PADDLE_ENFORCE_EQ( + x_dims.size(), 4, + platform::errors::InvalidArgument("input should be a 4D tensor")); auto blocksize = ctx->Attrs().Get("blocksize"); PADDLE_ENFORCE_GT(blocksize, 1, diff --git a/paddle/fluid/operators/sparse_attention_op.cc b/paddle/fluid/operators/sparse_attention_op.cc index a6534543a6515..14d1ffe3f11b0 100644 --- a/paddle/fluid/operators/sparse_attention_op.cc +++ b/paddle/fluid/operators/sparse_attention_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/sparse_attention_op.cu b/paddle/fluid/operators/sparse_attention_op.cu index 49f8263ab289a..2949642d2f3dd 100644 --- a/paddle/fluid/operators/sparse_attention_op.cu +++ b/paddle/fluid/operators/sparse_attention_op.cu @@ -13,9 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include #include #include + #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" #if defined(PADDLE_WITH_CUDA) @@ -90,17 +92,15 @@ __global__ void BlockSparseSoftmaxForward(T* softmax, const T* src, T scale, if (cur_block_col < cur_block_nnz) { // read kp mask T cur_kp_mask; - if ((kp_mask != nullptr) && - std::abs(kp_mask[colindex[cur_block_col]]) < - std::numeric_limits::epsilon()) { + if ((kp_mask != nullptr) && std::abs(kp_mask[colindex[cur_block_col]]) < + std::numeric_limits::epsilon()) { cur_kp_mask = -std::numeric_limits::infinity(); } else { cur_kp_mask = 0; } // do mask operation - if ((attnptr != nullptr) && - std::abs(attnptr[colindex[cur_block_col]]) < - std::numeric_limits::epsilon()) { + if ((attnptr != nullptr) && std::abs(attnptr[colindex[cur_block_col]]) < + std::numeric_limits::epsilon()) { srcdata[cur_reg_index] = -std::numeric_limits::infinity() * scale + cur_kp_mask; } else { @@ -280,37 +280,37 @@ void SparseSoftmaxBackward(const platform::CUDADeviceContext& ctx, T scaling = static_cast(1.0) / sqrt(static_cast(num_cols)); if (num_cols <= 4) { - BlockSparseSoftmaxBackward<<>>( - dx_data, dout_data, out_data, scaling, offset_data, columns_data, - num_rows); + BlockSparseSoftmaxBackward + <<>>(dx_data, dout_data, out_data, scaling, offset_data, + columns_data, num_rows); } else if (num_cols > 4 && num_cols <= 8) { - BlockSparseSoftmaxBackward<<>>( - dx_data, dout_data, out_data, scaling, offset_data, columns_data, - num_rows); + BlockSparseSoftmaxBackward + <<>>(dx_data, dout_data, out_data, scaling, offset_data, + columns_data, num_rows); } else if (num_cols > 8 && num_cols <= 16) { - BlockSparseSoftmaxBackward<<>>( - dx_data, dout_data, out_data, scaling, offset_data, columns_data, - num_rows); + BlockSparseSoftmaxBackward + <<>>(dx_data, dout_data, out_data, scaling, offset_data, + columns_data, num_rows); } else if (num_cols > 16 && num_cols <= 32) { - BlockSparseSoftmaxBackward<<>>( - dx_data, dout_data, out_data, scaling, offset_data, columns_data, - num_rows); + BlockSparseSoftmaxBackward + <<>>(dx_data, dout_data, out_data, scaling, offset_data, + columns_data, num_rows); } else if (num_cols > 32 && num_cols <= 64) { - BlockSparseSoftmaxBackward<<>>( - dx_data, dout_data, out_data, scaling, offset_data, columns_data, - num_rows); + BlockSparseSoftmaxBackward + <<>>(dx_data, dout_data, out_data, scaling, offset_data, + columns_data, num_rows); } else if (num_cols > 64 && num_cols <= 128) { - BlockSparseSoftmaxBackward<<>>( - dx_data, dout_data, out_data, scaling, offset_data, columns_data, - num_rows); + BlockSparseSoftmaxBackward + <<>>(dx_data, dout_data, out_data, scaling, offset_data, + columns_data, num_rows); } else if (num_cols > 128 && num_cols <= 256) { - BlockSparseSoftmaxBackward<<>>( - dx_data, dout_data, out_data, scaling, offset_data, columns_data, - num_rows); + BlockSparseSoftmaxBackward + <<>>(dx_data, dout_data, out_data, scaling, offset_data, + columns_data, num_rows); } else if (num_cols > 256 && num_cols <= 512) { - BlockSparseSoftmaxBackward<<>>( - dx_data, dout_data, out_data, scaling, offset_data, columns_data, - num_rows); + BlockSparseSoftmaxBackward + <<>>(dx_data, dout_data, out_data, scaling, offset_data, + columns_data, num_rows); } else { PADDLE_THROW(platform::errors::InvalidArgument( "The head_dim of query in sparse_attention op should less or equal " diff --git a/paddle/fluid/operators/spectral_norm_op.h b/paddle/fluid/operators/spectral_norm_op.h index ee75c96c23a9f..765b9a4dbfae6 100644 --- a/paddle/fluid/operators/spectral_norm_op.h +++ b/paddle/fluid/operators/spectral_norm_op.h @@ -11,6 +11,7 @@ #pragma once #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/blas/blas.h" diff --git a/paddle/fluid/operators/spectral_op.cc b/paddle/fluid/operators/spectral_op.cc index 0270f7e0576c8..cd2053b4ef083 100644 --- a/paddle/fluid/operators/spectral_op.cc +++ b/paddle/fluid/operators/spectral_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/spectral_op.h" + #include "paddle/fluid/operators/spectral_helper.h" namespace paddle { diff --git a/paddle/fluid/operators/spectral_op.h b/paddle/fluid/operators/spectral_op.h index 71b54caf5ee79..4900e88fbe18f 100644 --- a/paddle/fluid/operators/spectral_op.h +++ b/paddle/fluid/operators/spectral_op.h @@ -18,6 +18,7 @@ #include #include #include + #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type_transform.h" diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc index dc20952903ab2..6c60c1a17e017 100644 --- a/paddle/fluid/operators/split_op.cc +++ b/paddle/fluid/operators/split_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/split_op.h" + #include #include "paddle/fluid/framework/infershape_utils.h" diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h index cf44f341b2b64..143e1d72868a1 100644 --- a/paddle/fluid/operators/split_op.h +++ b/paddle/fluid/operators/split_op.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/utils.h" #include "paddle/phi/kernels/split_kernel.h" diff --git a/paddle/fluid/operators/split_op_mlu.cc b/paddle/fluid/operators/split_op_mlu.cc index adc3ea14e32d6..0d438854673cb 100644 --- a/paddle/fluid/operators/split_op_mlu.cc +++ b/paddle/fluid/operators/split_op_mlu.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/split_op.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/fluid/operators/split_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/split_op_xpu.cc b/paddle/fluid/operators/split_op_xpu.cc index 8f02d8157b202..b24d0a70b05b0 100644 --- a/paddle/fluid/operators/split_op_xpu.cc +++ b/paddle/fluid/operators/split_op_xpu.cc @@ -12,9 +12,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/split_op.h" #include #include + +#include "paddle/fluid/operators/split_op.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" namespace paddle { diff --git a/paddle/fluid/operators/spp_op.cc b/paddle/fluid/operators/spp_op.cc index b1e0127f4cf91..05230399b300a 100644 --- a/paddle/fluid/operators/spp_op.cc +++ b/paddle/fluid/operators/spp_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/spp_op.h" + #include #include namespace paddle { diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h index aa944cfcfbb17..cd81ade1f9d81 100644 --- a/paddle/fluid/operators/spp_op.h +++ b/paddle/fluid/operators/spp_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/operators/strided_memcpy.h" diff --git a/paddle/fluid/operators/squeeze_op_mlu.cc b/paddle/fluid/operators/squeeze_op_mlu.cc new file mode 100644 index 0000000000000..d492846b41c11 --- /dev/null +++ b/paddle/fluid/operators/squeeze_op_mlu.cc @@ -0,0 +1,61 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_MLU +#include +#include + +#include "paddle/fluid/operators/squeeze_op.h" +#include "paddle/fluid/platform/device/mlu/device_context.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL( + squeeze, ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel); + +REGISTER_OP_MLU_KERNEL( + squeeze_grad, ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel); + +REGISTER_OP_MLU_KERNEL( + squeeze2, ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel); + +REGISTER_OP_MLU_KERNEL( + squeeze2_grad, ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel); +#endif diff --git a/paddle/fluid/operators/stack_op.cc b/paddle/fluid/operators/stack_op.cc index 6fc80ca379f3f..6b0a0657afba8 100644 --- a/paddle/fluid/operators/stack_op.cc +++ b/paddle/fluid/operators/stack_op.cc @@ -14,6 +14,7 @@ #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" diff --git a/paddle/fluid/operators/stack_op_npu.cc b/paddle/fluid/operators/stack_op_npu.cc index 9d4ef0ffa20e2..c3e6e333e4cf4 100644 --- a/paddle/fluid/operators/stack_op_npu.cc +++ b/paddle/fluid/operators/stack_op_npu.cc @@ -30,8 +30,9 @@ class StackNPUKernel : public framework::OpKernel { if (axis < 0) axis += (x[0]->dims().size() + 1); int num = static_cast(x.size()); - PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument( - "number of input Tensor <= 0")); + PADDLE_ENFORCE_GT( + num, 0, + platform::errors::InvalidArgument("number of input Tensor <= 0")); auto stream = ctx.template device_context() @@ -59,8 +60,9 @@ class StackGradNPUKernel : public framework::OpKernel { if (axis < 0) axis += dy->dims().size(); int num = dy->dims()[axis]; - PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument( - "number of input Tensor <= 0")); + PADDLE_ENFORCE_GT( + num, 0, + platform::errors::InvalidArgument("number of input Tensor <= 0")); auto stream = ctx.template device_context() diff --git a/paddle/fluid/operators/stack_op_xpu.cc b/paddle/fluid/operators/stack_op_xpu.cc index baaa2b4884ce3..925fcc08615ac 100644 --- a/paddle/fluid/operators/stack_op_xpu.cc +++ b/paddle/fluid/operators/stack_op_xpu.cc @@ -15,6 +15,7 @@ #ifdef PADDLE_WITH_XPU #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/concat_op.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" diff --git a/paddle/fluid/operators/stft_op.cc b/paddle/fluid/operators/stft_op.cc index 7d4103ddf3859..36e867417291c 100644 --- a/paddle/fluid/operators/stft_op.cc +++ b/paddle/fluid/operators/stft_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/stft_op.h" + #include "paddle/fluid/operators/spectral_helper.h" namespace paddle { diff --git a/paddle/fluid/operators/stft_op.h b/paddle/fluid/operators/stft_op.h index e75c59232bcae..cc17ed9a43cc1 100644 --- a/paddle/fluid/operators/stft_op.h +++ b/paddle/fluid/operators/stft_op.h @@ -17,7 +17,6 @@ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" - #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/frame_op.h" #include "paddle/fluid/operators/spectral_op.h" diff --git a/paddle/fluid/operators/strided_slice_op_npu.cc b/paddle/fluid/operators/strided_slice_op_npu.cc index b142b8f099b89..80952e9b5560c 100644 --- a/paddle/fluid/operators/strided_slice_op_npu.cc +++ b/paddle/fluid/operators/strided_slice_op_npu.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/funcs/strided_slice.h" #include "paddle/fluid/operators/slice_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/phi/kernels/funcs/strided_slice.h" namespace paddle { namespace operators { @@ -186,14 +186,16 @@ class StridedSliceNPUKernel : public framework::OpKernel { out->Resize(out_dims); out->mutable_data(place); - const auto& runner = NpuOpRunner( - "StridedSlice", {*in, starts_indices_tensor, ends_indices_tensor, - strides_indices_tensor}, - {*out}, {{"begin_mask", 0}, - {"end_mask", 0}, - {"ellipsis_mask", 0}, - {"new_axis_mask", 0}, - {"shrink_axis_mask", 0}}); + const auto& runner = + NpuOpRunner("StridedSlice", + {*in, starts_indices_tensor, ends_indices_tensor, + strides_indices_tensor}, + {*out}, + {{"begin_mask", 0}, + {"end_mask", 0}, + {"ellipsis_mask", 0}, + {"new_axis_mask", 0}, + {"shrink_axis_mask", 0}}); runner.Run(stream); if (need_reverse) { diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.cc b/paddle/fluid/operators/string/faster_tokenizer_op.cc index 42047021b408a..9e4089680f420 100644 --- a/paddle/fluid/operators/string/faster_tokenizer_op.cc +++ b/paddle/fluid/operators/string/faster_tokenizer_op.cc @@ -9,9 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/string/faster_tokenizer_op.h" + #include #include +#include #include #include #include @@ -22,10 +25,7 @@ limitations under the License. */ #include #include -#include - #include "paddle/fluid/framework/string_array.h" -#include "paddle/fluid/operators/string/faster_tokenizer_op.h" namespace paddle { namespace operators { @@ -38,12 +38,11 @@ using std::ifstream; using std::int64_t; using std::min; using std::runtime_error; -using std::unordered_map; -using std::unordered_set; using std::shared_ptr; using std::size_t; -using std::int64_t; using std::string; +using std::unordered_map; +using std::unordered_set; using std::vector; using std::wstring; diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.h b/paddle/fluid/operators/string/faster_tokenizer_op.h index 446be3a1999fc..a6b8bfea59c47 100644 --- a/paddle/fluid/operators/string/faster_tokenizer_op.h +++ b/paddle/fluid/operators/string/faster_tokenizer_op.h @@ -26,15 +26,14 @@ namespace operators { using std::endl; using std::int64_t; +using std::shared_ptr; using std::size_t; using std::string; -using std::shared_ptr; -using std::vector; using std::unordered_map; using std::unordered_set; using std::vector; -using std::wstring; using std::wcout; +using std::wstring; inline bool IsControl(const wchar_t& ch); inline bool IsChineseChar(const wchar_t& ch); diff --git a/paddle/fluid/operators/string/unity_build_rule.cmake b/paddle/fluid/operators/string/unity_build_rule.cmake index a4b209d2df13e..90922407ec712 100644 --- a/paddle/fluid/operators/string/unity_build_rule.cmake +++ b/paddle/fluid/operators/string/unity_build_rule.cmake @@ -4,5 +4,4 @@ # Generally, the combination rules in this file do not need to be modified. # If there are some redefined error in compiling with the source file which # in combination rule, you can remove the source file from the following rules. -register_unity_group(cc - faster_tokenizer_op.cc) \ No newline at end of file +register_unity_group(cc faster_tokenizer_op.cc) diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 51040544fac34..bc6997e36ebf7 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -36,9 +36,8 @@ class SumOp : public framework::OperatorWithKernel { OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "sum"); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "sum"); - if (ctx->IsRuntime() && - ctx->GetOutputsVarType("Out")[0] == - framework::proto::VarType::LOD_TENSOR_ARRAY) { + if (ctx->IsRuntime() && ctx->GetOutputsVarType("Out")[0] == + framework::proto::VarType::LOD_TENSOR_ARRAY) { return; // skip runtime infershape when is tensor array; } @@ -47,11 +46,12 @@ class SumOp : public framework::OperatorWithKernel { auto N = x_dims.size(); PADDLE_ENFORCE_GT( - N, 0, platform::errors::InvalidArgument( - "The input tensor X's dimensions of SumOp " - "should be larger than 0. But received X's dimensions %d, " - "X's shape = [%s].", - N, &x_dims)); + N, 0, + platform::errors::InvalidArgument( + "The input tensor X's dimensions of SumOp " + "should be larger than 0. But received X's dimensions %d, " + "X's shape = [%s].", + N, &x_dims)); if (N == 1) { VLOG(3) << "Warning: SumOp have only one input, may waste memory"; } @@ -115,8 +115,9 @@ class SumOp : public framework::OperatorWithKernel { framework::LibraryType library{framework::LibraryType::kPlain}; framework::DataLayout layout{framework::DataLayout::kAnyLayout}; - PADDLE_ENFORCE_GT(x_vars.size(), 0, platform::errors::InvalidArgument( - "Input[X] should not be empty")); + PADDLE_ENFORCE_GT( + x_vars.size(), 0, + platform::errors::InvalidArgument("Input[X] should not be empty")); PADDLE_ENFORCE_NOT_NULL( x_vars[0], platform::errors::NotFound( diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu index 8c6c083cde880..3bf249425c2ce 100644 --- a/paddle/fluid/operators/sum_op.cu +++ b/paddle/fluid/operators/sum_op.cu @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/sum_op.h" @@ -205,8 +206,8 @@ void SumToLoDTensor(const framework::ExecutionContext &context) { reinterpret_cast(tmp_sr_in_out_array->ptr()); ComputeKernelParameter(length); - SumSelectedRowsCUDAKernel<<>>( - sr_in_out_array_data, length, rows); + SumSelectedRowsCUDAKernel + <<>>(sr_in_out_array_data, length, rows); dst_write = true; } } diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h index 3c51b3398be4e..8c1e3a3dbf191 100644 --- a/paddle/fluid/operators/sum_op.h +++ b/paddle/fluid/operators/sum_op.h @@ -11,6 +11,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/sum_op_mlu.cc b/paddle/fluid/operators/sum_op_mlu.cc index 179c038e83716..68e31c364b64b 100644 --- a/paddle/fluid/operators/sum_op_mlu.cc +++ b/paddle/fluid/operators/sum_op_mlu.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/sum_op.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/fluid/operators/sum_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/sum_op_xpu.cc b/paddle/fluid/operators/sum_op_xpu.cc index 5899591549eac..a1cdaddd11b42 100644 --- a/paddle/fluid/operators/sum_op_xpu.cc +++ b/paddle/fluid/operators/sum_op_xpu.cc @@ -11,8 +11,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/sum_op.h" #include + +#include "paddle/fluid/operators/sum_op.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" namespace paddle { diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h index 166f49999d552..468c658e5e640 100644 --- a/paddle/fluid/operators/svd_helper.h +++ b/paddle/fluid/operators/svd_helper.h @@ -15,9 +15,11 @@ #pragma once #include + #include #include #include + #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/diag_op.h" @@ -101,20 +103,22 @@ struct RealMulComplexFunctor { // y: complex number (c+0j) pretend to be a real number // out: complex number (ac+bcj) inline HOSTDEVICE T operator()(T x, T y) { - PADDLE_ENFORCE_LT(y.imag, 1e-6, platform::errors::InvalidArgument( - "The image part of y must to be 0" - "but got [%d]", - y.imag)); + PADDLE_ENFORCE_LT( + y.imag, 1e-6, + platform::errors::InvalidArgument("The image part of y must to be 0" + "but got [%d]", + y.imag)); return platform::complex>(x.real * y.real, x.imag * y.real); } }; static std::vector GetBroadcastShape(InTensors ins) { - PADDLE_ENFORCE_EQ(ins.size(), 2, platform::errors::InvalidArgument( - "GetBroadcastShape Receive 2 tensors" - "but got [%d]", - ins.size())); + PADDLE_ENFORCE_EQ( + ins.size(), 2, + platform::errors::InvalidArgument("GetBroadcastShape Receive 2 tensors" + "but got [%d]", + ins.size())); auto x_dim = ins[0]->dims(); auto y_dim = ins[1]->dims(); std::vector broadcast_shape = @@ -596,8 +600,9 @@ struct DeviceIndependenceTensorOperations { attrs["lower"] = lower; NameInTensorMap inputs({{"X", {&x}}}); int x_rank = x.dims().size(); - PADDLE_ENFORCE_GE(x_rank, 2, platform::errors::InvalidArgument( - "Rank must be at least 2.")); + PADDLE_ENFORCE_GE( + x_rank, 2, + platform::errors::InvalidArgument("Rank must be at least 2.")); std::vector out_shape = phi::vectorize(x.dims()); return CreateOpRunAndReturnTensor("tril_triu", inputs, attrs, out_shape); } diff --git a/paddle/fluid/operators/svd_op.cc b/paddle/fluid/operators/svd_op.cc index 3ca7320114a8a..e68b013d2fb62 100644 --- a/paddle/fluid/operators/svd_op.cc +++ b/paddle/fluid/operators/svd_op.cc @@ -13,10 +13,12 @@ // limitations under the License. #include "paddle/fluid/operators/svd_op.h" + #include #include #include #include + #include "paddle/phi/core/ddim.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" diff --git a/paddle/fluid/operators/svd_op.cu b/paddle/fluid/operators/svd_op.cu index e987589e83c19..317ea7c5363b9 100644 --- a/paddle/fluid/operators/svd_op.cu +++ b/paddle/fluid/operators/svd_op.cu @@ -16,8 +16,10 @@ limitations under the License. */ // HIP not support cusolver #include + #include #include + #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/svd_op.h" #include "paddle/fluid/platform/dynload/cusolver.h" diff --git a/paddle/fluid/operators/svd_op.h b/paddle/fluid/operators/svd_op.h index 42a847206a3cb..1008a69e6de0f 100644 --- a/paddle/fluid/operators/svd_op.h +++ b/paddle/fluid/operators/svd_op.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/svd_helper.h" diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu.h b/paddle/fluid/operators/sync_batch_norm_op.cu.h index 17c96544988b6..9818aa3651baf 100644 --- a/paddle/fluid/operators/sync_batch_norm_op.cu.h +++ b/paddle/fluid/operators/sync_batch_norm_op.cu.h @@ -137,7 +137,7 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx, const float momentum, const bool is_test, const bool use_global_stats - ) { +) { const auto &x_dims = x->dims(); PADDLE_ENFORCE_GE(x_dims.size(), 2, platform::errors::InvalidArgument( @@ -178,13 +178,11 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx, const int threads = 256; int grid = std::min(C, (max_threads + threads - 1) / threads); if (layout == framework::DataLayout::kNCHW) { - KeLocalStats<<>>( - x_d, N, H * W * D, C, stats); + KeLocalStats + <<>>(x_d, N, H * W * D, C, stats); } else { - KeLocalStats<<>>( - x_d, N, H * W * D, C, stats); + KeLocalStats + <<>>(x_d, N, H * W * D, C, stats); } #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) @@ -221,13 +219,13 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx, int grid2 = (std::min(x_numel, max_threads) + block - 1) / block; if (layout == framework::DataLayout::kNCHW) { - KeNormAffine<<>>( - x_d, s_d, b_d, mean_data, var_data, epsilon, C, H * W * D, x_numel, - y_d); + KeNormAffine + <<>>(x_d, s_d, b_d, mean_data, var_data, + epsilon, C, H * W * D, x_numel, y_d); } else { - KeNormAffine<<>>( - x_d, s_d, b_d, mean_data, var_data, epsilon, C, H * W * D, x_numel, - y_d); + KeNormAffine + <<>>(x_d, s_d, b_d, mean_data, var_data, + epsilon, C, H * W * D, x_numel, y_d); } } @@ -436,30 +434,30 @@ void SyncBatchNormGradFunctor( if (is_inplace) { if (layout == framework::DataLayout::kNCHW) { - KeBNRestoreData< - T, framework::DataLayout::kNCHW><<>>( - px.mutable_data(ctx.GetPlace()), - scale->data>(), - bias->data>(), saved_mean, saved_inv_var, - epsilon, C, H * W * D, x_numel, x->data()); + KeBNRestoreData + <<>>(px.mutable_data(ctx.GetPlace()), + scale->data>(), + bias->data>(), + saved_mean, saved_inv_var, epsilon, C, + H * W * D, x_numel, x->data()); } else { - KeBNRestoreData< - T, framework::DataLayout::kNHWC><<>>( - px.mutable_data(ctx.GetPlace()), - scale->data>(), - bias->data>(), saved_mean, saved_inv_var, - epsilon, C, H * W * D, x_numel, x->data()); + KeBNRestoreData + <<>>(px.mutable_data(ctx.GetPlace()), + scale->data>(), + bias->data>(), + saved_mean, saved_inv_var, epsilon, C, + H * W * D, x_numel, x->data()); } } if (layout == framework::DataLayout::kNCHW) { - KeBackwardLocalStats< - T, threads, framework::DataLayout::kNCHW><<>>( - dy_d, x_d, saved_mean, N, fsize, C, stats); + KeBackwardLocalStats + <<>>(dy_d, x_d, saved_mean, N, fsize, C, + stats); } else { - KeBackwardLocalStats< - T, threads, framework::DataLayout::kNHWC><<>>( - dy_d, x_d, saved_mean, N, fsize, C, stats); + KeBackwardLocalStats + <<>>(dy_d, x_d, saved_mean, N, fsize, C, + stats); } #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) @@ -476,35 +474,33 @@ void SyncBatchNormGradFunctor( if (layout == framework::DataLayout::kNCHW) { if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, threads, - framework::DataLayout::kNCHW><<>>( - dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize, - d_scale->data>(), - d_bias->data>()); + KeBNBackwardScaleBias + <<>>(dy_d, x_d, saved_mean, saved_inv_var, + epsilon, N, C, fsize, + d_scale->data>(), + d_bias->data>()); } if (d_x) { - KeBNBackwardData< - T, framework::DataLayout::kNCHW><<>>( - dy_d, x_d, scale->data>(), saved_mean, - saved_inv_var, stats, stats + C, stats + 2 * C, epsilon, C, fsize, - x->numel(), d_x->data()); + KeBNBackwardData + <<>>( + dy_d, x_d, scale->data>(), saved_mean, + saved_inv_var, stats, stats + C, stats + 2 * C, epsilon, C, fsize, + x->numel(), d_x->data()); } } else { if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, threads, - framework::DataLayout::kNHWC><<>>( - dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize, - d_scale->data>(), - d_bias->data>()); + KeBNBackwardScaleBias + <<>>(dy_d, x_d, saved_mean, saved_inv_var, + epsilon, N, C, fsize, + d_scale->data>(), + d_bias->data>()); } if (d_x) { - KeBNBackwardData< - T, framework::DataLayout::kNHWC><<>>( - dy_d, x_d, scale->data>(), saved_mean, - saved_inv_var, stats, stats + C, stats + 2 * C, epsilon, C, fsize, - x->numel(), d_x->data()); + KeBNBackwardData + <<>>( + dy_d, x_d, scale->data>(), saved_mean, + saved_inv_var, stats, stats + C, stats + 2 * C, epsilon, C, fsize, + x->numel(), d_x->data()); } } } diff --git a/paddle/fluid/operators/sync_batch_norm_op_npu.cc b/paddle/fluid/operators/sync_batch_norm_op_npu.cc index b5632f4fe4a84..604f8f97a3f41 100644 --- a/paddle/fluid/operators/sync_batch_norm_op_npu.cc +++ b/paddle/fluid/operators/sync_batch_norm_op_npu.cc @@ -566,8 +566,9 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel { paddle::framework::TensorToVector( device_count_tensor, ctx.device_context(), &device_count_vec); device_counts = device_count_vec[0]; - PADDLE_ENFORCE_GE(device_counts, 2, platform::errors::PreconditionNotMet( - "device_counts should >= 2.")); + PADDLE_ENFORCE_GE( + device_counts, 2, + platform::errors::PreconditionNotMet("device_counts should >= 2.")); } // cacl var_ref diff --git a/paddle/fluid/operators/tdm_child_op.cc b/paddle/fluid/operators/tdm_child_op.cc index a60fc537e3216..a7a218972ecf9 100644 --- a/paddle/fluid/operators/tdm_child_op.cc +++ b/paddle/fluid/operators/tdm_child_op.cc @@ -13,7 +13,9 @@ limitations under the License. */ #include "paddle/fluid/operators/tdm_child_op.h" + #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/sampler.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/operators/tdm_child_op.h b/paddle/fluid/operators/tdm_child_op.h index e437975320cc5..c39d8260a8b36 100644 --- a/paddle/fluid/operators/tdm_child_op.h +++ b/paddle/fluid/operators/tdm_child_op.h @@ -20,6 +20,7 @@ #include #include #include + #include "gflags/gflags.h" #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/tdm_sampler_op.cc b/paddle/fluid/operators/tdm_sampler_op.cc index 6aad72a0d9cbe..68d079e679302 100644 --- a/paddle/fluid/operators/tdm_sampler_op.cc +++ b/paddle/fluid/operators/tdm_sampler_op.cc @@ -13,7 +13,9 @@ limitations under the License. */ #include "paddle/fluid/operators/tdm_sampler_op.h" + #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/sampler.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/operators/tdm_sampler_op.h b/paddle/fluid/operators/tdm_sampler_op.h index bf752a9c8ad78..c3ed90ae68ebd 100644 --- a/paddle/fluid/operators/tdm_sampler_op.h +++ b/paddle/fluid/operators/tdm_sampler_op.h @@ -20,6 +20,7 @@ #include #include #include + #include "gflags/gflags.h" #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/temporal_shift_op.cc b/paddle/fluid/operators/temporal_shift_op.cc index 3bdb9cb972fc6..12d0f288d97c9 100644 --- a/paddle/fluid/operators/temporal_shift_op.cc +++ b/paddle/fluid/operators/temporal_shift_op.cc @@ -10,12 +10,13 @@ limitations under the License. */ #include "paddle/fluid/operators/temporal_shift_op.h" + #include #include #include -#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/unary.h" diff --git a/paddle/fluid/operators/temporal_shift_op.cu b/paddle/fluid/operators/temporal_shift_op.cu index 1d7aeec142ff0..f8e642cdb897c 100644 --- a/paddle/fluid/operators/temporal_shift_op.cu +++ b/paddle/fluid/operators/temporal_shift_op.cu @@ -179,13 +179,13 @@ class TemporalShiftOpCUDAKernel : public framework::OpKernel { grid = std::min(dev_ctx.GetSMCount() * blocks_per_sm, grid); if (data_layout == DataLayout::kNCHW) { - KeTemporalShiftFwNCHW< - T><<>>( - input_data, output_data, ntchw, tchw, chw, hw, t, c1, c2); + KeTemporalShiftFwNCHW + <<>>( + input_data, output_data, ntchw, tchw, chw, hw, t, c1, c2); } else { - KeTemporalShiftFwNHWC< - T><<>>( - input_data, output_data, ntchw, tchw, chw, t, c, c1, c2); + KeTemporalShiftFwNHWC + <<>>( + input_data, output_data, ntchw, tchw, chw, t, c, c1, c2); } } }; @@ -233,13 +233,15 @@ class TemporalShiftGradOpCUDAKernel : public framework::OpKernel { grid = std::min(dev_ctx.GetSMCount() * blocks_per_sm, grid); if (data_layout == DataLayout::kNCHW) { - KeTemporalShiftBwNCHW< - T><<>>( - output_grad_data, input_grad_data, ntchw, tchw, chw, hw, t, c1, c2); + KeTemporalShiftBwNCHW + <<>>( + output_grad_data, input_grad_data, ntchw, tchw, chw, hw, t, c1, + c2); } else { - KeTemporalShiftBwNHWC< - T><<>>( - output_grad_data, input_grad_data, ntchw, tchw, chw, t, c, c1, c2); + KeTemporalShiftBwNHWC + <<>>( + output_grad_data, input_grad_data, ntchw, tchw, chw, t, c, c1, + c2); } } }; diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc index 95ae32fa2ea6f..41d1fc2356e4b 100644 --- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc +++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc @@ -96,10 +96,11 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase { *scope.FindVar(Output("OutIndex"))->GetMutable(); const size_t n = inx.size(); - PADDLE_ENFORCE_GT(n, 0, platform::errors::InvalidArgument( - "Input tensorarray size should > 0," - "but the received is %d", - n)); + PADDLE_ENFORCE_GT( + n, 0, + platform::errors::InvalidArgument("Input tensorarray size should > 0," + "but the received is %d", + n)); std::string base_name = Inputs("X")[0]; std::vector names; @@ -235,10 +236,11 @@ class LoDTensorArray2TensorGradOp : public framework::OperatorBase { auto &inx = scope.FindVar(Input("X"))->Get(); const size_t n = inx.size(); - PADDLE_ENFORCE_GT(n, 0, platform::errors::InvalidArgument( - "Input tensorarray size should > 0, " - "but the received is: %d. ", - n)); + PADDLE_ENFORCE_GT( + n, 0, + platform::errors::InvalidArgument("Input tensorarray size should > 0, " + "but the received is: %d. ", + n)); std::string base_name = Inputs("X")[0]; std::vector names; diff --git a/paddle/fluid/operators/tensor_formatter.cc b/paddle/fluid/operators/tensor_formatter.cc index ef46ee25156e5..8f02bc870e2fb 100644 --- a/paddle/fluid/operators/tensor_formatter.cc +++ b/paddle/fluid/operators/tensor_formatter.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/operators/tensor_formatter.h" #include + #include "paddle/fluid/framework/convert_utils.h" namespace paddle { diff --git a/paddle/fluid/operators/tensor_to_string.h b/paddle/fluid/operators/tensor_to_string.h index bd9e7f6219b4a..c1ca1dff9ffe7 100644 --- a/paddle/fluid/operators/tensor_to_string.h +++ b/paddle/fluid/operators/tensor_to_string.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/device_context.h" @@ -56,7 +57,7 @@ static std::vector ToVector(const framework::Tensor &src) { } template -static std::string FlattenToString(Args &&... args) { +static std::string FlattenToString(Args &&...args) { const auto &vec = ToVector(std::forward(args)...); return "[" + string::join_strings(vec, ',') + "]"; } diff --git a/paddle/fluid/operators/tensorrt/CMakeLists.txt b/paddle/fluid/operators/tensorrt/CMakeLists.txt index a7f18245ab9e9..e0fed2804a9b7 100644 --- a/paddle/fluid/operators/tensorrt/CMakeLists.txt +++ b/paddle/fluid/operators/tensorrt/CMakeLists.txt @@ -1,4 +1,6 @@ -op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter infer_io_utils analysis_helper) -nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc - DEPS tensorrt_engine_op - analysis) +op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter + infer_io_utils analysis_helper) +nv_test( + test_tensorrt_engine_op + SRCS tensorrt_engine_op_test.cc + DEPS tensorrt_engine_op analysis) diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 0a71875d8931e..1e5ce6fa3e80c 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -52,9 +52,9 @@ namespace operators { using inference::Singleton; using inference::tensorrt::TensorRTEngine; -using inference::tensorrt::TRTInt8Calibrator; using inference::tensorrt::TRTCalibratorEngine; using inference::tensorrt::TRTCalibratorEngineManager; +using inference::tensorrt::TRTInt8Calibrator; static void RuntimeStaticShapeCheck(std::vector runtime_input_shape, std::vector model_input_shape) { @@ -111,10 +111,10 @@ static void RuntimeDynamicShapeCheck( // "TRT engine runtime input %s dims size(%d) inconsistent " // "with the dynamic shape size(%d)", // x, runtime_input_shape.size(), min_input_shape.size())); - auto is_input_shape_valid = [&]( - const std::vector &runtime_input_shape, - const std::vector &min_input_shape, - const std::vector &max_input_shape) -> bool { + auto is_input_shape_valid = + [&](const std::vector &runtime_input_shape, + const std::vector &min_input_shape, + const std::vector &max_input_shape) -> bool { for (size_t i = 0; i < runtime_input_shape.size(); i++) { if (runtime_input_shape[i] <= max_input_shape[i] && runtime_input_shape[i] >= min_input_shape[i]) { diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 243ae757277a8..c4278cfeb58c5 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h" + #include + #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_desc.h" diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc index cea6b458aec78..ee2d38fea7033 100644 --- a/paddle/fluid/operators/tile_op_npu.cc +++ b/paddle/fluid/operators/tile_op_npu.cc @@ -27,10 +27,11 @@ class TileNPUKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { auto rank = context.Input("X")->dims().size(); PADDLE_ENFORCE_GE( - rank, 1, platform::errors::InvalidArgument( - "The rank of the input 'x' for tile op must be a positive " - "integer, but the value received is %d.", - rank)); + rank, 1, + platform::errors::InvalidArgument( + "The rank of the input 'x' for tile op must be a positive " + "integer, but the value received is %d.", + rank)); PADDLE_ENFORCE_LE( rank, MAX_RANK_SUPPORTED, platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/tile_op_xpu.cc b/paddle/fluid/operators/tile_op_xpu.cc index 598377587d6f7..a0ce4a2bebeb7 100644 --- a/paddle/fluid/operators/tile_op_xpu.cc +++ b/paddle/fluid/operators/tile_op_xpu.cc @@ -25,10 +25,11 @@ class TileXPUKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { auto rank = context.Input("X")->dims().size(); PADDLE_ENFORCE_GE( - rank, 1, platform::errors::InvalidArgument( - "The rank of the input 'x' for tile op must be a positive " - "integer, but the value received is %d.", - rank)); + rank, 1, + platform::errors::InvalidArgument( + "The rank of the input 'x' for tile op must be a positive " + "integer, but the value received is %d.", + rank)); PADDLE_ENFORCE_LE( rank, MAX_RANK_SUPPORTED, platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h index 848ab1cb556e0..a7981c86c450c 100644 --- a/paddle/fluid/operators/top_k_function_cuda.h +++ b/paddle/fluid/operators/top_k_function_cuda.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include #include #ifdef __NVCC__ diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc index cce5ad2631733..d8fc129588a03 100644 --- a/paddle/fluid/operators/top_k_op.cc +++ b/paddle/fluid/operators/top_k_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/top_k_op.h" + #include namespace paddle { @@ -39,8 +40,9 @@ class TopkOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_GE(k, 1, platform::errors::InvalidArgument( "Attribute k must be >= 1, but got k is %d.", k)); - PADDLE_ENFORCE_GE(input_dims.size(), 1, platform::errors::InvalidArgument( - "input must have >= 1d shape")); + PADDLE_ENFORCE_GE( + input_dims.size(), 1, + platform::errors::InvalidArgument("input must have >= 1d shape")); if (ctx->IsRuntime()) { PADDLE_ENFORCE_GE( diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu index 30a5a802a5360..fc8f08ca4805a 100644 --- a/paddle/fluid/operators/top_k_op.cu +++ b/paddle/fluid/operators/top_k_op.cu @@ -96,8 +96,8 @@ class TopkOpCUDAKernel : public framework::OpKernel { int gridx = input_height < kMaxHeight ? input_height : kMaxHeight; switch (GetDesiredBlockDim(input_width)) { FIXED_BLOCK_DIM( - KeMatrixTopK<<>>( + KeMatrixTopK + <<>>( output_data, k, indices_data, input_data, input_width, input_width, static_cast(k), gridx, input_height)); default: @@ -133,8 +133,8 @@ class TopkOpGradCUDAKernel : public framework::OpKernel { int gridx = row < kMaxHeight ? row : kMaxHeight; switch (GetDesiredBlockDim(col)) { FIXED_BLOCK_DIM( - AssignGrad<<>>( + AssignGrad + <<>>( x_grad_data, indices_data, out_grad_data, row, col, k)); default: PADDLE_THROW( diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h index aad2f096a536e..9d933eb5c47ed 100644 --- a/paddle/fluid/operators/top_k_op.h +++ b/paddle/fluid/operators/top_k_op.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/top_k_op_mlu.cc b/paddle/fluid/operators/top_k_op_mlu.cc index 102902bdaaaaf..16b2ac9807e83 100644 --- a/paddle/fluid/operators/top_k_op_mlu.cc +++ b/paddle/fluid/operators/top_k_op_mlu.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/top_k_op.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/fluid/operators/top_k_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc index 04e4d88b008e0..051cb9611bab1 100644 --- a/paddle/fluid/operators/top_k_v2_op_npu.cc +++ b/paddle/fluid/operators/top_k_v2_op_npu.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" diff --git a/paddle/fluid/operators/trace_op.cc b/paddle/fluid/operators/trace_op.cc index c6c0fa3c0019e..36ad2d74869c6 100644 --- a/paddle/fluid/operators/trace_op.cc +++ b/paddle/fluid/operators/trace_op.cc @@ -118,19 +118,16 @@ REGISTER_OPERATOR(trace_grad, ops::TraceGradOp, ops::TraceGradNoNeedBufferVarsInferer); /* ========================== register checkpoint ===========================*/ -REGISTER_OP_VERSION(trace) - .AddCheckpoint( - R"ROC(Upgrade trace add a new attribute [axis2])ROC", - paddle::framework::compatible::OpVersionDesc() - .NewAttr("axis1", - "The added attribute 'axis1' is not yet registered.", - std::vector{0.0f}) - .NewAttr("axis2", - "The added attribute 'axis2' is not yet registered.", - std::vector{1.0f}) - .DeleteAttr("dim1", - "The attribute 'dim1' is not recommend according to " - "the specification 2.0.") - .DeleteAttr("dim2", - "The attribute 'dim2' is not recommend according to " - "the specification 2.0.")); +REGISTER_OP_VERSION(trace).AddCheckpoint( + R"ROC(Upgrade trace add a new attribute [axis2])ROC", + paddle::framework::compatible::OpVersionDesc() + .NewAttr("axis1", "The added attribute 'axis1' is not yet registered.", + std::vector{0.0f}) + .NewAttr("axis2", "The added attribute 'axis2' is not yet registered.", + std::vector{1.0f}) + .DeleteAttr("dim1", + "The attribute 'dim1' is not recommend according to " + "the specification 2.0.") + .DeleteAttr("dim2", + "The attribute 'dim2' is not recommend according to " + "the specification 2.0.")); diff --git a/paddle/fluid/operators/transfer_layout_op.cc b/paddle/fluid/operators/transfer_layout_op.cc index f26bcdca4a7b3..3b55631900d30 100644 --- a/paddle/fluid/operators/transfer_layout_op.cc +++ b/paddle/fluid/operators/transfer_layout_op.cc @@ -146,7 +146,7 @@ REGISTER_OPERATOR( REGISTER_OP_CPU_KERNEL_FUNCTOR(transfer_layout, float, ops::TransferLayoutKernel); REGISTER_OP_VERSION(transfer_layout) - .AddCheckpoint( - R"ROC(refine transfer_layout, add src_layout attribute)ROC", - paddle::framework::compatible::OpVersionDesc().NewAttr( - "src_layout", "(int, the layout of the input tensor", -1)); + .AddCheckpoint(R"ROC(refine transfer_layout, add src_layout attribute)ROC", + paddle::framework::compatible::OpVersionDesc().NewAttr( + "src_layout", "(int, the layout of the input tensor", + -1)); diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc index a45d32b34b983..4eceb69e8ce45 100644 --- a/paddle/fluid/operators/transpose_op.cc +++ b/paddle/fluid/operators/transpose_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/transpose_op.h" + #include #include #include diff --git a/paddle/fluid/operators/transpose_op.cu.h b/paddle/fluid/operators/transpose_op.cu.h index a31ac28c9910c..40a967b11f7a9 100644 --- a/paddle/fluid/operators/transpose_op.cu.h +++ b/paddle/fluid/operators/transpose_op.cu.h @@ -96,12 +96,15 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input, Dim3 input_dims, int x = threadIdx.x; Dim3 output_dims = { - input_dims[0], input_dims[2], input_dims[1], + input_dims[0], + input_dims[2], + input_dims[1], }; // Align dim to Tiles Dim3 tile_aligned_input_dim = { - input_dims[0], (input_dims[1] + TileX - 1) / TileX, + input_dims[0], + (input_dims[1] + TileX - 1) / TileX, (input_dims[2] + TileY - 1) / TileY, }; @@ -111,7 +114,8 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input, Dim3 input_dims, // Compute real index align to tile:0, 32, 64... Index3 block_tile_index_in_input = { - input_block_tile_index[0], input_block_tile_index[1] * TileX, + input_block_tile_index[0], + input_block_tile_index[1] * TileX, input_block_tile_index[2] * TileY, }; @@ -165,12 +169,14 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input, Dim3 input_dims, // Store sm value back to out Index3 output_block_tile_index = { - input_block_tile_index[0], input_block_tile_index[2], + input_block_tile_index[0], + input_block_tile_index[2], input_block_tile_index[1], }; Index3 block_tile_index_in_output = { - output_block_tile_index[0], output_block_tile_index[1] * TileY, + output_block_tile_index[0], + output_block_tile_index[1] * TileY, output_block_tile_index[2] * TileX, }; @@ -265,15 +271,13 @@ void LaunchNarrowDims2TransposeKernel(const phi::GPUContext& d, int tile_size_i, T* output) { constexpr int NumThreads = tile_long; if (tile_size_i <= tile_long && tile_size_j <= tile_short) { - TilingSwapDim1And2< - T, NumThreads, tile_long, - tile_short><<>>( - input, input_dims, output); + TilingSwapDim1And2 + <<>>(input, input_dims, + output); } else { - TilingSwapDim1And2< - T, NumThreads, tile_short, - tile_long><<>>( - input, input_dims, output); + TilingSwapDim1And2 + <<>>(input, input_dims, + output); } } @@ -392,10 +396,10 @@ void SwapDim1And2InNarrow(const phi::GPUContext& d, const T* input, // data may not aligned to tile, so some threads wasted, we need // to find least wasted threads, which means we need to find tile // can split input properly, in another words: num_wasted_threads=0. - int num_wasted_threads = input_long_edge - - framework::CeilOrFloor( - input_long_edge, proposed_tile_long_edge) * - proposed_tile_long_edge; + int num_wasted_threads = + input_long_edge - framework::CeilOrFloor( + input_long_edge, proposed_tile_long_edge) * + proposed_tile_long_edge; int num_full_tiles = framework::CeilOrFloor( input_long_edge, proposed_tile_long_edge); @@ -499,10 +503,9 @@ void SendSwapDim1And2InTranspose(const phi::GPUContext& d, const T* input, int total_tiles_count = input_dims_aligned[0] * input_dims_aligned[1] * input_dims_aligned[2]; - TilingSwapDim1And2< - T, kNumThreads, kTileSize, - kTileSize><<>>( - input, input_dims, output); + TilingSwapDim1And2 + <<>>(input, input_dims, + output); } else if (narrow_tile) { // If input shape is like Rect, such as 2X100, use Narrow tile size. @@ -513,9 +516,9 @@ void SendSwapDim1And2InTranspose(const phi::GPUContext& d, const T* input, // If input shape is small, such as 8X8, just do simple copy int total_elements = input_dims[0] * input_dims[1] * input_dims[2]; auto config = phi::backends::gpu::GetGpuLaunchConfig1D(d, total_elements); - TransposeSimpleKernel<<< - config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>( - total_elements, input, input_dims, output); + TransposeSimpleKernel + <<>>( + total_elements, input, input_dims, output); } } @@ -543,9 +546,9 @@ struct SwapDim0And2InTranspose { size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2]; auto config = phi::backends::gpu::GetGpuLaunchConfig1D(d, total_size); - TransposeSimpleKernel<<< - config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>( - total_size, in, input_dims, out); + TransposeSimpleKernel + <<>>( + total_size, in, input_dims, out); } }; diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h index a9e4876cc82a4..891aa312f69ff 100644 --- a/paddle/fluid/operators/transpose_op.h +++ b/paddle/fluid/operators/transpose_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/transpose_op_mlu.cc b/paddle/fluid/operators/transpose_op_mlu.cc index 40cb22bab50ec..38f6114e48d3f 100644 --- a/paddle/fluid/operators/transpose_op_mlu.cc +++ b/paddle/fluid/operators/transpose_op_mlu.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/fluid/operators/transpose_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/transpose_op_xpu.cc b/paddle/fluid/operators/transpose_op_xpu.cc index 00a43c74d8736..32b303238ab81 100644 --- a/paddle/fluid/operators/transpose_op_xpu.cc +++ b/paddle/fluid/operators/transpose_op_xpu.cc @@ -13,10 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/transpose_op.h" #include #include #include + +#include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" namespace paddle { diff --git a/paddle/fluid/operators/tree_conv_op.h b/paddle/fluid/operators/tree_conv_op.h index afe5379dc3f2a..8c479076175dd 100644 --- a/paddle/fluid/operators/tree_conv_op.h +++ b/paddle/fluid/operators/tree_conv_op.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/tree2col.h" #include "paddle/phi/kernels/funcs/blas/blas.h" diff --git a/paddle/fluid/operators/tril_indices_op.cc b/paddle/fluid/operators/tril_indices_op.cc index be42f53dd2344..63b5c1a2431ce 100644 --- a/paddle/fluid/operators/tril_indices_op.cc +++ b/paddle/fluid/operators/tril_indices_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" diff --git a/paddle/fluid/operators/tril_triu_op.cc b/paddle/fluid/operators/tril_triu_op.cc index b941fa3d03ae1..8ca83ed881099 100644 --- a/paddle/fluid/operators/tril_triu_op.cc +++ b/paddle/fluid/operators/tril_triu_op.cc @@ -13,9 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" - #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/unary.h" diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc index dc5a66dce16d6..21e2061e73b6c 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/truncated_gaussian_random_op.h" + #include #include #include @@ -19,7 +21,6 @@ limitations under the License. */ #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/truncated_gaussian_random_op.h" #include "paddle/phi/infermeta/nullary.h" namespace paddle { diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc index 261d9cee2d5cd..363d909d84dcf 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc @@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/truncated_gaussian_random_op.h" #include #include + #include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/operators/truncated_gaussian_random_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc index 803b61fbe813f..45a4b6a3bab7e 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc @@ -14,11 +14,12 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/truncated_gaussian_random_op.h" #include #include + #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/truncated_gaussian_random_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/unbind_op.cc b/paddle/fluid/operators/unbind_op.cc index f2fc08308c6b3..739fc98f3f086 100644 --- a/paddle/fluid/operators/unbind_op.cc +++ b/paddle/fluid/operators/unbind_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/unbind_op.h" + #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/unary.h" diff --git a/paddle/fluid/operators/unbind_op.h b/paddle/fluid/operators/unbind_op.h index 6e35f262de420..8e6cd391578c7 100644 --- a/paddle/fluid/operators/unbind_op.h +++ b/paddle/fluid/operators/unbind_op.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/strided_memcpy.h" diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h index 3e27402c86947..a988c6843893c 100644 --- a/paddle/fluid/operators/uniform_random_op.h +++ b/paddle/fluid/operators/uniform_random_op.h @@ -16,10 +16,12 @@ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #if defined(__NVCC__) || defined(__HIPCC__) #include + #include "paddle/fluid/framework/generator.h" #include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/funcs/distribution_helper.h" @@ -102,8 +104,9 @@ inline std::vector GetNewDataFromShapeTensorList( "Expected dtype of ShapeTensorList of %d-th must be int32, int64. " "But got " "unsupport dtype: %s.", - i, paddle::framework::DataTypeToString( - framework::TransToProtoVarType(tensor->dtype())))); + i, + paddle::framework::DataTypeToString( + framework::TransToProtoVarType(tensor->dtype())))); } } diff --git a/paddle/fluid/operators/uniform_random_op_mlu.cc b/paddle/fluid/operators/uniform_random_op_mlu.cc index 2c5f13f5a9307..fdf1252eb0ded 100644 --- a/paddle/fluid/operators/uniform_random_op_mlu.cc +++ b/paddle/fluid/operators/uniform_random_op_mlu.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/uniform_random_op.h" #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/fluid/operators/uniform_random_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/uniform_random_op_xpu.cc b/paddle/fluid/operators/uniform_random_op_xpu.cc index ae2adf834194d..23d0f61c2bd1d 100644 --- a/paddle/fluid/operators/uniform_random_op_xpu.cc +++ b/paddle/fluid/operators/uniform_random_op_xpu.cc @@ -14,11 +14,12 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/uniform_random_op.h" #include + #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/uniform_random_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/unique_consecutive_op.cc b/paddle/fluid/operators/unique_consecutive_op.cc index 24ef3a85ee2ce..567f7bac34be7 100644 --- a/paddle/fluid/operators/unique_consecutive_op.cc +++ b/paddle/fluid/operators/unique_consecutive_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/unique_consecutive_op.h" + #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/unique_consecutive_op.cu b/paddle/fluid/operators/unique_consecutive_op.cu index fbffb01ed19b6..9db14e82b25b1 100644 --- a/paddle/fluid/operators/unique_consecutive_op.cu +++ b/paddle/fluid/operators/unique_consecutive_op.cu @@ -18,8 +18,10 @@ limitations under the License. */ #include #include #include + #include #include + #include "paddle/fluid/framework/tensor_util.h" // TensorToVector() #include "paddle/fluid/operators/unique_consecutive_op.h" // TransComute() diff --git a/paddle/fluid/operators/unique_consecutive_op.h b/paddle/fluid/operators/unique_consecutive_op.h index b31c2aa67a587..4dc1871b5d140 100644 --- a/paddle/fluid/operators/unique_consecutive_op.h +++ b/paddle/fluid/operators/unique_consecutive_op.h @@ -20,6 +20,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/transpose_op.h" diff --git a/paddle/fluid/operators/unique_op.cc b/paddle/fluid/operators/unique_op.cc index 5c103e088b559..fbbd562c1b8a2 100644 --- a/paddle/fluid/operators/unique_op.cc +++ b/paddle/fluid/operators/unique_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/unique_op.h" + #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" diff --git a/paddle/fluid/operators/unique_op.h b/paddle/fluid/operators/unique_op.h index 01439d2182464..d59e6590a88f3 100644 --- a/paddle/fluid/operators/unique_op.h +++ b/paddle/fluid/operators/unique_op.h @@ -20,6 +20,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/transpose_op.h" diff --git a/paddle/fluid/operators/unique_with_counts_op.h b/paddle/fluid/operators/unique_with_counts_op.h index af8bfe813a6b0..227fdef222432 100644 --- a/paddle/fluid/operators/unique_with_counts_op.h +++ b/paddle/fluid/operators/unique_with_counts_op.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/unique_op.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake index 1be8f3387dbad..62aa990ca7bc8 100644 --- a/paddle/fluid/operators/unity_build_rule.cmake +++ b/paddle/fluid/operators/unity_build_rule.cmake @@ -4,533 +4,569 @@ # Generally, the combination rules in this file do not need to be modified. # If there are some redefined error in compiling with the source file which # in combination rule, you can remove the source file from the following rules. -register_unity_group(cc - abs_op.cc - add_position_encoding_op.cc - addmm_op.cc - affine_channel_op.cc - affine_grid_op.cc - allclose_op.cc - argsort_op.cc - array_to_lod_tensor_op.cc - assert_op.cc - assign_op.cc - assign_value_op.cc - attention_lstm_op.cc - average_accumulates_op.cc - batch_fc_op.cc - bce_loss_op.cc - beam_search_op.cc - beam_search_decode_op.cc - bernoulli_op.cc - bilateral_slice_op.cc) -register_unity_group(cc - mkldnn/batch_norm_mkldnn_op.cc - bilinear_tensor_product_op.cc - bmm_op.cc - bpr_loss_op.cc - cast_op.cc - mkldnn/cast_mkldnn_op.cc - cholesky_op.cc - chunk_eval_op.cc - clip_by_norm_op.cc - clip_op.cc - coalesce_tensor_op.cc - mkldnn/activation_mkldnn_op.cc - mkldnn/interpolate_mkldnn_op.cc - mkldnn/pool_mkldnn_op.cc - mkldnn/softmax_mkldnn_op.cc) -register_unity_group(cc - center_loss_op.cc - mkldnn/concat_mkldnn_op.cc - mkldnn/conv_mkldnn_op.cc - mkldnn/conv_transpose_mkldnn_op.cc - correlation_op.cc - cos_sim_op.cc - crf_decoding_op.cc - crop_op.cc - ascend_trigger_op.cc - conj_op.cc - imag_op.cc - kldiv_loss_op.cc - memcpy_op.cc) -register_unity_group(cc - cross_entropy_op.cc - cross_op.cc - ctc_align_op.cc - cudnn_lstm_op.cc - cumsum_op.cc - cvm_op.cc - data_norm_op.cc - deformable_conv_op.cc - deformable_conv_v1_op.cc - deformable_psroi_pooling_op.cc - delete_var_op.cc - dequantize_abs_max_op.cc - dequantize_op.cc - mkldnn/dequantize_mkldnn_op.cc) -register_unity_group(cc - dequeue_op.cc - detection_map_op.cc - dgc_clip_by_norm_op.cc - diag_embed_op.cc - diag_op.cc - diag_v2_op.cc - dot_op.cc - edit_distance_op.cc - empty_op.cc - enqueue_op.cc - erf_op.cc - py_func_op.cc - real_op.cc - sync_batch_norm_op.cc - top_k_op.cc - conv_op.cc - conv_transpose_op.cc - gru_unit_op.cc) -register_unity_group(cc - expand_v2_op.cc - fake_dequantize_op.cc - fc_op.cc - mkldnn/fc_mkldnn_op.cc - fill_any_like_op.cc - fill_constant_batch_size_like_op.cc - fill_constant_op.cc - fill_op.cc - fill_zeros_like_op.cc - filter_by_instag_op.cc) -register_unity_group(cc - flatten_op.cc - flip_op.cc - fsp_op.cc - gather_nd_op.cc - gather_op.cc - gather_tree_op.cc - gaussian_random_batch_size_like_op.cc - gaussian_random_op.cc - mkldnn/gaussian_random_mkldnn_op.cc - group_norm_op.cc gru_op.cc) -register_unity_group(cc - hash_op.cc - hierarchical_sigmoid_op.cc - hinge_loss_op.cc - histogram_op.cc - huber_loss_op.cc - im2sequence_op.cc - increment_op.cc - index_sample_op.cc - index_select_op.cc - interpolate_op.cc - isfinite_v2_op.cc) -register_unity_group(cc - inplace_abn_op.cc - interpolate_v2_op.cc - inverse_op.cc - is_empty_op.cc - isfinite_op.cc - kron_op.cc - l1_norm_op.cc - label_smooth_op.cc - layer_norm_op.cc - mkldnn/layer_norm_mkldnn_op.cc - mkldnn/layer_norm_mkldnn_op.cc - linspace_op.cc - load_combine_op.cc - load_op.cc) -register_unity_group(cc - lod_array_length_op.cc - lod_rank_table_op.cc - lod_reset_op.cc - lod_tensor_to_array_op.cc - log_softmax_op.cc - lookup_table_dequant_op.cc - lrn_op.cc - mkldnn/lrn_mkldnn_op.cc - lstm_unit_op.cc - lstmp_op.cc) -register_unity_group(cc - log_loss_op.cc - lookup_table_v2_op.cc - margin_rank_loss_op.cc - masked_select_op.cc - match_matrix_tensor_op.cc - matmul_op.cc - mkldnn/matmul_mkldnn_op.cc - max_sequence_len_op.cc - maxout_op.cc - merge_lod_tensor_op.cc - merge_selected_rows_op.cc - meshgrid_op.cc) -register_unity_group(cc - concat_op.cc - conv_shift_op.cc - dequantize_log_op.cc - dropout_op.cc - expand_op.cc - fake_quantize_op.cc - gelu_op.cc - get_tensor_from_selected_rows_op.cc - lookup_table_op.cc - matmul_v2_op.cc) -register_unity_group(cc - mean_iou_op.cc - mean_op.cc - minus_op.cc - mish_op.cc - mul_op.cc - multinomial_op.cc - multiplex_op.cc - mv_op.cc - nce_op.cc - nll_loss_op.cc - norm_op.cc - one_hot_op.cc - one_hot_v2_op.cc - pad2d_op.cc - pad3d_op.cc - pad_constant_like_op.cc - pad_op.cc) -register_unity_group(cc - modified_huber_loss_op.cc - partial_sum_op.cc - pixel_shuffle_op.cc - pool_op.cc - pool_with_index_op.cc - positive_negative_pair_op.cc - prelu_op.cc - print_op.cc - prroi_pool_op.cc - psroi_pool_op.cc - pull_box_extended_sparse_op.cc - pull_box_sparse_op.cc - pull_sparse_op.cc - pull_sparse_v2_op.cc) -register_unity_group(cc - push_dense_op.cc - quantize_op.cc - mkldnn/quantize_mkldnn_op.cc - queue_generator_op.cc - randint_op.cc - random_crop_op.cc - randperm_op.cc - range_op.cc - rank_attention_op.cc - rank_loss_op.cc - recurrent_op.cc - reorder_lod_tensor_by_rank_op.cc - requantize_op.cc - mkldnn/requantize_mkldnn_op.cc - reshape_op.cc - reverse_op.cc) -register_unity_group(cc - rnn_memory_helper_op.cc - roi_align_op.cc - roll_op.cc - run_program_op.cc - sample_logits_op.cc - sampling_id_op.cc - save_combine_op.cc - save_op.cc - scale_op.cc - mkldnn/scale_mkldnn_op.cc - scatter_nd_add_op.cc - scatter_op.cc - seed_op.cc - select_input_op.cc - select_output_op.cc) -register_unity_group(cc - roi_pool_op.cc - selu_op.cc - shape_op.cc - shard_index_op.cc - shrink_rnn_memory_op.cc - shuffle_batch_op.cc - shuffle_channel_op.cc - sigmoid_cross_entropy_with_logits_op.cc - sign_op.cc - similarity_focus_op.cc - size_op.cc - slice_op.cc - softmax_op.cc) -register_unity_group(cc - space_to_depth_op.cc - spectral_norm_op.cc - split_lod_tensor_op.cc - split_op.cc - split_selected_rows_op.cc - spp_op.cc - squared_l2_norm_op.cc - squeeze_op.cc - stack_op.cc - strided_slice_op.cc - sum_op.cc - mkldnn/sum_mkldnn_op.cc - tdm_child_op.cc - tdm_sampler_op.cc - teacher_student_sigmoid_loss_op.cc - temporal_shift_op.cc) -register_unity_group(cc - row_conv_op.cc - tensor_array_to_tensor_op.cc - tile_op.cc - top_k_v2_op.cc - trace_op.cc - transpose_op.cc - mkldnn/transpose_mkldnn_op.cc - tree_conv_op.cc - tril_triu_op.cc - truncated_gaussian_random_op.cc - unbind_op.cc - unfold_op.cc) -register_unity_group(cc - smooth_l1_loss_op.cc - uniform_random_batch_size_like_op.cc - uniform_random_op.cc - unique_op.cc - unique_with_counts_op.cc - unpool_op.cc - unsqueeze_op.cc - unstack_op.cc - var_conv_2d_op.cc - where_index_op.cc - where_op.cc) -register_unity_group(cc - affine_grid_cudnn_op.cu.cc - beam_search_op.cu.cc - cudnn_lstm_op.cu.cc - empty_op.cu.cc - fc_op.cu.cc - fill_constant_batch_size_like_op.cu.cc - fill_constant_op.cu.cc - fill_op.cu.cc - fill_zeros_like_op.cu.cc - flatten_op.cu.cc - grid_sampler_cudnn_op.cu.cc - gru_op.cu.cc - inverse_op.cu.cc - is_empty_op.cu.cc - maxout_op.cu.cc - mul_op.cu.cc - concat_op.cu.cc - mul_op.cu.cc - pool_op.cu.cc - pool_cudnn_op.cu.cc - pool_with_index_op.cu.cc - run_program_op.cu.cc - softmax_op.cu.cc - softmax_cudnn_op.cu.cc - spp_op.cu.cc - squeeze_op.cu.cc - unbind_op.cu.cc - unpool_op.cu.cc - unsqueeze_op.cu.cc) -register_unity_group(cc - arg_max_op.cc - arg_min_op.cc - squared_l2_distance_op.cc) -register_unity_group(cc - linear_chain_crf_op.cc - lstm_op.cc - partial_concat_op.cc - pyramid_hash_op.cc - recurrent_op.cc - run_program_op.cc - softmax_with_cross_entropy_op.cc - warpctc_op.cc) -register_unity_group(cc - conv_op.cu.cc - lstm_op.cu.cc - rnn_op.cu.cc - split_op.cu.cc - activation_cudnn_op.cu.cc - assign_value_op.cu.cc - merge_selected_rows_op.cu.cc - run_program_op.cu.cc - warpctc_op.cu.cc) -register_unity_group(cu - addmm_op.cu - affine_channel_op.cu - allclose_op.cu - assign_value_op.cu - bce_loss_op.cu - bernoulli_op.cu - bilateral_slice_op.cu - batch_norm_op.cu) -register_unity_group(cu - bilinear_tensor_product_op.cu - bmm_op.cu - cast_op.cu - cholesky_op.cu - clip_by_norm_op.cu - clip_op.cu - conv_cudnn_op.cu - affine_grid_op.cu) -register_unity_group(cu - center_loss_op.cu - conv_op.cu - conv_transpose_cudnn_op.cu - conv_transpose_op.cu - cos_sim_op.cu - crop_op.cu - average_accumulates_op.cu - conj_op.cu - correlation_op.cu) -register_unity_group(cu - cross_entropy_op.cu - cross_op.cu - ctc_align_op.cu - cumsum_op.cu - cvm_op.cu - data_norm_op.cu - deformable_conv_op.cu - deformable_conv_v1_op.cu - dequantize_abs_max_op.cu) -register_unity_group(cu - dgc_clip_by_norm_op.cu - diag_embed_op.cu - diag_op.cu - diag_v2_op.cu - edit_distance_op.cu - erf_op.cu - meshgrid_op.cu - imag_op.cu) -register_unity_group(cu - expand_v2_op.cu - fake_dequantize_op.cu - fill_any_like_op.cu) -register_unity_group(cu - flip_op.cu - fsp_op.cu - gather_nd_op.cu - gather_op.cu - gather_tree_op.cu - gaussian_random_op.cu - grid_sampler_op.cu - group_norm_op.cu) -register_unity_group(cu - hinge_loss_op.cu - histogram_op.cu - huber_loss_op.cu - im2sequence_op.cu - increment_op.cu - index_sample_op.cu - index_select_op.cu - interpolate_op.cu - isfinite_v2_op.cu) -register_unity_group(cu - inplace_abn_op.cu - interpolate_v2_op.cu - isfinite_op.cu - l1_norm_op.cu - label_smooth_op.cu - linspace_op.cu - load_combine_op.cu - load_op.cu) -register_unity_group(cu - lod_reset_op.cu - log_softmax_op.cu - lrn_op.cu - lstm_unit_op.cu - dot_op.cu - psroi_pool_op.cu - rank_loss_op.cu - real_op.cu) -register_unity_group(cu - log_loss_op.cu - lookup_table_v2_op.cu - margin_rank_loss_op.cu - masked_select_op.cu - merge_selected_rows_op.cu - lstmp_op.cu - shuffle_channel_op.cu - softmax_cudnn_op.cu - squared_l2_distance_op.cu) -register_unity_group(cu - conv_shift_op.cu - dequantize_log_op.cu - dropout_op.cu - fake_quantize_op.cu - gelu_op.cu - lookup_table_op.cu - sigmoid_cross_entropy_with_logits_op.cu - softmax_with_cross_entropy_op.cu) -register_unity_group(cu - mean_iou_op.cu - mean_op.cu - minus_op.cu - mish_op.cu - multinomial_op.cu - multiplex_op.cu - mv_op.cu - nll_loss_op.cu - norm_op.cu - one_hot_op.cu - pad2d_op.cu - pad3d_op.cu - pad_constant_like_op.cu - pad_op.cu) -register_unity_group(cu - partial_sum_op.cu - pixel_shuffle_op.cu - prelu_op.cu - prroi_pool_op.cu - pull_box_extended_sparse_op.cu - pull_box_sparse_op.cu) -register_unity_group(cu - randint_op.cu - random_crop_op.cu - randperm_op.cu - range_op.cu - reverse_op.cu - partial_concat_op.cu - kldiv_loss_op.cu - instance_norm_op.cu) -register_unity_group(cu - roi_align_op.cu - roll_op.cu - sample_logits_op.cu - sampling_id_op.cu - save_combine_op.cu - save_op.cu - scale_op.cu - scatter_nd_add_op.cu - scatter_op.cu - seed_op.cu) -register_unity_group(cu - roi_pool_op.cu - selu_op.cu - shape_op.cu - shard_index_op.cu - sign_op.cu - size_op.cu - slice_op.cu) -register_unity_group(cu - space_to_depth_op.cu - spectral_norm_op.cu - split_op.cu - split_selected_rows_op.cu - squared_l2_norm_op.cu - sum_op.cu - temporal_shift_op.cu - arg_max_op.cu) -register_unity_group(cu - row_conv_op.cu - tree_conv_op.cu - tril_triu_op.cu - truncated_gaussian_random_op.cu - unfold_op.cu - arg_min_op.cu - crop_tensor_op.cu) -register_unity_group(cu - smooth_l1_loss_op.cu - uniform_random_op.cu - unstack_op.cu - where_index_op.cu - where_op.cu - layer_norm_op.cu) -register_unity_group(cu - expand_as_op.cu - stack_op.cu) +register_unity_group( + cc + abs_op.cc + add_position_encoding_op.cc + addmm_op.cc + affine_channel_op.cc + affine_grid_op.cc + allclose_op.cc + argsort_op.cc + array_to_lod_tensor_op.cc + assert_op.cc + assign_op.cc + assign_value_op.cc + attention_lstm_op.cc + average_accumulates_op.cc + batch_fc_op.cc + bce_loss_op.cc + beam_search_op.cc + beam_search_decode_op.cc + bernoulli_op.cc + bilateral_slice_op.cc) +register_unity_group( + cc + mkldnn/batch_norm_mkldnn_op.cc + bilinear_tensor_product_op.cc + bmm_op.cc + bpr_loss_op.cc + cast_op.cc + mkldnn/cast_mkldnn_op.cc + cholesky_op.cc + chunk_eval_op.cc + clip_by_norm_op.cc + clip_op.cc + coalesce_tensor_op.cc + mkldnn/activation_mkldnn_op.cc + mkldnn/interpolate_mkldnn_op.cc + mkldnn/pool_mkldnn_op.cc + mkldnn/softmax_mkldnn_op.cc) +register_unity_group( + cc + center_loss_op.cc + mkldnn/concat_mkldnn_op.cc + mkldnn/conv_mkldnn_op.cc + mkldnn/conv_transpose_mkldnn_op.cc + correlation_op.cc + cos_sim_op.cc + crf_decoding_op.cc + crop_op.cc + ascend_trigger_op.cc + conj_op.cc + imag_op.cc + kldiv_loss_op.cc + memcpy_op.cc) +register_unity_group( + cc + cross_entropy_op.cc + cross_op.cc + ctc_align_op.cc + cudnn_lstm_op.cc + cumsum_op.cc + cvm_op.cc + data_norm_op.cc + deformable_conv_op.cc + deformable_conv_v1_op.cc + deformable_psroi_pooling_op.cc + delete_var_op.cc + dequantize_abs_max_op.cc + dequantize_op.cc + mkldnn/dequantize_mkldnn_op.cc) +register_unity_group( + cc + dequeue_op.cc + detection_map_op.cc + dgc_clip_by_norm_op.cc + diag_embed_op.cc + diag_op.cc + diag_v2_op.cc + dot_op.cc + edit_distance_op.cc + empty_op.cc + enqueue_op.cc + erf_op.cc + py_func_op.cc + real_op.cc + sync_batch_norm_op.cc + top_k_op.cc + conv_op.cc + conv_transpose_op.cc + gru_unit_op.cc) +register_unity_group( + cc + expand_v2_op.cc + fake_dequantize_op.cc + fc_op.cc + mkldnn/fc_mkldnn_op.cc + fill_any_like_op.cc + fill_constant_batch_size_like_op.cc + fill_constant_op.cc + fill_op.cc + fill_zeros_like_op.cc + filter_by_instag_op.cc) +register_unity_group( + cc + flatten_op.cc + flip_op.cc + fsp_op.cc + gather_nd_op.cc + gather_op.cc + gather_tree_op.cc + gaussian_random_batch_size_like_op.cc + gaussian_random_op.cc + mkldnn/gaussian_random_mkldnn_op.cc + group_norm_op.cc + gru_op.cc) +register_unity_group( + cc + hash_op.cc + hierarchical_sigmoid_op.cc + hinge_loss_op.cc + histogram_op.cc + huber_loss_op.cc + im2sequence_op.cc + increment_op.cc + index_sample_op.cc + index_select_op.cc + interpolate_op.cc + isfinite_v2_op.cc) +register_unity_group( + cc + inplace_abn_op.cc + interpolate_v2_op.cc + inverse_op.cc + is_empty_op.cc + isfinite_op.cc + kron_op.cc + l1_norm_op.cc + label_smooth_op.cc + layer_norm_op.cc + mkldnn/layer_norm_mkldnn_op.cc + mkldnn/layer_norm_mkldnn_op.cc + linspace_op.cc + load_combine_op.cc + load_op.cc) +register_unity_group( + cc + lod_array_length_op.cc + lod_rank_table_op.cc + lod_reset_op.cc + lod_tensor_to_array_op.cc + log_softmax_op.cc + lookup_table_dequant_op.cc + lrn_op.cc + mkldnn/lrn_mkldnn_op.cc + lstm_unit_op.cc + lstmp_op.cc) +register_unity_group( + cc + log_loss_op.cc + lookup_table_v2_op.cc + margin_rank_loss_op.cc + masked_select_op.cc + match_matrix_tensor_op.cc + matmul_op.cc + mkldnn/matmul_mkldnn_op.cc + max_sequence_len_op.cc + maxout_op.cc + merge_lod_tensor_op.cc + merge_selected_rows_op.cc + meshgrid_op.cc) +register_unity_group( + cc + concat_op.cc + conv_shift_op.cc + dequantize_log_op.cc + dropout_op.cc + expand_op.cc + fake_quantize_op.cc + gelu_op.cc + get_tensor_from_selected_rows_op.cc + lookup_table_op.cc + matmul_v2_op.cc) +register_unity_group( + cc + mean_iou_op.cc + mean_op.cc + minus_op.cc + mish_op.cc + mul_op.cc + multinomial_op.cc + multiplex_op.cc + mv_op.cc + nce_op.cc + nll_loss_op.cc + norm_op.cc + one_hot_op.cc + one_hot_v2_op.cc + pad2d_op.cc + pad3d_op.cc + pad_constant_like_op.cc + pad_op.cc) +register_unity_group( + cc + modified_huber_loss_op.cc + partial_sum_op.cc + pixel_shuffle_op.cc + pool_op.cc + pool_with_index_op.cc + positive_negative_pair_op.cc + prelu_op.cc + print_op.cc + prroi_pool_op.cc + psroi_pool_op.cc + pull_box_extended_sparse_op.cc + pull_box_sparse_op.cc + pull_sparse_op.cc + pull_sparse_v2_op.cc) +register_unity_group( + cc + push_dense_op.cc + quantize_op.cc + mkldnn/quantize_mkldnn_op.cc + queue_generator_op.cc + randint_op.cc + random_crop_op.cc + randperm_op.cc + range_op.cc + rank_attention_op.cc + rank_loss_op.cc + recurrent_op.cc + reorder_lod_tensor_by_rank_op.cc + requantize_op.cc + mkldnn/requantize_mkldnn_op.cc + reshape_op.cc + reverse_op.cc) +register_unity_group( + cc + rnn_memory_helper_op.cc + roi_align_op.cc + roll_op.cc + run_program_op.cc + sample_logits_op.cc + sampling_id_op.cc + save_combine_op.cc + save_op.cc + scale_op.cc + mkldnn/scale_mkldnn_op.cc + scatter_nd_add_op.cc + scatter_op.cc + seed_op.cc + select_input_op.cc + select_output_op.cc) +register_unity_group( + cc + roi_pool_op.cc + selu_op.cc + shape_op.cc + shard_index_op.cc + shrink_rnn_memory_op.cc + shuffle_batch_op.cc + shuffle_channel_op.cc + sigmoid_cross_entropy_with_logits_op.cc + sign_op.cc + similarity_focus_op.cc + size_op.cc + slice_op.cc + softmax_op.cc) +register_unity_group( + cc + space_to_depth_op.cc + spectral_norm_op.cc + split_lod_tensor_op.cc + split_op.cc + split_selected_rows_op.cc + spp_op.cc + squared_l2_norm_op.cc + squeeze_op.cc + stack_op.cc + strided_slice_op.cc + sum_op.cc + mkldnn/sum_mkldnn_op.cc + tdm_child_op.cc + tdm_sampler_op.cc + teacher_student_sigmoid_loss_op.cc + temporal_shift_op.cc) +register_unity_group( + cc + row_conv_op.cc + tensor_array_to_tensor_op.cc + tile_op.cc + top_k_v2_op.cc + trace_op.cc + transpose_op.cc + mkldnn/transpose_mkldnn_op.cc + tree_conv_op.cc + tril_triu_op.cc + truncated_gaussian_random_op.cc + unbind_op.cc + unfold_op.cc) +register_unity_group( + cc + smooth_l1_loss_op.cc + uniform_random_batch_size_like_op.cc + uniform_random_op.cc + unique_op.cc + unique_with_counts_op.cc + unpool_op.cc + unsqueeze_op.cc + unstack_op.cc + var_conv_2d_op.cc + where_index_op.cc + where_op.cc) +register_unity_group( + cc + affine_grid_cudnn_op.cu.cc + beam_search_op.cu.cc + cudnn_lstm_op.cu.cc + empty_op.cu.cc + fc_op.cu.cc + fill_constant_batch_size_like_op.cu.cc + fill_constant_op.cu.cc + fill_op.cu.cc + fill_zeros_like_op.cu.cc + flatten_op.cu.cc + grid_sampler_cudnn_op.cu.cc + gru_op.cu.cc + inverse_op.cu.cc + is_empty_op.cu.cc + maxout_op.cu.cc + mul_op.cu.cc + concat_op.cu.cc + mul_op.cu.cc + pool_op.cu.cc + pool_cudnn_op.cu.cc + pool_with_index_op.cu.cc + run_program_op.cu.cc + softmax_op.cu.cc + softmax_cudnn_op.cu.cc + spp_op.cu.cc + squeeze_op.cu.cc + unbind_op.cu.cc + unpool_op.cu.cc + unsqueeze_op.cu.cc) +register_unity_group(cc arg_max_op.cc arg_min_op.cc squared_l2_distance_op.cc) +register_unity_group( + cc + linear_chain_crf_op.cc + lstm_op.cc + partial_concat_op.cc + pyramid_hash_op.cc + recurrent_op.cc + run_program_op.cc + softmax_with_cross_entropy_op.cc + warpctc_op.cc) +register_unity_group( + cc + conv_op.cu.cc + lstm_op.cu.cc + rnn_op.cu.cc + split_op.cu.cc + activation_cudnn_op.cu.cc + assign_value_op.cu.cc + merge_selected_rows_op.cu.cc + run_program_op.cu.cc + warpctc_op.cu.cc) +register_unity_group( + cu + addmm_op.cu + affine_channel_op.cu + allclose_op.cu + assign_value_op.cu + bce_loss_op.cu + bernoulli_op.cu + bilateral_slice_op.cu + batch_norm_op.cu) +register_unity_group( + cu + bilinear_tensor_product_op.cu + bmm_op.cu + cast_op.cu + cholesky_op.cu + clip_by_norm_op.cu + clip_op.cu + conv_cudnn_op.cu + affine_grid_op.cu) +register_unity_group( + cu + center_loss_op.cu + conv_op.cu + conv_transpose_cudnn_op.cu + conv_transpose_op.cu + cos_sim_op.cu + crop_op.cu + average_accumulates_op.cu + conj_op.cu + correlation_op.cu) +register_unity_group( + cu + cross_entropy_op.cu + cross_op.cu + ctc_align_op.cu + cumsum_op.cu + cvm_op.cu + data_norm_op.cu + deformable_conv_op.cu + deformable_conv_v1_op.cu + dequantize_abs_max_op.cu) +register_unity_group( + cu + dgc_clip_by_norm_op.cu + diag_embed_op.cu + diag_op.cu + diag_v2_op.cu + edit_distance_op.cu + erf_op.cu + meshgrid_op.cu + imag_op.cu) +register_unity_group(cu expand_v2_op.cu fake_dequantize_op.cu + fill_any_like_op.cu) +register_unity_group( + cu + flip_op.cu + fsp_op.cu + gather_nd_op.cu + gather_op.cu + gather_tree_op.cu + gaussian_random_op.cu + grid_sampler_op.cu + group_norm_op.cu) +register_unity_group( + cu + hinge_loss_op.cu + histogram_op.cu + huber_loss_op.cu + im2sequence_op.cu + increment_op.cu + index_sample_op.cu + index_select_op.cu + interpolate_op.cu + isfinite_v2_op.cu) +register_unity_group( + cu + inplace_abn_op.cu + interpolate_v2_op.cu + isfinite_op.cu + l1_norm_op.cu + label_smooth_op.cu + linspace_op.cu + load_combine_op.cu + load_op.cu) +register_unity_group( + cu + lod_reset_op.cu + log_softmax_op.cu + lrn_op.cu + lstm_unit_op.cu + dot_op.cu + psroi_pool_op.cu + rank_loss_op.cu + real_op.cu) +register_unity_group( + cu + log_loss_op.cu + lookup_table_v2_op.cu + margin_rank_loss_op.cu + masked_select_op.cu + merge_selected_rows_op.cu + lstmp_op.cu + shuffle_channel_op.cu + softmax_cudnn_op.cu + squared_l2_distance_op.cu) +register_unity_group( + cu + conv_shift_op.cu + dequantize_log_op.cu + dropout_op.cu + fake_quantize_op.cu + gelu_op.cu + lookup_table_op.cu + sigmoid_cross_entropy_with_logits_op.cu + softmax_with_cross_entropy_op.cu) +register_unity_group( + cu + mean_iou_op.cu + mean_op.cu + minus_op.cu + mish_op.cu + multinomial_op.cu + multiplex_op.cu + mv_op.cu + nll_loss_op.cu + norm_op.cu + one_hot_op.cu + pad2d_op.cu + pad3d_op.cu + pad_constant_like_op.cu + pad_op.cu) +register_unity_group( + cu + partial_sum_op.cu + pixel_shuffle_op.cu + prelu_op.cu + prroi_pool_op.cu + pull_box_extended_sparse_op.cu + pull_box_sparse_op.cu) +register_unity_group( + cu + randint_op.cu + random_crop_op.cu + randperm_op.cu + range_op.cu + reverse_op.cu + partial_concat_op.cu + kldiv_loss_op.cu + instance_norm_op.cu) +register_unity_group( + cu + roi_align_op.cu + roll_op.cu + sample_logits_op.cu + sampling_id_op.cu + save_combine_op.cu + save_op.cu + scale_op.cu + scatter_nd_add_op.cu + scatter_op.cu + seed_op.cu) +register_unity_group( + cu + roi_pool_op.cu + selu_op.cu + shape_op.cu + shard_index_op.cu + sign_op.cu + size_op.cu + slice_op.cu) +register_unity_group( + cu + space_to_depth_op.cu + spectral_norm_op.cu + split_op.cu + split_selected_rows_op.cu + squared_l2_norm_op.cu + sum_op.cu + temporal_shift_op.cu + arg_max_op.cu) +register_unity_group( + cu + row_conv_op.cu + tree_conv_op.cu + tril_triu_op.cu + truncated_gaussian_random_op.cu + unfold_op.cu + arg_min_op.cu + crop_tensor_op.cu) +register_unity_group( + cu + smooth_l1_loss_op.cu + uniform_random_op.cu + unstack_op.cu + where_index_op.cu + where_op.cu + layer_norm_op.cu) +register_unity_group(cu expand_as_op.cu stack_op.cu) # The following groups are to make better use of `/MP` which MSVC's parallel # compilation instruction when compiling in Unity Build. register_unity_group(cu activation_op.cu) diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc index 36e9d894541b0..b18c4e4de4475 100644 --- a/paddle/fluid/operators/unpool_op.cc +++ b/paddle/fluid/operators/unpool_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/unpool_op.h" + #include #include #include diff --git a/paddle/fluid/operators/unpool_op.h b/paddle/fluid/operators/unpool_op.h index 35aeb4e0d610e..062008f95ea3c 100644 --- a/paddle/fluid/operators/unpool_op.h +++ b/paddle/fluid/operators/unpool_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/unpooling.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc index 445e8cd468bf3..82edcd5a9fcf7 100644 --- a/paddle/fluid/operators/unsqueeze_op.cc +++ b/paddle/fluid/operators/unsqueeze_op.cc @@ -101,9 +101,10 @@ class UnsqueezeOp : public framework::OperatorWithKernel { for (int axis : unsqz_dims) { int cur = axis < 0 ? axis + cur_output_size + 1 : axis; // Vaildity Check: the axis bound - PADDLE_ENFORCE_GE(cur, 0, platform::errors::InvalidArgument( - "The insert dimension value should " - "not be less than 0")); + PADDLE_ENFORCE_GE( + cur, 0, + platform::errors::InvalidArgument("The insert dimension value should " + "not be less than 0")); PADDLE_ENFORCE_LE(cur, cur_output_size, platform::errors::InvalidArgument( "The insert dimension value shoud not be larger " diff --git a/paddle/fluid/operators/unsqueeze_op.h b/paddle/fluid/operators/unsqueeze_op.h index f6112fb59c122..86038aced3846 100644 --- a/paddle/fluid/operators/unsqueeze_op.h +++ b/paddle/fluid/operators/unsqueeze_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/utils.h" #include "paddle/fluid/platform/device_context.h" @@ -72,9 +73,10 @@ class UnsqueezeKernel : public framework::OpKernel { for (int axis : unsqz_dims) { int cur = axis < 0 ? axis + cur_output_size + 1 : axis; // Vaildity Check: the axis bound - PADDLE_ENFORCE_GE(cur, 0, platform::errors::InvalidArgument( - "The insert dimension value should " - "not be less than 0")); + PADDLE_ENFORCE_GE( + cur, 0, + platform::errors::InvalidArgument("The insert dimension value should " + "not be less than 0")); PADDLE_ENFORCE_LE(cur, cur_output_size, platform::errors::InvalidArgument( "The insert dimension value shoule not be larger " diff --git a/paddle/fluid/operators/unstack_op.cc b/paddle/fluid/operators/unstack_op.cc index 8c8684bf4b035..df2325f5dc523 100644 --- a/paddle/fluid/operators/unstack_op.cc +++ b/paddle/fluid/operators/unstack_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/for_range.h" diff --git a/paddle/fluid/operators/utils.h b/paddle/fluid/operators/utils.h index d84f7b165fd99..009e883ccb642 100644 --- a/paddle/fluid/operators/utils.h +++ b/paddle/fluid/operators/utils.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include #include diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc index 3dffa0be2e28a..977cd99984ca0 100644 --- a/paddle/fluid/operators/var_conv_2d_op.cc +++ b/paddle/fluid/operators/var_conv_2d_op.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/var_conv_2d_op.h" + #include #include + #include "paddle/fluid/platform/dynload/mklml.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 24d39c25cf335..247ff43b8a047 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -1,229 +1,448 @@ -proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool) +proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto + simple_threadpool) if(WITH_GPU) proto_library(external_error_proto SRCS external_error.proto) endif(WITH_GPU) -if (WITH_PYTHON) +if(WITH_PYTHON) py_proto_compile(profiler_py_proto SRCS profiler.proto) - add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) + add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E + touch __init__.py) add_dependencies(profiler_py_proto profiler_py_proto_init) - if (NOT WIN32) - add_custom_command(TARGET profiler_py_proto POST_BUILD - COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler - COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler - COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler." - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + if(NOT WIN32) + add_custom_command( + TARGET profiler_py_proto + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory + ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler + COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler + COMMENT + "Copy generated python proto into directory paddle/fluid/proto/profiler." + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) else(NOT WIN32) - string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler/") - add_custom_command(TARGET profiler_py_proto POST_BUILD - COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler - COMMAND copy /Y *.py ${proto_dstpath} - COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler." - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + string(REPLACE "/" "\\" proto_dstpath + "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler/") + add_custom_command( + TARGET profiler_py_proto + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory + ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler + COMMAND copy /Y *.py ${proto_dstpath} + COMMENT + "Copy generated python proto into directory paddle/fluid/proto/profiler." + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endif(NOT WIN32) endif() -cc_library(flags SRCS flags.cc DEPS gflags boost) -cc_library(denormal SRCS denormal.cc DEPS) +cc_library( + flags + SRCS flags.cc + DEPS gflags boost) +cc_library( + denormal + SRCS denormal.cc + DEPS) -cc_test(errors_test SRCS errors_test.cc DEPS errors enforce) +cc_test( + errors_test + SRCS errors_test.cc + DEPS errors enforce) set(enforce_deps flags errors boost flags phi_enforce) if(WITH_GPU) set(enforce_deps ${enforce_deps} external_error_proto) endif() -cc_library(enforce INTERFACE SRCS enforce.cc DEPS ${enforce_deps}) +cc_library( + enforce INTERFACE + SRCS enforce.cc + DEPS ${enforce_deps}) cc_library(monitor SRCS monitor.cc) -cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce) +cc_test( + enforce_test + SRCS enforce_test.cc + DEPS stringpiece enforce) set(CPU_INFO_DEPS gflags glog enforce) -IF(WITH_XBYAK) - list(APPEND CPU_INFO_DEPS xbyak) -ENDIF() -cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS}) -cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) -cc_library(os_info SRCS os_info.cc DEPS enforce) -cc_test(os_info_test SRCS os_info_test.cc DEPS os_info) - -IF(WITH_GPU) - nv_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade cuda_graph) -ELSE() - cc_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade) -ENDIF() - -cc_library(place SRCS place.cc DEPS enforce boost phi_place) -cc_test(place_test SRCS place_test.cc DEPS place glog gflags) - -IF(WITH_MKLDNN) - set(MKLDNN_CTX_DEPS mkldnn) -ELSE() - set(MKLDNN_CTX_DEPS) -ENDIF() +if(WITH_XBYAK) + list(APPEND CPU_INFO_DEPS xbyak) +endif() +cc_library( + cpu_info + SRCS cpu_info.cc + DEPS ${CPU_INFO_DEPS}) +cc_test( + cpu_info_test + SRCS cpu_info_test.cc + DEPS cpu_info) +cc_library( + os_info + SRCS os_info.cc + DEPS enforce) +cc_test( + os_info_test + SRCS os_info_test.cc + DEPS os_info) + +if(WITH_GPU) + nv_library( + cuda_graph_with_memory_pool + SRCS cuda_graph_with_memory_pool.cc + DEPS device_context allocator_facade cuda_graph) +else() + cc_library( + cuda_graph_with_memory_pool + SRCS cuda_graph_with_memory_pool.cc + DEPS device_context allocator_facade) +endif() + +cc_library( + place + SRCS place.cc + DEPS enforce boost phi_place) +cc_test( + place_test + SRCS place_test.cc + DEPS place glog gflags) + +if(WITH_MKLDNN) + set(MKLDNN_CTX_DEPS mkldnn) +else() + set(MKLDNN_CTX_DEPS) +endif() add_subdirectory(device) add_subdirectory(dynload) add_subdirectory(stream) -cc_library(cpu_helper SRCS cpu_helper.cc DEPS cblas enforce) -cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper) +cc_library( + cpu_helper + SRCS cpu_helper.cc + DEPS cblas enforce) +cc_test( + cpu_helper_test + SRCS cpu_helper_test.cc + DEPS cpu_helper) set(dgc_deps "") -IF(WITH_DGC) - set(dgc_deps dgc) -ENDIF() - -IF(WITH_GPU OR WITH_ROCM) - set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream) -ENDIF() - -IF(WITH_IPU) - set(IPU_CTX_DEPS ipu_info) -ELSE() - set(IPU_CTX_DEPS) -ENDIF(WITH_IPU) - -IF(WITH_ASCEND_CL) - set(NPU_CTX_DEPS npu_stream npu_info) -ENDIF() - -IF(WITH_MLU) - set(MLU_CTX_DEPS mlu_device_context) -ENDIF() - -IF(WITH_ASCEND_CL OR WITH_MLU) -cc_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) -ENDIF() - -IF(WITH_GPU) - nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) -ENDIF() -IF(WITH_ROCM) - hip_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) -ENDIF() - -IF(WITH_GPU OR WITH_ROCM) +if(WITH_DGC) + set(dgc_deps dgc) +endif() + +if(WITH_GPU OR WITH_ROCM) + set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream) +endif() + +if(WITH_IPU) + set(IPU_CTX_DEPS ipu_info) +else() + set(IPU_CTX_DEPS) +endif(WITH_IPU) + +if(WITH_ASCEND_CL) + set(NPU_CTX_DEPS npu_stream npu_info) +endif() + +if(WITH_MLU) + set(MLU_CTX_DEPS mlu_device_context) +endif() + +if(WITH_ASCEND_CL OR WITH_MLU) + cc_library( + stream_callback_manager + SRCS stream_callback_manager.cc + DEPS simple_threadpool enforce) +endif() + +if(WITH_GPU) + nv_library( + stream_callback_manager + SRCS stream_callback_manager.cc + DEPS simple_threadpool enforce) +endif() +if(WITH_ROCM) + hip_library( + stream_callback_manager + SRCS stream_callback_manager.cc + DEPS simple_threadpool enforce) +endif() + +if(WITH_GPU OR WITH_ROCM) set(STREAM_CALLBACK_DEPS stream_callback_manager) -ELSEIF(WITH_ASCEND_CL) +elseif(WITH_ASCEND_CL) set(STREAM_CALLBACK_DEPS stream_callback_manager) -ELSE() +else() set(STREAM_CALLBACK_DEPS) -ENDIF() +endif() if(WITH_GLOO) - cc_library(gloo_context SRCS gloo_context.cc DEPS framework_proto gloo_wrapper enforce) + cc_library( + gloo_context + SRCS gloo_context.cc + DEPS framework_proto gloo_wrapper enforce) endif() -cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost) +cc_library( + cudnn_workspace_helper + SRCS cudnn_workspace_helper.cc + DEPS boost) # separate init from device_context to avoid cycle dependencies -cc_library(init SRCS init.cc DEPS device_context custom_kernel context_pool) +cc_library( + init + SRCS init.cc + DEPS device_context custom_kernel context_pool) # memcpy depends on device_context, here add deps individually for # avoiding cycle dependencies -cc_library(device_context SRCS device_context.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS} - place phi_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS} - ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} eigen3 cpu_context generator) +cc_library( + device_context + SRCS device_context.cc + DEPS simple_threadpool + malloc + xxhash + ${STREAM_CALLBACK_DEPS} + place + phi_place + eigen3 + stringpiece + cpu_helper + cpu_info + framework_proto + ${IPU_CTX_DEPS} + ${GPU_CTX_DEPS} + ${NPU_CTX_DEPS} + ${MKLDNN_CTX_DEPS} + ${dgc_deps} + dlpack + cudnn_workspace_helper + ${XPU_CTX_DEPS} + ${MLU_CTX_DEPS} + eigen3 + cpu_context + generator) if(WITH_XPU) target_link_libraries(device_context xpu_context xpu_resource_pool) endif() -cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce) +cc_library( + collective_helper + SRCS collective_helper.cc gen_comm_id_helper.cc + DEPS framework_proto device_context enforce) if(WITH_ASCEND_CL) - target_link_libraries(collective_helper npu_collective_helper) + target_link_libraries(collective_helper npu_collective_helper) endif() if(WITH_CNCL) - target_link_libraries(collective_helper mlu_collective_helper) + target_link_libraries(collective_helper mlu_collective_helper) endif() if(WITH_GPU OR WITH_ROCM) - target_link_libraries(device_context gpu_info gpu_context phi_gpu_info) - target_link_libraries(device_context gpu_resource_pool) + target_link_libraries(device_context gpu_info gpu_context phi_gpu_info) + target_link_libraries(device_context gpu_resource_pool) endif() -if (WITH_CUSTOM_DEVICE) - target_link_libraries(device_context custom_context) +if(WITH_CUSTOM_DEVICE) + target_link_libraries(device_context custom_context) endif() if(WITH_ASCEND_CL) - target_link_libraries(device_context npu_resource_pool) + target_link_libraries(device_context npu_resource_pool) endif() if(WITH_MLU) - target_link_libraries(device_context mlu_resource_pool) + target_link_libraries(device_context mlu_resource_pool) endif() if(WITH_CUSTOM_DEVICE) - target_link_libraries(device_context custom_context) + target_link_libraries(device_context custom_context) endif() -cc_test(init_test SRCS init_test.cc DEPS device_context) +cc_test( + init_test + SRCS init_test.cc + DEPS device_context) # Manage all device event library set(DEVICE_EVENT_LIBS) -cc_library(device_event_base SRCS device_event_base.cc DEPS place enforce device_context op_registry) -set(DEVICE_EVENT_LIBS device_event_base CACHE INTERNAL "device event libs") - +cc_library( + device_event_base + SRCS device_event_base.cc + DEPS place enforce device_context op_registry) +set(DEVICE_EVENT_LIBS + device_event_base + CACHE INTERNAL "device event libs") if(WITH_GPU) - nv_library(device_event_gpu SRCS device_event_gpu.cc DEPS device_event_base) - set(DEVICE_EVENT_LIBS device_event_gpu CACHE INTERNAL "device event libs") - nv_test(device_event_test SRCS device_event_test.cc DEPS device_event_gpu) - - nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) - nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) + nv_library( + device_event_gpu + SRCS device_event_gpu.cc + DEPS device_event_base) + set(DEVICE_EVENT_LIBS + device_event_gpu + CACHE INTERNAL "device event libs") + nv_test( + device_event_test + SRCS device_event_test.cc + DEPS device_event_gpu) + + nv_test( + device_context_test + SRCS device_context_test.cu + DEPS device_context gpu_info) + nv_test( + transform_test + SRCS transform_test.cu + DEPS memory place device_context) endif() if(WITH_ROCM) - hip_library(device_event_gpu SRCS device_event_gpu.cc DEPS device_event_base) - set(DEVICE_EVENT_LIBS device_event_gpu CACHE INTERNAL "device event libs") - hip_test(device_event_test SRCS device_event_test.cc DEPS device_event_gpu) - - hip_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) - hip_test(transform_test SRCS transform_test.cu DEPS memory place device_context) + hip_library( + device_event_gpu + SRCS device_event_gpu.cc + DEPS device_event_base) + set(DEVICE_EVENT_LIBS + device_event_gpu + CACHE INTERNAL "device event libs") + hip_test( + device_event_test + SRCS device_event_test.cc + DEPS device_event_gpu) + + hip_test( + device_context_test + SRCS device_context_test.cu + DEPS device_context gpu_info) + hip_test( + transform_test + SRCS transform_test.cu + DEPS memory place device_context) endif() cc_library(timer SRCS timer.cc) -cc_test(timer_test SRCS timer_test.cc DEPS timer) - -cc_library(lodtensor_printer SRCS lodtensor_printer.cc DEPS ddim place tensor scope lod_tensor variable_helper framework_proto) -cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_printer) +cc_test( + timer_test + SRCS timer_test.cc + DEPS timer) + +cc_library( + lodtensor_printer + SRCS lodtensor_printer.cc + DEPS ddim + place + tensor + scope + lod_tensor + variable_helper + framework_proto) +cc_test( + lodtensor_printer_test + SRCS lodtensor_printer_test.cc + DEPS lodtensor_printer) add_subdirectory(profiler) -cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) +cc_library( + device_tracer + SRCS device_tracer.cc + DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) if(WITH_GPU) - nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda new_profiler stats) - nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) + nv_library( + profiler + SRCS profiler.cc profiler.cu + DEPS os_info + device_tracer + gpu_info + enforce + dynload_cuda + new_profiler + stats) + nv_library( + device_memory_aligment + SRCS device_memory_aligment.cc + DEPS cpu_info gpu_info place) elseif(WITH_ROCM) - hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce new_profiler stats) - hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) + hip_library( + profiler + SRCS profiler.cc profiler.cu + DEPS os_info device_tracer gpu_info enforce new_profiler stats) + hip_library( + device_memory_aligment + SRCS device_memory_aligment.cc + DEPS cpu_info gpu_info place) else() - cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce new_profiler stats) - cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place) + cc_library( + profiler + SRCS profiler.cc + DEPS os_info device_tracer enforce new_profiler stats) + cc_library( + device_memory_aligment + SRCS device_memory_aligment.cc + DEPS cpu_info place) endif() -cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) -cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor) -cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor) -cc_test(complex_test SRCS complex_test.cc DEPS lod_tensor) +cc_test( + profiler_test + SRCS profiler_test.cc + DEPS profiler) +cc_test( + float16_test + SRCS float16_test.cc + DEPS lod_tensor) +cc_test( + bfloat16_test + SRCS bfloat16_test.cc + DEPS lod_tensor) +cc_test( + complex_test + SRCS complex_test.cc + DEPS lod_tensor) -IF(WITH_GPU) - nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) - nv_test(bfloat16_gpu_test SRCS bfloat16_test.cu DEPS lod_tensor) - nv_test(complex_gpu_test SRCS complex_test.cu DEPS lod_tensor) - nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags) - nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info) -ENDIF() +if(WITH_GPU) + nv_test( + float16_gpu_test + SRCS float16_test.cu + DEPS lod_tensor) + nv_test( + bfloat16_gpu_test + SRCS bfloat16_test.cu + DEPS lod_tensor) + nv_test( + complex_gpu_test + SRCS complex_test.cu + DEPS lod_tensor) + nv_test( + test_limit_gpu_memory + SRCS test_limit_gpu_memory.cu + DEPS gpu_info flags) + nv_library( + cuda_device_guard + SRCS cuda_device_guard.cc + DEPS gpu_info) +endif() -IF(WITH_ROCM) - hip_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) - hip_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags) - hip_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info) -ENDIF() +if(WITH_ROCM) + hip_test( + float16_gpu_test + SRCS float16_test.cu + DEPS lod_tensor) + hip_test( + test_limit_gpu_memory + SRCS test_limit_gpu_memory.cu + DEPS gpu_info flags) + hip_library( + cuda_device_guard + SRCS cuda_device_guard.cc + DEPS gpu_info) +endif() if(NOT APPLE AND NOT WIN32) - cc_library(device_code SRCS device_code.cc DEPS device_context) + cc_library( + device_code + SRCS device_code.cc + DEPS device_context) if(WITH_GPU OR WITH_ROCM) - cc_test(device_code_test SRCS device_code_test.cc DEPS device_code lod_tensor) + cc_test( + device_code_test + SRCS device_code_test.cc + DEPS device_code lod_tensor) endif() endif() diff --git a/paddle/fluid/platform/aligned_vector.h b/paddle/fluid/platform/aligned_vector.h index 6d48917ba1f6d..b42ae15405e7f 100644 --- a/paddle/fluid/platform/aligned_vector.h +++ b/paddle/fluid/platform/aligned_vector.h @@ -43,11 +43,11 @@ HOSTDEVICE inline void Store(const AlignedVector& vec, T* addr) { } /* -* Only the address of input data is the multiplier of 1,2,4, vectorized load -* with corresponding multiplier-value is possible. Moreover, the maximum length -* of vectorized load is 128 bits once. Hence, valid length of vectorized load -* shall be determined under both former constraints. -*/ + * Only the address of input data is the multiplier of 1,2,4, vectorized load + * with corresponding multiplier-value is possible. Moreover, the maximum length + * of vectorized load is 128 bits once. Hence, valid length of vectorized load + * shall be determined under both former constraints. + */ template int GetVectorizedSize(const T* pointer) { constexpr int max_load_bits = 128; @@ -58,11 +58,11 @@ int GetVectorizedSize(const T* pointer) { constexpr int vec2 = std::alignment_of>::value; // NOLINT if (address % vec8 == 0) { /* - * Currently, decide to deal with no more than 4 data once while adopting - * vectorization load/store, if performance test shows that dealing with - * 8 data once in vectorization load/store does get optimized, return code - * below can be changed into " return std::min(8, valid_vec_size); " . - */ + * Currently, decide to deal with no more than 4 data once while adopting + * vectorization load/store, if performance test shows that dealing with + * 8 data once in vectorization load/store does get optimized, return code + * below can be changed into " return std::min(8, valid_vec_size); " . + */ return std::min(4, valid_vec_size); } else if (address % vec4 == 0) { return std::min(4, valid_vec_size); diff --git a/paddle/fluid/platform/bfloat16_test.cc b/paddle/fluid/platform/bfloat16_test.cc index 794c1ff684c8d..f824716ab9224 100644 --- a/paddle/fluid/platform/bfloat16_test.cc +++ b/paddle/fluid/platform/bfloat16_test.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/bfloat16.h" + #include "paddle/phi/kernels/funcs/eigen/extensions.h" #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h diff --git a/paddle/fluid/platform/bfloat16_test.cu b/paddle/fluid/platform/bfloat16_test.cu index 391b91487fa8a..c5f38cf94eedb 100644 --- a/paddle/fluid/platform/bfloat16_test.cu +++ b/paddle/fluid/platform/bfloat16_test.cu @@ -17,7 +17,9 @@ limitations under the License. */ #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include #include + #include + #include "paddle/fluid/framework/lod_tensor.h" #if defined(PADDLE_CUDA_BF16) diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc index d05de900e5e77..8f0e4204772f8 100644 --- a/paddle/fluid/platform/collective_helper.cc +++ b/paddle/fluid/platform/collective_helper.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/platform/collective_helper.h" + #include #include "paddle/fluid/memory/allocation/allocator_facade.h" diff --git a/paddle/fluid/platform/complex_test.cc b/paddle/fluid/platform/complex_test.cc index c7ded7587172e..3547631064d39 100644 --- a/paddle/fluid/platform/complex_test.cc +++ b/paddle/fluid/platform/complex_test.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/platform/complex.h" + #include + #include "paddle/phi/kernels/funcs/eigen/extensions.h" #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h diff --git a/paddle/fluid/platform/complex_test.cu b/paddle/fluid/platform/complex_test.cu index 08ec75878b827..b814bcde6841f 100644 --- a/paddle/fluid/platform/complex_test.cu +++ b/paddle/fluid/platform/complex_test.cu @@ -18,6 +18,7 @@ #include #include #include + #include #include diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index e486044486571..c32af3b37a409 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -31,6 +31,7 @@ limitations under the License. */ #endif // _WIN32 #include + #include "paddle/fluid/platform/flags.h" DECLARE_double(fraction_of_cpu_memory_to_use); diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc index 4804d3f6ed301..4ef2a9709a59d 100644 --- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc +++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc @@ -13,26 +13,37 @@ // limitations under the License. #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" + #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/platform/device_context.h" +DECLARE_bool(use_stream_safe_cuda_allocator); + namespace paddle { namespace platform { #ifdef PADDLE_WITH_CUDA void BeginCUDAGraphCapture(platform::CUDAPlace place, - cudaStreamCaptureMode mode) { + cudaStreamCaptureMode mode, int64_t pool_id) { auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place); dev_ctx->cudnn_workspace_handle().ResetWorkspace(); auto stream = dev_ctx->stream(); CUDAGraph::BeginCapture(place, stream, mode); - auto id = CUDAGraph::CapturingID(); + + auto old_value = FLAGS_use_stream_safe_cuda_allocator; + if (old_value) { + FLAGS_use_stream_safe_cuda_allocator = false; + } + pool_id = CUDAGraph::SetMemoryPoolID(pool_id); memory::allocation::AllocatorFacade::Instance().PrepareMemoryPoolForCUDAGraph( - id); - AddResetCallbackIfCapturingCUDAGraph([id] { + pool_id); + if (old_value) { + FLAGS_use_stream_safe_cuda_allocator = true; + } + AddResetCallbackIfCapturingCUDAGraph([pool_id] { memory::allocation::AllocatorFacade::Instance().RemoveMemoryPoolOfCUDAGraph( - id); + pool_id); }); } diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.h b/paddle/fluid/platform/cuda_graph_with_memory_pool.h index 7a9e1a3a1419c..b8831126be052 100644 --- a/paddle/fluid/platform/cuda_graph_with_memory_pool.h +++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.h @@ -23,10 +23,51 @@ namespace paddle { namespace platform { +#ifdef PADDLE_WITH_CUDA +#define PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL( \ + __cond, __kernel_func, __grid, __block, __sm_size, __stream, __seed_inc, \ + __seed_expr, __offset_expr, ...) \ + do { \ + if (::paddle::platform::CUDAGraph::IsThisThreadCapturing() && (__cond)) { \ + using __Helper = \ + ::paddle::platform::IsSameKernelHelper; \ + auto *dev_ctx = \ + ::paddle::platform::DeviceContextPool::Instance().GetByPlace( \ + ::paddle::platform::CUDAGraph::CapturingPlace()); \ + auto __set_seed_func = \ + [=](::paddle::platform::CUDAKernelParams *__params, \ + bool __check_only) -> bool { \ + if (__check_only) { \ + return __params->func() == &__kernel_func && \ + __Helper::Compare(*__params, __VA_ARGS__); \ + } \ + auto &KERNEL_PARAMS = *__params; \ + uint64_t __seed, __offset; \ + ::paddle::operators::GetSeedDataAndIncrement( \ + *dev_ctx, nullptr, false, 0, __seed_inc, &__seed, &__offset); \ + __seed_expr = static_cast(__seed); \ + __offset_expr = static_cast(__offset); \ + return true; \ + }; \ + ::paddle::platform::CUDAGraph::RecordRandomKernelInfo(__set_seed_func); \ + } \ + __kernel_func<<<__grid, __block, __sm_size, __stream>>>(__VA_ARGS__); \ + } while (0) +#else +#define PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL( \ + __cond, __kernel_func, __grid, __block, __sm_size, __stream, __seed_inc, \ + __seed_expr, __offset_expr, ...) \ + do { \ + __kernel_func<<<__grid, __block, __sm_size, __stream>>>(__VA_ARGS__); \ + } while (0) +#endif + // NOTE: These APIs are not thread-safe. #ifdef PADDLE_WITH_CUDA void BeginCUDAGraphCapture(platform::CUDAPlace place, - cudaStreamCaptureMode mode); + cudaStreamCaptureMode mode, + int64_t pool_id = CUDAGraph::kInvalidPoolID); std::unique_ptr EndCUDAGraphCapture(); #endif diff --git a/paddle/fluid/platform/denormal.cc b/paddle/fluid/platform/denormal.cc index 4af156d1577dd..4cfb082544322 100644 --- a/paddle/fluid/platform/denormal.cc +++ b/paddle/fluid/platform/denormal.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/platform/denormal.h" + #include #include diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt index cbf3fdd263b48..62745883023cb 100644 --- a/paddle/fluid/platform/device/CMakeLists.txt +++ b/paddle/fluid/platform/device/CMakeLists.txt @@ -1,27 +1,26 @@ - set(DEV_LIBS custom_device) # GPU -IF(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM) add_subdirectory(gpu) -ENDIF() +endif() # XPU -IF(WITH_XPU) +if(WITH_XPU) add_subdirectory(xpu) -ENDIF() +endif() # NPU -IF(WITH_ASCEND OR WITH_ASCEND_CL) +if(WITH_ASCEND OR WITH_ASCEND_CL) add_subdirectory(npu) -ENDIF() +endif() # IPU -IF(WITH_IPU) +if(WITH_IPU) add_subdirectory(ipu) -ENDIF() +endif() # MLU -IF(WITH_MLU) +if(WITH_MLU) add_subdirectory(mlu) -ENDIF() +endif() diff --git a/paddle/fluid/platform/device/gpu/CMakeLists.txt b/paddle/fluid/platform/device/gpu/CMakeLists.txt index f7c13ec7ed5ed..66120f55f7cdc 100644 --- a/paddle/fluid/platform/device/gpu/CMakeLists.txt +++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt @@ -1,15 +1,30 @@ -IF(WITH_GPU) - add_subdirectory(cuda) - nv_library(gpu_info SRCS gpu_info.cc DEPS phi_gpu_info gflags glog enforce monitor dynload_cuda) +if(WITH_GPU) + add_subdirectory(cuda) + nv_library( + gpu_info + SRCS gpu_info.cc + DEPS phi_gpu_info gflags glog enforce monitor dynload_cuda) - nv_test(cuda_helper_test SRCS cuda_helper_test.cu) - nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda) -ELSEIF(WITH_ROCM) - add_subdirectory(rocm) - hip_library(gpu_info SRCS gpu_info.cc DEPS phi_gpu_info gflags glog enforce monitor dynload_cuda) + nv_test(cuda_helper_test SRCS cuda_helper_test.cu) + nv_test( + cudnn_desc_test + SRCS cudnn_desc_test.cc + DEPS dynload_cuda) +elseif(WITH_ROCM) + add_subdirectory(rocm) + hip_library( + gpu_info + SRCS gpu_info.cc + DEPS phi_gpu_info gflags glog enforce monitor dynload_cuda) - hip_test(cuda_helper_test SRCS cuda_helper_test.cu) - hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda) -ENDIF() + hip_test(cuda_helper_test SRCS cuda_helper_test.cu) + hip_test( + cudnn_desc_test + SRCS cudnn_desc_test.cc + DEPS dynload_cuda) +endif() -cc_library(gpu_resource_pool SRCS gpu_resource_pool.cc DEPS gpu_info) +cc_library( + gpu_resource_pool + SRCS gpu_resource_pool.cc + DEPS gpu_info) diff --git a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt index 85050038d5a83..da9121550e07a 100644 --- a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt +++ b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt @@ -1,4 +1,13 @@ -nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade) -nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce) +nv_library( + cuda_graph + SRCS cuda_graph.cc + DEPS enforce allocator_facade) +nv_library( + cuda_profiler + SRCS cuda_profiler.cc + DEPS enforce) -nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda phi) +nv_test( + cudnn_helper_test + SRCS cudnn_helper_test.cc + DEPS dynload_cuda phi) diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc index 8ee3b118c32f2..c5a515ce43611 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc @@ -14,12 +14,79 @@ #include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h" +#include +#include +#include + namespace paddle { namespace platform { std::unique_ptr CUDAGraph::capturing_graph_{nullptr}; paddle::optional CUDAGraph::capturing_thread_id_{paddle::none}; +static std::vector ToposortCUDAGraph(cudaGraph_t graph) { + size_t num_nodes; + PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphGetNodes(graph, nullptr, &num_nodes)); + std::vector nodes(num_nodes); + PADDLE_ENFORCE_GPU_SUCCESS( + cudaGraphGetNodes(graph, nodes.data(), &num_nodes)); + + size_t num_edges; + PADDLE_ENFORCE_GPU_SUCCESS( + cudaGraphGetEdges(graph, nullptr, nullptr, &num_edges)); + std::vector from(num_edges), to(num_edges); + PADDLE_ENFORCE_GPU_SUCCESS( + cudaGraphGetEdges(graph, from.data(), to.data(), &num_edges)); + + std::unordered_map> + in_edges, out_edges; + for (auto node : nodes) { + in_edges[node]; + out_edges[node]; + } + + for (size_t i = 0; i < num_edges; ++i) { + in_edges[to[i]].insert(from[i]); + out_edges[from[i]].insert(to[i]); + } + + std::queue q; + for (const auto &pair : in_edges) { + if (pair.second.empty()) { + q.push(pair.first); + } + } + + nodes.clear(); + while (!q.empty()) { + auto cur = q.front(); + q.pop(); + nodes.push_back(cur); + + for (auto out_node : out_edges.at(cur)) { + auto &in_nodes = in_edges.at(out_node); + in_nodes.erase(cur); + if (in_nodes.empty()) { + q.push(out_node); + } + } + } + PADDLE_ENFORCE_EQ( + nodes.size(), num_nodes, + phi::errors::InvalidArgument("Toposort error, this may be a bug.")); + return nodes; +} + +CUDAGraphID CUDAGraph::UniqueID() { + static std::atomic id; + return id.fetch_add(1); +} + +int64_t CUDAGraph::UniqueMemoryPoolID() { + static std::atomic id(CUDAGraph::kDefaultPoolID + 1); + return id.fetch_add(1); +} + void CUDAGraph::Reset() { if (is_reset_) return; #if CUDA_VERSION >= 10010 @@ -46,9 +113,16 @@ void CUDAGraph::Replay() { PADDLE_ENFORCE_EQ(is_reset_, false, errors::PermissionDenied( "Cannot replay the CUDA Graph after reset is called.")); - for (auto exec_graph : exec_graphs_) { - PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphLaunch(exec_graph, stream_)); + size_t n = exec_graphs_.size(); + for (size_t i = 0; i < n; ++i) { + if (!is_first_run_) { + for (auto &hook : pre_hooks_[i]) { + hook(exec_graphs_[i]); + } + } + PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphLaunch(exec_graphs_[i], stream_)); } + is_first_run_ = false; #endif } @@ -72,7 +146,8 @@ void CUDAGraph::BeginSegmentCapture() { platform::errors::PermissionDenied( "CUDA Graph should not be invalidated.")); VLOG(10) << "Begin to capture CUDA Graph with ID " << capturing_graph_->id_ - << ", segment id " << capturing_graph_->graphs_.size(); + << ", segment id " << capturing_graph_->graphs_.size() + << ", memory pool id " << capturing_graph_->pool_id_; #endif } @@ -112,15 +187,57 @@ void CUDAGraph::EndSegmentCapture() { if (num_nodes == 0) { PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphDestroy(graph)); VLOG(10) << "Skip empty CUDA Graph with ID " << capturing_graph_->id_ - << ", segment id " << capturing_graph_->graphs_.size(); + << ", segment id " << capturing_graph_->graphs_.size() + << ", memory pool id " << capturing_graph_->pool_id_; return; } + auto sorted_nodes = ToposortCUDAGraph(graph); + capturing_graph_->pre_hooks_.emplace_back(); + std::unordered_set visited; + VLOG(10) << "SetSeedFunc number : " + << capturing_graph_->set_seed_funcs_.size(); + for (const auto &set_seed_func : capturing_graph_->set_seed_funcs_) { + bool found = false; + for (auto node : sorted_nodes) { + if (visited.count(node) > 0) continue; + cudaGraphNodeType type; + PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphNodeGetType(node, &type)); + if (type == cudaGraphNodeTypeKernel) { + cudaKernelNodeParams params; + auto err = cudaGraphKernelNodeGetParams(node, ¶ms); + if (err == cudaErrorInvalidDeviceFunction) { + continue; + } else { + PADDLE_ENFORCE_GPU_SUCCESS(err); + } + CUDAKernelParams kernel_params(¶ms); + if (set_seed_func(&kernel_params, true)) { + capturing_graph_->pre_hooks_.back().push_back( + [set_seed_func, node, params](cudaGraphExec_t exec_graph) { + CUDAKernelParams kernel_params(¶ms); + set_seed_func(&kernel_params, false); + PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphExecKernelNodeSetParams( + exec_graph, node, ¶ms)); + }); + visited.insert(node); + found = true; + break; + } + } + } + PADDLE_ENFORCE_EQ(found, true, + phi::errors::InvalidArgument( + "Cannot find the corresponding random CUDA kernel.")); + } + capturing_graph_->set_seed_funcs_.clear(); + cudaGraphExec_t exec_graph; PADDLE_ENFORCE_GPU_SUCCESS( cudaGraphInstantiate(&exec_graph, graph, nullptr, nullptr, 0)); VLOG(10) << "End to capture CUDA Graph with ID " << capturing_graph_->id_ - << ", segment id " << capturing_graph_->graphs_.size(); + << ", segment id " << capturing_graph_->graphs_.size() + << ", memory pool id " << capturing_graph_->pool_id_; capturing_graph_->graphs_.emplace_back(graph); capturing_graph_->exec_graphs_.emplace_back(exec_graph); #endif diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h index ca1e7abb375cb..b3704fc628adc 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h @@ -20,10 +20,10 @@ #include #include #include + #include "cuda.h" // NOLINT #include "cuda_runtime.h" // NOLINT #include "paddle/fluid/platform/device/gpu/gpu_types.h" - #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/place.h" @@ -32,6 +32,69 @@ namespace paddle { namespace platform { +template +static bool IsBitwiseEqual(const T &x, const T &y) { + return std::memcmp(&x, &y, sizeof(T)) == 0; +} + +class CUDAKernelParams { + public: + explicit CUDAKernelParams(const cudaKernelNodeParams *params) + : params_(params) {} + + const void *func() const { return params_->func; } + + template + T &As(size_t idx) const { + return *reinterpret_cast(params_->kernelParams[idx]); + } + + private: + const cudaKernelNodeParams *params_; +}; + +template +struct IsSameKernelHelper; + +template +struct IsSameKernelHelper { + private: + using FuncArgsTuple = decltype(std::make_tuple(std::declval()...)); + + template + struct Impl { + static bool Compare(const CUDAKernelParams ¶ms, const TupleT &args) { + using CompareT = typename std::tuple_element::type; + if (!IsBitwiseEqual(params.As(IDX), + std::get(args))) { + return false; + } + + constexpr auto NewIsEnd = (IDX + 1 == std::tuple_size::value); + return Impl::Compare(params, args); + } + }; + + template + struct Impl { + static bool Compare(const CUDAKernelParams ¶ms, const TupleT &args) { + return true; + } + }; + + public: + template + static bool Compare(const CUDAKernelParams ¶ms, Args... args) { + constexpr auto kNumArgs = sizeof...(FuncArgs); + static_assert(kNumArgs == sizeof...(Args), "Argument number not match"); + + auto args_tuple = std::make_tuple(args...); + using TupleT = typename std::decay::type; + return Impl::Compare(params, args_tuple); + } +}; + #if CUDA_VERSION >= 10010 static void ThrowErrorIfNotSupportCUDAGraph() {} #else @@ -61,10 +124,35 @@ class CUDAGraph { } public: + static constexpr int64_t kDefaultPoolID = 0; + static constexpr int64_t kInvalidPoolID = -1; + ~CUDAGraph() { Reset(); } CUDAGraphID ID() const { return id_; } + static int64_t SetMemoryPoolID(int64_t pool_id) { + auto &pool_id_ = capturing_graph_->pool_id_; + PADDLE_ENFORCE_EQ( + pool_id_, kInvalidPoolID, + phi::errors::InvalidArgument("Cannot reset memory pool id twice, the " + "former memory pool id is %d.", + pool_id_)); + if (pool_id <= kInvalidPoolID) { + pool_id_ = UniqueMemoryPoolID(); + } else { + PADDLE_ENFORCE_GE( + pool_id, kDefaultPoolID, + phi::errors::InvalidArgument("Invalid memory pool id %d.", pool_id)); + pool_id_ = pool_id; + } + return pool_id_; + } + + int64_t PoolID() const { return pool_id_; } + + static int64_t CapturingPoolID() { return capturing_graph_->pool_id_; } + void Replay(); void Reset(); @@ -120,12 +208,17 @@ class CUDAGraph { } } - private: - static CUDAGraphID UniqueID() { - static std::atomic id; - return id.fetch_add(1); + using SetSeedFunc = std::function; + static void RecordRandomKernelInfo(SetSeedFunc set_seed_func) { + std::lock_guard guard(capturing_graph_->func_mtx_); + capturing_graph_->set_seed_funcs_.emplace_back(std::move(set_seed_func)); } + static int64_t UniqueMemoryPoolID(); + + private: + static CUDAGraphID UniqueID(); + private: #if CUDA_VERSION >= 10010 std::vector graphs_; @@ -135,10 +228,17 @@ class CUDAGraph { cudaStream_t stream_{nullptr}; platform::CUDAPlace place_; CUDAGraphID id_; + int64_t pool_id_{kInvalidPoolID}; std::vector> callbacks_; bool is_reset_{false}; std::mutex mtx_; + std::vector set_seed_funcs_; + std::vector>> pre_hooks_; + std::mutex func_mtx_; + + bool is_first_run_{true}; + static paddle::optional capturing_thread_id_; static std::unique_ptr capturing_graph_; }; diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h index a32db3a9921e3..7185d2356aae5 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h @@ -68,7 +68,7 @@ namespace platform { * } * } * -*/ + */ #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \ diff --git a/paddle/fluid/platform/device/gpu/cuda/cudnn_helper_test.cc b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper_test.cc index 851d0d18c604c..86c72769eb56e 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cudnn_helper_test.cc +++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper_test.cc @@ -15,13 +15,13 @@ limitations under the License. */ #define GLOG_NO_ABBREVIATED_SEVERITIES #define GOOGLE_GLOG_DLL_DECL -#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" - #include +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" + TEST(CudnnHelper, ScopedTensorDescriptor) { - using paddle::platform::ScopedTensorDescriptor; using paddle::platform::DataLayout; + using paddle::platform::ScopedTensorDescriptor; ScopedTensorDescriptor tensor_desc; std::vector shape = {2, 4, 6, 6}; @@ -65,8 +65,8 @@ TEST(CudnnHelper, ScopedTensorDescriptor) { } TEST(CudnnHelper, ScopedFilterDescriptor) { - using paddle::platform::ScopedFilterDescriptor; using paddle::platform::DataLayout; + using paddle::platform::ScopedFilterDescriptor; ScopedFilterDescriptor filter_desc; std::vector shape = {2, 3, 3}; @@ -129,8 +129,8 @@ TEST(CudnnHelper, ScopedConvolutionDescriptor) { } TEST(CudnnHelper, ScopedPoolingDescriptor) { - using paddle::platform::ScopedPoolingDescriptor; using paddle::platform::PoolingMode; + using paddle::platform::ScopedPoolingDescriptor; ScopedPoolingDescriptor pool_desc; std::vector src_kernel = {2, 2, 5}; diff --git a/paddle/fluid/platform/device/gpu/cuda_helper_test.cu b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu index ab8bb2cad8c51..28c0e0ef9acf8 100644 --- a/paddle/fluid/platform/device/gpu/cuda_helper_test.cu +++ b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu @@ -13,6 +13,7 @@ // limitations under the License. #include + #include #include #ifdef _WIN32 @@ -22,13 +23,12 @@ #define PADDLE_CUDA_FP16 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_helper.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/device/gpu/gpu_helper.h" - -using paddle::platform::PADDLE_CUDA_NUM_THREADS; using paddle::platform::float16; +using paddle::platform::PADDLE_CUDA_NUM_THREADS; template __global__ void AddKernel(const T* data_a, T* data_b, size_t num) { diff --git a/paddle/fluid/platform/device/gpu/cudnn_desc_test.cc b/paddle/fluid/platform/device/gpu/cudnn_desc_test.cc index 8ea30027e8ade..2e58e71cc2c06 100644 --- a/paddle/fluid/platform/device/gpu/cudnn_desc_test.cc +++ b/paddle/fluid/platform/device/gpu/cudnn_desc_test.cc @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" - #include +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" + namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc index 8c04e935134c7..6b302d2449da5 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.cc +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include #include #include + #include "gflags/gflags.h" #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/cuda_device_guard.h" @@ -100,8 +101,9 @@ static size_t GpuAllocSize(bool realloc) { size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb : FLAGS_initial_gpu_memory_in_mb; size_t alloc_bytes = - (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc * - FLAGS_fraction_of_gpu_memory_to_use); + (flag_mb > 0ul + ? flag_mb << 20 + : available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use); PADDLE_ENFORCE_GE( available_to_alloc, alloc_bytes, platform::errors::ResourceExhausted("Not enough available GPU memory.")); diff --git a/paddle/fluid/platform/device/gpu/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h index 94b47cca948e6..3a97797c98260 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.h +++ b/paddle/fluid/platform/device/gpu/gpu_info.h @@ -14,6 +14,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include + #include #include #include diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h index 80d60ca95bf6f..5cacdfcb12f03 100644 --- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h +++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h @@ -25,9 +25,11 @@ #endif #include + #include #include #include + #include "paddle/fluid/platform/device_context.h" #ifdef __HIPCC__ @@ -93,9 +95,9 @@ struct GpuLaunchConfig { }; /* According to NVIDIA, if number of threads per block is 64/128/256/512, - * cuda performs better. And number of blocks should be greater (at least - * 2x~4x) than number of SMs. Hence, SM count is took into account within - * this function to determine the right number of threads per block. */ + * cuda performs better. And number of blocks should be greater (at least + * 2x~4x) than number of SMs. Hence, SM count is took into account within + * this function to determine the right number of threads per block. */ inline GpuLaunchConfig GetGpuLaunchConfig1D( const platform::CUDADeviceContext& context, int64_t numel, int vec_size = 1) { @@ -143,14 +145,16 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D( inline GpuLaunchConfig GetGpuLaunchConfig2D( const platform::CUDADeviceContext& context, int x_dim, int y_dim) { - PADDLE_ENFORCE_GT(x_dim, 0, platform::errors::InvalidArgument( - "x dim number should greater than 0," - " but received value is: %d", - x_dim)); - PADDLE_ENFORCE_GT(y_dim, 0, platform::errors::InvalidArgument( - "y dim number should greater than 0," - " but received value is: %d", - y_dim)); + PADDLE_ENFORCE_GT( + x_dim, 0, + platform::errors::InvalidArgument("x dim number should greater than 0," + " but received value is: %d", + x_dim)); + PADDLE_ENFORCE_GT( + y_dim, 0, + platform::errors::InvalidArgument("y dim number should greater than 0," + " but received value is: %d", + y_dim)); const int kThreadsPerBlock = 256; int block_cols = (std::min)(x_dim, kThreadsPerBlock); diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h index 803674779e756..a0e9d459721fd 100644 --- a/paddle/fluid/platform/device/gpu/gpu_primitives.h +++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h @@ -20,6 +20,7 @@ limitations under the License. */ #include #endif #include + #include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc index 2c55eb972b765..56fdb0da34057 100644 --- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc +++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc @@ -14,6 +14,7 @@ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h" + #include "paddle/fluid/platform/device/gpu/gpu_info.h" namespace paddle { diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h index d0b48eca5021b..2cadd55d2dc77 100644 --- a/paddle/fluid/platform/device/gpu/gpu_types.h +++ b/paddle/fluid/platform/device/gpu/gpu_types.h @@ -19,11 +19,13 @@ #ifdef PADDLE_WITH_HIP #include + #include "paddle/fluid/platform/dynload/miopen.h" #include "paddle/fluid/platform/dynload/rocblas.h" #else #include + #include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cublasLt.h" #include "paddle/fluid/platform/dynload/cudnn.h" diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h index 61ea0fd3cd293..b9e612b98def9 100644 --- a/paddle/fluid/platform/device/gpu/nccl_helper.h +++ b/paddle/fluid/platform/device/gpu/nccl_helper.h @@ -16,6 +16,7 @@ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include + #include #include #include // NOLINT @@ -31,6 +32,8 @@ #ifdef PADDLE_WITH_RCCL #include "paddle/fluid/platform/dynload/rccl.h" #endif +#include "paddle/fluid/platform/bfloat16.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" @@ -52,6 +55,10 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) { return ncclFloat16; } else if (type == framework::proto::VarType::INT8) { return ncclInt8; +#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000 + } else if (type == framework::proto::VarType::BF16) { + return ncclBfloat16; +#endif } else { PADDLE_THROW(platform::errors::Unimplemented( "This datatype in nccl is not supported.")); @@ -69,6 +76,10 @@ inline ncclDataType_t ToNCCLDataType(experimental::DataType type) { return ncclInt64; } else if (type == experimental::DataType::FLOAT16) { return ncclFloat16; +#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000 + } else if (type == experimental::DataType::BFLOAT16) { + return ncclBfloat16; +#endif } else { PADDLE_THROW(platform::errors::Unimplemented( "This datatype in nccl is not supported.")); @@ -254,7 +265,7 @@ class NCCLCommunicator { *allreduce ophandle and sync_batch_norm_op use ncclallreduce parallelly. So *create a new nccl comm for sync_batch_norm_op. And these codes should be *polished with a unified nccl management. - */ + */ NCCLContextMap *GetSyncBatchNormCtx( framework::Scope *scope, const std::vector &places) { auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); diff --git a/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt b/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt index 988807258c123..070312adbc2e6 100644 --- a/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt +++ b/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt @@ -1 +1,4 @@ -hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda) +hip_test( + miopen_helper_test + SRCS miopen_helper_test.cc + DEPS dynload_cuda) diff --git a/paddle/fluid/platform/device/gpu/rocm/miopen_helper_test.cc b/paddle/fluid/platform/device/gpu/rocm/miopen_helper_test.cc index 13cf52dc2c6a3..e99fc7f37a8f8 100644 --- a/paddle/fluid/platform/device/gpu/rocm/miopen_helper_test.cc +++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper_test.cc @@ -15,13 +15,13 @@ limitations under the License. */ #define GLOG_NO_ABBREVIATED_SEVERITIES #define GOOGLE_GLOG_DLL_DECL -#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" - #include +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" + TEST(MIOpenHelper, ScopedTensorDescriptor) { - using paddle::platform::ScopedTensorDescriptor; using paddle::platform::DataLayout; + using paddle::platform::ScopedTensorDescriptor; ScopedTensorDescriptor tensor_desc; std::vector shape = {2, 4, 6, 6}; diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h b/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h index a0f3fb0f73ba5..c0f6f173a798a 100644 --- a/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h +++ b/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h @@ -65,7 +65,7 @@ namespace platform { * } * } * -*/ + */ #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \ diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt index 7712ede8fd210..29f2a2955e0c2 100644 --- a/paddle/fluid/platform/device/ipu/CMakeLists.txt +++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt @@ -1,35 +1,42 @@ if(WITH_IPU) set(paddle_ipu_handler ${CMAKE_CURRENT_BINARY_DIR}/paddle_ipu_handler.h.tmp) set(paddle_ipu_handler_final ${CMAKE_CURRENT_BINARY_DIR}/paddle_ipu_handler.h) - file(WRITE ${paddle_ipu_handler} "// Auto generated from CMake. DO NOT EDIT!\n\n") + file(WRITE ${paddle_ipu_handler} + "// Auto generated from CMake. DO NOT EDIT!\n\n") file(APPEND ${paddle_ipu_handler} "\#pragma once\n") - file(APPEND ${paddle_ipu_handler} "\#include \"paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h\"\n\n") - file(GLOB POPART_CANONICALIZATION_SRC ${CMAKE_CURRENT_SOURCE_DIR}/popart_canonicalization/*.cc) + file( + APPEND ${paddle_ipu_handler} + "\#include \"paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h\"\n\n" + ) + file(GLOB POPART_CANONICALIZATION_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/popart_canonicalization/*.cc) copy_if_different(${paddle_ipu_handler} ${paddle_ipu_handler_final}) foreach(file_path ${POPART_CANONICALIZATION_SRC}) file(READ ${file_path} file_content) - string(REGEX MATCHALL "(REGISTER_HANDLER)(\\()([A-Za-z0-9_]+)(,)" op_handlers ${file_content}) + string(REGEX MATCHALL "(REGISTER_HANDLER)(\\()([A-Za-z0-9_]+)(,)" + op_handlers ${file_content}) string(REPLACE "REGISTER_HANDLER(" "" op_handlers "${op_handlers}") string(REPLACE "," "" op_handlers "${op_handlers}") foreach(op_handler ${op_handlers}) file(APPEND ${paddle_ipu_handler} "USE_HANDLER(${op_handler});\n") endforeach() endforeach() - - set(IPU_BACKEND_SRC - "ipu_strategy.cc" - "ipu_executor.cc" - "ipu_compiler.cc" - "ipu_backend.cc" - "ipu_utils.cc" - ) - set(IPU_INFO_SRC - "ipu_info.cc" - "ipu_device.cc" - ) - cc_library(popart_canonicalization SRCS ${POPART_CANONICALIZATION_SRC} DEPS graph) - cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart-only graph graph_helper popdist popart_canonicalization) - cc_library(ipu_info SRCS ${IPU_INFO_SRC} DEPS popart-only enforce) + set(IPU_BACKEND_SRC "ipu_strategy.cc" "ipu_executor.cc" "ipu_compiler.cc" + "ipu_backend.cc" "ipu_utils.cc") + set(IPU_INFO_SRC "ipu_info.cc" "ipu_device.cc") + + cc_library( + popart_canonicalization + SRCS ${POPART_CANONICALIZATION_SRC} + DEPS graph) + cc_library( + ipu_backend + SRCS ${IPU_BACKEND_SRC} + DEPS popart-only graph graph_helper popdist popart_canonicalization) + cc_library( + ipu_info + SRCS ${IPU_INFO_SRC} + DEPS popart-only enforce) endif() diff --git a/paddle/fluid/platform/device/ipu/ipu_device.cc b/paddle/fluid/platform/device/ipu/ipu_device.cc index 2d0381cb8b3ea..f6de526c90090 100644 --- a/paddle/fluid/platform/device/ipu/ipu_device.cc +++ b/paddle/fluid/platform/device/ipu/ipu_device.cc @@ -45,9 +45,10 @@ int GetNumDevices() { } int num_devices = popart::DeviceManager::createDeviceManager().enumerateDevices().size(); - PADDLE_ENFORCE_GT(num_devices, 0, platform::errors::Unavailable( - "Do not found any IPU devices, please " - "make sure Poplar sdk is enabled")); + PADDLE_ENFORCE_GT( + num_devices, 0, + platform::errors::Unavailable("Do not found any IPU devices, please " + "make sure Poplar sdk is enabled")); return num_devices; } diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc index d490334ee33f5..30c9bc2094a8a 100644 --- a/paddle/fluid/platform/device/ipu/ipu_executor.cc +++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc @@ -243,7 +243,8 @@ void Executor::AcquireDevice() { VLOG(10) << "Create IPU model device..."; std::map deviceOpts{ { - "numIPUs", std::to_string(ipu_strategy_->num_ipus), + "numIPUs", + std::to_string(ipu_strategy_->num_ipus), }, {"ipuVersion", "ipu2"}, }; @@ -254,7 +255,8 @@ void Executor::AcquireDevice() { VLOG(10) << "Create offline device..."; std::map deviceOpts{ { - "numIPUs", std::to_string(ipu_strategy_->num_ipus), + "numIPUs", + std::to_string(ipu_strategy_->num_ipus), }, {"ipuVersion", "ipu2"}, }; diff --git a/paddle/fluid/platform/device/ipu/ipu_info.h b/paddle/fluid/platform/device/ipu/ipu_info.h index fe7076e0b50b6..06ef070ed65ea 100644 --- a/paddle/fluid/platform/device/ipu/ipu_info.h +++ b/paddle/fluid/platform/device/ipu/ipu_info.h @@ -13,6 +13,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_IPU #include #include + #include "glog/logging.h" namespace paddle { diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h index da08c76fb90d1..0e17a485afb01 100644 --- a/paddle/fluid/platform/device/ipu/ipu_strategy.h +++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/platform/device/ipu/ipu_utils.h" #include "paddle/fluid/platform/enforce.h" @@ -143,10 +144,11 @@ class IpuStrategy { std::map> &options, // NOLINT const std::string &type_str) { auto it = options.find(key); - PADDLE_ENFORCE_NE(it, options.end(), platform::errors::InvalidArgument( - "Cannot find option: %s, type: %s " - "when setting IpuStrategy options", - key, type_str)); + PADDLE_ENFORCE_NE( + it, options.end(), + platform::errors::InvalidArgument("Cannot find option: %s, type: %s " + "when setting IpuStrategy options", + key, type_str)); it->second(value); } diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc index 254e566567424..1d5fe8c329f11 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc @@ -57,14 +57,14 @@ Node *gelu_handler(Graph *graph, Node *node) { {{"value", std::vector{1.4142135623730951}}, {"dims", std::vector{1}}, {"dtype", GetOutputVarDType(node)}}); - auto zero_point_five = - CreateConst(graph, node, {}, {}, {{"value", std::vector{0.5}}, - {"dims", std::vector{1}}, - {"dtype", GetOutputVarDType(node)}}); - auto one = - CreateConst(graph, node, {}, {}, {{"value", std::vector{1}}, - {"dims", std::vector{1}}, - {"dtype", GetOutputVarDType(node)}}); + auto zero_point_five = CreateConst(graph, node, {}, {}, + {{"value", std::vector{0.5}}, + {"dims", std::vector{1}}, + {"dtype", GetOutputVarDType(node)}}); + auto one = CreateConst(graph, node, {}, {}, + {{"value", std::vector{1}}, + {"dims", std::vector{1}}, + {"dtype", GetOutputVarDType(node)}}); auto div = CreateBaseOp(graph, node, "popart_div", {GetInputVarNode("X", node), sqrt2->outputs[0]}, {}, {}); diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc index af72f84c9d771..9b91abc4a67af 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc @@ -44,9 +44,10 @@ Node *pow_handler(Graph *graph, Node *node) { MakeConstAttrMapFromValue(value_, {1}, GetOutputVarDType(node)); auto new_node_const = CreateConst(graph, node, {}, {}, attrs); - return CreateBaseOp(graph, node, "popart_pow", {GetInputVarNode("X", node), - new_node_const->outputs[0]}, - node->outputs); + return CreateBaseOp( + graph, node, "popart_pow", + {GetInputVarNode("X", node), new_node_const->outputs[0]}, + node->outputs); } } @@ -380,10 +381,10 @@ Node *cumsum_handler(Graph *graph, Node *node) { auto reverse = BOOST_GET_CONST(bool, op->GetAttr("reverse")); int64_t popart_reverse = 1 ? reverse : 0; auto axis = BOOST_GET_CONST(int, op->GetAttr("axis")); - auto axis_node = - CreateConst(graph, node, {}, {}, {{"value", std::vector{axis}}, - {"dims", std::vector{1}}, - {"dtype", ONNXDataType::INT64}}); + auto axis_node = CreateConst(graph, node, {}, {}, + {{"value", std::vector{axis}}, + {"dims", std::vector{1}}, + {"dtype", ONNXDataType::INT64}}); return CreateBaseOp( graph, node, "popart_cumsum", {GetInputVarNode("X", node), axis_node->outputs[0]}, diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc index 2e9913f58efbb..bce6bac88e204 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc @@ -35,20 +35,20 @@ Node *conv2d_handler(Graph *graph, Node *node) { auto stride_ = BOOST_GET_CONST(std::vector, op->GetAttr("strides")); auto stride = std::vector{stride_.begin(), stride_.end()}; if (!op->Input("Bias").empty()) { - return CreateConv( - graph, node, - { - GetInputVarNode("Input", node), GetInputVarNode("Filter", node), - GetInputVarNode("Bias", node), - }, - node->outputs, dilations, group_, {}, pads, stride); + return CreateConv(graph, node, + { + GetInputVarNode("Input", node), + GetInputVarNode("Filter", node), + GetInputVarNode("Bias", node), + }, + node->outputs, dilations, group_, {}, pads, stride); } else { - return CreateConv( - graph, node, - { - GetInputVarNode("Input", node), GetInputVarNode("Filter", node), - }, - node->outputs, dilations, group_, {}, pads, stride); + return CreateConv(graph, node, + { + GetInputVarNode("Input", node), + GetInputVarNode("Filter", node), + }, + node->outputs, dilations, group_, {}, pads, stride); } } @@ -148,15 +148,16 @@ Node *pool2d_handler(Graph *graph, Node *node) { auto dilations = std::vector{}; int64_t storage_order = 0; return CreateBaseOp(graph, node, "popart_maxpool", node->inputs, - node->outputs, { - {"num_outputs", num_outputs}, - {"kernel_shape", kernel_shape}, - {"ceil_mode", ceil_mode}, - {"dilations", dilations}, - {"pads", pads}, - {"storage_order", storage_order}, - {"strides", strides}, - }); + node->outputs, + { + {"num_outputs", num_outputs}, + {"kernel_shape", kernel_shape}, + {"ceil_mode", ceil_mode}, + {"dilations", dilations}, + {"pads", pads}, + {"storage_order", storage_order}, + {"strides", strides}, + }); } else if (pooling_type == "avg") { int64_t count_include_pad = 0; return CreateBaseOp(graph, node, "popart_averagepool", node->inputs, diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc index 0525bb66f1618..b51d923bfcf5c 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc @@ -173,8 +173,9 @@ Node *CreateConv(Graph *graph, Node *node, const std::vector &inputs, Node *CreateSoftmaxOpset11(Graph *graph, Node *node, const std::vector &inputs, const std::vector &outputs, int64_t axis) { - PADDLE_ENFORCE_EQ(inputs.size(), 1, platform::errors::InvalidArgument( - "Softmax op only support one input")); + PADDLE_ENFORCE_EQ( + inputs.size(), 1, + platform::errors::InvalidArgument("Softmax op only support one input")); auto x_shape = inputs[0]->Var()->GetShape(); int x_rank = x_shape.size(); if (axis < 0) { diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc index aec89a1cf0d82..77ce2f3166914 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc @@ -69,10 +69,10 @@ Node *topk_handler(Graph *graph, Node *node) { var_k = GetInputVarNode("K", node); } else { auto k = BOOST_GET_CONST(int, op->GetAttr("k")); - auto *op_k = - CreateConst(graph, node, {}, {}, {{"value", std::vector{k}}, - {"dims", std::vector{1}}, - {"dtype", ONNXDataType::INT64}}); + auto *op_k = CreateConst(graph, node, {}, {}, + {{"value", std::vector{k}}, + {"dims", std::vector{1}}, + {"dtype", ONNXDataType::INT64}}); var_k = op_k->outputs[0]; } diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc index 00926ee7a0b25..bf32744d5a542 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc @@ -61,7 +61,9 @@ Node *fill_constant_handler(Graph *graph, Node *node) { } return CreateConst(graph, node, node->inputs, node->outputs, AttributeMap{ - {"value", value}, {"dims", dims}, {"dtype", dtype}, + {"value", value}, + {"dims", dims}, + {"dtype", dtype}, }); } @@ -76,13 +78,14 @@ Node *gaussian_random_handler(Graph *graph, Node *node) { auto seed_ = BOOST_GET_CONST(int, op->GetAttr("seed")); auto seed = static_cast(seed_); return CreateBaseOp(graph, node, "popart_randomnormal", node->inputs, - node->outputs, { - {"shape", shape}, - {"dtype", dtype}, - {"mean", mean}, - {"scale", scale}, - {"seed", seed}, - }); + node->outputs, + { + {"shape", shape}, + {"dtype", dtype}, + {"mean", mean}, + {"scale", scale}, + {"seed", seed}, + }); } Node *uniform_random_handler(Graph *graph, Node *node) { @@ -96,13 +99,14 @@ Node *uniform_random_handler(Graph *graph, Node *node) { auto seed_ = BOOST_GET_CONST(int, op->GetAttr("seed")); auto seed = static_cast(seed_); return CreateBaseOp(graph, node, "popart_randomuniform", node->inputs, - node->outputs, { - {"shape", shape}, - {"dtype", dtype}, - {"high", high}, - {"low", low}, - {"seed", seed}, - }); + node->outputs, + { + {"shape", shape}, + {"dtype", dtype}, + {"high", high}, + {"low", low}, + {"seed", seed}, + }); } Node *transpose_handler(Graph *graph, Node *node) { @@ -204,32 +208,33 @@ Node *lookup_table_op_handler(Graph *graph, Node *node, if (padding_idx_ >= 0 && padding_idx_ < table_size_) { std::vector const_value_(emb_size_, 0); std::vector const_shape_{1, emb_size_}; - auto concat_const = - CreateConst(graph, node, {}, {}, {{"value", const_value_}, - {"dims", const_shape_}, - {"dtype", GetOutputVarDType(node)}}); - auto axes = - CreateConst(graph, node, {}, {}, {{"value", std::vector{0}}, - {"dims", std::vector{1}}, - {"dtype", ONNXDataType::INT64}}); - auto step = - CreateConst(graph, node, {}, {}, {{"value", std::vector{1}}, - {"dims", std::vector{1}}, - {"dtype", ONNXDataType::INT64}}); - - auto left_start = - CreateConst(graph, node, {}, {}, {{"value", std::vector{0}}, - {"dims", std::vector{1}}, - {"dtype", ONNXDataType::INT64}}); + auto concat_const = CreateConst(graph, node, {}, {}, + {{"value", const_value_}, + {"dims", const_shape_}, + {"dtype", GetOutputVarDType(node)}}); + auto axes = CreateConst(graph, node, {}, {}, + {{"value", std::vector{0}}, + {"dims", std::vector{1}}, + {"dtype", ONNXDataType::INT64}}); + auto step = CreateConst(graph, node, {}, {}, + {{"value", std::vector{1}}, + {"dims", std::vector{1}}, + {"dtype", ONNXDataType::INT64}}); + + auto left_start = CreateConst(graph, node, {}, {}, + {{"value", std::vector{0}}, + {"dims", std::vector{1}}, + {"dtype", ONNXDataType::INT64}}); auto left_end = CreateConst(graph, node, {}, {}, {{"value", std::vector{padding_idx_}}, {"dims", std::vector{1}}, {"dtype", ONNXDataType::INT64}}); - auto right_start = CreateConst( - graph, node, {}, {}, {{"value", std::vector{padding_idx_ + 1}}, - {"dims", std::vector{1}}, - {"dtype", ONNXDataType::INT64}}); + auto right_start = + CreateConst(graph, node, {}, {}, + {{"value", std::vector{padding_idx_ + 1}}, + {"dims", std::vector{1}}, + {"dtype", ONNXDataType::INT64}}); auto right_end = CreateConst(graph, node, {}, {}, {{"value", std::vector{table_size_}}, {"dims", std::vector{1}}, @@ -471,7 +476,9 @@ Node *assign_value_handler(Graph *graph, Node *node) { } return CreateConst(graph, node, node->inputs, node->outputs, AttributeMap{ - {"value", values}, {"dims", dims}, {"dtype", dtype}, + {"value", values}, + {"dims", dims}, + {"dtype", dtype}, }); } @@ -529,10 +536,10 @@ Node *one_hot_handler(Graph *graph, Node *node) { {{"value", std::vector{depth}}, {"dims", std::vector{1}}, {"dtype", ONNXDataType::INT64}}); - auto value_tensor = - CreateConst(graph, node, {}, {}, {{"value", std::vector{0, 1}}, - {"dims", std::vector{2}}, - {"dtype", ONNXDataType::FLOAT}}); + auto value_tensor = CreateConst(graph, node, {}, {}, + {{"value", std::vector{0, 1}}, + {"dims", std::vector{2}}, + {"dtype", ONNXDataType::FLOAT}}); return CreateBaseOp(graph, node, "popart_onehot", {GetInputVarNode("X", node), depth_tensor->outputs[0], value_tensor->outputs[0]}, @@ -550,21 +557,21 @@ Node *one_hot_v2_handler(Graph *graph, Node *node) { PADDLE_THROW(platform::errors::Unimplemented( "Do not support allow_out_of_range=True")); } else { - auto depth_tensor = - CreateConst(graph, node, {}, {}, {{"value", std::vector{depth}}, - {"dims", std::vector{1}}, - {"dtype", ONNXDataType::INT32}}); + auto depth_tensor = CreateConst(graph, node, {}, {}, + {{"value", std::vector{depth}}, + {"dims", std::vector{1}}, + {"dtype", ONNXDataType::INT32}}); Node *value_tensor = nullptr; if (GetOutputVarNode("Out", node)->Var()->GetDataType() == VarType::FP16) { - value_tensor = - CreateConst(graph, node, {}, {}, {{"value", std::vector{0, 1}}, - {"dims", std::vector{2}}, - {"dtype", ONNXDataType::FLOAT16}}); + value_tensor = CreateConst(graph, node, {}, {}, + {{"value", std::vector{0, 1}}, + {"dims", std::vector{2}}, + {"dtype", ONNXDataType::FLOAT16}}); } else { - value_tensor = - CreateConst(graph, node, {}, {}, {{"value", std::vector{0, 1}}, - {"dims", std::vector{2}}, - {"dtype", ONNXDataType::FLOAT}}); + value_tensor = CreateConst(graph, node, {}, {}, + {{"value", std::vector{0, 1}}, + {"dims", std::vector{2}}, + {"dtype", ONNXDataType::FLOAT}}); } return CreateBaseOp(graph, node, "popart_onehot", diff --git a/paddle/fluid/platform/device/mlu/CMakeLists.txt b/paddle/fluid/platform/device/mlu/CMakeLists.txt index 1f3a7670849c2..08b33c9b58f06 100644 --- a/paddle/fluid/platform/device/mlu/CMakeLists.txt +++ b/paddle/fluid/platform/device/mlu/CMakeLists.txt @@ -1,12 +1,32 @@ - if(NOT WITH_MLU) - return() + return() endif() -cc_test(mlu_enforce_test SRCS enforce_test.cc DEPS stringpiece) -cc_library(mlu_info SRCS mlu_info.cc DEPS enforce glog monitor neuware_lib) -cc_library(mlu_stream SRCS mlu_stream.cc DEPS boost mlu_info stream_callback_manager eigen3 ${MKLDNN_CTX_DEPS}) -cc_library(mlu_device_context SRCS device_context.cc DEPS mlu_stream) -cc_test(mlu_device_context_test SRCS device_context_test.cc DEPS mlu_device_context) -cc_library(mlu_collective_helper SRCS mlu_collective_helper.cc DEPS mlu_stream mlu_info) -cc_library(mlu_resource_pool SRCS mlu_resource_pool.cc DEPS mlu_info) +cc_test( + mlu_enforce_test + SRCS enforce_test.cc + DEPS stringpiece) +cc_library( + mlu_info + SRCS mlu_info.cc + DEPS enforce glog monitor neuware_lib) +cc_library( + mlu_stream + SRCS mlu_stream.cc + DEPS boost mlu_info stream_callback_manager eigen3 ${MKLDNN_CTX_DEPS}) +cc_library( + mlu_device_context + SRCS device_context.cc + DEPS mlu_stream) +cc_test( + mlu_device_context_test + SRCS device_context_test.cc + DEPS mlu_device_context) +cc_library( + mlu_collective_helper + SRCS mlu_collective_helper.cc + DEPS mlu_stream mlu_info) +cc_library( + mlu_resource_pool + SRCS mlu_resource_pool.cc + DEPS mlu_info) diff --git a/paddle/fluid/platform/device/mlu/cncl_helper.h b/paddle/fluid/platform/device/mlu/cncl_helper.h index 2f9bed0142641..634e420d5ce53 100644 --- a/paddle/fluid/platform/device/mlu/cncl_helper.h +++ b/paddle/fluid/platform/device/mlu/cncl_helper.h @@ -16,8 +16,8 @@ limitations under the License. */ #ifdef PADDLE_WITH_CNCL #include - #include + #include #include #include // NOLINT diff --git a/paddle/fluid/platform/device/mlu/device_context.h b/paddle/fluid/platform/device/mlu/device_context.h index 120916b4f5c56..d607b1e12f5a7 100644 --- a/paddle/fluid/platform/device/mlu/device_context.h +++ b/paddle/fluid/platform/device/mlu/device_context.h @@ -12,6 +12,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_MLU #include + #include "paddle/fluid/platform/device/mlu/enforce.h" #include "paddle/fluid/platform/device/mlu/mlu_stream.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/platform/device/mlu/device_context_test.cc b/paddle/fluid/platform/device/mlu/device_context_test.cc index 5caaa9dec1e4b..41f79c7092ea4 100644 --- a/paddle/fluid/platform/device/mlu/device_context_test.cc +++ b/paddle/fluid/platform/device/mlu/device_context_test.cc @@ -20,9 +20,9 @@ limitations under the License. */ TEST(Device, Init) { using paddle::platform::DeviceContext; + using paddle::platform::MLUContext; using paddle::platform::MLUDeviceContext; using paddle::platform::MLUPlace; - using paddle::platform::MLUContext; int count = paddle::platform::GetMLUDeviceCount(); for (int i = 0; i < count; i++) { @@ -34,9 +34,9 @@ TEST(Device, Init) { } TEST(Device, MLUDeviceContext) { + using paddle::mluCnnlHandle; using paddle::platform::MLUDeviceContext; using paddle::platform::MLUPlace; - using paddle::mluCnnlHandle; int count = paddle::platform::GetMLUDeviceCount(); for (int i = 0; i < count; i++) { @@ -48,9 +48,9 @@ TEST(Device, MLUDeviceContext) { } TEST(Device, MLUStream) { + using paddle::mluStream; using paddle::platform::MLUDeviceContext; using paddle::platform::MLUPlace; - using paddle::mluStream; int count = paddle::platform::GetMLUDeviceCount(); for (int i = 0; i < count; i++) { @@ -62,11 +62,11 @@ TEST(Device, MLUStream) { } TEST(Device, DeviceContextPool) { + using paddle::platform::CPUPlace; using paddle::platform::DeviceContextPool; using paddle::platform::MLUDeviceContext; - using paddle::platform::Place; - using paddle::platform::CPUPlace; using paddle::platform::MLUPlace; + using paddle::platform::Place; DeviceContextPool& pool = DeviceContextPool::Instance(); auto cpu_dev_ctx1 = pool.Get(CPUPlace()); diff --git a/paddle/fluid/platform/device/mlu/mlu_collective_helper.cc b/paddle/fluid/platform/device/mlu/mlu_collective_helper.cc index 7708267c1bc72..4051caac1c800 100644 --- a/paddle/fluid/platform/device/mlu/mlu_collective_helper.cc +++ b/paddle/fluid/platform/device/mlu/mlu_collective_helper.cc @@ -14,6 +14,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_CNCL) #include + #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/mlu/enforce.h" diff --git a/paddle/fluid/platform/device/mlu/mlu_info.cc b/paddle/fluid/platform/device/mlu/mlu_info.cc index 7cad99bf5d22d..e3672707210fb 100644 --- a/paddle/fluid/platform/device/mlu/mlu_info.cc +++ b/paddle/fluid/platform/device/mlu/mlu_info.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/device/mlu/mlu_info.h" + #include #include + #include "gflags/gflags.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/device/mlu/enforce.h" @@ -187,8 +189,9 @@ static size_t MLUAllocSize(bool realloc) { size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb : FLAGS_initial_gpu_memory_in_mb; size_t alloc_bytes = - (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc * - FLAGS_fraction_of_gpu_memory_to_use); + (flag_mb > 0ul + ? flag_mb << 20 + : available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use); PADDLE_ENFORCE_GE( available_to_alloc, alloc_bytes, platform::errors::ResourceExhausted("Not enough available MLU memory.")); diff --git a/paddle/fluid/platform/device/mlu/mlu_stream.cc b/paddle/fluid/platform/device/mlu/mlu_stream.cc index 7a27a49250a1e..f570cc77e5a97 100644 --- a/paddle/fluid/platform/device/mlu/mlu_stream.cc +++ b/paddle/fluid/platform/device/mlu/mlu_stream.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/device/mlu/mlu_stream.h" + #include "paddle/fluid/platform/device/mlu/device_context.h" namespace paddle { diff --git a/paddle/fluid/platform/device/npu/CMakeLists.txt b/paddle/fluid/platform/device/npu/CMakeLists.txt index 52db36d131ec2..9015a76e9cd5a 100644 --- a/paddle/fluid/platform/device/npu/CMakeLists.txt +++ b/paddle/fluid/platform/device/npu/CMakeLists.txt @@ -3,13 +3,31 @@ add_subdirectory(dynload) if(WITH_ASCEND) - cc_library(ascend_npu_info SRCS ascend_npu_info.cc DEPS gflags glog enforce atlas_acl) + cc_library( + ascend_npu_info + SRCS ascend_npu_info.cc + DEPS gflags glog enforce atlas_acl) endif() if(WITH_ASCEND_CL) - cc_library(npu_info SRCS npu_info.cc DEPS gflags glog enforce monitor ascendcl acl_op_compiler) - cc_library(npu_resource_pool SRCS npu_resource_pool.cc DEPS npu_info) - cc_library(npu_stream SRCS npu_stream.cc DEPS enforce boost stream_callback_manager) - cc_library(npu_collective_helper SRCS npu_collective_helper.cc DEPS npu_stream npu_info data_type) - cc_library(npu_op_runner SRCS npu_op_runner.cc DEPS operator npu_info) + cc_library( + npu_info + SRCS npu_info.cc + DEPS gflags glog enforce monitor ascendcl acl_op_compiler) + cc_library( + npu_resource_pool + SRCS npu_resource_pool.cc + DEPS npu_info) + cc_library( + npu_stream + SRCS npu_stream.cc + DEPS enforce boost stream_callback_manager) + cc_library( + npu_collective_helper + SRCS npu_collective_helper.cc + DEPS npu_stream npu_info data_type) + cc_library( + npu_op_runner + SRCS npu_op_runner.cc + DEPS operator npu_info) endif() diff --git a/paddle/fluid/platform/device/npu/ascend_npu_info.cc b/paddle/fluid/platform/device/npu/ascend_npu_info.cc index c100b2d0a1740..a9204ac3fca50 100644 --- a/paddle/fluid/platform/device/npu/ascend_npu_info.cc +++ b/paddle/fluid/platform/device/npu/ascend_npu_info.cc @@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/device/npu/ascend_npu_info.h" + #include + #include "acl/acl_rt.h" namespace paddle { diff --git a/paddle/fluid/platform/device/npu/dynload/CMakeLists.txt b/paddle/fluid/platform/device/npu/dynload/CMakeLists.txt index 7232d51a602b3..9f36942524bf3 100644 --- a/paddle/fluid/platform/device/npu/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/device/npu/dynload/CMakeLists.txt @@ -1,3 +1,6 @@ if(WITH_ASCEND_CL) - cc_library(npu_hccl SRCS hccl.cc DEPS dynamic_loader warpctc) + cc_library( + npu_hccl + SRCS hccl.cc + DEPS dynamic_loader warpctc) endif() diff --git a/paddle/fluid/platform/device/npu/dynload/hccl.h b/paddle/fluid/platform/device/npu/dynload/hccl.h index 3d7587bfa266b..ae140dd295067 100644 --- a/paddle/fluid/platform/device/npu/dynload/hccl.h +++ b/paddle/fluid/platform/device/npu/dynload/hccl.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include + #include // NOLINT #include "paddle/fluid/platform/dynload/dynamic_loader.h" diff --git a/paddle/fluid/platform/device/npu/enforce_npu.h b/paddle/fluid/platform/device/npu/enforce_npu.h index 3887ee4866af8..243926868631d 100644 --- a/paddle/fluid/platform/device/npu/enforce_npu.h +++ b/paddle/fluid/platform/device/npu/enforce_npu.h @@ -17,10 +17,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_ASCEND_CL #include -#include "paddle/fluid/platform/enforce.h" - #include "acl/acl.h" #include "hccl/hccl_types.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/device/npu/hccl_helper.h b/paddle/fluid/platform/device/npu/hccl_helper.h index 134ec04030d75..107fe5989ddba 100644 --- a/paddle/fluid/platform/device/npu/hccl_helper.h +++ b/paddle/fluid/platform/device/npu/hccl_helper.h @@ -17,6 +17,7 @@ #ifdef PADDLE_WITH_ASCEND_CL #include + #include #include #include // NOLINT @@ -24,11 +25,10 @@ #include #include -#include "paddle/fluid/platform/device/npu/dynload/hccl.h" -#include "paddle/fluid/platform/device/npu/enforce_npu.h" - #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/npu/dynload/hccl.h" +#include "paddle/fluid/platform/device/npu/enforce_npu.h" #include "paddle/fluid/platform/float16.h" #define HCCL_ID_VARNAME "HCCLID" diff --git a/paddle/fluid/platform/device/npu/npu_collective_helper.cc b/paddle/fluid/platform/device/npu/npu_collective_helper.cc index cdec3519a23f3..77528fe19fcb4 100644 --- a/paddle/fluid/platform/device/npu/npu_collective_helper.cc +++ b/paddle/fluid/platform/device/npu/npu_collective_helper.cc @@ -14,6 +14,7 @@ #if defined(PADDLE_WITH_ASCEND_CL) #include + #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/npu/enforce_npu.h" diff --git a/paddle/fluid/platform/device/npu/npu_info.cc b/paddle/fluid/platform/device/npu/npu_info.cc index b5516944b750e..2688c88f55773 100644 --- a/paddle/fluid/platform/device/npu/npu_info.cc +++ b/paddle/fluid/platform/device/npu/npu_info.cc @@ -13,12 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/device/npu/npu_info.h" + #include #include #include #include "gflags/gflags.h" - #include "paddle/fluid/platform/lock_guard_ptr.h" #include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/monitor.h" @@ -153,8 +153,9 @@ static size_t NPUAllocSize(bool realloc) { size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb : FLAGS_initial_gpu_memory_in_mb; size_t alloc_bytes = - (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc * - FLAGS_fraction_of_gpu_memory_to_use); + (flag_mb > 0ul + ? flag_mb << 20 + : available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use); PADDLE_ENFORCE_GE( available_to_alloc, alloc_bytes, platform::errors::ResourceExhausted("Not enough available NPU memory.")); diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.cc b/paddle/fluid/platform/device/npu/npu_op_runner.cc index 72169ae303b4c..d38443acca3a3 100644 --- a/paddle/fluid/platform/device/npu/npu_op_runner.cc +++ b/paddle/fluid/platform/device/npu/npu_op_runner.cc @@ -23,7 +23,6 @@ limitations under the License. */ #include "acl/acl.h" #include "acl/acl_op_compiler.h" - #include "paddle/fluid/framework/framework.pb.h" DECLARE_string(npu_precision_mode); diff --git a/paddle/fluid/platform/device/npu/npu_resource_pool.cc b/paddle/fluid/platform/device/npu/npu_resource_pool.cc index d837e90c3c42c..e7c302289dbfe 100644 --- a/paddle/fluid/platform/device/npu/npu_resource_pool.cc +++ b/paddle/fluid/platform/device/npu/npu_resource_pool.cc @@ -14,6 +14,7 @@ #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/platform/device/npu/npu_resource_pool.h" + #include "paddle/fluid/platform/device/npu/npu_info.h" namespace paddle { diff --git a/paddle/fluid/platform/device/npu/npu_stream.cc b/paddle/fluid/platform/device/npu/npu_stream.cc index 0b15a0d937e82..55a73146815c9 100644 --- a/paddle/fluid/platform/device/npu/npu_stream.cc +++ b/paddle/fluid/platform/device/npu/npu_stream.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/device/npu/npu_stream.h" + #include "paddle/fluid/platform/enforce.h" namespace paddle { diff --git a/paddle/fluid/platform/device/xpu/CMakeLists.txt b/paddle/fluid/platform/device/xpu/CMakeLists.txt index 3399fff087f8d..19656bf1cce64 100644 --- a/paddle/fluid/platform/device/xpu/CMakeLists.txt +++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt @@ -2,11 +2,32 @@ if(NOT WITH_XPU) return() endif() -set(XPU_CTX_DEPS xpulib ssl crypto rt z resolv dl) +set(XPU_CTX_DEPS + xpulib + ssl + crypto + rt + z + resolv + dl) - -cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place phi_xpu_info) -cc_library(xpu_op_list SRCS xpu_op_list.cc DEPS gflags glog enforce xpulib device_context op_kernel_type) -cc_library(xpu_resource_pool SRCS xpu_resource_pool.cc DEPS xpu_info) +cc_library( + xpu_info + SRCS xpu_info.cc + DEPS gflags + glog + enforce + xpulib + device_context + place + phi_xpu_info) +cc_library( + xpu_op_list + SRCS xpu_op_list.cc + DEPS gflags glog enforce xpulib device_context op_kernel_type) +cc_library( + xpu_resource_pool + SRCS xpu_resource_pool.cc + DEPS xpu_info) add_subdirectory(tests) diff --git a/paddle/fluid/platform/device/xpu/bkcl_helper.h b/paddle/fluid/platform/device/xpu/bkcl_helper.h index 24fd8b5faa4e9..a7a3e4f060529 100644 --- a/paddle/fluid/platform/device/xpu/bkcl_helper.h +++ b/paddle/fluid/platform/device/xpu/bkcl_helper.h @@ -17,6 +17,7 @@ #pragma once #include + #include #include #include // NOLINT @@ -217,7 +218,7 @@ class BKCLCommunicator { *bkcl_all_reduce *parallelly. So create a new bkcl comm for sync_batch_norm_op. And these *codes should be polished with a unified bkcl management. - */ + */ BKCLContextMap *GetSyncBatchNormCtx( framework::Scope *scope, const std::vector &places) { auto *bkcl_id_var = scope->FindVar(BKCL_ID_VARNAME); diff --git a/paddle/fluid/platform/device/xpu/enforce_xpu.h b/paddle/fluid/platform/device/xpu/enforce_xpu.h index c55d91c301550..77d14aa712e70 100644 --- a/paddle/fluid/platform/device/xpu/enforce_xpu.h +++ b/paddle/fluid/platform/device/xpu/enforce_xpu.h @@ -15,7 +15,6 @@ limitations under the License. */ #pragma once #include "paddle/fluid/platform/device/xpu/xpu_header.h" - #include "paddle/phi/backends/xpu/enforce_xpu.h" namespace paddle { diff --git a/paddle/fluid/platform/device/xpu/tests/CMakeLists.txt b/paddle/fluid/platform/device/xpu/tests/CMakeLists.txt index 6d98fefcf8317..e51896df6159a 100644 --- a/paddle/fluid/platform/device/xpu/tests/CMakeLists.txt +++ b/paddle/fluid/platform/device/xpu/tests/CMakeLists.txt @@ -1 +1,4 @@ -cc_test(enforce_xpu_test SRCS enforce_xpu_test.cc DEPS stringpiece) +cc_test( + enforce_xpu_test + SRCS enforce_xpu_test.cc + DEPS stringpiece) diff --git a/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc b/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc index 8cba98f3fb352..0b528c3999e07 100644 --- a/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc +++ b/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/device/xpu/enforce_xpu.h" + #include "gtest/gtest.h" template diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 99f8e5ace9c00..b94d0353e5dd5 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -38,6 +38,11 @@ XPUOpMap& get_kl2_ops() { {"argsort", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, + {"assign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace())})}, {"assign_value", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"batch_norm_grad", @@ -209,6 +214,8 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::FP16, XPUPlace())})}, {"gelu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, + {"generate_proposals_v2", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"greater_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), diff --git a/paddle/fluid/platform/device/xpu/xpu_info.cc b/paddle/fluid/platform/device/xpu/xpu_info.cc index cdd7ee7f806e9..dbc8ed4a51aaf 100644 --- a/paddle/fluid/platform/device/xpu/xpu_info.cc +++ b/paddle/fluid/platform/device/xpu/xpu_info.cc @@ -13,14 +13,13 @@ limitations under the License. */ #include #include #include -#include "gflags/gflags.h" +#include "gflags/gflags.h" #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/xpu/enforce_xpu.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" - #include "paddle/phi/backends/xpu/xpu_info.h" namespace paddle { diff --git a/paddle/fluid/platform/device/xpu/xpu_info.h b/paddle/fluid/platform/device/xpu/xpu_info.h index 38b4defadc6c3..2dd0f3275309e 100644 --- a/paddle/fluid/platform/device/xpu/xpu_info.h +++ b/paddle/fluid/platform/device/xpu/xpu_info.h @@ -12,6 +12,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include + #include "paddle/fluid/platform/place.h" #include "paddle/phi/backends/xpu/xpu_info.h" #include "xpu/runtime.h" diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h index 778c18146d64d..452f388f03dcf 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h +++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h @@ -113,6 +113,12 @@ XPUOpMap& get_kp_ops() { XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"reduce_amax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"reduce_amin", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"c_sync_calc_stream", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"c_sync_comm_stream", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"c_allreduce_sum", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, }; return s_xpu_kp_kernels; diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc index 0738514336201..8ace4d1a32c50 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc +++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc @@ -9,6 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU +#include "paddle/fluid/platform/device/xpu/xpu_op_list.h" + #include #include #include @@ -17,7 +19,6 @@ limitations under the License. */ #include "paddle/fluid/platform/device/xpu/xpu2_op_list.h" #include "paddle/fluid/platform/device/xpu/xpu_info.h" #include "paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h" -#include "paddle/fluid/platform/device/xpu/xpu_op_list.h" namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/device_code.cc b/paddle/fluid/platform/device_code.cc index a4226dabf9d52..4ee32ad5a03cd 100644 --- a/paddle/fluid/platform/device_code.cc +++ b/paddle/fluid/platform/device_code.cc @@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/platform/device_code.h" + #include + #include #include #include -#include "paddle/fluid/platform/device_code.h" #include "paddle/fluid/platform/enforce.h" DECLARE_string(cuda_dir); diff --git a/paddle/fluid/platform/device_code_test.cc b/paddle/fluid/platform/device_code_test.cc index 7da8c56138543..cb2649686ec02 100644 --- a/paddle/fluid/platform/device_code_test.cc +++ b/paddle/fluid/platform/device_code_test.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/device_code.h" + #include + #include "gtest/gtest.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/platform/init.h" diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 09a29c3429cba..0bd606257f541 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/device_context.h" + #include #include #include + #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/stream/cuda_stream.h" #include "paddle/phi/backends/gpu/gpu_context.h" diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index a63d41405f1b2..d0dae706ba572 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -21,13 +21,12 @@ limitations under the License. */ #include #include +#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/device/gpu/gpu_types.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/custom/custom_context.h" #include "paddle/phi/backends/gpu/gpu_decls.h" #include "paddle/phi/core/device_context.h" - -#include "paddle/fluid/memory/malloc.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/device/gpu/gpu_helper.h" #include "paddle/fluid/platform/dynload/cublas.h" diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu index 08a04a9565af7..2db29dc11ada0 100644 --- a/paddle/fluid/platform/device_context_test.cu +++ b/paddle/fluid/platform/device_context_test.cu @@ -11,18 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/platform/device_context.h" - #include #include "glog/logging.h" #include "gtest/gtest.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/fluid/platform/device_context.h" TEST(Device, Init) { - using paddle::platform::DeviceContext; using paddle::platform::CUDADeviceContext; using paddle::platform::CUDAPlace; + using paddle::platform::DeviceContext; int count = paddle::platform::GetGPUDeviceCount(); for (int i = 0; i < count; i++) { @@ -94,11 +93,11 @@ TEST(Device, CUDADeviceContext) { } TEST(Device, DeviceContextPool) { - using paddle::platform::DeviceContextPool; - using paddle::platform::CUDADeviceContext; - using paddle::platform::Place; using paddle::platform::CPUPlace; + using paddle::platform::CUDADeviceContext; using paddle::platform::CUDAPlace; + using paddle::platform::DeviceContextPool; + using paddle::platform::Place; DeviceContextPool& pool = DeviceContextPool::Instance(); auto cpu_dev_ctx1 = pool.Get(CPUPlace()); diff --git a/paddle/fluid/platform/device_context_xpu_test.cc b/paddle/fluid/platform/device_context_xpu_test.cc index 3de2e3957a990..50cb0f98d334f 100644 --- a/paddle/fluid/platform/device_context_xpu_test.cc +++ b/paddle/fluid/platform/device_context_xpu_test.cc @@ -11,12 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/platform/device_context.h" - #include #include "glog/logging.h" #include "gtest/gtest.h" +#include "paddle/fluid/platform/device_context.h" TEST(Device, Init) { using paddle::platform::DeviceContext; @@ -33,10 +32,10 @@ TEST(Device, Init) { } TEST(Device, DeviceContextPool) { + using paddle::platform::CPUPlace; using paddle::platform::DeviceContextPool; - using paddle::platform::XPUDeviceContext; using paddle::platform::Place; - using paddle::platform::CPUPlace; + using paddle::platform::XPUDeviceContext; using paddle::platform::XPUPlace; DeviceContextPool& pool = DeviceContextPool::Instance(); diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h index 463329d32c936..82d93dee3989f 100644 --- a/paddle/fluid/platform/device_event.h +++ b/paddle/fluid/platform/device_event.h @@ -23,8 +23,8 @@ * for USE_PASS from pass_library. */ -using ::paddle::platform::kCUDA; using ::paddle::platform::kCPU; +using ::paddle::platform::kCUDA; USE_EVENT(kCPU) USE_EVENT_WAIT(kCPU, kCPU) diff --git a/paddle/fluid/platform/device_event_base.cc b/paddle/fluid/platform/device_event_base.cc index 67fad3857f2c1..374de7d923f30 100644 --- a/paddle/fluid/platform/device_event_base.cc +++ b/paddle/fluid/platform/device_event_base.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/platform/device_event_base.h" + #include "paddle/fluid/platform/device_event_cpu.h" #include "paddle/fluid/platform/event.h" diff --git a/paddle/fluid/platform/device_event_base.h b/paddle/fluid/platform/device_event_base.h index 8fe5ef9fcb107..4e751aa6d133a 100644 --- a/paddle/fluid/platform/device_event_base.h +++ b/paddle/fluid/platform/device_event_base.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/platform/device_event_cpu.h b/paddle/fluid/platform/device_event_cpu.h index 6e2bf4c7ad135..1620dffdabd51 100644 --- a/paddle/fluid/platform/device_event_cpu.h +++ b/paddle/fluid/platform/device_event_cpu.h @@ -16,6 +16,7 @@ #include #include #include + #include "paddle/fluid/platform/device_event_base.h" namespace paddle { diff --git a/paddle/fluid/platform/device_event_gpu.cc b/paddle/fluid/platform/device_event_gpu.cc index f42ccc5a1db54..f176d1a0d5dbd 100644 --- a/paddle/fluid/platform/device_event_gpu.cc +++ b/paddle/fluid/platform/device_event_gpu.cc @@ -101,8 +101,8 @@ void EventResetCUDA(const DeviceEvent* event) { } // namespace platform } // namespace paddle -using ::paddle::platform::kCUDA; using ::paddle::platform::kCPU; +using ::paddle::platform::kCUDA; REGISTER_EVENT_CREATE_FUNCTION(kCUDA, paddle::platform::DeviceEventCreateCUDA) REGISTER_EVENT_RECORD_FUNCTION(kCUDA, paddle::platform::DeviceEventRecordCUDA) REGISTER_EVENT_QUERY_FUNCTION(kCUDA, paddle::platform::DeviceEventQueryCUDA) diff --git a/paddle/fluid/platform/device_event_test.cc b/paddle/fluid/platform/device_event_test.cc index d9f744b26256b..92fe7c02bd0bd 100644 --- a/paddle/fluid/platform/device_event_test.cc +++ b/paddle/fluid/platform/device_event_test.cc @@ -13,15 +13,16 @@ // limitations under the License. #include "paddle/fluid/platform/device_event.h" + #include "glog/logging.h" #include "gtest/gtest.h" #include "paddle/fluid/platform/place.h" -using ::paddle::platform::kCUDA; using ::paddle::platform::kCPU; +using ::paddle::platform::kCUDA; -using paddle::platform::DeviceEvent; using paddle::platform::DeviceContextPool; +using paddle::platform::DeviceEvent; #ifdef PADDLE_WITH_CUDA #include diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index 73847ce24aa72..fa345ed31cbb2 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/platform/device_tracer.h" + #include #include #include @@ -20,7 +22,6 @@ limitations under the License. */ #include // NOLINT #include "glog/logging.h" -#include "paddle/fluid/platform/device_tracer.h" DECLARE_bool(enable_host_event_recorder_hook); @@ -255,7 +256,9 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, } break; } - default: { break; } + default: { + break; + } } } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) { // Seems not an error in this case. diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 1f95e12127104..bba0ad35e0216 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -1,55 +1,89 @@ -cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce phi_dynamic_loader) +cc_library( + dynamic_loader + SRCS dynamic_loader.cc + DEPS glog gflags enforce phi_dynamic_loader) -list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc nvtx.cc cufft.cc) +list( + APPEND + CUDA_SRCS + cublas.cc + cublasLt.cc + cudnn.cc + curand.cc + cusolver.cc + cusparse.cc + nvtx.cc + cufft.cc) -if (NOT WITH_NV_JETSON) - list(APPEND CUDA_SRCS nvjpeg.cc) +if(NOT WITH_NV_JETSON) + list(APPEND CUDA_SRCS nvjpeg.cc) endif() -if (WITH_ROCM) +if(WITH_ROCM) list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc hipfft.cc) endif() # There is no macOS version of NCCL. # Disable nvrtc and cuda_driver api on MacOS, and only do a early test on Linux and Windows. -if (NOT APPLE) - list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc) - if (WITH_NCCL) +if(NOT APPLE) + list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc) + if(WITH_NCCL) list(APPEND CUDA_SRCS nccl.cc) endif() - if (WITH_ROCM) + if(WITH_ROCM) list(APPEND HIP_SRCS hiprtc.cc rocm_driver.cc) - if (WITH_RCCL) + if(WITH_RCCL) list(APPEND HIP_SRCS rccl.cc) endif() endif() endif() -if (TENSORRT_FOUND) +if(TENSORRT_FOUND) list(APPEND CUDA_SRCS tensorrt.cc) endif() configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h) -if (CUPTI_FOUND) - list(APPEND CUDA_SRCS cupti.cc) +if(CUPTI_FOUND) + list(APPEND CUDA_SRCS cupti.cc) endif(CUPTI_FOUND) if(WITH_ROCM) - hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader phi_dynload_cuda) - cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc phi_dynload_warpctc) -elseif (WITH_ASCEND_CL) - cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc npu_hccl phi_dynload_warpctc) + hip_library( + dynload_cuda + SRCS ${HIP_SRCS} + DEPS dynamic_loader phi_dynload_cuda) + cc_library( + dynload_warpctc + SRCS warpctc.cc + DEPS dynamic_loader warpctc phi_dynload_warpctc) +elseif(WITH_ASCEND_CL) + cc_library( + dynload_warpctc + SRCS warpctc.cc + DEPS dynamic_loader warpctc npu_hccl phi_dynload_warpctc) else() - nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader phi_dynload_cuda) - cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc phi_dynload_warpctc) + nv_library( + dynload_cuda + SRCS ${CUDA_SRCS} + DEPS dynamic_loader phi_dynload_cuda) + cc_library( + dynload_warpctc + SRCS warpctc.cc + DEPS dynamic_loader warpctc phi_dynload_warpctc) endif() -if (WITH_MKLML) - cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml phi_dynload_mklml) +if(WITH_MKLML) + cc_library( + dynload_mklml + SRCS mklml.cc + DEPS dynamic_loader mklml phi_dynload_mklml) endif() # TODO(TJ): add iomp, mkldnn? -if (MKL_FOUND AND WITH_ONEMKL) +if(MKL_FOUND AND WITH_ONEMKL) message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}") - cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader phi_dynload_mklrt) + cc_library( + dynload_mklrt + SRCS mklrt.cc + DEPS dynamic_loader phi_dynload_mklrt) target_include_directories(dynload_mklrt PRIVATE ${MKL_INCLUDE}) endif() diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h index d7d43cecc25dd..496b253dff5b3 100644 --- a/paddle/fluid/platform/dynload/cublas.h +++ b/paddle/fluid/platform/dynload/cublas.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include + #include // NOLINT #include diff --git a/paddle/fluid/platform/dynload/cublasLt.h b/paddle/fluid/platform/dynload/cublasLt.h index 5157cfdad2e59..3a1d28072c591 100644 --- a/paddle/fluid/platform/dynload/cublasLt.h +++ b/paddle/fluid/platform/dynload/cublasLt.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include + #include // NOLINT #include diff --git a/paddle/fluid/platform/dynload/cuda_driver.cc b/paddle/fluid/platform/dynload/cuda_driver.cc index a0f9647f08934..c6851594b803b 100644 --- a/paddle/fluid/platform/dynload/cuda_driver.cc +++ b/paddle/fluid/platform/dynload/cuda_driver.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/dynload/cuda_driver.h" + #include "paddle/phi/backends/dynload/cuda_driver.h" namespace paddle { diff --git a/paddle/fluid/platform/dynload/cuda_driver.h b/paddle/fluid/platform/dynload/cuda_driver.h index f5550e9f9fe39..b696ffc1a3be8 100644 --- a/paddle/fluid/platform/dynload/cuda_driver.h +++ b/paddle/fluid/platform/dynload/cuda_driver.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include // NOLINT #include "paddle/phi/backends/dynload/cuda_driver.h" diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc index 553792d3bbf25..05cacb74c8673 100644 --- a/paddle/fluid/platform/dynload/cudnn.cc +++ b/paddle/fluid/platform/dynload/cudnn.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/dynload/cudnn.h" + #include "paddle/phi/backends/dynload/cudnn.h" namespace paddle { diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index bf4bb08a696ed..9af1e8065c49d 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -16,6 +16,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #include + #include // NOLINT #include "paddle/phi/backends/dynload/cudnn.h" diff --git a/paddle/fluid/platform/dynload/cufft.cc b/paddle/fluid/platform/dynload/cufft.cc index 1996ab16167f1..6a06c4bdb6ac4 100644 --- a/paddle/fluid/platform/dynload/cufft.cc +++ b/paddle/fluid/platform/dynload/cufft.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/dynload/cufft.h" + #include "paddle/phi/backends/dynload/cufft.h" namespace paddle { diff --git a/paddle/fluid/platform/dynload/cufft.h b/paddle/fluid/platform/dynload/cufft.h index 6c3a0992d758d..d79603a5a01fc 100644 --- a/paddle/fluid/platform/dynload/cufft.h +++ b/paddle/fluid/platform/dynload/cufft.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include + #include // NOLINT #include "paddle/phi/backends/dynload/cufft.h" diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h index 854e5a7b9f04a..8e08785f20925 100644 --- a/paddle/fluid/platform/dynload/cupti.h +++ b/paddle/fluid/platform/dynload/cupti.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include + #include // NOLINT #include "paddle/phi/backends/dynload/cupti.h" diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h index 1fdd9240284dc..f4065a196d3c4 100644 --- a/paddle/fluid/platform/dynload/curand.h +++ b/paddle/fluid/platform/dynload/curand.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include // NOLINT #include "paddle/phi/backends/dynload/curand.h" diff --git a/paddle/fluid/platform/dynload/cusolver.h b/paddle/fluid/platform/dynload/cusolver.h index 212c350ebb288..854de23150cad 100644 --- a/paddle/fluid/platform/dynload/cusolver.h +++ b/paddle/fluid/platform/dynload/cusolver.h @@ -15,6 +15,7 @@ limitations under the License. */ #include #include + #include // NOLINT #include "paddle/phi/backends/dynload/cusolver.h" diff --git a/paddle/fluid/platform/dynload/cusparse.h b/paddle/fluid/platform/dynload/cusparse.h index b4b9352167829..925852bb4158b 100644 --- a/paddle/fluid/platform/dynload/cusparse.h +++ b/paddle/fluid/platform/dynload/cusparse.h @@ -15,6 +15,7 @@ limitations under the License. */ #include #include + #include // NOLINT #include "paddle/phi/backends/dynload/cusparse.h" diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 5ce63b244efde..2f24e1b87daba 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include + #include "gflags/gflags.h" #include "paddle/phi/backends/dynload/dynamic_loader.h" diff --git a/paddle/fluid/platform/dynload/hiprtc.cc b/paddle/fluid/platform/dynload/hiprtc.cc index 6c4a4bfd0dedc..d9bb3fd2c4214 100644 --- a/paddle/fluid/platform/dynload/hiprtc.cc +++ b/paddle/fluid/platform/dynload/hiprtc.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/dynload/hiprtc.h" + #include "paddle/phi/backends/dynload/hiprtc.h" namespace paddle { diff --git a/paddle/fluid/platform/dynload/hiprtc.h b/paddle/fluid/platform/dynload/hiprtc.h index 851dadbac63d2..f27d5d808f77b 100644 --- a/paddle/fluid/platform/dynload/hiprtc.h +++ b/paddle/fluid/platform/dynload/hiprtc.h @@ -15,7 +15,9 @@ limitations under the License. */ #pragma once #include + #include // NOLINT + #include "paddle/phi/backends/dynload/hiprtc.h" namespace paddle { diff --git a/paddle/fluid/platform/dynload/miopen.cc b/paddle/fluid/platform/dynload/miopen.cc index 9660188b68d4f..15012531b4c9f 100644 --- a/paddle/fluid/platform/dynload/miopen.cc +++ b/paddle/fluid/platform/dynload/miopen.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/dynload/miopen.h" + #include "paddle/phi/backends/dynload/cudnn.h" namespace paddle { diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h index b99cd5ebb6e15..20b92b170511c 100644 --- a/paddle/fluid/platform/dynload/miopen.h +++ b/paddle/fluid/platform/dynload/miopen.h @@ -14,10 +14,11 @@ limitations under the License. */ #pragma once #include - #include #include + #include // NOLINT + #include "paddle/phi/backends/dynload/miopen.h" namespace paddle { diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index 1c7d0c17a0fc8..78cae9a082153 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include // NOLINT #include "paddle/phi/backends/dynload/mklml.h" diff --git a/paddle/fluid/platform/dynload/mklrt.h b/paddle/fluid/platform/dynload/mklrt.h index 334b98a1c3d5a..e1a2bedfa8e2c 100644 --- a/paddle/fluid/platform/dynload/mklrt.h +++ b/paddle/fluid/platform/dynload/mklrt.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include // NOLINT #include "paddle/phi/backends/dynload/dynamic_loader.h" diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h index a38d1d4272e39..c2052719dd56c 100644 --- a/paddle/fluid/platform/dynload/nccl.h +++ b/paddle/fluid/platform/dynload/nccl.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include // NOLINT #include "paddle/phi/backends/dynload/nccl.h" diff --git a/paddle/fluid/platform/dynload/nvjpeg.h b/paddle/fluid/platform/dynload/nvjpeg.h index 8aaf672fe67b9..026a3b6488606 100644 --- a/paddle/fluid/platform/dynload/nvjpeg.h +++ b/paddle/fluid/platform/dynload/nvjpeg.h @@ -12,6 +12,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include + #include // NOLINT #include "paddle/phi/backends/dynload/nvjpeg.h" diff --git a/paddle/fluid/platform/dynload/nvrtc.cc b/paddle/fluid/platform/dynload/nvrtc.cc index a032299827742..242aa912ad838 100644 --- a/paddle/fluid/platform/dynload/nvrtc.cc +++ b/paddle/fluid/platform/dynload/nvrtc.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/dynload/nvrtc.h" + #include "paddle/phi/backends/dynload/nvrtc.h" namespace paddle { diff --git a/paddle/fluid/platform/dynload/nvrtc.h b/paddle/fluid/platform/dynload/nvrtc.h index 5ca8860c5acbe..e03235e116f25 100644 --- a/paddle/fluid/platform/dynload/nvrtc.h +++ b/paddle/fluid/platform/dynload/nvrtc.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include // NOLINT #include "paddle/phi/backends/dynload/nvrtc.h" diff --git a/paddle/fluid/platform/dynload/nvtx.h b/paddle/fluid/platform/dynload/nvtx.h index 3f974eca1d00b..c3dc9e31df354 100644 --- a/paddle/fluid/platform/dynload/nvtx.h +++ b/paddle/fluid/platform/dynload/nvtx.h @@ -15,6 +15,7 @@ limitations under the License. */ #ifndef _WIN32 #include #include + #include // NOLINT #include "paddle/phi/backends/dynload/nvtx.h" diff --git a/paddle/fluid/platform/dynload/rccl.h b/paddle/fluid/platform/dynload/rccl.h index 7bb4992c89cb9..2f874bb59f593 100644 --- a/paddle/fluid/platform/dynload/rccl.h +++ b/paddle/fluid/platform/dynload/rccl.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include // NOLINT + #include "paddle/phi/backends/dynload/rccl.h" namespace paddle { diff --git a/paddle/fluid/platform/dynload/rocblas.h b/paddle/fluid/platform/dynload/rocblas.h index 04f4fdd9506da..5cec6fb48798b 100644 --- a/paddle/fluid/platform/dynload/rocblas.h +++ b/paddle/fluid/platform/dynload/rocblas.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include // NOLINT #include diff --git a/paddle/fluid/platform/dynload/rocm_driver.cc b/paddle/fluid/platform/dynload/rocm_driver.cc index 088129f3f8d02..4fa20c5c4bbb8 100644 --- a/paddle/fluid/platform/dynload/rocm_driver.cc +++ b/paddle/fluid/platform/dynload/rocm_driver.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/dynload/rocm_driver.h" + #include "paddle/phi/backends/dynload/rocm_driver.h" namespace paddle { diff --git a/paddle/fluid/platform/dynload/rocm_driver.h b/paddle/fluid/platform/dynload/rocm_driver.h index 5a902239fefd4..5c8e18611c40a 100644 --- a/paddle/fluid/platform/dynload/rocm_driver.h +++ b/paddle/fluid/platform/dynload/rocm_driver.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include // NOLINT #include "paddle/phi/backends/dynload/rocm_driver.h" diff --git a/paddle/fluid/platform/dynload/tensorrt.cc b/paddle/fluid/platform/dynload/tensorrt.cc index 8153877b7bbb8..8d700faac0c14 100644 --- a/paddle/fluid/platform/dynload/tensorrt.cc +++ b/paddle/fluid/platform/dynload/tensorrt.cc @@ -13,6 +13,7 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/tensorrt.h" + #include namespace paddle { diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 772a7750fe90d..1106eef455957 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -36,6 +36,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/platform/external_error.pb.h" #endif // PADDLE_WITH_CUDA @@ -77,6 +78,7 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/cusolver.h" #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) #include + #include "paddle/phi/backends/dynload/nccl.h" #endif // __APPLE__ #endif // PADDLE_WITH_CUDA @@ -88,6 +90,7 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/rocblas.h" #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) #include // NOLINT + #include "paddle/phi/backends/dynload/rccl.h" #endif // __APPLE__ #endif // PADDLE_WITH_HIP diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index b9e4239299169..771c4853f6f24 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -63,19 +63,22 @@ TEST(ENFORCE, FAILED) { TEST(ENFORCE, NO_ARG_OK) { int a = 2; int b = 2; - PADDLE_ENFORCE_EQ(a, b, paddle::platform::errors::Unavailable( - "PADDLE_ENFORCE_EQ tests failed.")); + PADDLE_ENFORCE_EQ( + a, b, + paddle::platform::errors::Unavailable("PADDLE_ENFORCE_EQ tests failed.")); // test enforce with extra message. - PADDLE_ENFORCE_EQ(a, b, paddle::platform::errors::Unavailable( - "Some %s wrong in PADDLE_ENFORCE_EQ.", "info")); + PADDLE_ENFORCE_EQ(a, b, + paddle::platform::errors::Unavailable( + "Some %s wrong in PADDLE_ENFORCE_EQ.", "info")); } TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) { int a = 2; bool caught_exception = false; try { - PADDLE_ENFORCE_EQ(a, 1 + 3, paddle::platform::errors::InvalidArgument( - "The result is not equal correct result.")); + PADDLE_ENFORCE_EQ(a, 1 + 3, + paddle::platform::errors::InvalidArgument( + "The result is not equal correct result.")); } catch (paddle::platform::EnforceNotMet& error) { caught_exception = true; std::string ex_msg = error.what(); @@ -89,8 +92,9 @@ TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) { int a = 2; bool caught_exception = false; try { - PADDLE_ENFORCE_EQ(a, 1 + 3, paddle::platform::errors::InvalidArgument( - "The result is not equal correct result.")); + PADDLE_ENFORCE_EQ(a, 1 + 3, + paddle::platform::errors::InvalidArgument( + "The result is not equal correct result.")); } catch (paddle::platform::EnforceNotMet& error) { caught_exception = true; std::string ex_msg = error.what(); @@ -102,10 +106,12 @@ TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) { } TEST(ENFORCE_NE, OK) { - PADDLE_ENFORCE_NE(1, 2, paddle::platform::errors::Unavailable( - "PADDLE_ENFORCE_NE tests failed.")); - PADDLE_ENFORCE_NE(1.0, 2UL, paddle::platform::errors::Unavailable( - "PADDLE_ENFORCE_NE tests failed.")); + PADDLE_ENFORCE_NE( + 1, 2, + paddle::platform::errors::Unavailable("PADDLE_ENFORCE_NE tests failed.")); + PADDLE_ENFORCE_NE( + 1.0, 2UL, + paddle::platform::errors::Unavailable("PADDLE_ENFORCE_NE tests failed.")); } TEST(ENFORCE_NE, FAIL) { bool caught_exception = false; @@ -125,14 +131,16 @@ TEST(ENFORCE_NE, FAIL) { } TEST(ENFORCE_GT, OK) { - PADDLE_ENFORCE_GT(2, 1, paddle::platform::errors::Unavailable( - "PADDLE_ENFORCE_GT tests failed.")); + PADDLE_ENFORCE_GT( + 2, 1, + paddle::platform::errors::Unavailable("PADDLE_ENFORCE_GT tests failed.")); } TEST(ENFORCE_GT, FAIL) { bool caught_exception = false; try { - PADDLE_ENFORCE_GT(1, 2, paddle::platform::errors::InvalidArgument( - "Expected 1 > 2, but received 1:1 <= 2:2.")); + PADDLE_ENFORCE_GT(1, 2, + paddle::platform::errors::InvalidArgument( + "Expected 1 > 2, but received 1:1 <= 2:2.")); } catch (paddle::platform::EnforceNotMet& error) { caught_exception = true; std::string ex_msg = error.what(); @@ -143,18 +151,22 @@ TEST(ENFORCE_GT, FAIL) { } TEST(ENFORCE_GE, OK) { - PADDLE_ENFORCE_GE(2, 2, paddle::platform::errors::Unavailable( - "PADDLE_ENFORCE_GE tests failed.")); - PADDLE_ENFORCE_GE(3, 2, paddle::platform::errors::Unavailable( - "PADDLE_ENFORCE_GE tests failed.")); - PADDLE_ENFORCE_GE(3.21, 2.0, paddle::platform::errors::Unavailable( - "PADDLE_ENFORCE_GE tests failed.")); + PADDLE_ENFORCE_GE( + 2, 2, + paddle::platform::errors::Unavailable("PADDLE_ENFORCE_GE tests failed.")); + PADDLE_ENFORCE_GE( + 3, 2, + paddle::platform::errors::Unavailable("PADDLE_ENFORCE_GE tests failed.")); + PADDLE_ENFORCE_GE( + 3.21, 2.0, + paddle::platform::errors::Unavailable("PADDLE_ENFORCE_GE tests failed.")); } TEST(ENFORCE_GE, FAIL) { bool caught_exception = false; try { - PADDLE_ENFORCE_GE(1, 2, paddle::platform::errors::InvalidArgument( - "Expected 1 >= 2, but received 1:1 < 2:2.")); + PADDLE_ENFORCE_GE(1, 2, + paddle::platform::errors::InvalidArgument( + "Expected 1 >= 2, but received 1:1 < 2:2.")); } catch (paddle::platform::EnforceNotMet& error) { caught_exception = true; std::string ex_msg = error.what(); @@ -165,22 +177,28 @@ TEST(ENFORCE_GE, FAIL) { } TEST(ENFORCE_LE, OK) { - PADDLE_ENFORCE_LE(1, 1, paddle::platform::errors::Unavailable( - "PADDLE_ENFORCE_LE tests failed.")); - PADDLE_ENFORCE_LE(1UL, 1UL, paddle::platform::errors::Unavailable( - "PADDLE_ENFORCE_LE tests failed.")); - PADDLE_ENFORCE_LE(2, 3, paddle::platform::errors::Unavailable( - "PADDLE_ENFORCE_LE tests failed.")); - PADDLE_ENFORCE_LE(2UL, 3UL, paddle::platform::errors::Unavailable( - "PADDLE_ENFORCE_LE tests failed.")); - PADDLE_ENFORCE_LE(2.0, 3.2, paddle::platform::errors::Unavailable( - "PADDLE_ENFORCE_LE tests failed.")); + PADDLE_ENFORCE_LE( + 1, 1, + paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LE tests failed.")); + PADDLE_ENFORCE_LE( + 1UL, 1UL, + paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LE tests failed.")); + PADDLE_ENFORCE_LE( + 2, 3, + paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LE tests failed.")); + PADDLE_ENFORCE_LE( + 2UL, 3UL, + paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LE tests failed.")); + PADDLE_ENFORCE_LE( + 2.0, 3.2, + paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LE tests failed.")); } TEST(ENFORCE_LE, FAIL) { bool caught_exception = false; try { - PADDLE_ENFORCE_GT(1, 2, paddle::platform::errors::InvalidArgument( - "Expected 1 > 2, but received 1:1 <= 2:2.")); + PADDLE_ENFORCE_GT(1, 2, + paddle::platform::errors::InvalidArgument( + "Expected 1 > 2, but received 1:1 <= 2:2.")); } catch (paddle::platform::EnforceNotMet& error) { caught_exception = true; std::string ex_msg = error.what(); @@ -191,12 +209,15 @@ TEST(ENFORCE_LE, FAIL) { } TEST(ENFORCE_LT, OK) { - PADDLE_ENFORCE_LT(3, 10, paddle::platform::errors::Unavailable( - "PADDLE_ENFORCE_LT tests failed.")); - PADDLE_ENFORCE_LT(2UL, 3UL, paddle::platform::errors::Unavailable( - "PADDLE_ENFORCE_LT tests failed.")); - PADDLE_ENFORCE_LT(2, 3, paddle::platform::errors::Unavailable( - "PADDLE_ENFORCE_LT tests failed.")); + PADDLE_ENFORCE_LT( + 3, 10, + paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LT tests failed.")); + PADDLE_ENFORCE_LT( + 2UL, 3UL, + paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LT tests failed.")); + PADDLE_ENFORCE_LT( + 2, 3, + paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LT tests failed.")); } TEST(ENFORCE_LT, FAIL) { bool caught_exception = false; @@ -263,16 +284,18 @@ std::ostream& operator<<(std::ostream& os, const Dims& d) { TEST(ENFORCE_USER_DEFINED_CLASS, EQ) { Dims a{{1, 2, 3, 4}}, b{{1, 2, 3, 4}}; - PADDLE_ENFORCE_EQ(a, b, paddle::platform::errors::Unavailable( - "PADDLE_ENFORCE_EQ tests failed.")); + PADDLE_ENFORCE_EQ( + a, b, + paddle::platform::errors::Unavailable("PADDLE_ENFORCE_EQ tests failed.")); } TEST(ENFORCE_USER_DEFINED_CLASS, NE) { Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}}; bool caught_exception = false; try { - PADDLE_ENFORCE_EQ(a, b, paddle::platform::errors::Unavailable( - "PADDLE_ENFORCE_EQ tests failed.")); + PADDLE_ENFORCE_EQ(a, b, + paddle::platform::errors::Unavailable( + "PADDLE_ENFORCE_EQ tests failed.")); } catch (paddle::platform::EnforceNotMet&) { caught_exception = true; } @@ -481,10 +504,12 @@ TEST(enforce, cannot_to_string_type) { "int can be converted to string"); CannotToStringType obj1(3), obj2(4), obj3(3); - PADDLE_ENFORCE_NE(obj1, obj2, paddle::platform::errors::InvalidArgument( - "Object 1 is not equal to Object 2")); - PADDLE_ENFORCE_EQ(obj1, obj3, paddle::platform::errors::InvalidArgument( - "Object 1 is equal to Object 3")); + PADDLE_ENFORCE_NE(obj1, obj2, + paddle::platform::errors::InvalidArgument( + "Object 1 is not equal to Object 2")); + PADDLE_ENFORCE_EQ(obj1, obj3, + paddle::platform::errors::InvalidArgument( + "Object 1 is equal to Object 3")); std::string msg = "Compare obj1 with obj2"; try { diff --git a/paddle/fluid/platform/errors.h b/paddle/fluid/platform/errors.h index 57f5b3a7c9374..758af3e2d9137 100644 --- a/paddle/fluid/platform/errors.h +++ b/paddle/fluid/platform/errors.h @@ -18,5 +18,5 @@ namespace paddle { namespace platform { namespace errors = ::phi::errors; using error = ::phi::ErrorCode; -} -} +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/errors_test.cc b/paddle/fluid/platform/errors_test.cc index 712b67a654c40..8b11c1d2d2492 100644 --- a/paddle/fluid/platform/errors_test.cc +++ b/paddle/fluid/platform/errors_test.cc @@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/platform/errors.h" + #include #include "gtest/gtest.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/errors.h" using namespace paddle::platform::errors; // NOLINT diff --git a/paddle/fluid/platform/fast_divmod.h b/paddle/fluid/platform/fast_divmod.h index bef551078b332..f2a150c301216 100644 --- a/paddle/fluid/platform/fast_divmod.h +++ b/paddle/fluid/platform/fast_divmod.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/phi/kernels/funcs/aligned_vector.h" #define INT_BITS 32 diff --git a/paddle/fluid/platform/flags.h b/paddle/fluid/platform/flags.h index b9d78c2e9dc39..0a38d61293978 100644 --- a/paddle/fluid/platform/flags.h +++ b/paddle/fluid/platform/flags.h @@ -18,6 +18,7 @@ #include #include #include + #include "gflags/gflags.h" #include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/variant.h" diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu index 2c00854e082eb..dc7fdc6b443d9 100644 --- a/paddle/fluid/platform/float16_test.cu +++ b/paddle/fluid/platform/float16_test.cu @@ -14,6 +14,7 @@ limitations under the License. */ #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include #include + #include #include diff --git a/paddle/fluid/platform/gen_comm_id_helper.cc b/paddle/fluid/platform/gen_comm_id_helper.cc index bbec743d26f3b..45ca4a6f27765 100644 --- a/paddle/fluid/platform/gen_comm_id_helper.cc +++ b/paddle/fluid/platform/gen_comm_id_helper.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include #include #include + #include #include #include // NOLINT diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc index 5301dd307590b..bc5bd274bf8a7 100644 --- a/paddle/fluid/platform/init_test.cc +++ b/paddle/fluid/platform/init_test.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/init.h" + #include "gtest/gtest.h" #include "paddle/fluid/platform/device_context.h" #ifdef PADDLE_WITH_MLU diff --git a/paddle/fluid/platform/lock_guard_ptr.h b/paddle/fluid/platform/lock_guard_ptr.h index bff24e74a7070..66d6e446d3f16 100644 --- a/paddle/fluid/platform/lock_guard_ptr.h +++ b/paddle/fluid/platform/lock_guard_ptr.h @@ -14,6 +14,7 @@ #pragma once #include + #include #include // NOLINT namespace paddle { diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 5476d244f6035..382f96e83bfce 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -1061,16 +1061,18 @@ static void SetDstMemoryQuantized( const size_t dst_dims = dst_tz.size(); MKLDNNMemoryFormat dst_fmt; - PADDLE_ENFORCE_LE(dst_dims, 5, platform::errors::InvalidArgument( - "Dst memory for quantization can not have " - "dims > 5. But received dst_dims is %d.", - dst_dims)); + PADDLE_ENFORCE_LE(dst_dims, 5, + platform::errors::InvalidArgument( + "Dst memory for quantization can not have " + "dims > 5. But received dst_dims is %d.", + dst_dims)); dst_fmt = platform::MKLDNNFormatForSize(dst_dims, output_format); - auto tmp_dst_md = platform::MKLDNNMemDesc( - {dst_tz}, paddle::framework::ToMKLDNNDataType( - framework::DataTypeTrait::DataType()), - dst_fmt); + auto tmp_dst_md = + platform::MKLDNNMemDesc({dst_tz}, + paddle::framework::ToMKLDNNDataType( + framework::DataTypeTrait::DataType()), + dst_fmt); dst_md.reset(new dnnl::memory::desc(tmp_dst_md)); dst_memory.reset( new dnnl::memory(*dst_md, engine, to_void_cast(output_data))); diff --git a/paddle/fluid/platform/monitor.h b/paddle/fluid/platform/monitor.h index dc9abaf36d825..e7612f6dcb6cd 100644 --- a/paddle/fluid/platform/monitor.h +++ b/paddle/fluid/platform/monitor.h @@ -15,6 +15,7 @@ #pragma once #include + #include #include #include // NOLINT diff --git a/paddle/fluid/platform/os_info.cc b/paddle/fluid/platform/os_info.cc index 36dd7891d5518..694f701b5ad9b 100644 --- a/paddle/fluid/platform/os_info.cc +++ b/paddle/fluid/platform/os_info.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/os_info.h" + #include #include #include diff --git a/paddle/fluid/platform/os_info_test.cc b/paddle/fluid/platform/os_info_test.cc index b3311f1d19e63..149da6ba27aea 100644 --- a/paddle/fluid/platform/os_info_test.cc +++ b/paddle/fluid/platform/os_info_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/platform/os_info.h" + #include + #include "gtest/gtest.h" TEST(ThreadInfo, TestThreadIdUtils) { - using paddle::platform::GetCurrentThreadStdId; - using paddle::platform::GetCurrentThreadId; using paddle::platform::GetAllThreadIds; + using paddle::platform::GetCurrentThreadId; + using paddle::platform::GetCurrentThreadStdId; EXPECT_EQ(std::hash()(std::this_thread::get_id()), GetCurrentThreadId().std_tid); auto ids = GetAllThreadIds(); @@ -26,10 +28,10 @@ TEST(ThreadInfo, TestThreadIdUtils) { } TEST(ThreadInfo, TestThreadNameUtils) { - using paddle::platform::GetCurrentThreadStdId; + using paddle::platform::GetAllThreadNames; using paddle::platform::GetCurrentThreadName; + using paddle::platform::GetCurrentThreadStdId; using paddle::platform::SetCurrentThreadName; - using paddle::platform::GetAllThreadNames; SetCurrentThreadName("MainThread"); EXPECT_FALSE(SetCurrentThreadName("MainThread")); auto names = GetAllThreadNames(); diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 75abf36e676d0..c573650f1791f 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/platform/profiler.h" + #include // NOLINT #include #include @@ -20,7 +22,6 @@ limitations under the License. */ #include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/common_event.h" #include "paddle/fluid/platform/profiler/host_event_recorder.h" #include "paddle/fluid/platform/profiler/host_tracer.h" diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt index 084bc44dbc78b..ea3111b73613a 100644 --- a/paddle/fluid/platform/profiler/CMakeLists.txt +++ b/paddle/fluid/platform/profiler/CMakeLists.txt @@ -1,14 +1,52 @@ -cc_library(host_tracer SRCS host_tracer.cc DEPS enforce) -cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog) +cc_library( + host_tracer + SRCS host_tracer.cc + DEPS enforce) +cc_library( + cuda_tracer + SRCS cuda_tracer.cc cupti_data_process.cc + DEPS workqueue_utils enforce glog) add_subdirectory(mlu) -cc_library(event_node SRCS event_node.cc DEPS enforce) -cc_library(profiler_utils SRCS utils.cc DEPS enforce glog) +cc_library( + event_node + SRCS event_node.cc + DEPS enforce) +cc_library( + profiler_utils + SRCS utils.cc + DEPS enforce glog) add_subdirectory(dump) -cc_library(profiler_logger SRCS chrometracing_logger.cc dump/serialization_logger.cc dump/deserialization_reader.cc DEPS nodetreeproto event_node profiler_utils) -cc_library(event_bind SRCS event_python.cc DEPS profiler_logger) -cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog) -cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind mlu_tracer) -cc_test(test_event_node SRCS test_event_node.cc DEPS event_node profiler_logger) -cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils) -cc_test(test_serialization_logger SRCS dump/test_serialization_logger.cc DEPS event_bind) -cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler) +cc_library( + profiler_logger + SRCS chrometracing_logger.cc dump/serialization_logger.cc + dump/deserialization_reader.cc + DEPS nodetreeproto event_node profiler_utils) +cc_library( + event_bind + SRCS event_python.cc + DEPS profiler_logger) +cc_library( + cpu_utilization + SRCS cpu_utilization.cc + DEPS cpu_info os_info enforce glog) +cc_library( + new_profiler + SRCS profiler.cc + DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind + mlu_tracer) +cc_test( + test_event_node + SRCS test_event_node.cc + DEPS event_node profiler_logger) +cc_test( + test_extra_info + SRCS test_extra_info.cc + DEPS profiler_utils) +cc_test( + test_serialization_logger + SRCS dump/test_serialization_logger.cc + DEPS event_bind) +cc_test( + new_profiler_test + SRCS profiler_test.cc + DEPS new_profiler) diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc index 4ee95a530fb43..f728a820bd73c 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.cc +++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc @@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/platform/profiler/chrometracing_logger.h" + #include #include #include #include "glog/logging.h" - #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/profiler/chrometracing_logger.h" #include "paddle/fluid/platform/profiler/event_node.h" #include "paddle/fluid/platform/profiler/utils.h" @@ -304,9 +304,10 @@ void ChromeTracingLogger::HandleTypeKernel( blocks_per_sm = static_cast(kernel_info.grid_x * kernel_info.grid_y * kernel_info.grid_z) / device_property.multiProcessorCount; - warps_per_sm = blocks_per_sm * (kernel_info.block_x * kernel_info.block_y * - kernel_info.block_z) / - threads_per_warp; + warps_per_sm = + blocks_per_sm * + (kernel_info.block_x * kernel_info.block_y * kernel_info.block_z) / + threads_per_warp; occupancy = CalculateEstOccupancy( device_node.DeviceId(), kernel_info.registers_per_thread, kernel_info.static_shared_memory, kernel_info.dynamic_shared_memory, diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.h b/paddle/fluid/platform/profiler/chrometracing_logger.h index 8977ab748c63a..12d98d1ef0c63 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.h +++ b/paddle/fluid/platform/profiler/chrometracing_logger.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/platform/profiler/output_logger.h" namespace paddle { diff --git a/paddle/fluid/platform/profiler/common_event.h b/paddle/fluid/platform/profiler/common_event.h index cfdc3be110a5b..8fe3b15052306 100644 --- a/paddle/fluid/platform/profiler/common_event.h +++ b/paddle/fluid/platform/profiler/common_event.h @@ -17,6 +17,7 @@ #include #include #include + #include "paddle/fluid/platform/event.h" // import EventRole, TODO(TIEXING): remove later #include "paddle/fluid/platform/profiler/trace_event.h" diff --git a/paddle/fluid/platform/profiler/cpu_utilization.cc b/paddle/fluid/platform/profiler/cpu_utilization.cc index d507153d3f5b4..4319841c8a93b 100644 --- a/paddle/fluid/platform/profiler/cpu_utilization.cc +++ b/paddle/fluid/platform/profiler/cpu_utilization.cc @@ -54,12 +54,13 @@ void CpuUtilization::RecordBeginTimeInfo() { if (stat_file != nullptr) { char temp_str[200]; uint64_t temp_lu; - int retval = fscanf( - stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 - "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64, - temp_str, &system_tms_start_.tms_utime, &nice_time_start_, - &system_tms_start_.tms_stime, &idle_start_, &iowait_start_, &irq_start_, - &softirq_start_, &steal_start_, &temp_lu, &temp_lu); + int retval = + fscanf(stat_file, + "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 + "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64, + temp_str, &system_tms_start_.tms_utime, &nice_time_start_, + &system_tms_start_.tms_stime, &idle_start_, &iowait_start_, + &irq_start_, &softirq_start_, &steal_start_, &temp_lu, &temp_lu); if (retval != 11) { LOG(WARNING) << "Failed to read cpu utilization information at record beginning." @@ -87,12 +88,13 @@ void CpuUtilization::RecordEndTimeInfo() { if (stat_file != nullptr) { char temp_str[200]; uint64_t temp_lu; - int retval = fscanf( - stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 - "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64, - temp_str, &system_tms_end_.tms_utime, &nice_time_end_, - &system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_, - &softirq_end_, &steal_end_, &temp_lu, &temp_lu); + int retval = + fscanf(stat_file, + "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 + "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64, + temp_str, &system_tms_end_.tms_utime, &nice_time_end_, + &system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_, + &softirq_end_, &steal_end_, &temp_lu, &temp_lu); if (retval != 11) { LOG(WARNING) diff --git a/paddle/fluid/platform/profiler/cpu_utilization.h b/paddle/fluid/platform/profiler/cpu_utilization.h index 7b05a6302cdb0..aa25ae5a43c10 100644 --- a/paddle/fluid/platform/profiler/cpu_utilization.h +++ b/paddle/fluid/platform/profiler/cpu_utilization.h @@ -15,8 +15,10 @@ #pragma once #include + #include #include + #include "glog/logging.h" #ifdef _MSC_VER #include diff --git a/paddle/fluid/platform/profiler/cuda_tracer.cc b/paddle/fluid/platform/profiler/cuda_tracer.cc index 2d3e354dc271a..9e32f7bbf19ee 100644 --- a/paddle/fluid/platform/profiler/cuda_tracer.cc +++ b/paddle/fluid/platform/profiler/cuda_tracer.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "paddle/fluid/platform/profiler/cuda_tracer.h" + #include #include + #include "glog/logging.h" #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h" #include "paddle/fluid/platform/os_info.h" diff --git a/paddle/fluid/platform/profiler/cuda_tracer.h b/paddle/fluid/platform/profiler/cuda_tracer.h index 20a60521266a2..36c5ab4eb5546 100644 --- a/paddle/fluid/platform/profiler/cuda_tracer.h +++ b/paddle/fluid/platform/profiler/cuda_tracer.h @@ -17,6 +17,7 @@ #include #include #include + #include "paddle/fluid/platform/dynload/cupti.h" #include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/profiler/tracer_base.h" diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc index da12dccb74924..7cb8b597dcdd0 100644 --- a/paddle/fluid/platform/profiler/cupti_data_process.cc +++ b/paddle/fluid/platform/profiler/cupti_data_process.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/platform/profiler/cupti_data_process.h" + #include + #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/os_info.h" diff --git a/paddle/fluid/platform/profiler/cupti_data_process.h b/paddle/fluid/platform/profiler/cupti_data_process.h index 01b2e72ade4e2..7b80046473456 100644 --- a/paddle/fluid/platform/profiler/cupti_data_process.h +++ b/paddle/fluid/platform/profiler/cupti_data_process.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/fluid/platform/dynload/cupti.h" #include "paddle/fluid/platform/profiler/trace_event_collector.h" diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc index de3411579d3e9..82363fcff6349 100644 --- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc @@ -9,7 +9,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/profiler/dump/deserialization_reader.h" + #include + #include "paddle/fluid/platform/profiler/extra_info.h" namespace paddle { diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc index 73021f4362af5..b8afe2af0e776 100644 --- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc @@ -9,9 +9,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "glog/logging.h" - #include "paddle/fluid/platform/profiler/dump/serialization_logger.h" + +#include "glog/logging.h" #include "paddle/fluid/platform/profiler/event_node.h" #include "paddle/fluid/platform/profiler/extra_info.h" #include "paddle/fluid/platform/profiler/utils.h" diff --git a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc index d294bfee58c2b..5253ecc505dbb 100644 --- a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc +++ b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc @@ -13,26 +13,25 @@ // limitations under the License. #include "gtest/gtest.h" - #include "paddle/fluid/platform/profiler/dump/deserialization_reader.h" #include "paddle/fluid/platform/profiler/dump/serialization_logger.h" #include "paddle/fluid/platform/profiler/event_node.h" #include "paddle/fluid/platform/profiler/event_python.h" -using paddle::platform::SerializationLogger; -using paddle::platform::DeserializationReader; -using paddle::platform::NodeTrees; -using paddle::platform::HostTraceEventNode; using paddle::platform::CudaRuntimeTraceEventNode; +using paddle::platform::DeserializationReader; +using paddle::platform::DeviceTraceEvent; using paddle::platform::DeviceTraceEventNode; using paddle::platform::HostTraceEvent; -using paddle::platform::RuntimeTraceEvent; -using paddle::platform::DeviceTraceEvent; -using paddle::platform::TracerEventType; +using paddle::platform::HostTraceEventNode; using paddle::platform::KernelEventInfo; using paddle::platform::MemcpyEventInfo; using paddle::platform::MemsetEventInfo; +using paddle::platform::NodeTrees; using paddle::platform::ProfilerResult; +using paddle::platform::RuntimeTraceEvent; +using paddle::platform::SerializationLogger; +using paddle::platform::TracerEventType; TEST(SerializationLoggerTest, dump_case0) { std::list host_events; diff --git a/paddle/fluid/platform/profiler/event_node.cc b/paddle/fluid/platform/profiler/event_node.cc index b909fb5f25aa7..e1af63ad8909c 100644 --- a/paddle/fluid/platform/profiler/event_node.cc +++ b/paddle/fluid/platform/profiler/event_node.cc @@ -12,6 +12,7 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler/event_node.h" #include + #include #include #include diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc index 5c42c8e8bf61e..abde62c6b1444 100644 --- a/paddle/fluid/platform/profiler/event_python.cc +++ b/paddle/fluid/platform/profiler/event_python.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/profiler/event_python.h" + #include "paddle/fluid/platform/profiler/chrometracing_logger.h" #include "paddle/fluid/platform/profiler/dump/deserialization_reader.h" #include "paddle/fluid/platform/profiler/dump/serialization_logger.h" diff --git a/paddle/fluid/platform/profiler/event_tracing.h b/paddle/fluid/platform/profiler/event_tracing.h index fcaba9a43ca93..fd81c15f92ad7 100644 --- a/paddle/fluid/platform/profiler/event_tracing.h +++ b/paddle/fluid/platform/profiler/event_tracing.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/profiler/trace_event.h" @@ -70,10 +71,11 @@ class RecordEvent { * @param level: Used to filter events, works like glog VLOG(level). * RecordEvent will works if HostTraceLevel >= level. */ - explicit RecordEvent(const char* name, const TracerEventType type = - TracerEventType::UserDefined, - uint32_t level = kDefaultTraceLevel, - const EventRole role = EventRole::kOrdinary); + explicit RecordEvent( + const char* name, + const TracerEventType type = TracerEventType::UserDefined, + uint32_t level = kDefaultTraceLevel, + const EventRole role = EventRole::kOrdinary); RecordEvent(const std::string& name, const std::string& attr, const TracerEventType type = TracerEventType::UserDefined, diff --git a/paddle/fluid/platform/profiler/host_event_recorder.h b/paddle/fluid/platform/profiler/host_event_recorder.h index afd4135246556..1359c3b85a096 100644 --- a/paddle/fluid/platform/profiler/host_event_recorder.h +++ b/paddle/fluid/platform/profiler/host_event_recorder.h @@ -17,6 +17,7 @@ #include #include #include + #include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h" #include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/os_info.h" @@ -58,7 +59,7 @@ class EventContainer { public: // Record an event template - void Record(Args &&... args) { + void Record(Args &&...args) { DoRecord(ContainsStdString(), std::forward(args)...); } @@ -112,7 +113,7 @@ class EventContainer { // Record an event with string arguments template - void DoRecord(std::true_type, Args &&... args) { + void DoRecord(std::true_type, Args &&...args) { auto *storage = GetEventStorage(); std::function allocator = [this](size_t size) { return GetStrBufFromArena(size); @@ -122,7 +123,7 @@ class EventContainer { // Record an event without any string argument template - void DoRecord(std::false_type, Args &&... args) { + void DoRecord(std::false_type, Args &&...args) { auto *storage = GetEventStorage(); new (storage) EventType(std::forward(args)...); } @@ -199,7 +200,7 @@ class ThreadEventRecorder { public: // Forward call to EventContainer::Record template - void RecordEvent(Args &&... args) { + void RecordEvent(Args &&...args) { base_evt_cntr_.Record(std::forward(args)...); } @@ -237,7 +238,7 @@ class HostEventRecorder { // Do your best to avoid using 'std::string' as the argument type. // It will cause deep-copy to harm performance. template - void RecordEvent(Args &&... args) { + void RecordEvent(Args &&...args) { GetThreadLocalRecorder()->RecordEvent(std::forward(args)...); } diff --git a/paddle/fluid/platform/profiler/host_tracer.cc b/paddle/fluid/platform/profiler/host_tracer.cc index b7eb53331b793..8a36a3a8bab44 100644 --- a/paddle/fluid/platform/profiler/host_tracer.cc +++ b/paddle/fluid/platform/profiler/host_tracer.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/platform/profiler/host_tracer.h" + #include "glog/logging.h" #include "paddle/fluid/platform/flags.h" #include "paddle/fluid/platform/profiler/common_event.h" diff --git a/paddle/fluid/platform/profiler/mlu/CMakeLists.txt b/paddle/fluid/platform/profiler/mlu/CMakeLists.txt index 01b3757ea6912..d510edb0457db 100644 --- a/paddle/fluid/platform/profiler/mlu/CMakeLists.txt +++ b/paddle/fluid/platform/profiler/mlu/CMakeLists.txt @@ -2,4 +2,7 @@ if(WITH_MLU) set(MLU_INFO mlu_info) endif() -cc_library(mlu_tracer SRCS mlu_tracer.cc cnpapi_data_process.cc DEPS workqueue_utils enforce glog ${MLU_INFO}) +cc_library( + mlu_tracer + SRCS mlu_tracer.cc cnpapi_data_process.cc + DEPS workqueue_utils enforce glog ${MLU_INFO}) diff --git a/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc index 36abf77279d06..7afdb5eb2a352 100644 --- a/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc +++ b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h" + #include + #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/os_info.h" diff --git a/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc b/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc index 2d719a8bbfdcb..bbaafa3faa60a 100644 --- a/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc +++ b/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h" + #include #include + #include "glog/logging.h" #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h" #include "paddle/fluid/platform/os_info.h" diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc index a417eda1509e5..8bcf856c01ab6 100644 --- a/paddle/fluid/platform/profiler/profiler.cc +++ b/paddle/fluid/platform/profiler/profiler.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/platform/profiler/profiler.h" + #include "glog/logging.h" #ifdef PADDLE_WITH_CUDA #include diff --git a/paddle/fluid/platform/profiler/profiler.h b/paddle/fluid/platform/profiler/profiler.h index ea346a4fb748d..65a3bcc02d857 100644 --- a/paddle/fluid/platform/profiler/profiler.h +++ b/paddle/fluid/platform/profiler/profiler.h @@ -20,6 +20,7 @@ #include #include #include + #include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/profiler/cpu_utilization.h" #include "paddle/fluid/platform/profiler/event_node.h" diff --git a/paddle/fluid/platform/profiler/profiler_test.cc b/paddle/fluid/platform/profiler/profiler_test.cc index f2c867ffff217..1f1fbcb71ecd5 100644 --- a/paddle/fluid/platform/profiler/profiler_test.cc +++ b/paddle/fluid/platform/profiler/profiler_test.cc @@ -14,6 +14,7 @@ #include #include + #include "glog/logging.h" #include "gtest/gtest.h" #ifdef PADDLE_WITH_CUDA @@ -27,11 +28,11 @@ #include "paddle/fluid/platform/profiler/profiler.h" TEST(ProfilerTest, TestHostTracer) { - using paddle::platform::ProfilerOptions; using paddle::platform::Profiler; + using paddle::platform::ProfilerOptions; + using paddle::platform::ProfilerResult; using paddle::platform::RecordInstantEvent; using paddle::platform::TracerEventType; - using paddle::platform::ProfilerResult; ProfilerOptions options; options.trace_level = 2; options.trace_switch = 3; @@ -58,8 +59,8 @@ TEST(ProfilerTest, TestHostTracer) { } TEST(ProfilerTest, TestCudaTracer) { - using paddle::platform::ProfilerOptions; using paddle::platform::Profiler; + using paddle::platform::ProfilerOptions; using paddle::platform::ProfilerResult; ProfilerOptions options; options.trace_level = 0; diff --git a/paddle/fluid/platform/profiler/test_event_node.cc b/paddle/fluid/platform/profiler/test_event_node.cc index b8d1306ad076c..23ad917b57d0e 100644 --- a/paddle/fluid/platform/profiler/test_event_node.cc +++ b/paddle/fluid/platform/profiler/test_event_node.cc @@ -13,22 +13,21 @@ // limitations under the License. #include "gtest/gtest.h" - #include "paddle/fluid/platform/profiler/chrometracing_logger.h" #include "paddle/fluid/platform/profiler/event_node.h" using paddle::platform::ChromeTracingLogger; -using paddle::platform::NodeTrees; -using paddle::platform::HostTraceEventNode; using paddle::platform::CudaRuntimeTraceEventNode; +using paddle::platform::DeviceTraceEvent; using paddle::platform::DeviceTraceEventNode; using paddle::platform::HostTraceEvent; -using paddle::platform::RuntimeTraceEvent; -using paddle::platform::DeviceTraceEvent; -using paddle::platform::TracerEventType; +using paddle::platform::HostTraceEventNode; using paddle::platform::KernelEventInfo; using paddle::platform::MemcpyEventInfo; using paddle::platform::MemsetEventInfo; +using paddle::platform::NodeTrees; +using paddle::platform::RuntimeTraceEvent; +using paddle::platform::TracerEventType; TEST(NodeTreesTest, LogMe_case0) { std::list host_events; std::list runtime_events; @@ -194,8 +193,10 @@ TEST(NodeTreesTest, HandleTrees_case0) { } std::function host_event_node_handle( [&](HostTraceEventNode* a) { logger.LogHostTraceEventNode(*a); }); - std::function runtime_event_node_handle([&]( - CudaRuntimeTraceEventNode* a) { logger.LogRuntimeTraceEventNode(*a); }); + std::function runtime_event_node_handle( + [&](CudaRuntimeTraceEventNode* a) { + logger.LogRuntimeTraceEventNode(*a); + }); std::function device_event_node_handle( [&](DeviceTraceEventNode* a) { logger.LogDeviceTraceEventNode(*a); }); tree.HandleTrees(host_event_node_handle, runtime_event_node_handle, diff --git a/paddle/fluid/platform/profiler/trace_event_collector.h b/paddle/fluid/platform/profiler/trace_event_collector.h index 5f2bc9dc90db9..d1593bc1bfcd7 100644 --- a/paddle/fluid/platform/profiler/trace_event_collector.h +++ b/paddle/fluid/platform/profiler/trace_event_collector.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/platform/profiler/trace_event.h" namespace paddle { diff --git a/paddle/fluid/platform/profiler/utils.h b/paddle/fluid/platform/profiler/utils.h index 06d1636c4617c..433fd0b825a11 100644 --- a/paddle/fluid/platform/profiler/utils.h +++ b/paddle/fluid/platform/profiler/utils.h @@ -15,6 +15,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/platform/dynload/cupti.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/os_info.h" @@ -26,8 +27,9 @@ template std::string string_format(const std::string& format, Args... args) { int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) + 1; // Extra space for '\0' - PADDLE_ENFORCE_GE(size_s, 0, platform::errors::Fatal( - "Error during profiler data formatting.")); + PADDLE_ENFORCE_GE( + size_s, 0, + platform::errors::Fatal("Error during profiler data formatting.")); auto size = static_cast(size_s); auto buf = std::make_unique(size); std::snprintf(buf.get(), size, format.c_str(), args...); diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h index f64e05504aa3f..ae856044f8fc5 100644 --- a/paddle/fluid/platform/profiler_helper.h +++ b/paddle/fluid/platform/profiler_helper.h @@ -390,8 +390,8 @@ void SetEvent(bool merge_thread, const Event &analyze_event, index++; } if (split_pos == -1 && !main_thread_event_name.count(rit->name())) { - event_name = "thread" + std::to_string(rit->thread_id()) + "::" + - rit->name(); + event_name = "thread" + std::to_string(rit->thread_id()) + + "::" + rit->name(); } else { if (!main_thread_event_name.count(rit->name())) { event_name = diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc index e9f84a49246f7..18d4b4dc83478 100644 --- a/paddle/fluid/platform/profiler_test.cc +++ b/paddle/fluid/platform/profiler_test.cc @@ -36,24 +36,24 @@ TEST(Event, CpuElapsedTime) { TEST(RecordEvent, RecordEvent) { using paddle::platform::Event; + using paddle::platform::EventRole; + using paddle::platform::EventSortingKey; using paddle::platform::EventType; - using paddle::platform::RecordEvent; - using paddle::platform::PushEvent; using paddle::platform::PopEvent; using paddle::platform::ProfilerState; - using paddle::platform::EventSortingKey; - using paddle::platform::EventRole; + using paddle::platform::PushEvent; + using paddle::platform::RecordEvent; ProfilerState state = ProfilerState::kCPU; EnableProfiler(state); /* Usage 1: - * PushEvent(evt_name); - * ... - * code to be analyzed - * ... - * PopEvent(evt_name); - */ + * PushEvent(evt_name); + * ... + * code to be analyzed + * ... + * PopEvent(evt_name); + */ LOG(INFO) << "Usage 1: PushEvent & PopEvent"; for (int loop = 0; loop < 3; ++loop) { for (int i = 1; i < 5; ++i) { diff --git a/paddle/fluid/platform/resource_pool.h b/paddle/fluid/platform/resource_pool.h index f01d006d5b273..737001a50abbf 100644 --- a/paddle/fluid/platform/resource_pool.h +++ b/paddle/fluid/platform/resource_pool.h @@ -20,6 +20,7 @@ #include #include #include + #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/macros.h" diff --git a/paddle/fluid/platform/stream/CMakeLists.txt b/paddle/fluid/platform/stream/CMakeLists.txt index 6a825e9077c0a..25d2874ca04d2 100644 --- a/paddle/fluid/platform/stream/CMakeLists.txt +++ b/paddle/fluid/platform/stream/CMakeLists.txt @@ -1,3 +1,6 @@ -IF(WITH_GPU OR WITH_ROCM) - cc_library(cuda_stream SRCS cuda_stream.cc DEPS enforce boost eigen3 ${MKLDNN_CTX_DEPS}) -ENDIF() +if(WITH_GPU OR WITH_ROCM) + cc_library( + cuda_stream + SRCS cuda_stream.cc + DEPS enforce boost eigen3 ${MKLDNN_CTX_DEPS}) +endif() diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc index e3e735d03aba1..d7f60e4019d2e 100644 --- a/paddle/fluid/platform/stream/cuda_stream.cc +++ b/paddle/fluid/platform/stream/cuda_stream.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/stream/cuda_stream.h" + #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/device/gpu/gpu_types.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc index 6fa326d57bc67..bb9a405798b63 100644 --- a/paddle/fluid/platform/stream_callback_manager.cc +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/platform/stream_callback_manager.h" + #include "paddle/fluid/platform/device/device_wrapper.h" namespace paddle { diff --git a/paddle/fluid/platform/transform.h b/paddle/fluid/platform/transform.h index 6f714a677033b..32c759d01026c 100644 --- a/paddle/fluid/platform/transform.h +++ b/paddle/fluid/platform/transform.h @@ -25,6 +25,7 @@ limitations under the License. */ #if defined(__NVCC__) || defined(__HIPCC__) #include #include + #include "paddle/fluid/platform/details/cuda_transform_iterator_cast.h" #endif diff --git a/paddle/fluid/platform/transform_test.cu b/paddle/fluid/platform/transform_test.cu index 2e7b8b402f69a..1caa2e8770772 100644 --- a/paddle/fluid/platform/transform_test.cu +++ b/paddle/fluid/platform/transform_test.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memory.h" @@ -38,10 +39,10 @@ class Multiply { using paddle::memory::Alloc; using paddle::memory::Copy; -using paddle::platform::CPUPlace; -using paddle::platform::CUDAPlace; using paddle::platform::CPUDeviceContext; +using paddle::platform::CPUPlace; using paddle::platform::CUDADeviceContext; +using paddle::platform::CUDAPlace; using paddle::platform::Transform; diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 90a86aaf31f26..bf74d1184322c 100755 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,46 +1,82 @@ -set(PYBIND_DEPS init pybind python proto_desc memory executor fleet_wrapper box_wrapper metrics prune - feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool - analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context - gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator - cost_model cuda_graph_with_memory_pool fleet_executor global_utils phi_utils tcp_store new_profiler) - -if (WITH_PSCORE) +set(PYBIND_DEPS + init + pybind + python + proto_desc + memory + executor + fleet_wrapper + box_wrapper + metrics + prune + feed_fetch_method + pass + generate_pass + pass_builder + parallel_executor + profiler + layer + tracer + engine + scope_pool + analysis_predictor + imperative_profiler + imperative_flag + save_load_util + dlpack_tensor + device_context + gloo_wrapper + infer_io_utils + heter_wrapper + generator + op_version_registry + ps_gpu_wrapper + custom_operator + cost_model + cuda_graph_with_memory_pool + fleet_executor + global_utils + phi_utils + tcp_store + new_profiler) + +if(WITH_PSCORE) set(PYBIND_DEPS ${PYBIND_DEPS} ps_service) set(PYBIND_DEPS ${PYBIND_DEPS} graph_py_service) - if (WITH_HETERPS) + if(WITH_HETERPS) set(PYBIND_DEPS ${PYBIND_DEPS} graph_gpu_wrapper) endif() endif() -if (WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM) set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda) set(PYBIND_DEPS ${PYBIND_DEPS} cuda_device_guard) endif() -if (WITH_GPU) +if(WITH_GPU) set(PYBIND_DEPS ${PYBIND_DEPS} cuda_profiler) endif() -if (WITH_IPU) +if(WITH_IPU) set(PYBIND_DEPS ${PYBIND_DEPS} ipu_info) endif() -if (WITH_NCCL OR WITH_RCCL) +if(WITH_NCCL OR WITH_RCCL) set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper) set(PYBIND_DEPS ${PYBIND_DEPS} reducer) endif() -if (WITH_XPU_BKCL) +if(WITH_XPU_BKCL) set(PYBIND_DEPS ${PYBIND_DEPS} reducer) set(PYBIND_DEPS ${PYBIND_DEPS} bkcl_context) set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context) endif() -if (WITH_ASCEND_CL) +if(WITH_ASCEND_CL) set(PYBIND_DEPS ${PYBIND_DEPS} reducer) set(PYBIND_DEPS ${PYBIND_DEPS} hccl_context) set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context) endif() -if (WITH_CNCL) +if(WITH_CNCL) set(PYBIND_DEPS ${PYBIND_DEPS} reducer) set(PYBIND_DEPS ${PYBIND_DEPS} cncl_context) endif() @@ -48,10 +84,10 @@ endif() if(NOT WIN32) set(PYBIND_DEPS ${PYBIND_DEPS} data_loader) set(PYBIND_DEPS ${PYBIND_DEPS} mmap_allocator) - if (WITH_GPU) + if(WITH_GPU) set(PYBIND_DEPS ${PYBIND_DEPS} cuda_ipc_allocator) endif() - if (WITH_NCCL OR WITH_RCCL) + if(WITH_NCCL OR WITH_RCCL) set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context) set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context) endif() @@ -63,45 +99,45 @@ if(WITH_PYTHON) endif() set(PYBIND_SRCS - pybind.cc - exception.cc - protobuf.cc - const_value.cc - global_value_getter_setter.cc - reader_py.cc - fleet_wrapper_py.cc - heter_wrapper_py.cc - ps_gpu_wrapper_py.cc - gloo_wrapper_py.cc - box_helper_py.cc - metrics_py.cc - data_set_py.cc - imperative.cc - ir.cc - bind_cost_model.cc - bind_fleet_executor.cc - inference_api.cc - compatible.cc - io.cc - generator_py.cc - communication.cc - cuda_streams_py.cc) + pybind.cc + exception.cc + protobuf.cc + const_value.cc + global_value_getter_setter.cc + reader_py.cc + fleet_wrapper_py.cc + heter_wrapper_py.cc + ps_gpu_wrapper_py.cc + gloo_wrapper_py.cc + box_helper_py.cc + metrics_py.cc + data_set_py.cc + imperative.cc + ir.cc + bind_cost_model.cc + bind_fleet_executor.cc + inference_api.cc + compatible.cc + io.cc + generator_py.cc + communication.cc + cuda_streams_py.cc) if(NOT ON_INFER) - set (PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer) - if (WITH_NCCL) - set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl) - if (WITH_PSCORE) - set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_heter) + set(PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer) + if(WITH_NCCL) + set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl) + if(WITH_PSCORE) + set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_heter) endif() endif() - if (WITH_GLOO) - set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo) + if(WITH_GLOO) + set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo) endif() if(WITH_ASCEND_CL) - set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_hccl) - if (WITH_PSCORE) - set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_heter) + set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_hccl) + if(WITH_PSCORE) + set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_heter) endif() endif() set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc) @@ -119,45 +155,69 @@ if(WITH_GLOO) set(PYBIND_DEPS ${PYBIND_DEPS} reducer) endif(WITH_GLOO) -if (WITH_CRYPTO) +if(WITH_CRYPTO) set(PYBIND_DEPS ${PYBIND_DEPS} paddle_crypto) set(PYBIND_SRCS ${PYBIND_SRCS} crypto.cc) -endif (WITH_CRYPTO) - -if (WITH_PSLIB) - set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=type-limits -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result") - if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) - set(DISTRIBUTE_COMPILE_FLAGS - "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") +endif(WITH_CRYPTO) + +if(WITH_PSLIB) + set(DISTRIBUTE_COMPILE_FLAGS + "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=type-limits -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result" + ) + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) + set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") endif() - set_source_files_properties(heter_wrapper_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + heter_wrapper_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) endif(WITH_PSLIB) -if (WITH_PSCORE) - if (WITH_ARM_BRPC) - set(DISTRIBUTE_COMPILE_FLAGS "-faligned-new -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result") +if(WITH_PSCORE) + if(WITH_ARM_BRPC) + set(DISTRIBUTE_COMPILE_FLAGS + "-faligned-new -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result" + ) else() - set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result") + set(DISTRIBUTE_COMPILE_FLAGS + "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result" + ) endif() - set_source_files_properties(fleet_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties( + fleet_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) list(APPEND PYBIND_DEPS fleet communicator index_wrapper index_sampler) list(APPEND PYBIND_SRCS fleet_py.cc) endif() -if (WITH_NCCL OR WITH_RCCL) +if(WITH_NCCL OR WITH_RCCL) list(APPEND PYBIND_SRCS nccl_wrapper_py.cc) endif() if(WITH_PYTHON) # generate op pybind functions automatically for dygraph. - if (WITH_ASCEND_CL) - set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag ascend_wrapper) + if(WITH_ASCEND_CL) + set(OP_FUNCTION_GENERETOR_DEPS + pybind + proto_desc + executor + layer + tracer + engine + imperative_profiler + imperative_flag + ascend_wrapper) else() - set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag) + set(OP_FUNCTION_GENERETOR_DEPS + pybind + proto_desc + executor + layer + tracer + engine + imperative_profiler + imperative_flag) endif() list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB}) list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS}) - if (WITH_NCCL OR WITH_RCCL) + if(WITH_NCCL OR WITH_RCCL) list(APPEND OP_FUNCTION_GENERETOR_DEPS nccl_context) endif() @@ -176,13 +236,15 @@ if(WITH_PYTHON) add_executable(op_function_generator op_function_generator.cc) target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS}) add_executable(eager_op_function_generator eager_op_function_generator.cc) - target_link_libraries(eager_op_function_generator ${OP_FUNCTION_GENERETOR_DEPS}) + target_link_libraries(eager_op_function_generator + ${OP_FUNCTION_GENERETOR_DEPS}) if(NOT WIN32) add_executable(kernel_signature_generator kernel_signature_generator.cc) - target_link_libraries(kernel_signature_generator ${OP_FUNCTION_GENERETOR_DEPS}) + target_link_libraries(kernel_signature_generator + ${OP_FUNCTION_GENERETOR_DEPS}) endif() - get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) + get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(op_function_generator ${os_dependency_modules}) target_link_libraries(eager_op_function_generator ${os_dependency_modules}) if(WITH_ROCM) @@ -193,11 +255,13 @@ if(WITH_PYTHON) set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function_impl.h) set(tmp_impl_file ${impl_file}.tmp) - set(eager_impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/eager_op_function_impl.h) + set(eager_impl_file + ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/eager_op_function_impl.h) set(tmp_eager_impl_file ${eager_impl_file}.tmp) set(OP_IMPL_DEPS op_function_generator) - set(EAGER_OP_IMPL_DEPS eager_op_function_generator eager_final_state_python_c_codegen) + set(EAGER_OP_IMPL_DEPS eager_op_function_generator + eager_final_state_python_c_codegen) if(WIN32) if("${CMAKE_GENERATOR}" STREQUAL "Ninja") @@ -206,81 +270,103 @@ if(WITH_PYTHON) set(op_impl_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}") endif() - file(WRITE ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat "" - "set build_times=1\n" - ":retry\n" - "ECHO op_function_generator run %build_times% time\n" - "taskkill /f /im op_function_generator.exe 2>NUL\n" - "${op_impl_path}/op_function_generator.exe ${tmp_impl_file}\n" - "if %ERRORLEVEL% NEQ 0 (\n" - " set /a build_times=%build_times%+1\n" - " if %build_times% GEQ 10 (\n" - " exit /b 1\n" - " ) else (\n" - " goto :retry\n" - " )\n" - ")\n" - "exit /b 0") - - file(WRITE ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/eager_op_function_generator_retry.bat "" - "set build_times=1\n" - ":retry\n" - "ECHO eager_op_function_generator run %build_times% time\n" - "taskkill /f /im eager_op_function_generator.exe 2>NUL\n" - "${op_impl_path}/eager_op_function_generator.exe ${tmp_eager_impl_file}\n" - "if %ERRORLEVEL% NEQ 0 (\n" - " set /a build_times=%build_times%+1\n" - " if %build_times% GEQ 10 (\n" - " exit /b 1\n" - " ) else (\n" - " goto :retry\n" - " )\n" - ")\n" - "exit /b 0") + file( + WRITE + ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat + "" + "set build_times=1\n" + ":retry\n" + "ECHO op_function_generator run %build_times% time\n" + "taskkill /f /im op_function_generator.exe 2>NUL\n" + "${op_impl_path}/op_function_generator.exe ${tmp_impl_file}\n" + "if %ERRORLEVEL% NEQ 0 (\n" + " set /a build_times=%build_times%+1\n" + " if %build_times% GEQ 10 (\n" + " exit /b 1\n" + " ) else (\n" + " goto :retry\n" + " )\n" + ")\n" + "exit /b 0") + + file( + WRITE + ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/eager_op_function_generator_retry.bat + "" + "set build_times=1\n" + ":retry\n" + "ECHO eager_op_function_generator run %build_times% time\n" + "taskkill /f /im eager_op_function_generator.exe 2>NUL\n" + "${op_impl_path}/eager_op_function_generator.exe ${tmp_eager_impl_file}\n" + "if %ERRORLEVEL% NEQ 0 (\n" + " set /a build_times=%build_times%+1\n" + " if %build_times% GEQ 10 (\n" + " exit /b 1\n" + " ) else (\n" + " goto :retry\n" + " )\n" + ")\n" + "exit /b 0") if(${CBLAS_PROVIDER} STREQUAL MKLML) - ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/libiomp5md.dll - COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${op_impl_path} + add_custom_command( + OUTPUT ${op_impl_path}/libiomp5md.dll + COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} + ${op_impl_path} DEPENDS mklml) list(APPEND OP_IMPL_DEPS ${op_impl_path}/libiomp5md.dll) list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/libiomp5md.dll) else(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS) - ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/openblas.dll + add_custom_command( + OUTPUT ${op_impl_path}/openblas.dll COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_SHARED_LIB} ${op_impl_path} DEPENDS extern_openblas) list(APPEND OP_IMPL_DEPS ${op_impl_path}/openblas.dll) list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/openblas.dll) endif() if(WITH_MKLDNN) - ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/mkldnn.dll + add_custom_command( + OUTPUT ${op_impl_path}/mkldnn.dll COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${op_impl_path} DEPENDS mkldnn) - list(APPEND OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll) - list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll) + list(APPEND OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll) + list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll) endif() if(WITH_ONNXRUNTIME) - ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll - COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR} + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll + COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_SHARED_LIB} + ${CMAKE_CURRENT_BINARY_DIR} DEPENDS paddle2onnx) list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll) - list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll) + list(APPEND EAGER_OP_IMPL_DEPS + ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll) - ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime.dll - COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR} + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime.dll + COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SHARED_LIB} + ${CMAKE_CURRENT_BINARY_DIR} DEPENDS onnxruntime) list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime.dll) - list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime.dll) + list(APPEND EAGER_OP_IMPL_DEPS + ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime.dll) endif() - add_custom_command(OUTPUT ${impl_file} - COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file} + add_custom_command( + OUTPUT ${impl_file} + COMMAND + ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} + ${impl_file} COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}" DEPENDS ${OP_IMPL_DEPS}) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) - add_custom_command(OUTPUT ${eager_impl_file} - COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/eager_op_function_generator_retry.bat - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_eager_impl_file} ${eager_impl_file} + add_custom_command( + OUTPUT ${eager_impl_file} + COMMAND + ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/eager_op_function_generator_retry.bat + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_eager_impl_file} + ${eager_impl_file} COMMENT "copy_if_different ${tmp_eager_impl_file} to ${eager_impl_file}" DEPENDS ${EAGER_OP_IMPL_DEPS}) endif() @@ -290,79 +376,120 @@ if(WITH_PYTHON) # LD_LIBRARY_PATH. This is different with Windows platformm, which search # *.dll in current directory automatically. if(WITH_ONNXRUNTIME) - if (APPLE) - set(PADDLE2ONNX_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libpaddle2onnx.dylib) - set(ONNXRUNTIME_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.dylib) + if(APPLE) + set(PADDLE2ONNX_PYBIND_OUT + ${CMAKE_CURRENT_BINARY_DIR}/libpaddle2onnx.dylib) + set(ONNXRUNTIME_PYBIND_OUT + ${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.dylib) else() - set(PADDLE2ONNX_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libpaddle2onnx.so) - set(ONNXRUNTIME_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.so) + set(PADDLE2ONNX_PYBIND_OUT + ${CMAKE_CURRENT_BINARY_DIR}/libpaddle2onnx.so) + set(ONNXRUNTIME_PYBIND_OUT + ${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.so) endif() - ADD_CUSTOM_COMMAND(OUTPUT ${PADDLE2ONNX_PYBIND_OUT} - COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_LIB} ${CMAKE_CURRENT_BINARY_DIR} + add_custom_command( + OUTPUT ${PADDLE2ONNX_PYBIND_OUT} + COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_LIB} + ${CMAKE_CURRENT_BINARY_DIR} DEPENDS paddle2onnx) list(APPEND OP_IMPL_DEPS ${PADDLE2ONNX_PYBIND_OUT}) list(APPEND EAGER_OP_IMPL_DEPS ${PADDLE2ONNX_PYBIND_OUT}) - ADD_CUSTOM_COMMAND(OUTPUT ${ONNXRUNTIME_PYBIND_OUT} - COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_LIB} ${CMAKE_CURRENT_BINARY_DIR} + add_custom_command( + OUTPUT ${ONNXRUNTIME_PYBIND_OUT} + COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_LIB} + ${CMAKE_CURRENT_BINARY_DIR} DEPENDS onnxruntime) list(APPEND OP_IMPL_DEPS ${ONNXRUNTIME_PYBIND_OUT}) list(APPEND EAGER_OP_IMPL_DEPS ${ONNXRUNTIME_PYBIND_OUT}) endif() if(WITH_MKLML) - ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so - COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${CMAKE_CURRENT_BINARY_DIR} + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so + COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} + ${CMAKE_CURRENT_BINARY_DIR} DEPENDS mklml) list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so) list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so) endif() if(WITH_MKLDNN) - ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0 - COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR} + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0 + COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} + ${CMAKE_CURRENT_BINARY_DIR} DEPENDS mkldnn) list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0) list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0) endif() - add_custom_command(OUTPUT ${impl_file} - COMMAND ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:." - "${CMAKE_CURRENT_BINARY_DIR}/op_function_generator" - "${tmp_impl_file}" - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file} - COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}" - DEPENDS ${OP_IMPL_DEPS} - VERBATIM) + add_custom_command( + OUTPUT ${impl_file} + COMMAND + ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:." + "${CMAKE_CURRENT_BINARY_DIR}/op_function_generator" "${tmp_impl_file}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} + ${impl_file} + COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}" + DEPENDS ${OP_IMPL_DEPS} + VERBATIM) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) - add_custom_command(OUTPUT ${eager_impl_file} - COMMAND ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:." - "${CMAKE_CURRENT_BINARY_DIR}/eager_op_function_generator" - "${tmp_eager_impl_file}" - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_eager_impl_file} ${eager_impl_file} - COMMENT "copy_if_different ${tmp_eager_impl_file} to ${eager_impl_file}" - DEPENDS ${EAGER_OP_IMPL_DEPS} - VERBATIM) - endif() + add_custom_command( + OUTPUT ${eager_impl_file} + COMMAND + ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:." + "${CMAKE_CURRENT_BINARY_DIR}/eager_op_function_generator" + "${tmp_eager_impl_file}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_eager_impl_file} + ${eager_impl_file} + COMMENT "copy_if_different ${tmp_eager_impl_file} to ${eager_impl_file}" + DEPENDS ${EAGER_OP_IMPL_DEPS} + VERBATIM) + endif() endif(WIN32) add_custom_target(op_function_generator_cmd ALL DEPENDS ${impl_file}) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) - add_custom_target(eager_op_function_generator_cmd ALL DEPENDS ${eager_impl_file}) + add_custom_target(eager_op_function_generator_cmd ALL + DEPENDS ${eager_impl_file}) endif() - list(APPEND PYBIND_DEPS interpretercore standalone_executor staticgraph_executor_statistics) - cc_library(op_function_common SRCS op_function_common.cc DEPS ${PYBIND_DEPS}) + list(APPEND PYBIND_DEPS interpretercore standalone_executor + staticgraph_executor_statistics) + cc_library( + op_function_common + SRCS op_function_common.cc + DEPS ${PYBIND_DEPS}) list(APPEND PYBIND_DEPS op_function_common) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) - cc_library(paddle_eager - SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc eager_utils.cc eager_py_layer.cc - DEPS eager_api autograd_meta backward grad_node_info phi op_function_common final_dygraph_function final_dygraph_node dygraph_function dygraph_node accumulation_node py_layer_node global_utils utils python custom_operator custom_operator_node) + cc_library( + paddle_eager + SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc + eager_utils.cc eager_py_layer.cc + DEPS eager_api + autograd_meta + backward + grad_node_info + phi + op_function_common + final_dygraph_function + final_dygraph_node + dygraph_function + dygraph_node + accumulation_node + py_layer_node + global_utils + utils + python + custom_operator + custom_operator_node) add_dependencies(paddle_eager eager_codegen) add_dependencies(paddle_eager eager_op_function_generator_cmd) list(APPEND PYBIND_DEPS paddle_eager) endif() - cc_library(paddle_pybind SHARED + cc_library( + paddle_pybind SHARED SRCS ${PYBIND_SRCS} DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ${GLOB_DEV_LIB}) @@ -374,7 +501,7 @@ if(WITH_PYTHON) target_link_libraries(paddle_pybind ${ROCM_HIPRTC_LIB}) endif() - get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) + get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(paddle_pybind ${os_dependency_modules}) add_dependencies(paddle_pybind op_function_generator_cmd) endif(WITH_PYTHON) diff --git a/paddle/fluid/pybind/ascend_wrapper_py.cc b/paddle/fluid/pybind/ascend_wrapper_py.cc index fdf3a12a81fb2..8c1eb2c1b9003 100644 --- a/paddle/fluid/pybind/ascend_wrapper_py.cc +++ b/paddle/fluid/pybind/ascend_wrapper_py.cc @@ -26,11 +26,13 @@ limitations under the License. */ #include #include #include + #include #include #include #include #include + #include "paddle/fluid/framework/fleet/ascend_wrapper.h" #include "paddle/fluid/platform/device/npu/ascend_npu_info.h" #include "paddle/fluid/platform/enforce.h" @@ -78,8 +80,9 @@ ge::Status ge_initialize( py::gil_scoped_release release; auto init_options = convert_map(options); ge::Status res = ge::GEInitialize(init_options); - PADDLE_ENFORCE_EQ(res, ge::SUCCESS, platform::errors::Fatal( - "ge initialize not success:%d", res)); + PADDLE_ENFORCE_EQ( + res, ge::SUCCESS, + platform::errors::Fatal("ge initialize not success:%d", res)); py::gil_scoped_acquire acquire; return res; } @@ -253,7 +256,7 @@ void BindAscendGraph(py::module *m) { return std::unique_ptr( new ge::Session(convert_map(options))); })) - .def("add_graph", (ge::Status (Session::*)(uint32_t, const Graph &)) & + .def("add_graph", (ge::Status(Session::*)(uint32_t, const Graph &)) & Session::AddGraph) .def("add_graph", [](Session &ss, uint32_t index, const Graph &graph, @@ -261,14 +264,15 @@ void BindAscendGraph(py::module *m) { return ss.AddGraph(index, graph, convert_map(options)); }) .def("remove_graph", &Session::RemoveGraph) - .def("run_graph", - [](Session &ss, uint32_t graphId, - const std::vector &inputs) -> py::tuple { - std::vector outputs; - ge::Status res = ss.RunGraph(graphId, inputs, outputs); - return py::make_tuple(outputs, res); - }, - py::call_guard()) + .def( + "run_graph", + [](Session &ss, uint32_t graphId, + const std::vector &inputs) -> py::tuple { + std::vector outputs; + ge::Status res = ss.RunGraph(graphId, inputs, outputs); + return py::make_tuple(outputs, res); + }, + py::call_guard()) .def("build_graph", &Session::BuildGraph) .def("run_graph_async", &Session::RunGraphAsync) #ifdef PADDLE_WITH_ASCEND_STRING @@ -385,7 +389,7 @@ void BindAscendGraph(py::module *m) { }) #ifdef PADDLE_WITH_ASCEND_STRING .def("get_input_desc", - (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetInputDesc) + (TensorDesc(Operator::*)(uint32_t) const) & Operator::GetInputDesc) .def("get_input_desc", [](Operator &op, const std::string &name) { return op.GetInputDescByName(name.c_str()); @@ -420,7 +424,7 @@ void BindAscendGraph(py::module *m) { return op.GetOutputDescByName(name.c_str()); }) .def("get_output_desc", - (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetOutputDesc) + (TensorDesc(Operator::*)(uint32_t) const) & Operator::GetOutputDesc) .def("update_output_desc", static_cast(&Operator::UpdateOutputDesc)) @@ -779,19 +783,18 @@ void BindAscendGraph(py::module *m) { .def("get_tensor_desc", &Tensor::GetTensorDesc) // .def("set_data", (graphStatus(Tensor::*)(std::vector &&)) & // Tensor::SetData) - .def("set_data", (graphStatus (Tensor::*)(const std::vector &)) & + .def("set_data", (graphStatus(Tensor::*)(const std::vector &)) & Tensor::SetData) .def("set_data", - (graphStatus (Tensor::*)(const uint8_t *, size_t)) & Tensor::SetData) + (graphStatus(Tensor::*)(const uint8_t *, size_t)) & Tensor::SetData) #ifdef PADDLE_WITH_ASCEND_STRING - .def("set_data", - (graphStatus (Tensor::*)(const char *)) & Tensor::SetData) + .def("set_data", (graphStatus(Tensor::*)(const char *)) & Tensor::SetData) #else .def("set_data", (graphStatus (Tensor::*)(const std::string &)) & Tensor::SetData) #endif .def("set_data", - (graphStatus (Tensor::*)(const std::vector &)) & + (graphStatus(Tensor::*)(const std::vector &)) & Tensor::SetData) .def("get_data", @@ -813,8 +816,9 @@ void BindAscendGraph(py::module *m) { .def(py::init(), py::arg("shape"), py::arg("format") = FORMAT_ND, py::arg("dt") = DT_FLOAT) .def(py::init()) - .def("update", (void (TensorDesc::*)(const Shape &, Format, DataType)) & - TensorDesc::Update, + .def("update", + (void(TensorDesc::*)(const Shape &, Format, DataType)) & + TensorDesc::Update, py::arg("shape"), py::arg("format") = FORMAT_ND, py::arg("dt") = DT_FLOAT) .def("set_shape", &TensorDesc::SetShape) diff --git a/paddle/fluid/pybind/bind_cost_model.cc b/paddle/fluid/pybind/bind_cost_model.cc index a4a40f1fd02c9..ef2fe0dd3d446 100644 --- a/paddle/fluid/pybind/bind_cost_model.cc +++ b/paddle/fluid/pybind/bind_cost_model.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/pybind/bind_cost_model.h" #include + #include "paddle/fluid/framework/ir/cost_model.h" #include "paddle/fluid/framework/program_desc.h" diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc index 8491d1e224930..6bd032037443e 100644 --- a/paddle/fluid/pybind/bind_fleet_executor.cc +++ b/paddle/fluid/pybind/bind_fleet_executor.cc @@ -13,10 +13,13 @@ // limitations under the License. #include "paddle/fluid/pybind/bind_fleet_executor.h" + #include #include + #include #include + #include "paddle/fluid/distributed/fleet_executor/dist_model.h" #include "paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h" #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h" @@ -62,13 +65,13 @@ struct npy_format_descriptor { namespace paddle { namespace pybind { -using paddle::distributed::FleetExecutor; -using paddle::distributed::TaskNode; -using paddle::distributed::DistModelConfig; using paddle::distributed::DistModel; +using paddle::distributed::DistModelConfig; using paddle::distributed::DistModelDataBuf; -using paddle::distributed::DistModelTensor; using paddle::distributed::DistModelDataType; +using paddle::distributed::DistModelTensor; +using paddle::distributed::FleetExecutor; +using paddle::distributed::TaskNode; using paddle::framework::OpDesc; using paddle::framework::ProgramDesc; @@ -217,33 +220,34 @@ void BindFleetExecutor(py::module* m) { .def("reset", &DistModelDataBufReset) .def("reset", &DistModelDataBufReset) .def("length", &DistModelDataBuf::length) - .def("tolist", [](DistModelDataBuf& self, - const std::string& dtype) -> py::list { - py::list l; - if (dtype == "int32") { - auto* data = static_cast(self.data()); - auto size = self.length() / sizeof(int32_t); - l = py::cast(std::vector(data, data + size)); - } else if (dtype == "int64") { - auto* data = static_cast(self.data()); - auto size = self.length() / sizeof(int64_t); - l = py::cast(std::vector(data, data + size)); - } else if (dtype == "float32") { - auto* data = static_cast(self.data()); - auto size = self.length() / sizeof(float); - l = py::cast(std::vector(data, data + size)); - } else if (dtype == "float16") { - auto* data = static_cast(self.data()); - auto size = self.length() / sizeof(paddle::platform::float16); - l = py::cast( - std::vector(data, data + size)); - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported data type. Now only supports INT32, INT64, " - "FLOAT16 and FLOAT32.")); - } - return l; - }); + .def("tolist", + [](DistModelDataBuf& self, const std::string& dtype) -> py::list { + py::list l; + if (dtype == "int32") { + auto* data = static_cast(self.data()); + auto size = self.length() / sizeof(int32_t); + l = py::cast(std::vector(data, data + size)); + } else if (dtype == "int64") { + auto* data = static_cast(self.data()); + auto size = self.length() / sizeof(int64_t); + l = py::cast(std::vector(data, data + size)); + } else if (dtype == "float32") { + auto* data = static_cast(self.data()); + auto size = self.length() / sizeof(float); + l = py::cast(std::vector(data, data + size)); + } else if (dtype == "float16") { + auto* data = + static_cast(self.data()); + auto size = self.length() / sizeof(paddle::platform::float16); + l = py::cast( + std::vector(data, data + size)); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported data type. Now only supports INT32, INT64, " + "FLOAT16 and FLOAT32.")); + } + return l; + }); py::class_(*m, "DistModelTensor") .def(py::init<>()) diff --git a/paddle/fluid/pybind/communication.cc b/paddle/fluid/pybind/communication.cc index aef02d65b4dbd..418804df02879 100644 --- a/paddle/fluid/pybind/communication.cc +++ b/paddle/fluid/pybind/communication.cc @@ -12,16 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/pybind/communication.h" + #include #include #include #include #include + #include #include #include "paddle/fluid/distributed/store/tcp_store.h" -#include "paddle/fluid/pybind/communication.h" namespace py = pybind11; @@ -35,22 +37,24 @@ void BindTCPStore(py::module *m) { py::class_>( *m, "Store") .def(py::init<>()) - .def("set", - [](distributed::Store &self, const std::string &key, - const std::string &value) { - std::vector data(value.begin(), value.end()); - self.set(key, data); - }, - py::arg("key"), py::arg("value"), - py::call_guard()) - .def("get", - [](distributed::Store &self, - const std::string &key) -> py::bytes { - auto data = self.get(key); - return py::bytes(reinterpret_cast(data.data()), - data.size()); - }, - py::arg("key"), py::call_guard()) + .def( + "set", + [](distributed::Store &self, const std::string &key, + const std::string &value) { + std::vector data(value.begin(), value.end()); + self.set(key, data); + }, + py::arg("key"), py::arg("value"), + py::call_guard()) + .def( + "get", + [](distributed::Store &self, + const std::string &key) -> py::bytes { + auto data = self.get(key); + return py::bytes(reinterpret_cast(data.data()), + data.size()); + }, + py::arg("key"), py::call_guard()) .def("add", &distributed::Store::add, py::call_guard()) .def("wait", &distributed::Store::wait, diff --git a/paddle/fluid/pybind/communicator_py.cc b/paddle/fluid/pybind/communicator_py.cc index 723d7f3197230..0cb5aa6ef7023 100644 --- a/paddle/fluid/pybind/communicator_py.cc +++ b/paddle/fluid/pybind/communicator_py.cc @@ -15,16 +15,17 @@ limitations under the License. */ #include "paddle/fluid/pybind/communicator_py.h" #include + #include #include #include #include -#include "paddle/fluid/framework/program_desc.h" -#include "pybind11/pybind11.h" +#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/operators/distributed/communicator.h" #include "paddle/fluid/operators/distributed/large_scale_kv.h" #include "paddle/fluid/operators/distributed/ps/service/communicator/communicator_common.h" +#include "pybind11/pybind11.h" namespace py = pybind11; diff --git a/paddle/fluid/pybind/compatible.cc b/paddle/fluid/pybind/compatible.cc index cfe87a86cf0e5..013d0cc0c6068 100644 --- a/paddle/fluid/pybind/compatible.cc +++ b/paddle/fluid/pybind/compatible.cc @@ -13,23 +13,25 @@ // limitations under the License. #include "paddle/fluid/pybind/compatible.h" + #include #include + #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/pybind/pybind_boost_headers.h" namespace py = pybind11; -using paddle::framework::compatible::OpAttrVariantT; -using paddle::framework::compatible::OpUpdateInfo; using paddle::framework::compatible::OpAttrInfo; -using paddle::framework::compatible::OpInputOutputInfo; +using paddle::framework::compatible::OpAttrVariantT; using paddle::framework::compatible::OpBugfixInfo; -using paddle::framework::compatible::OpUpdateType; -using paddle::framework::compatible::OpUpdateBase; -using paddle::framework::compatible::OpVersionDesc; using paddle::framework::compatible::OpCheckpoint; +using paddle::framework::compatible::OpInputOutputInfo; +using paddle::framework::compatible::OpUpdateBase; +using paddle::framework::compatible::OpUpdateInfo; +using paddle::framework::compatible::OpUpdateType; using paddle::framework::compatible::OpVersion; +using paddle::framework::compatible::OpVersionDesc; namespace paddle { namespace pybind { diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc index 8b48d0b4e44ca..89a3904d0003f 100644 --- a/paddle/fluid/pybind/const_value.cc +++ b/paddle/fluid/pybind/const_value.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/pybind/const_value.h" + #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/pybind/crypto.cc b/paddle/fluid/pybind/crypto.cc index 8fbf395bf18a6..07a9e4021cee7 100644 --- a/paddle/fluid/pybind/crypto.cc +++ b/paddle/fluid/pybind/crypto.cc @@ -97,11 +97,12 @@ void BindAESCipher(py::module* m) { void BindCipherFactory(py::module* m) { py::class_(*m, "CipherFactory") .def(py::init<>()) - .def_static("create_cipher", - [](const std::string& config_file) { - return CipherFactory::CreateCipher(config_file); - }, - py::arg("config_file") = std::string()); + .def_static( + "create_cipher", + [](const std::string& config_file) { + return CipherFactory::CreateCipher(config_file); + }, + py::arg("config_file") = std::string()); } void BindCipherUtils(py::module* m) { diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc index 64c145c94f99d..54080d5e09615 100644 --- a/paddle/fluid/pybind/cuda_streams_py.cc +++ b/paddle/fluid/pybind/cuda_streams_py.cc @@ -12,13 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/pybind/cuda_streams_py.h" + #include #include #include "paddle/fluid/platform/device_event_base.h" #include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/stream/cuda_stream.h" -#include "paddle/fluid/pybind/cuda_streams_py.h" namespace py = pybind11; @@ -28,29 +29,31 @@ void BindCudaStream(py::module *m_ptr) { auto &m = *m_ptr; // Bind Methods - m.def("_get_current_stream", - [](int deviceId) { + m.def( + "_get_current_stream", + [](int deviceId) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - return paddle::platform::stream::get_current_stream(deviceId); + return paddle::platform::stream::get_current_stream(deviceId); #else - PADDLE_THROW(platform::errors::Unavailable( - "Paddle is not compiled with CUDA. Cannot visit cuda current" - "stream.")); + PADDLE_THROW(platform::errors::Unavailable( + "Paddle is not compiled with CUDA. Cannot visit cuda current" + "stream.")); #endif - }, - py::return_value_policy::reference); + }, + py::return_value_policy::reference); - m.def("_set_current_stream", - [](paddle::platform::stream::CUDAStream &stream) { + m.def( + "_set_current_stream", + [](paddle::platform::stream::CUDAStream &stream) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - return paddle::platform::stream::set_current_stream(&stream); + return paddle::platform::stream::set_current_stream(&stream); #else - PADDLE_THROW(platform::errors::Unavailable( - "Paddle is not compiled with CUDA. Cannot set cuda current " - "stream.")); + PADDLE_THROW(platform::errors::Unavailable( + "Paddle is not compiled with CUDA. Cannot set cuda current " + "stream.")); #endif - }, - py::return_value_policy::reference); + }, + py::return_value_policy::reference); m.def("_device_synchronize", [](int device_id) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -94,12 +97,13 @@ void BindCudaStream(py::module *m_ptr) { )DOC") #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - .def("wait_event", - [](paddle::platform::stream::CUDAStream &self, - paddle::platform::CudaEvent &event) { - self.WaitEvent(event.GetRawCudaEvent()); - }, - R"DOC( + .def( + "wait_event", + [](paddle::platform::stream::CUDAStream &self, + paddle::platform::CudaEvent &event) { + self.WaitEvent(event.GetRawCudaEvent()); + }, + R"DOC( Makes all future work submitted to stream wait for all work captured in event. Parameters: @@ -115,15 +119,16 @@ void BindCudaStream(py::module *m_ptr) { s.wait_event(event) )DOC") - .def("wait_stream", - [](paddle::platform::stream::CUDAStream &self, - paddle::platform::stream::CUDAStream &stream) { - paddle::platform::CudaEvent event; - event.Record(stream.raw_stream()); - - self.WaitEvent(event.GetRawCudaEvent()); - }, - R"DOC( + .def( + "wait_stream", + [](paddle::platform::stream::CUDAStream &self, + paddle::platform::stream::CUDAStream &stream) { + paddle::platform::CudaEvent event; + event.Record(stream.raw_stream()); + + self.WaitEvent(event.GetRawCudaEvent()); + }, + R"DOC( Synchronizes with the given stream. Parameters: @@ -139,11 +144,12 @@ void BindCudaStream(py::module *m_ptr) { s1.wait_stream(s2) )DOC") - .def("query", - [](paddle::platform::stream::CUDAStream &self) { - return self.Query(); - }, - R"DOC( + .def( + "query", + [](paddle::platform::stream::CUDAStream &self) { + return self.Query(); + }, + R"DOC( Return the status whether if all operations in stream have completed. Returns: A boolean value. @@ -157,11 +163,12 @@ void BindCudaStream(py::module *m_ptr) { is_done = s.query() )DOC") - .def("synchronize", - [](paddle::platform::stream::CUDAStream &self) { - self.Synchronize(); - }, - R"DOC( + .def( + "synchronize", + [](paddle::platform::stream::CUDAStream &self) { + self.Synchronize(); + }, + R"DOC( Waits for stream tasks to complete. Examples: @@ -173,16 +180,17 @@ void BindCudaStream(py::module *m_ptr) { s.synchronize() )DOC") - .def("record_event", - [](paddle::platform::stream::CUDAStream &self, - paddle::platform::CudaEvent *event) { - if (event == nullptr) { - event = new paddle::platform::CudaEvent(); - } - event->Record(self.raw_stream()); - return event; - }, - R"DOC( + .def( + "record_event", + [](paddle::platform::stream::CUDAStream &self, + paddle::platform::CudaEvent *event) { + if (event == nullptr) { + event = new paddle::platform::CudaEvent(); + } + event->Record(self.raw_stream()); + return event; + }, + R"DOC( Record a CUDA event in the stream. Parameters: @@ -201,7 +209,7 @@ void BindCudaStream(py::module *m_ptr) { event = s.record_event() )DOC", - py::arg("event") = nullptr) + py::arg("event") = nullptr) .def_property_readonly( "cuda_stream", [](paddle::platform::stream::CUDAStream &self) { @@ -225,32 +233,33 @@ void BindCudaStream(py::module *m_ptr) { )DOC") #endif - .def("__init__", - [](paddle::platform::stream::CUDAStream &self, - platform::CUDAPlace *device, int priority) { + .def( + "__init__", + [](paddle::platform::stream::CUDAStream &self, + platform::CUDAPlace *device, int priority) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (priority != 1 && priority != 2) { - PADDLE_THROW(platform::errors::InvalidArgument( - "Priority should be 1(high) or 2(normal) ")); - } - auto prio = paddle::platform::stream::Priority(priority); - auto stream_flag = - paddle::platform::stream::StreamFlag::kStreamNonBlocking; - - if (device == nullptr) { - int curr_device_id = platform::GetCurrentDeviceId(); - auto device_tmp = platform::CUDAPlace(curr_device_id); - device = &device_tmp; - } - - new (&self) paddle::platform::stream::CUDAStream(*device, prio, - stream_flag); + if (priority != 1 && priority != 2) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Priority should be 1(high) or 2(normal) ")); + } + auto prio = paddle::platform::stream::Priority(priority); + auto stream_flag = + paddle::platform::stream::StreamFlag::kStreamNonBlocking; + + if (device == nullptr) { + int curr_device_id = platform::GetCurrentDeviceId(); + auto device_tmp = platform::CUDAPlace(curr_device_id); + device = &device_tmp; + } + + new (&self) paddle::platform::stream::CUDAStream(*device, prio, + stream_flag); #else PADDLE_THROW(platform::errors::Unavailable( "Class CUDAStream can only be initialized on the GPU platform.")); #endif - }, - py::arg("device") = nullptr, py::arg("priority") = 2) + }, + py::arg("device") = nullptr, py::arg("priority") = 2) .def( "__init__", [](paddle::platform::stream::CUDAStream &self, int device, @@ -315,15 +324,16 @@ void BindCudaStream(py::module *m_ptr) { )DOC") #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - .def("record", - [](paddle::platform::CudaEvent &self, - paddle::platform::stream::CUDAStream *stream) { - if (stream == nullptr) { - stream = paddle::platform::stream::get_current_stream(-1); - } - self.Record(stream->raw_stream()); - }, - R"DOC( + .def( + "record", + [](paddle::platform::CudaEvent &self, + paddle::platform::stream::CUDAStream *stream) { + if (stream == nullptr) { + stream = paddle::platform::stream::get_current_stream(-1); + } + self.Record(stream->raw_stream()); + }, + R"DOC( Records the event in the given stream. Parameters: @@ -338,10 +348,11 @@ void BindCudaStream(py::module *m_ptr) { event.record() )DOC", - py::arg("stream") = nullptr) - .def("query", - [](paddle::platform::CudaEvent &self) { return self.Query(); }, - R"DOC( + py::arg("stream") = nullptr) + .def( + "query", + [](paddle::platform::CudaEvent &self) { return self.Query(); }, + R"DOC( Queries the event's status. Returns: A boolean which indicates all work currently captured by the event has been completed. @@ -355,8 +366,9 @@ void BindCudaStream(py::module *m_ptr) { is_done = event.query() )DOC") - .def("synchronize", - [](paddle::platform::CudaEvent &self) { self.Synchronize(); }, R"DOC( + .def( + "synchronize", + [](paddle::platform::CudaEvent &self) { self.Synchronize(); }, R"DOC( Waits for an event to complete. Examples: @@ -369,22 +381,23 @@ void BindCudaStream(py::module *m_ptr) { )DOC") #endif - .def("__init__", - [](paddle::platform::CudaEvent &self, bool enable_timing, - bool blocking, bool interprocess) { + .def( + "__init__", + [](paddle::platform::CudaEvent &self, bool enable_timing, + bool blocking, bool interprocess) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - unsigned int flags = platform::GenerateDeviceEventFlag( - enable_timing, blocking, interprocess); - new (&self) paddle::platform::CudaEvent(flags); + unsigned int flags = platform::GenerateDeviceEventFlag( + enable_timing, blocking, interprocess); + new (&self) paddle::platform::CudaEvent(flags); #else - PADDLE_THROW(platform::errors::Unavailable( - "Class CUDAEvent can only be initialized on the GPU " - "platform.")); + PADDLE_THROW(platform::errors::Unavailable( + "Class CUDAEvent can only be initialized on the GPU " + "platform.")); #endif - }, - py::arg("enable_timing") = false, py::arg("blocking") = false, - py::arg("interprocess") = false); + }, + py::arg("enable_timing") = false, py::arg("blocking") = false, + py::arg("interprocess") = false); } } // namespace pybind diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc index 5e2274cb65138..700bd458a58eb 100644 --- a/paddle/fluid/pybind/data_set_py.cc +++ b/paddle/fluid/pybind/data_set_py.cc @@ -24,6 +24,7 @@ limitations under the License. */ #include #include #include + #include "google/protobuf/io/zero_copy_stream_impl.h" #include "google/protobuf/text_format.h" #include "paddle/fluid/framework/async_executor.h" diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 6636fc8aca51d..3d1a81da6f382 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -109,132 +109,141 @@ void BindDistributed(py::module *m) { .def("rank", &distributed::ProcessGroup::GetRank) .def("size", &distributed::ProcessGroup::GetSize) .def("name", &distributed::ProcessGroup::GetBackendName) - .def("allreduce", - [](distributed::ProcessGroup &self, py::handle py_tensor, - distributed::ReduceOp op) { - auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); - distributed::AllreduceOptions opts; - opts.reduce_op = op; - auto dense = - std::dynamic_pointer_cast(tensor.impl()); - std::vector tensors = {*dense}; - return self.AllReduce(tensors, tensors, opts); - }, - py::arg("tensor"), py::arg("op") = distributed::ReduceOp::SUM, - py::call_guard()) - - .def("broadcast", - [](distributed::ProcessGroup &self, py::handle py_tensor, - int source_rank) { - auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); - distributed::BroadcastOptions opts; - opts.source_rank = source_rank; - auto dense = - std::dynamic_pointer_cast(tensor.impl()); - std::vector tensors = {*dense}; - return self.Broadcast(tensors, tensors, opts); - }, - py::arg("tensor"), py::arg("source_rank"), - py::call_guard()) - - .def("barrier", - [](distributed::ProcessGroup &self, std::vector place_ids) { - distributed::BarrierOptions opts; - opts.place_ids = place_ids; - return self.Barrier(opts); - }, - py::arg("place_ids") = std::vector{}, - py::call_guard()) - - .def("send", - [](distributed::ProcessGroup &self, py::handle py_tensor, - int dst) { - auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); - auto dense = - std::dynamic_pointer_cast(tensor.impl()); - std::vector tensors = {*dense}; - return self.Send(tensors, dst); - }, - py::arg("tensor"), py::arg("dst"), - py::call_guard()) - - .def("recv", - [](distributed::ProcessGroup &self, py::handle py_tensor, - int src) { - auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); - auto dense = - std::dynamic_pointer_cast(tensor.impl()); - std::vector tensors = {*dense}; - return self.Recv(tensors, src); - }, - py::arg("tensor"), py::arg("src"), - py::call_guard()) - - .def("all_gather", - [](distributed::ProcessGroup &self, py::handle py_in_tensor, - py::handle py_out_tensor) { - auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); - auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); - auto in_dense = std::dynamic_pointer_cast( - in_tensor.impl()); - auto out_dense = std::dynamic_pointer_cast( - out_tensor.impl()); - std::vector in_tensors = {*in_dense}; - std::vector out_tensors = {*out_dense}; - return self.AllGather(in_tensors, out_tensors); - }, - py::arg("in"), py::arg("out"), - py::call_guard()) - - .def("alltoall", - [](distributed::ProcessGroup &self, py::handle py_in_tensor, - py::handle py_out_tensor) { - auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); - auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); - auto in_dense = std::dynamic_pointer_cast( - in_tensor.impl()); - auto out_dense = std::dynamic_pointer_cast( - out_tensor.impl()); - std::vector in_tensors = {*in_dense}; - std::vector out_tensors = {*out_dense}; - return self.AllToAll(in_tensors, out_tensors); - }, - py::arg("in"), py::arg("out"), - py::call_guard()) - - .def("reduce", - [](distributed::ProcessGroup &self, py::handle py_in_tensor, - int dst, distributed::ReduceOp op) { - auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); - distributed::ReduceOptions opts; - opts.reduce_op = op; - opts.root_rank = dst; - auto dense = std::dynamic_pointer_cast( - in_tensor.impl()); - std::vector tensors = {*dense}; - return self.Reduce(tensors, tensors, opts); - }, - py::arg("tensor"), py::arg("dst"), - py::arg("op") = distributed::ReduceOp::SUM, - py::call_guard()) - - .def("scatter", - [](distributed::ProcessGroup &self, py::handle py_in_tensor, - py::handle py_out_tensor, int src) { - auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); - auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); - distributed::ScatterOptions opts; - opts.root_rank = src; - auto in_dense = std::dynamic_pointer_cast( - in_tensor.impl()); - auto out_dense = std::dynamic_pointer_cast( - out_tensor.impl()); - std::vector in_tensors = {*in_dense}; - std::vector out_tensors = {*out_dense}; - return self.Scatter(in_tensors, out_tensors, opts); - }, - py::arg("in"), py::arg("out"), py::arg("src"), - py::call_guard()); + .def( + "allreduce", + [](distributed::ProcessGroup &self, py::handle py_tensor, + distributed::ReduceOp op) { + auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); + distributed::AllreduceOptions opts; + opts.reduce_op = op; + auto dense = + std::dynamic_pointer_cast(tensor.impl()); + std::vector tensors = {*dense}; + return self.AllReduce(tensors, tensors, opts); + }, + py::arg("tensor"), py::arg("op") = distributed::ReduceOp::SUM, + py::call_guard()) + + .def( + "broadcast", + [](distributed::ProcessGroup &self, py::handle py_tensor, + int source_rank) { + auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); + distributed::BroadcastOptions opts; + opts.source_rank = source_rank; + auto dense = + std::dynamic_pointer_cast(tensor.impl()); + std::vector tensors = {*dense}; + return self.Broadcast(tensors, tensors, opts); + }, + py::arg("tensor"), py::arg("source_rank"), + py::call_guard()) + + .def( + "barrier", + [](distributed::ProcessGroup &self, std::vector place_ids) { + distributed::BarrierOptions opts; + opts.place_ids = place_ids; + return self.Barrier(opts); + }, + py::arg("place_ids") = std::vector{}, + py::call_guard()) + + .def( + "send", + [](distributed::ProcessGroup &self, py::handle py_tensor, + int dst) { + auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); + auto dense = + std::dynamic_pointer_cast(tensor.impl()); + std::vector tensors = {*dense}; + return self.Send(tensors, dst); + }, + py::arg("tensor"), py::arg("dst"), + py::call_guard()) + + .def( + "recv", + [](distributed::ProcessGroup &self, py::handle py_tensor, + int src) { + auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); + auto dense = + std::dynamic_pointer_cast(tensor.impl()); + std::vector tensors = {*dense}; + return self.Recv(tensors, src); + }, + py::arg("tensor"), py::arg("src"), + py::call_guard()) + + .def( + "all_gather", + [](distributed::ProcessGroup &self, py::handle py_in_tensor, + py::handle py_out_tensor) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector in_tensors = {*in_dense}; + std::vector out_tensors = {*out_dense}; + return self.AllGather(in_tensors, out_tensors); + }, + py::arg("in"), py::arg("out"), + py::call_guard()) + + .def( + "alltoall", + [](distributed::ProcessGroup &self, py::handle py_in_tensor, + py::handle py_out_tensor) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector in_tensors = {*in_dense}; + std::vector out_tensors = {*out_dense}; + return self.AllToAll(in_tensors, out_tensors); + }, + py::arg("in"), py::arg("out"), + py::call_guard()) + + .def( + "reduce", + [](distributed::ProcessGroup &self, py::handle py_in_tensor, + int dst, distributed::ReduceOp op) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + distributed::ReduceOptions opts; + opts.reduce_op = op; + opts.root_rank = dst; + auto dense = std::dynamic_pointer_cast( + in_tensor.impl()); + std::vector tensors = {*dense}; + return self.Reduce(tensors, tensors, opts); + }, + py::arg("tensor"), py::arg("dst"), + py::arg("op") = distributed::ReduceOp::SUM, + py::call_guard()) + + .def( + "scatter", + [](distributed::ProcessGroup &self, py::handle py_in_tensor, + py::handle py_out_tensor, int src) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + distributed::ScatterOptions opts; + opts.root_rank = src; + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector in_tensors = {*in_dense}; + std::vector out_tensors = {*out_dense}; + return self.Scatter(in_tensors, out_tensors, opts); + }, + py::arg("in"), py::arg("out"), py::arg("src"), + py::call_guard()); #if defined(PADDLE_WITH_NCCL) py::class_def("eager_assign_group_by_size", - [](py::handle py_tensors, std::vector is_sparse_gradient, - std::vector group_size_limits, - std::vector tensor_indices) { - auto tensors = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0); - return distributed::Eager_AssignGroupBySize( - tensors, is_sparse_gradient, group_size_limits, tensor_indices); - }, - py::arg("tensors"), py::arg("is_sparse_gradient"), - py::arg("group_size_limits") = std::vector{25 * 1024 * 1024}, - py::arg("tensor_indices") = std::vector{}, - py::call_guard()); + m->def( + "eager_assign_group_by_size", + [](py::handle py_tensors, std::vector is_sparse_gradient, + std::vector group_size_limits, + std::vector tensor_indices) { + auto tensors = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0); + return distributed::Eager_AssignGroupBySize( + tensors, is_sparse_gradient, group_size_limits, tensor_indices); + }, + py::arg("tensors"), py::arg("is_sparse_gradient"), + py::arg("group_size_limits") = std::vector{25 * 1024 * 1024}, + py::arg("tensor_indices") = std::vector{}, + py::call_guard()); py::class_>(*m, "EagerReducer", R"DOC()DOC") .def(py::init(&CreateEagerReducer)) - .def("prepare_for_backward", - [](distributed::EagerReducer &self, py::handle py_tensors) { - auto params = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0); - self.PrepareForBackward(params); - }, - py::arg("tensors"), py::call_guard()); + .def( + "prepare_for_backward", + [](distributed::EagerReducer &self, py::handle py_tensors) { + auto params = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0); + self.PrepareForBackward(params); + }, + py::arg("tensors"), py::call_guard()); } } // end namespace pybind diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc index c1b26ee0b792d..f9325d1b9ca53 100644 --- a/paddle/fluid/pybind/eager.cc +++ b/paddle/fluid/pybind/eager.cc @@ -9,6 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // disable numpy compile error +#include "paddle/fluid/pybind/eager.h" + #include #include @@ -22,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/pybind/eager.h" #include "paddle/fluid/pybind/eager_utils.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/compat/convert_utils.h" @@ -488,45 +489,45 @@ void AutoInitStringTensorByStringTensor( } /** We should have init function with signature: - * 1. - * def __init__ () - * 2. - * def __init__ ( - * ** dtype: paddle::framework::proto::VarType::Type, - * ** dims: vector, - * ** name: std::string, - * ** type: paddle::framework::proto::VarType::LodTensor, - * ** persistable: bool) - * 3. (multi-place) - * (should have at least one parameter, one parameter equals to case 4, zero - * parameter equals to case 1) - * def __init__ ( - * ** value: ndarray, - * ** place: paddle::platform::Place, - * ** persistable: bool, - * ** zero_copy: bool, - * ** name: std::string, - * ** stop_gradient: bool) - * 4. - * def __init__ ( - * ** value: ndarray) - * 5. - * def __init__ ( - * ** tensor: Tensor) - * 6. (multi-place) - * (should have at least one parameter, one parameter equals to case 5, zero - * parameter equals to case 1.) - * def __init__ ( - * ** tensor: Tensor, - * ** place: paddle::platform::Place, - * ** name: std::string) - * 7. (multi-place) (should have at least one parameter, one parameter similar - * to case 5, zero parameter equals to case 1.) - * def __init__ ( - * ** tensor: FrameworkTensor, - * ** place: paddle::platform::Place, - * ** name: std::string) - * **/ + * 1. + * def __init__ () + * 2. + * def __init__ ( + * ** dtype: paddle::framework::proto::VarType::Type, + * ** dims: vector, + * ** name: std::string, + * ** type: paddle::framework::proto::VarType::LodTensor, + * ** persistable: bool) + * 3. (multi-place) + * (should have at least one parameter, one parameter equals to case 4, zero + * parameter equals to case 1) + * def __init__ ( + * ** value: ndarray, + * ** place: paddle::platform::Place, + * ** persistable: bool, + * ** zero_copy: bool, + * ** name: std::string, + * ** stop_gradient: bool) + * 4. + * def __init__ ( + * ** value: ndarray) + * 5. + * def __init__ ( + * ** tensor: Tensor) + * 6. (multi-place) + * (should have at least one parameter, one parameter equals to case 5, zero + * parameter equals to case 1.) + * def __init__ ( + * ** tensor: Tensor, + * ** place: paddle::platform::Place, + * ** name: std::string) + * 7. (multi-place) (should have at least one parameter, one parameter similar + * to case 5, zero parameter equals to case 1.) + * def __init__ ( + * ** tensor: FrameworkTensor, + * ** place: paddle::platform::Place, + * ** name: std::string) + * **/ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { EAGER_TRY // set a flag to record use kwargs or not @@ -828,37 +829,37 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { } /** We should have init function with signature: - * 1. - * def __init__ () - * - * 2. - * def __init__ ( - * ** dims: vector, - * ** name: std::string) - * - * 3. - * (should have at least one parameter, one parameter equals to case 4, zero - * parameter equals to case 1) - * def __init__ ( - * ** value: ndarray, - * ** zero_copy: bool, - * ** name: std::string) - * - * 4. - * def __init__ ( - * ** value: ndarray) - * - * 5. - * def __init__ ( - * ** tensor: Tensor) - * - * 6. - * (should have at least one parameter, one parameter equals to case 5, zero - * parameter equals to case 1.) - * def __init__ ( - * ** tensor: Tensor, - * ** name: std::string) - * **/ + * 1. + * def __init__ () + * + * 2. + * def __init__ ( + * ** dims: vector, + * ** name: std::string) + * + * 3. + * (should have at least one parameter, one parameter equals to case 4, zero + * parameter equals to case 1) + * def __init__ ( + * ** value: ndarray, + * ** zero_copy: bool, + * ** name: std::string) + * + * 4. + * def __init__ ( + * ** value: ndarray) + * + * 5. + * def __init__ ( + * ** tensor: Tensor) + * + * 6. + * (should have at least one parameter, one parameter equals to case 5, zero + * parameter equals to case 1.) + * def __init__ ( + * ** tensor: Tensor, + * ** name: std::string) + * **/ int StringTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { // set a flag to record use kwargs or not bool flag_kwargs = false; @@ -916,8 +917,9 @@ int StringTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { // case 1 VLOG(6) << "Calling case1's string initializer."; EmptyStringTensorInitializer( - py_tensor_ptr, egr::Controller::Instance().GenerateUniqueName( - "generated_string_tensor"), + py_tensor_ptr, + egr::Controller::Instance().GenerateUniqueName( + "generated_string_tensor"), egr::Controller::Instance().GetExpectedPlace()); return 0; } else { diff --git a/paddle/fluid/pybind/eager.h b/paddle/fluid/pybind/eager.h index a3eac7ab47043..db2b438c3bd94 100644 --- a/paddle/fluid/pybind/eager.h +++ b/paddle/fluid/pybind/eager.h @@ -11,11 +11,11 @@ limitations under the License. */ #pragma once #include -#include "pybind11/pybind11.h" -#include "pybind11/stl.h" #include "paddle/fluid/eager/pylayer/py_layer_node.h" #include "paddle/phi/core/dense_tensor.h" +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" namespace paddle { namespace pybind { diff --git a/paddle/fluid/pybind/eager_custom_python_api.h b/paddle/fluid/pybind/eager_custom_python_api.h index 99ec4212918de..df4920a5e690f 100644 --- a/paddle/fluid/pybind/eager_custom_python_api.h +++ b/paddle/fluid/pybind/eager_custom_python_api.h @@ -14,6 +14,7 @@ #pragma once #include + #include "paddle/phi/core/enforce.h" static PyObject *eager_api_run_program(PyObject *self, PyObject *args, @@ -27,7 +28,8 @@ static PyObject *eager_api_run_program(PyObject *self, PyObject *args, GetScopePtrListFromArgs("run_program", "OutScope", args, 3, false); auto DOut = GetTensorPtrListFromArgs("run_program", "DOut", args, 4, true); framework::AttributeMap attrs; - ConstructAttrMapFromPyArgs("run_program", args, 5, PyTuple_GET_SIZE(args), + // TODO(zengjinle): support CUDA Graph on eager mode + ConstructAttrMapFromPyArgs("run_program", args, 6, PyTuple_GET_SIZE(args), attrs); tstate = PyEval_SaveThread(); diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index 628e808ef99ac..c75ac0b52c52c 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -20,9 +20,6 @@ typedef SSIZE_T ssize_t; #include #include -#include "pybind11/numpy.h" -#include "pybind11/pybind11.h" - #include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/autograd_meta.h" @@ -51,6 +48,8 @@ typedef SSIZE_T ssize_t; #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/sparse_csr_tensor.h" +#include "pybind11/numpy.h" +#include "pybind11/pybind11.h" namespace paddle { namespace pybind { diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index b54f4e1416c35..ab6b8edd52eae 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -21,9 +21,6 @@ typedef SSIZE_T ssize_t; #include #include -#include "pybind11/numpy.h" -#include "pybind11/pybind11.h" - #include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" @@ -47,12 +44,15 @@ typedef SSIZE_T ssize_t; #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/sparse_csr_tensor.h" #include "pybind11/detail/internals.h" +#include "pybind11/numpy.h" +#include "pybind11/pybind11.h" #pragma GCC diagnostic ignored "-Wmissing-field-initializers" #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/framework/python_headers.h" #include "paddle/fluid/memory/allocation/mmap_allocator.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { namespace pybind { @@ -518,7 +518,10 @@ static PyObject* tensor_clear_gradient(TensorObject* self, PyObject* args, } else if (grad->is_dense_tensor()) { if (grad->initialized()) { if (set_to_zero) { - grad->set_impl(paddle::experimental::zeros_like(*grad).impl()); + auto* grad_t = static_cast(grad->impl().get()); + auto* dev_ctx = + platform::DeviceContextPool::Instance().Get(grad_t->place()); + phi::funcs::set_constant(*dev_ctx, grad_t, 0.0); if (is_leaf) { std::static_pointer_cast( egr::EagerUtils::grad_node(self->tensor)) @@ -555,13 +558,26 @@ static PyObject* tensor__zero_grads(TensorObject* self, PyObject* args, "Please check if you have manually cleared" "the grad inside autograd_meta")); if (grad->initialized()) { - grad->set_impl(paddle::experimental::zeros_like(*(grad)).impl()); + if (grad->is_dense_tensor()) { + auto* t = static_cast(grad->impl().get()); + auto* dev_ctx = platform::DeviceContextPool::Instance().Get(t->place()); + phi::funcs::set_constant(*dev_ctx, t, 0.0); + } else { + grad->set_impl(paddle::experimental::zeros_like(*(grad)).impl()); + } } } else { auto meta = egr::EagerUtils::unsafe_autograd_meta(self->tensor); if (meta->MutableGrad()->initialized()) { - meta->MutableGrad()->set_impl( - paddle::experimental::zeros_like(*(meta->MutableGrad())).impl()); + if (meta->MutableGrad()->is_dense_tensor()) { + auto* t = + static_cast(meta->MutableGrad()->impl().get()); + auto* dev_ctx = platform::DeviceContextPool::Instance().Get(t->place()); + phi::funcs::set_constant(*dev_ctx, t, 0.0); + } else { + meta->MutableGrad()->set_impl( + paddle::experimental::zeros_like(*(meta->MutableGrad())).impl()); + } } } @@ -990,10 +1006,11 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self, PADDLE_ENFORCE_EQ( egr::egr_utils_api::IsLeafTensor(self->tensor) && !egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient(), - false, platform::errors::InvalidArgument( - "Leaf Tensor (%s) that doesn't stop gradient can't use " - "inplace strategy.", - self->tensor.name())); + false, + platform::errors::InvalidArgument( + "Leaf Tensor (%s) that doesn't stop gradient can't use " + "inplace strategy.", + self->tensor.name())); } paddle::experimental::Tensor value_tensor; @@ -1215,9 +1232,10 @@ static PyObject* tensor_register_reduce_hook(TensorObject* self, PyObject* args, "Only can register backward hook for leaf Tensor.")); PADDLE_ENFORCE_EQ( !egr::EagerUtils::unsafe_autograd_meta(self->tensor)->StopGradient(), - true, platform::errors::InvalidArgument( - "Cannot register backward hook on a Tensor that stop " - "gradient.")); + true, + platform::errors::InvalidArgument( + "Cannot register backward hook on a Tensor that stop " + "gradient.")); PADDLE_ENFORCE( grad_node.get() != nullptr, paddle::platform::errors::Fatal("Detected NULL grad_node," @@ -1650,8 +1668,8 @@ PyMethodDef variable_methods[] = { (PyCFunction)(void (*)(void))tensor_method__is_initialized, METH_VARARGS | METH_KEYWORDS, NULL}, {"_is_dense_tensor_hold_allocation", - (PyCFunction)( - void (*)(void))tensor_method__is_dense_tensor_hold_allocation, + (PyCFunction)(void (*)( + void))tensor_method__is_dense_tensor_hold_allocation, METH_VARARGS | METH_KEYWORDS, NULL}, {"_copy_to", (PyCFunction)(void (*)(void))tensor_method__copy_to, METH_VARARGS | METH_KEYWORDS, NULL}, @@ -1776,8 +1794,8 @@ PyMethodDef string_tensor_variable_methods[] = { (PyCFunction)(void (*)(void))tensor_method__is_initialized, METH_VARARGS | METH_KEYWORDS, NULL}, {"_is_string_tensor_hold_allocation", - (PyCFunction)( - void (*)(void))tensor_method__is_string_tensor_hold_allocation, + (PyCFunction)(void (*)( + void))tensor_method__is_string_tensor_hold_allocation, METH_VARARGS | METH_KEYWORDS, NULL}, // TODO(zhoushunjie): Need to add _copy_to, copy_ for StringTensor. {NULL, NULL, 0, NULL}}; diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc index b546aa2d76bcd..f58f3ce94537e 100644 --- a/paddle/fluid/pybind/eager_op_function_generator.cc +++ b/paddle/fluid/pybind/eager_op_function_generator.cc @@ -486,7 +486,8 @@ int main(int argc, char* argv[]) { "\"paddle/fluid/pybind/op_function_common.h\"", "\"paddle/fluid/eager/api/generated/fluid_generated/" "dygraph_forward_api.h\"", - "\"paddle/fluid/pybind/exception.h\"", ""}; + "\"paddle/fluid/pybind/exception.h\"", + ""}; std::ofstream out(argv[1], std::ios::out); diff --git a/paddle/fluid/pybind/eager_py_layer.cc b/paddle/fluid/pybind/eager_py_layer.cc index 47a5309d691f5..a0cef6388c13f 100644 --- a/paddle/fluid/pybind/eager_py_layer.cc +++ b/paddle/fluid/pybind/eager_py_layer.cc @@ -16,8 +16,6 @@ limitations under the License. */ #include #pragma GCC diagnostic ignored "-Wattributes" -#include "pybind11/pytypes.h" - #include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/autograd_meta.h" @@ -34,6 +32,7 @@ limitations under the License. */ #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" #include "pybind11/detail/internals.h" +#include "pybind11/pytypes.h" #pragma GCC diagnostic ignored "-Wwrite-strings" #pragma GCC diagnostic ignored "-Wmissing-field-initializers" @@ -323,10 +322,11 @@ PyObject* pylayer_method_apply(PyObject* cls, PyObject* args, egr::EagerUtils::autograd_meta(dirty_tensor); PADDLE_ENFORCE_EQ(!dirty_tensor_autograd_meta->StopGradient() && egr::egr_utils_api::IsLeafTensor(*dirty_tensor), - false, paddle::platform::errors::InvalidArgument( - "Leaf Var (%s) that doesn't stop gradient " - "can't use inplace strategy.", - dirty_tensor->name())); + false, + paddle::platform::errors::InvalidArgument( + "Leaf Var (%s) that doesn't stop gradient " + "can't use inplace strategy.", + dirty_tensor->name())); dirty_tensor->bump_inplace_version(); VLOG(3) << "Tensor(" << dirty_tensor->name() << ") uses Inplace Strategy."; @@ -466,16 +466,19 @@ PyMethodDef pylayer_methods[] = { METH_O, NULL}, {NULL, NULL, 0, NULL}}; -struct PyGetSetDef pylayer_properties[]{ - {"container", (getter)tensor_properties_get_container, - (setter)tensor_properties_set_container, nullptr, nullptr}, - {"non_differentiable", (getter)tensor_properties_get_non_differentiable, - (setter)tensor_properties_set_non_differentiable, nullptr, nullptr}, - {"dirty_tensors", (getter)tensor_properties_get_dirty_tensors, - (setter)tensor_properties_set_dirty_tensors, nullptr, nullptr}, - {"materialize_grads", nullptr, - (setter)tensor_properties_set_materialize_grads, nullptr, nullptr}, - {nullptr, nullptr, nullptr, nullptr, nullptr}}; +struct PyGetSetDef pylayer_properties[] { + {"container", (getter)tensor_properties_get_container, + (setter)tensor_properties_set_container, nullptr, nullptr}, + {"non_differentiable", (getter)tensor_properties_get_non_differentiable, + (setter)tensor_properties_set_non_differentiable, nullptr, nullptr}, + {"dirty_tensors", (getter)tensor_properties_get_dirty_tensors, + (setter)tensor_properties_set_dirty_tensors, nullptr, nullptr}, + {"materialize_grads", nullptr, + (setter)tensor_properties_set_materialize_grads, nullptr, nullptr}, + { + nullptr, nullptr, nullptr, nullptr, nullptr + } +}; void BindEagerPyLayer(PyObject* module) { auto heap_type = reinterpret_cast( diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index efa0fe2cb582e..9bcac35037d04 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -14,6 +14,9 @@ limitations under the License. */ #include #include +// clang-format will try to move eager_utils.h in front of other headers +// according to google c++ style, and that cause compiling problems. +// clang-format off #include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/framework/convert_utils.h" @@ -31,6 +34,7 @@ limitations under the License. */ #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" +// clang-format on namespace paddle { namespace pybind { diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index 7f94f6c90e5a0..beab99877bd79 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -16,12 +16,12 @@ typedef SSIZE_T ssize_t; #endif #include + #include "paddle/phi/common/backend.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" - #include "pybind11/pybind11.h" #include "pybind11/stl.h" namespace paddle { @@ -112,8 +112,9 @@ struct TupleTensorResult { PyObject* args, ssize_t arg_idx) { TupleTensorResult::Run(out, result, value_idx, args, arg_idx); if (N - 1 == value_idx) { - PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get(out), - value_idx, args, arg_idx)); + PyTuple_SET_ITEM( + result, N - 1, + ToPyObject(std::get(out), value_idx, args, arg_idx)); } else { PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get(out))); } diff --git a/paddle/fluid/pybind/exception.cc b/paddle/fluid/pybind/exception.cc index 4f25a6f1a5ca8..934a9ef97fb15 100644 --- a/paddle/fluid/pybind/exception.cc +++ b/paddle/fluid/pybind/exception.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/pybind/exception.h" + #include "paddle/phi/api/ext/exception.h" namespace paddle { namespace pybind { diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc index 4ffb513671c56..25f2c91002844 100644 --- a/paddle/fluid/pybind/fleet_py.cc +++ b/paddle/fluid/pybind/fleet_py.cc @@ -18,8 +18,6 @@ limitations under the License. */ #undef _XOPEN_SOURCE #endif -#include "paddle/fluid/pybind/fleet_py.h" - #include #include #include @@ -35,17 +33,18 @@ limitations under the License. */ #include "paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h" #include "paddle/fluid/distributed/ps/wrapper/fleet.h" #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h" +#include "paddle/fluid/pybind/fleet_py.h" namespace py = pybind11; using paddle::distributed::CommContext; using paddle::distributed::Communicator; +using paddle::distributed::FeatureNode; using paddle::distributed::FleetWrapper; -using paddle::distributed::HeterClient; -using paddle::distributed::GraphPyService; using paddle::distributed::GraphNode; -using paddle::distributed::GraphPyServer; using paddle::distributed::GraphPyClient; -using paddle::distributed::FeatureNode; +using paddle::distributed::GraphPyServer; +using paddle::distributed::GraphPyService; +using paddle::distributed::HeterClient; namespace paddle { namespace pybind { @@ -246,13 +245,13 @@ void BindGraphPyClient(py::module* m) { .def("bind_local_server", &GraphPyClient::bind_local_server); } -using paddle::distributed::TreeIndex; -using paddle::distributed::IndexWrapper; using paddle::distributed::IndexNode; +using paddle::distributed::IndexWrapper; +using paddle::distributed::TreeIndex; #ifdef PADDLE_WITH_HETERPS using paddle::framework::GraphGpuWrapper; -using paddle::framework::NeighborSampleResult; using paddle::framework::NeighborSampleQuery; +using paddle::framework::NeighborSampleResult; using paddle::framework::NodeQueryResult; #endif diff --git a/paddle/fluid/pybind/fleet_wrapper_py.cc b/paddle/fluid/pybind/fleet_wrapper_py.cc index af1c3da727d41..0e1d4cd76add2 100644 --- a/paddle/fluid/pybind/fleet_wrapper_py.cc +++ b/paddle/fluid/pybind/fleet_wrapper_py.cc @@ -46,10 +46,10 @@ void BindFleetWrapper(py::module* m) { .def("push_dense", &framework::FleetWrapper::PushDenseVarsSync) .def("pull_dense", &framework::FleetWrapper::PullDenseVarsSync) .def("init_server", &framework::FleetWrapper::InitServer) - .def("run_server", (uint64_t (framework::FleetWrapper::*)(void)) & + .def("run_server", (uint64_t(framework::FleetWrapper::*)(void)) & framework::FleetWrapper::RunServer) - .def("run_server", (uint64_t (framework::FleetWrapper::*)( // NOLINT - const std::string&, uint32_t)) & // NOLINT + .def("run_server", (uint64_t(framework::FleetWrapper::*)( // NOLINT + const std::string&, uint32_t)) & // NOLINT framework::FleetWrapper::RunServer) .def("init_worker", &framework::FleetWrapper::InitWorker) .def("init_model", &framework::FleetWrapper::PushDenseParamSync) diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc index 6bb85da8c466f..e456526f8441c 100644 --- a/paddle/fluid/pybind/generator_py.cc +++ b/paddle/fluid/pybind/generator_py.cc @@ -8,9 +8,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/core/generator.h" #include +#include "paddle/phi/core/generator.h" + #ifdef _POSIX_C_SOURCE #undef _POSIX_C_SOURCE #endif diff --git a/paddle/fluid/pybind/gloo_context_py.cc b/paddle/fluid/pybind/gloo_context_py.cc index 2314ceac76e5b..b4ee1bcd02bd7 100644 --- a/paddle/fluid/pybind/gloo_context_py.cc +++ b/paddle/fluid/pybind/gloo_context_py.cc @@ -43,13 +43,14 @@ void BindGlooContext(py::module *m) { py::class_ gloo_parallel_strategy( *m, "GlooParallelStrategy", ""); gloo_parallel_strategy.def(py::init()) - .def_property("rank_num", - [](const platform::GlooParallelStrategy &self) { - return self.rank_num; - }, - [](platform::GlooParallelStrategy &self, int nranks) { - self.rank_num = nranks; - }) + .def_property( + "rank_num", + [](const platform::GlooParallelStrategy &self) { + return self.rank_num; + }, + [](platform::GlooParallelStrategy &self, int nranks) { + self.rank_num = nranks; + }) .def_property( "rank", [](const platform::GlooParallelStrategy &self) { return self.rank; }, @@ -62,20 +63,22 @@ void BindGlooContext(py::module *m) { [](platform::GlooParallelStrategy &self, const std::string &iface) { self.iface = iface; }) - .def_property("init_seconds", - [](const platform::GlooParallelStrategy &self) { - return self.init_seconds; - }, - [](platform::GlooParallelStrategy &self, int init_seconds) { - self.init_seconds = init_seconds; - }) - .def_property("run_seconds", - [](const platform::GlooParallelStrategy &self) { - return self.run_seconds; - }, - [](platform::GlooParallelStrategy &self, int run_seconds) { - self.run_seconds = run_seconds; - }) + .def_property( + "init_seconds", + [](const platform::GlooParallelStrategy &self) { + return self.init_seconds; + }, + [](platform::GlooParallelStrategy &self, int init_seconds) { + self.init_seconds = init_seconds; + }) + .def_property( + "run_seconds", + [](const platform::GlooParallelStrategy &self) { + return self.run_seconds; + }, + [](platform::GlooParallelStrategy &self, int run_seconds) { + self.run_seconds = run_seconds; + }) .def_property( "ip_address", [](const platform::GlooParallelStrategy &self) { @@ -83,13 +86,14 @@ void BindGlooContext(py::module *m) { }, [](platform::GlooParallelStrategy &self, const std::string &ip_address) { self.ip_address = ip_address; }) - .def_property("ip_port", - [](const platform::GlooParallelStrategy &self) { - return self.ip_port; - }, - [](platform::GlooParallelStrategy &self, int ip_port) { - self.ip_port = ip_port; - }); + .def_property( + "ip_port", + [](const platform::GlooParallelStrategy &self) { + return self.ip_port; + }, + [](platform::GlooParallelStrategy &self, int ip_port) { + self.ip_port = ip_port; + }); py::class_ gloo_ctx(*m, "GlooParallelContext"); gloo_ctx.def(py::init()) diff --git a/paddle/fluid/pybind/gloo_context_py.h b/paddle/fluid/pybind/gloo_context_py.h index 89bd183097b75..51f736ed060ce 100644 --- a/paddle/fluid/pybind/gloo_context_py.h +++ b/paddle/fluid/pybind/gloo_context_py.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "pybind11/pybind11.h" #include "pybind11/stl.h" diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index d24c0355c2493..3de6c64617ddd 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -159,10 +159,9 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) { // only initialize varbase, but not its tensor. static void InitVarBaseOnly(imperative::VarBase *self, const std::string &name, bool persistable = false, int stop_gradient = -1) { - auto name_ = name == "" - ? imperative::GetCurrentTracer()->GenerateUniqueName( - "generated_tensor") - : name; + auto name_ = name == "" ? imperative::GetCurrentTracer()->GenerateUniqueName( + "generated_tensor") + : name; VLOG(5) << "Init Tensor as: / name: " << name_ << " / persistable: " << persistable @@ -274,10 +273,9 @@ static void InitVarBaseFromTensorWithArgDefault(imperative::VarBase *self, const std::string &name) { VLOG(4) << "Init VarBase"; auto place = imperative::GetCurrentTracer()->ExpectedPlace(); - auto name_ = name == "" - ? imperative::GetCurrentTracer()->GenerateUniqueName( - "generated_tensor") - : name; + auto name_ = name == "" ? imperative::GetCurrentTracer()->GenerateUniqueName( + "generated_tensor") + : name; new (self) imperative::VarBase(name_); self->SetPersistable(false); self->SetType(framework::proto::VarType::LOD_TENSOR); @@ -299,10 +297,9 @@ static void InitVarBaseFromTensorWithArg(imperative::VarBase *self, const P &place, const std::string &name) { VLOG(4) << "Init VarBase"; - auto name_ = name == "" - ? imperative::GetCurrentTracer()->GenerateUniqueName( - "generated_tensor") - : name; + auto name_ = name == "" ? imperative::GetCurrentTracer()->GenerateUniqueName( + "generated_tensor") + : name; new (self) imperative::VarBase(name_); self->SetPersistable(false); self->SetType(framework::proto::VarType::LOD_TENSOR); @@ -556,38 +553,39 @@ void BindImperative(py::module *m_ptr) { }, py::return_value_policy::take_ownership); - m.def("_array_to_share_memory_tensor", - [](py::object &obj) { - // 1. cast to python array - auto array = obj.cast(); - PADDLE_ENFORCE_NE( - string::Sprintf("%s", array.dtype()).compare("object"), 0, - platform::errors::InvalidArgument( - "Faild to convert input data to a regular ndarray.\n * " - "Usually this means the input data contains nested " - "lists with different lengths.\n * Check the reader " - "function passed to 'set_(sample/sample_list/batch)" - "_generator' to locate the data causes this issue.")); - // 2. construcct LoDTensor - framework::LoDTensor t; - SetTensorFromPyArray(&t, array, - platform::CPUPlace(), true); - // 3. allocate shared memory - void *data_ptr = t.data(); - size_t data_size = t.numel() * framework::DataTypeSize(t.dtype()); - auto shared_writer_holder = - memory::allocation::AllocateMemoryMapWriterAllocation(data_size); - // 4. maintain mmap fd set & backup ipc_name - const std::string &ipc_name = shared_writer_holder->ipc_name(); - memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name); - // 5. copy data & reset holder - memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(), - platform::CPUPlace(), data_ptr, data_size); - t.ResetHolder(shared_writer_holder); - - return t; - }, - py::return_value_policy::take_ownership); + m.def( + "_array_to_share_memory_tensor", + [](py::object &obj) { + // 1. cast to python array + auto array = obj.cast(); + PADDLE_ENFORCE_NE( + string::Sprintf("%s", array.dtype()).compare("object"), 0, + platform::errors::InvalidArgument( + "Faild to convert input data to a regular ndarray.\n * " + "Usually this means the input data contains nested " + "lists with different lengths.\n * Check the reader " + "function passed to 'set_(sample/sample_list/batch)" + "_generator' to locate the data causes this issue.")); + // 2. construcct LoDTensor + framework::LoDTensor t; + SetTensorFromPyArray(&t, array, + platform::CPUPlace(), true); + // 3. allocate shared memory + void *data_ptr = t.data(); + size_t data_size = t.numel() * framework::DataTypeSize(t.dtype()); + auto shared_writer_holder = + memory::allocation::AllocateMemoryMapWriterAllocation(data_size); + // 4. maintain mmap fd set & backup ipc_name + const std::string &ipc_name = shared_writer_holder->ipc_name(); + memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name); + // 5. copy data & reset holder + memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(), + platform::CPUPlace(), data_ptr, data_size); + t.ResetHolder(shared_writer_holder); + + return t; + }, + py::return_value_policy::take_ownership); m.def("_remove_tensor_list_mmap_fds", [](py::list &tensor_list) { for (size_t i = 0; i < tensor_list.size(); ++i) { @@ -1089,31 +1087,32 @@ void BindImperative(py::module *m_ptr) { self.Name())); return var->CurrentInplaceVersion(); }) - .def("_bump_inplace_version", - [](std::shared_ptr &self) { - // NOTE(liym27): _bump_inplace_version is only used for inplace - // operation - self->BumpInplaceVersion(); - }, - R"DOC( + .def( + "_bump_inplace_version", + [](std::shared_ptr &self) { + // NOTE(liym27): _bump_inplace_version is only used for inplace + // operation + self->BumpInplaceVersion(); + }, + R"DOC( **Notes**: **This API is ONLY available in Dygraph mode.** **This is a very low level API. Users should not use it directly. ** Bump the version whenever the Tensor is modified through an inplace operation. )DOC") - .def("numpy", + .def( + "numpy", - [](imperative::VarBase &self) -> py::array { - const auto &tensor = - self.MutableVar()->Get(); - PADDLE_ENFORCE_EQ( - tensor.IsInitialized(), true, - platform::errors::InvalidArgument( - "Tensor of %s is Empty, please check if it has no data.", - self.Name())); - return TensorToPyArray(tensor, true); - }, - R"DOC( + [](imperative::VarBase &self) -> py::array { + const auto &tensor = self.MutableVar()->Get(); + PADDLE_ENFORCE_EQ( + tensor.IsInitialized(), true, + platform::errors::InvalidArgument( + "Tensor of %s is Empty, please check if it has no data.", + self.Name())); + return TensorToPyArray(tensor, true); + }, + R"DOC( Returns a numpy array shows the value of current Tensor. Returns: @@ -1133,68 +1132,69 @@ void BindImperative(py::module *m_ptr) { x = linear(data) print(x.numpy()) )DOC") - .def("detach", - [](const imperative::VarBase - &self) -> std::shared_ptr { - PADDLE_ENFORCE_EQ( - self.Var().IsInitialized(), true, - platform::errors::InvalidArgument( - "Tensor %s has not been initialized!", self.Name())); + .def( + "detach", + [](const imperative::VarBase &self) + -> std::shared_ptr { + PADDLE_ENFORCE_EQ( + self.Var().IsInitialized(), true, + platform::errors::InvalidArgument( + "Tensor %s has not been initialized!", self.Name())); - PADDLE_ENFORCE_EQ( - self.Var().IsType() || - self.Var().IsType(), - true, - platform::errors::InvalidArgument( - "Type of Tensor[%s] must be LoDTensor or SelectedRows!", - self.Name())); + PADDLE_ENFORCE_EQ( + self.Var().IsType() || + self.Var().IsType(), + true, + platform::errors::InvalidArgument( + "Type of Tensor[%s] must be LoDTensor or SelectedRows!", + self.Name())); - auto detach_var = std::make_shared( - true, "detach_" + self.Name()); + auto detach_var = std::make_shared( + true, "detach_" + self.Name()); - detach_var->SetPersistable(self.Persistable()); - detach_var->SetType(self.Type()); - detach_var->SetDataType(self.DataType()); + detach_var->SetPersistable(self.Persistable()); + detach_var->SetType(self.Type()); + detach_var->SetDataType(self.DataType()); - if (self.Var().IsType()) { - const auto &origin_tensor = - self.Var().Get(); - PADDLE_ENFORCE_EQ( - origin_tensor.IsInitialized(), true, - platform::errors::InvalidArgument( - "Tensor %s has not been initialized!", self.Name())); - - auto *detach_tensor = - detach_var->MutableVar()->GetMutable(); - detach_tensor->ShareDataWith(origin_tensor); - // NOTE(liym27): Call ShareInplaceVersionCounterWith to share the - // same TensorInplaceVersion, which is used to check whether - // inplace - // operations are correct. - detach_tensor->ShareInplaceVersionCounterWith(origin_tensor); - } else { - const auto &origin_selected_rows = - self.Var().Get(); - PADDLE_ENFORCE_EQ( - origin_selected_rows.value().IsInitialized(), true, - platform::errors::InvalidArgument( - "Tensor %s has not been initialized!", self.Name())); - - auto *detach_selected_rows = - detach_var->MutableVar()->GetMutable(); - detach_selected_rows->set_height(origin_selected_rows.height()); - detach_selected_rows->set_rows(origin_selected_rows.rows()); - detach_selected_rows->mutable_value()->ShareDataWith( - origin_selected_rows.value()); - detach_selected_rows->mutable_value() - ->ShareInplaceVersionCounterWith( - origin_selected_rows.value()); - } - VLOG(3) << "The detached Tensor(" << detach_var->Name() - << ") share data with " << self.Name(); - return detach_var; - }, - py::return_value_policy::take_ownership, R"DOC( + if (self.Var().IsType()) { + const auto &origin_tensor = + self.Var().Get(); + PADDLE_ENFORCE_EQ( + origin_tensor.IsInitialized(), true, + platform::errors::InvalidArgument( + "Tensor %s has not been initialized!", self.Name())); + + auto *detach_tensor = + detach_var->MutableVar()->GetMutable(); + detach_tensor->ShareDataWith(origin_tensor); + // NOTE(liym27): Call ShareInplaceVersionCounterWith to share the + // same TensorInplaceVersion, which is used to check whether + // inplace + // operations are correct. + detach_tensor->ShareInplaceVersionCounterWith(origin_tensor); + } else { + const auto &origin_selected_rows = + self.Var().Get(); + PADDLE_ENFORCE_EQ( + origin_selected_rows.value().IsInitialized(), true, + platform::errors::InvalidArgument( + "Tensor %s has not been initialized!", self.Name())); + + auto *detach_selected_rows = + detach_var->MutableVar()->GetMutable(); + detach_selected_rows->set_height(origin_selected_rows.height()); + detach_selected_rows->set_rows(origin_selected_rows.rows()); + detach_selected_rows->mutable_value()->ShareDataWith( + origin_selected_rows.value()); + detach_selected_rows->mutable_value() + ->ShareInplaceVersionCounterWith( + origin_selected_rows.value()); + } + VLOG(3) << "The detached Tensor(" << detach_var->Name() + << ") share data with " << self.Name(); + return detach_var; + }, + py::return_value_policy::take_ownership, R"DOC( Returns a new Tensor, detached from the current graph. It will share data with origin Tensor and always doesn't have a Tensor copy. @@ -1256,23 +1256,23 @@ void BindImperative(py::module *m_ptr) { .def("_gradient_set_empty", &imperative::VarBase::_GradientSetEmpty, py::arg("set_is_empty") = true) .def("_is_gradient_set_empty", &imperative::VarBase::_IsGradientSetEmpty) - .def("clone", - [](std::shared_ptr &self) { - const auto &tensor = self->Var().Get(); - PADDLE_ENFORCE_EQ( - tensor.IsInitialized(), true, - platform::errors::InvalidArgument( - "%s has not been initialized", self->Name())); - auto tracer = imperative::GetCurrentTracer(); - auto new_var = std::make_shared( - true, tracer->GenerateUniqueName(self->Name() + "_clone")); - framework::AttributeMap attrs; - imperative::NameVarBaseMap ins = {{"X", {self}}}; - imperative::NameVarBaseMap outs = {{"Out", {new_var}}}; - tracer->TraceOp("assign", ins, outs, attrs); - return new_var; - }, - py::return_value_policy::copy, R"DOC( + .def( + "clone", + [](std::shared_ptr &self) { + const auto &tensor = self->Var().Get(); + PADDLE_ENFORCE_EQ(tensor.IsInitialized(), true, + platform::errors::InvalidArgument( + "%s has not been initialized", self->Name())); + auto tracer = imperative::GetCurrentTracer(); + auto new_var = std::make_shared( + true, tracer->GenerateUniqueName(self->Name() + "_clone")); + framework::AttributeMap attrs; + imperative::NameVarBaseMap ins = {{"X", {self}}}; + imperative::NameVarBaseMap outs = {{"Out", {new_var}}}; + tracer->TraceOp("assign", ins, outs, attrs); + return new_var; + }, + py::return_value_policy::copy, R"DOC( Returns a new Tensor, which is clone of origin Tensor, and it remains in the current graph. It will always have a Tensor copy. @@ -1305,11 +1305,12 @@ void BindImperative(py::module *m_ptr) { print(x.grad) # None )DOC") .def("_grad_name", &imperative::VarBase::GradVarName) - .def("_grad_value", - [](imperative::VarBase &self) { - return self.MutableGradVar()->Get(); - }, - py::return_value_policy::reference) + .def( + "_grad_value", + [](imperative::VarBase &self) { + return self.MutableGradVar()->Get(); + }, + py::return_value_policy::reference) .def("_set_grad_type", [](imperative::VarBase &self, framework::proto::VarType::Type type) { self.MutableGradVarBase()->SetType(type); @@ -1337,26 +1338,27 @@ void BindImperative(py::module *m_ptr) { } } }) - .def("_grad_ivar", - [](const imperative::VarBase &self) { - auto &grad_var = self.GradVarBase(); - - if (grad_var && grad_var->Var().IsInitialized()) { - auto *tensor = - grad_var->MutableVar()->IsType() - ? grad_var->MutableVar() - ->GetMutable() - : grad_var->MutableVar() - ->GetMutable() - ->mutable_value(); - - if (tensor->IsInitialized()) { - return grad_var; - } - } - return std::shared_ptr(nullptr); - }, - py::return_value_policy::copy) + .def( + "_grad_ivar", + [](const imperative::VarBase &self) { + auto &grad_var = self.GradVarBase(); + + if (grad_var && grad_var->Var().IsInitialized()) { + auto *tensor = + grad_var->MutableVar()->IsType() + ? grad_var->MutableVar() + ->GetMutable() + : grad_var->MutableVar() + ->GetMutable() + ->mutable_value(); + + if (tensor->IsInitialized()) { + return grad_var; + } + } + return std::shared_ptr(nullptr); + }, + py::return_value_policy::copy) .def("_set_grad_ivar", [](imperative::VarBase &self, imperative::VarBase &grad) { self.SetGradVarBase(grad); @@ -1365,13 +1367,14 @@ void BindImperative(py::module *m_ptr) { [](imperative::VarBase &self) { return self.Var().IsType(); }) - .def("_allreduce", - [](imperative::VarBase &self, - const imperative::ParallelStrategy &strategy) { - if (strategy.nranks_ > 1) { + .def( + "_allreduce", + [](imperative::VarBase &self, + const imperative::ParallelStrategy &strategy) { + if (strategy.nranks_ > 1) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if NCCL_VERSION_CODE >= 2212 - imperative::AllReduce(self.Var(), self.MutableVar(), strategy); + imperative::AllReduce(self.Var(), self.MutableVar(), strategy); #else if (!self.Var().IsType()) { imperative::AllReduce(self.Var(), self.MutableVar(), strategy); @@ -1388,9 +1391,9 @@ void BindImperative(py::module *m_ptr) { "Imperative allreduce is not supported when paddle is " "not compiled with NCCL.")); #endif // PADDLE_WITH_NCCL or PADDLE_WITH_RCCL - } - }, - py::call_guard()) + } + }, + py::call_guard()) .def("_register_grad_hook", [](imperative::VarBase &self, const py::handle &hook) { PADDLE_ENFORCE_EQ( @@ -1425,22 +1428,23 @@ void BindImperative(py::module *m_ptr) { std::make_shared>(py_func)); } }) - .def("_register_backward_hook", - [](imperative::VarBase &self, const py::handle &hook) { - PADDLE_ENFORCE_EQ( - self.IsLeaf(), true, - platform::errors::InvalidArgument( - "Only can register backward hook for leaf Tensor.")); - PADDLE_ENFORCE_EQ( - !self.OverridedStopGradient() && self.HasGradVar(), true, - platform::errors::InvalidArgument( - "Cannot register backward hook on a Tensor that stop " - "gradient or without gradient.")); - auto py_func = PyObjectCast>(hook.ptr()); - self.GradVarBase()->AddVoidHook( - std::make_shared>(py_func)); - }, - R"DOC( + .def( + "_register_backward_hook", + [](imperative::VarBase &self, const py::handle &hook) { + PADDLE_ENFORCE_EQ( + self.IsLeaf(), true, + platform::errors::InvalidArgument( + "Only can register backward hook for leaf Tensor.")); + PADDLE_ENFORCE_EQ( + !self.OverridedStopGradient() && self.HasGradVar(), true, + platform::errors::InvalidArgument( + "Cannot register backward hook on a Tensor that stop " + "gradient or without gradient.")); + auto py_func = PyObjectCast>(hook.ptr()); + self.GradVarBase()->AddVoidHook( + std::make_shared>(py_func)); + }, + R"DOC( Registers a backward hook for current Tensor. This hook will be called every time the gradient of current Tensor has been fully calculated. @@ -1461,17 +1465,18 @@ void BindImperative(py::module *m_ptr) { Returns: None )DOC") - .def("cpu", - [](const std::shared_ptr &self) { - if (platform::is_cpu_place(self->Place())) { - return self; - } else { - auto new_var = self->NewVarBase(platform::CPUPlace(), true); - new_var->SetOverridedStopGradient(self->OverridedStopGradient()); - return new_var; - } - }, - R"DOC( + .def( + "cpu", + [](const std::shared_ptr &self) { + if (platform::is_cpu_place(self->Place())) { + return self; + } else { + auto new_var = self->NewVarBase(platform::CPUPlace(), true); + new_var->SetOverridedStopGradient(self->OverridedStopGradient()); + return new_var; + } + }, + R"DOC( Returns a copy of this Tensor in CPU memory. If this Tensor is already in CPU memory, then no copy is performed and the original Tensor is returned. @@ -1487,24 +1492,25 @@ void BindImperative(py::module *m_ptr) { print(y.place) # CPUPlace )DOC") - .def("pin_memory", - [](const std::shared_ptr &self) { + .def( + "pin_memory", + [](const std::shared_ptr &self) { #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) - PADDLE_THROW(platform::errors::PermissionDenied( - "Cannot copy this Tensor to pinned memory in CPU version " - "Paddle, " - "Please recompile or reinstall Paddle with CUDA support.")); + PADDLE_THROW(platform::errors::PermissionDenied( + "Cannot copy this Tensor to pinned memory in CPU version " + "Paddle, " + "Please recompile or reinstall Paddle with CUDA support.")); #endif - if (platform::is_cuda_pinned_place(self->Place())) { - return self; - } else { - auto new_var = - self->NewVarBase(platform::CUDAPinnedPlace(), true); - new_var->SetOverridedStopGradient(self->OverridedStopGradient()); - return new_var; - } - }, - R"DOC( + if (platform::is_cuda_pinned_place(self->Place())) { + return self; + } else { + auto new_var = + self->NewVarBase(platform::CUDAPinnedPlace(), true); + new_var->SetOverridedStopGradient(self->OverridedStopGradient()); + return new_var; + } + }, + R"DOC( Returns a copy of this Tensor in pin memory. If this Tensor is already in pin memory, then no copy is performed and the original Tensor is returned. @@ -1520,13 +1526,14 @@ void BindImperative(py::module *m_ptr) { print(y.place) # CUDAPinnedPlace )DOC") - .def("cuda", - [](const std::shared_ptr &self, - py::handle &handle, bool blocking) { + .def( + "cuda", + [](const std::shared_ptr &self, + py::handle &handle, bool blocking) { #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) - PADDLE_THROW(platform::errors::PermissionDenied( - "Cannot copy this Tensor to GPU in CPU version Paddle, " - "Please recompile or reinstall Paddle with CUDA support.")); + PADDLE_THROW(platform::errors::PermissionDenied( + "Cannot copy this Tensor to GPU in CPU version Paddle, " + "Please recompile or reinstall Paddle with CUDA support.")); #else int device_count = platform::GetGPUDeviceCount(); int device_id = 0; @@ -1563,8 +1570,8 @@ void BindImperative(py::module *m_ptr) { return new_var; } #endif - }, - py::arg("device_id") = py::none(), py::arg("blocking") = true, R"DOC( + }, + py::arg("device_id") = py::none(), py::arg("blocking") = true, R"DOC( Returns a copy of this Tensor in GPU memory. If this Tensor is already in GPU memory and device_id is default, @@ -1592,49 +1599,51 @@ void BindImperative(py::module *m_ptr) { y = x.cuda(1) print(y.place) # CUDAPlace(1) )DOC") - .def("_share_memory", - [](const std::shared_ptr &self) { + .def( + "_share_memory", + [](const std::shared_ptr &self) { #ifndef _WIN32 - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(self->Place()), true, - platform::errors::InvalidArgument( - "Sharing memory only support CPU Tensor currently")); - // 1. get LoDTensor - auto *t = self->MutableVar()->GetMutable(); - // 2. allocate shared memory - void *data_ptr = t->data(); - size_t data_size = - t->numel() * framework::SizeOfType( - framework::TransToProtoVarType(t->dtype())); - auto shared_writer_holder = - memory::allocation::AllocateMemoryMapWriterAllocation( - data_size); - // 3. maintain mmap fd set & backup ipc_name - const std::string &ipc_name = shared_writer_holder->ipc_name(); - memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name); - // 4. copy data & reset holder - memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(), - platform::CPUPlace(), data_ptr, data_size); - t->ResetHolder(shared_writer_holder); - return *t; + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(self->Place()), true, + platform::errors::InvalidArgument( + "Sharing memory only support CPU Tensor currently")); + // 1. get LoDTensor + auto *t = self->MutableVar()->GetMutable(); + // 2. allocate shared memory + void *data_ptr = t->data(); + size_t data_size = + t->numel() * framework::SizeOfType( + framework::TransToProtoVarType(t->dtype())); + auto shared_writer_holder = + memory::allocation::AllocateMemoryMapWriterAllocation( + data_size); + // 3. maintain mmap fd set & backup ipc_name + const std::string &ipc_name = shared_writer_holder->ipc_name(); + memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name); + // 4. copy data & reset holder + memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(), + platform::CPUPlace(), data_ptr, data_size); + t->ResetHolder(shared_writer_holder); + return *t; #else PADDLE_THROW(platform::errors::PermissionDenied( "Sharing memory in Windows OS is not supported currently")); #endif - }, - py::return_value_policy::reference) + }, + py::return_value_policy::reference) #if defined(PADDLE_WITH_CUDA) - .def("_uva", - [](const std::shared_ptr &self, int device_id) { - PADDLE_ENFORCE_EQ(platform::is_cpu_place(self->Place()), true, - platform::errors::InvalidArgument( - "Unified virtual addressing only support " - "CPU Tensor currently.")); - auto *self_tensor = - self->MutableVar()->GetMutable(); - tensor_uva(self_tensor, device_id); - }, - py::arg("device_id") = 0, py::return_value_policy::reference, R"DOC( + .def( + "_uva", + [](const std::shared_ptr &self, int device_id) { + PADDLE_ENFORCE_EQ(platform::is_cpu_place(self->Place()), true, + platform::errors::InvalidArgument( + "Unified virtual addressing only support " + "CPU Tensor currently.")); + auto *self_tensor = + self->MutableVar()->GetMutable(); + tensor_uva(self_tensor, device_id); + }, + py::arg("device_id") = 0, py::return_value_policy::reference, R"DOC( Returns self tensor with the UVA(unified virtual addressing). Args: @@ -1651,86 +1660,94 @@ void BindImperative(py::module *m_ptr) { )DOC") #endif .def("copy_", &imperative::VarBase::CopyFrom) - .def("_copy_to", - [](const std::shared_ptr &self, - const platform::CPUPlace &place, bool blocking) { - auto new_var = self->NewVarBase(place, blocking); - // Note(zhiqiu): Since NewVarBase may use GpuCopyAsync to - // copy data from the tensor of self to the tensor of new varbase, - // we need to ensure that the varbase self is not destructed until - // the GpuCopyAsync is completed. Otherwise, the memory may be - // freed - // when varbase self is destructed. - // To do that, we increase the reference count of self by 1 and - // add a cuda event to wait the GpuCopyAsync's completion. - if (!blocking) { - IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); - } - return new_var; - }, - py::return_value_policy::copy) - .def("_copy_to", - [](const std::shared_ptr &self, - const platform::CUDAPinnedPlace &place, bool blocking) { - auto new_var = self->NewVarBase(place, blocking); - if (!blocking) { - IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); - } - return new_var; - }, - py::return_value_policy::copy) - .def("_copy_to", - [](const std::shared_ptr &self, - const platform::XPUPlace &place, bool blocking) { - auto new_var = self->NewVarBase(place, blocking); - if (!blocking) { - IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); - } - return new_var; - }, - py::return_value_policy::copy) - .def("_copy_to", - [](const std::shared_ptr &self, - const platform::CUDAPlace &place, bool blocking) { - auto new_var = self->NewVarBase(place, blocking); - if (!blocking) { - IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); - } - return new_var; - }, - py::return_value_policy::copy) - .def("_copy_to", - [](const std::shared_ptr &self, - const platform::NPUPlace &place, bool blocking) { - auto new_var = self->NewVarBase(place, blocking); - if (!blocking) { - IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); - } - return new_var; - }, - py::return_value_policy::copy) - .def("_copy_to", - [](const std::shared_ptr &self, - const platform::MLUPlace &place, bool blocking) { - auto new_var = self->NewVarBase(place, blocking); - if (!blocking) { - IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); - } - return new_var; - }, - py::return_value_policy::copy) - .def("_copy_to", - [](const std::shared_ptr &self, - const platform::Place &place, bool blocking) { - auto new_var = self->NewVarBase(place, blocking); - if (!blocking) { - IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); - } - return new_var; - }, - py::return_value_policy::copy) - .def("value", [](imperative::VarBase &self) { return self.MutableVar(); }, - py::return_value_policy::reference) + .def( + "_copy_to", + [](const std::shared_ptr &self, + const platform::CPUPlace &place, bool blocking) { + auto new_var = self->NewVarBase(place, blocking); + // Note(zhiqiu): Since NewVarBase may use GpuCopyAsync to + // copy data from the tensor of self to the tensor of new varbase, + // we need to ensure that the varbase self is not destructed until + // the GpuCopyAsync is completed. Otherwise, the memory may be + // freed + // when varbase self is destructed. + // To do that, we increase the reference count of self by 1 and + // add a cuda event to wait the GpuCopyAsync's completion. + if (!blocking) { + IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); + } + return new_var; + }, + py::return_value_policy::copy) + .def( + "_copy_to", + [](const std::shared_ptr &self, + const platform::CUDAPinnedPlace &place, bool blocking) { + auto new_var = self->NewVarBase(place, blocking); + if (!blocking) { + IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); + } + return new_var; + }, + py::return_value_policy::copy) + .def( + "_copy_to", + [](const std::shared_ptr &self, + const platform::XPUPlace &place, bool blocking) { + auto new_var = self->NewVarBase(place, blocking); + if (!blocking) { + IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); + } + return new_var; + }, + py::return_value_policy::copy) + .def( + "_copy_to", + [](const std::shared_ptr &self, + const platform::CUDAPlace &place, bool blocking) { + auto new_var = self->NewVarBase(place, blocking); + if (!blocking) { + IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); + } + return new_var; + }, + py::return_value_policy::copy) + .def( + "_copy_to", + [](const std::shared_ptr &self, + const platform::NPUPlace &place, bool blocking) { + auto new_var = self->NewVarBase(place, blocking); + if (!blocking) { + IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); + } + return new_var; + }, + py::return_value_policy::copy) + .def( + "_copy_to", + [](const std::shared_ptr &self, + const platform::MLUPlace &place, bool blocking) { + auto new_var = self->NewVarBase(place, blocking); + if (!blocking) { + IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); + } + return new_var; + }, + py::return_value_policy::copy) + .def( + "_copy_to", + [](const std::shared_ptr &self, + const platform::Place &place, bool blocking) { + auto new_var = self->NewVarBase(place, blocking); + if (!blocking) { + IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); + } + return new_var; + }, + py::return_value_policy::copy) + .def( + "value", [](imperative::VarBase &self) { return self.MutableVar(); }, + py::return_value_policy::reference) .def("_clear", [](const std::shared_ptr &self) { auto *t = self->MutableVar()->GetMutable(); @@ -1842,39 +1859,28 @@ void BindImperative(py::module *m_ptr) { &imperative::VarBase::SetOverridedStopGradient) .def_property("persistable", &imperative::VarBase::Persistable, &imperative::VarBase::SetPersistable) - .def_property_readonly("shape", - [](imperative::VarBase &self) { - if (self.Var().IsType()) { - return phi::vectorize( - self.Var() - .Get() - .dims()); - } else if (self.Var() - .IsType()) { - return phi::vectorize( - self.Var() - .Get() - .value() - .dims()); - } else if (self.Var() - .IsType()) { - return std::vector{static_cast( - self.Var() - .Get() - .size())}; - } else if (self.Var() - .IsType()) { - return std::vector{static_cast( - self.Var() - .Get() - .size())}; - } else { - VLOG(2) << "It is meaningless to get shape of " - "variable type " - << GetTypeName(self); - return std::vector(); - } - }) + .def_property_readonly( + "shape", + [](imperative::VarBase &self) { + if (self.Var().IsType()) { + return phi::vectorize( + self.Var().Get().dims()); + } else if (self.Var().IsType()) { + return phi::vectorize( + self.Var().Get().value().dims()); + } else if (self.Var().IsType()) { + return std::vector{static_cast( + self.Var().Get().size())}; + } else if (self.Var().IsType()) { + return std::vector{ + static_cast(self.Var().Get().size())}; + } else { + VLOG(2) << "It is meaningless to get shape of " + "variable type " + << GetTypeName(self); + return std::vector(); + } + }) .def_property_readonly("is_leaf", &imperative::VarBase::IsLeaf, R"DOC( Whether a Tensor is leaf Tensor. @@ -2157,13 +2163,14 @@ void BindImperative(py::module *m_ptr) { [](imperative::ParallelStrategy &self, int nranks) { self.nranks_ = nranks; }) - .def_property("local_rank", - [](const imperative::ParallelStrategy &self) { - return self.local_rank_; - }, - [](imperative::ParallelStrategy &self, int local_rank) { - self.local_rank_ = local_rank; - }) + .def_property( + "local_rank", + [](const imperative::ParallelStrategy &self) { + return self.local_rank_; + }, + [](imperative::ParallelStrategy &self, int local_rank) { + self.local_rank_ = local_rank; + }) .def_property( "trainer_endpoints", [](const imperative::ParallelStrategy &self) { @@ -2172,12 +2179,14 @@ void BindImperative(py::module *m_ptr) { [](imperative::ParallelStrategy &self, std::vector eps) { self.trainer_endpoints_ = eps; }) - .def_property("current_endpoint", - [](const imperative::ParallelStrategy &self) { - return self.current_endpoint_; - }, - [](imperative::ParallelStrategy &self, - const std::string &ep) { self.current_endpoint_ = ep; }) + .def_property( + "current_endpoint", + [](const imperative::ParallelStrategy &self) { + return self.current_endpoint_; + }, + [](imperative::ParallelStrategy &self, const std::string &ep) { + self.current_endpoint_ = ep; + }) .def_property( "nrings", [](const imperative::ParallelStrategy &self) { return self.nrings_; }, @@ -2224,9 +2233,9 @@ void BindImperative(py::module *m_ptr) { }, py::call_guard()); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \ - defined(PADDLE_WITH_CNCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) || \ + defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_CNCL) py::class_>(m, "ParallelContext"); @@ -2359,43 +2368,44 @@ void BindImperative(py::module *m_ptr) { }); #if defined(PADDLE_WITH_CUDA) - m.def("to_uva_tensor", - [](const py::object &obj, int device_id) { - const auto &tracer = imperative::GetCurrentTracer(); - auto new_tensor = std::shared_ptr( - new imperative::VarBase(tracer->GenerateUniqueName())); - auto array = obj.cast(); - if (py::isinstance>(array)) { - SetUVATensorFromPyArray(new_tensor, array, device_id); - } else if (py::isinstance>(array)) { - SetUVATensorFromPyArray(new_tensor, array, device_id); - } else if (py::isinstance>(array)) { - SetUVATensorFromPyArray(new_tensor, array, device_id); - } else if (py::isinstance>(array)) { - SetUVATensorFromPyArray(new_tensor, array, device_id); - } else if (py::isinstance>(array)) { - SetUVATensorFromPyArray(new_tensor, array, device_id); - } else if (py::isinstance>(array)) { - SetUVATensorFromPyArray(new_tensor, array, device_id); - } else if (py::isinstance>( - array)) { - SetUVATensorFromPyArray( - new_tensor, array, device_id); - } else if (py::isinstance>(array)) { - SetUVATensorFromPyArray(new_tensor, array, device_id); - } else { - // obj may be any type, obj.cast() may be failed, - // then the array.dtype will be string of unknown meaning. - PADDLE_THROW(platform::errors::InvalidArgument( - "Input object type error or incompatible array data type. " - "tensor.set() supports array with bool, float16, float32, " - "float64, int8, int16, int32, int64," - "please check your input or input array data type.")); - } - return new_tensor; - }, - py::arg("obj"), py::arg("device_id") = 0, - py::return_value_policy::reference, R"DOC( + m.def( + "to_uva_tensor", + [](const py::object &obj, int device_id) { + const auto &tracer = imperative::GetCurrentTracer(); + auto new_tensor = std::shared_ptr( + new imperative::VarBase(tracer->GenerateUniqueName())); + auto array = obj.cast(); + if (py::isinstance>(array)) { + SetUVATensorFromPyArray(new_tensor, array, device_id); + } else if (py::isinstance>(array)) { + SetUVATensorFromPyArray(new_tensor, array, device_id); + } else if (py::isinstance>(array)) { + SetUVATensorFromPyArray(new_tensor, array, device_id); + } else if (py::isinstance>(array)) { + SetUVATensorFromPyArray(new_tensor, array, device_id); + } else if (py::isinstance>(array)) { + SetUVATensorFromPyArray(new_tensor, array, device_id); + } else if (py::isinstance>(array)) { + SetUVATensorFromPyArray(new_tensor, array, device_id); + } else if (py::isinstance>( + array)) { + SetUVATensorFromPyArray(new_tensor, array, + device_id); + } else if (py::isinstance>(array)) { + SetUVATensorFromPyArray(new_tensor, array, device_id); + } else { + // obj may be any type, obj.cast() may be failed, + // then the array.dtype will be string of unknown meaning. + PADDLE_THROW(platform::errors::InvalidArgument( + "Input object type error or incompatible array data type. " + "tensor.set() supports array with bool, float16, float32, " + "float64, int8, int16, int32, int64," + "please check your input or input array data type.")); + } + return new_tensor; + }, + py::arg("obj"), py::arg("device_id") = 0, + py::return_value_policy::reference, R"DOC( Returns tensor with the UVA(unified virtual addressing) created from numpy array. Args: diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h index 0e3e98512d60f..91b9294421529 100644 --- a/paddle/fluid/pybind/imperative.h +++ b/paddle/fluid/pybind/imperative.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "pybind11/pybind11.h" #include "pybind11/stl.h" diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 944781484076b..d6ffbf010016a 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "paddle/fluid/pybind/inference_api.h" + #include #include + #include #include #include @@ -26,6 +28,7 @@ #include #include #include + #include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_infer_contrib.h" @@ -75,8 +78,8 @@ using paddle::AnalysisPredictor; using paddle::NativeConfig; using paddle::NativePaddlePredictor; using paddle::PaddleBuf; -using paddle::PaddleDType; using paddle::PaddleDataLayout; +using paddle::PaddleDType; using paddle::PaddlePassBuilder; using paddle::PaddlePlace; using paddle::PaddlePredictor; @@ -379,13 +382,13 @@ void BindInferenceApi(py::module *m) { &paddle::CreatePaddlePredictor, py::arg("config")); m->def("create_paddle_predictor", &paddle::CreatePaddlePredictor, py::arg("config")); - m->def("create_predictor", [](const paddle_infer::Config &config) - -> std::unique_ptr { - auto pred = - std::unique_ptr( - new paddle_infer::Predictor(config)); - return pred; - }); + m->def("create_predictor", + [](const paddle_infer::Config &config) + -> std::unique_ptr { + auto pred = std::unique_ptr( + new paddle_infer::Predictor(config)); + return pred; + }); m->def("copy_tensor", &CopyPaddleInferTensor); m->def("paddle_dtype_size", &paddle::PaddleDtypeSize); m->def("paddle_tensor_to_bytes", &SerializePDTensorToBytes); @@ -578,11 +581,11 @@ void BindAnalysisConfig(py::module *m) { .def(py::init()) .def(py::init()) .def("summary", &AnalysisConfig::Summary) - .def("set_model", (void (AnalysisConfig::*)(const std::string &)) & - AnalysisConfig::SetModel) - .def("set_model", (void (AnalysisConfig::*)(const std::string &, - const std::string &)) & + .def("set_model", (void(AnalysisConfig::*)(const std::string &)) & AnalysisConfig::SetModel) + .def("set_model", + (void(AnalysisConfig::*)(const std::string &, const std::string &)) & + AnalysisConfig::SetModel) .def("set_prog_file", &AnalysisConfig::SetProgFile) .def("set_params_file", &AnalysisConfig::SetParamsFile) .def("model_dir", &AnalysisConfig::model_dir) @@ -657,8 +660,9 @@ void BindAnalysisConfig(py::module *m) { py::arg("disable_trt_plugin_fp16") = false) .def("tensorrt_dynamic_shape_enabled", &AnalysisConfig::tensorrt_dynamic_shape_enabled) - .def("enable_tensorrt_oss", &AnalysisConfig::EnableTensorRtOSS) - .def("tensorrt_oss_enabled", &AnalysisConfig::tensorrt_oss_enabled) + .def("enable_tensorrt_varseqlen", &AnalysisConfig::EnableVarseqlen) + .def("tensorrt_varseqlen_enabled", + &AnalysisConfig::tensorrt_varseqlen_enabled) .def("collect_shape_range_info", &AnalysisConfig::CollectShapeRangeInfo) .def("shape_range_info_path", &AnalysisConfig::shape_range_info_path) .def("shape_range_info_collected", @@ -715,11 +719,12 @@ void BindAnalysisConfig(py::module *m) { [](AnalysisConfig &self, const std::string &pass) { self.pass_builder()->DeletePass(pass); }) - .def("pass_builder", - [](AnalysisConfig &self) { - return dynamic_cast(self.pass_builder()); - }, - py::return_value_policy::reference) + .def( + "pass_builder", + [](AnalysisConfig &self) { + return dynamic_cast(self.pass_builder()); + }, + py::return_value_policy::reference) .def("nnadapter", &AnalysisConfig::NNAdapter) .def("set_dist_config", &AnalysisConfig::SetDistConfig) .def("dist_config", &AnalysisConfig::dist_config); diff --git a/paddle/fluid/pybind/io.cc b/paddle/fluid/pybind/io.cc index a7222abf45c50..c880696242126 100644 --- a/paddle/fluid/pybind/io.cc +++ b/paddle/fluid/pybind/io.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/pybind/io.h" + #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/pybind/io.h b/paddle/fluid/pybind/io.h index 942c93deccf99..7f10306e919e9 100644 --- a/paddle/fluid/pybind/io.h +++ b/paddle/fluid/pybind/io.h @@ -20,6 +20,7 @@ typedef SSIZE_T ssize_t; #endif #include + #include "paddle/fluid/pybind/pybind_boost_headers.h" namespace paddle { diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc index ecbacd37d5666..ef005ee8b10fc 100644 --- a/paddle/fluid/pybind/ir.cc +++ b/paddle/fluid/pybind/ir.cc @@ -13,12 +13,14 @@ // limitations under the License. #include "paddle/fluid/pybind/ir.h" + #include #include #include #include #include #include + #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" @@ -31,18 +33,18 @@ #include "pybind11/stl.h" namespace py = pybind11; -using paddle::framework::ir::Graph; -using paddle::framework::ir::Node; -using paddle::framework::ir::NodeComp; -using paddle::framework::ir::GraphSafeRemoveNodes; -using paddle::framework::ir::HasCircle; -using paddle::framework::ir::GraphNum; -using paddle::framework::ir::TopologySortOperations; -using paddle::framework::ir::BuildOperationAdjList; using paddle::framework::OpDesc; using paddle::framework::ProgramDesc; using paddle::framework::Scope; using paddle::framework::VarDesc; +using paddle::framework::ir::BuildOperationAdjList; +using paddle::framework::ir::Graph; +using paddle::framework::ir::GraphNum; +using paddle::framework::ir::GraphSafeRemoveNodes; +using paddle::framework::ir::HasCircle; +using paddle::framework::ir::Node; +using paddle::framework::ir::NodeComp; +using paddle::framework::ir::TopologySortOperations; using pybind11::return_value_policy; namespace paddle { @@ -104,16 +106,18 @@ void BindGraph(py::module *m) { }) .def("erase", &Graph::Erase) .def("nodes", &Graph::Nodes, return_value_policy::reference) - .def("create_var_node", - [](Graph &self, VarDesc &var_desc) { - return self.CreateVarNode(&var_desc); - }, - return_value_policy::reference) - .def("create_op_node", - [](Graph &self, OpDesc &op_desc) { - return self.CreateOpNode(&op_desc); - }, - return_value_policy::reference) + .def( + "create_var_node", + [](Graph &self, VarDesc &var_desc) { + return self.CreateVarNode(&var_desc); + }, + return_value_policy::reference) + .def( + "create_op_node", + [](Graph &self, OpDesc &op_desc) { + return self.CreateOpNode(&op_desc); + }, + return_value_policy::reference) .def("create_control_dep_var", &Graph::CreateControlDepVar, return_value_policy::reference) .def("create_empty_node", &Graph::CreateEmptyNode, diff --git a/paddle/fluid/pybind/ir.h b/paddle/fluid/pybind/ir.h index 2cc1459bbe0fe..ad2d6aa11bfef 100644 --- a/paddle/fluid/pybind/ir.h +++ b/paddle/fluid/pybind/ir.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/fluid/framework/ir/graph.h" namespace paddle { diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc index 0e9c08cff2859..a3c6fa14765aa 100644 --- a/paddle/fluid/pybind/op_function_common.cc +++ b/paddle/fluid/pybind/op_function_common.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/pybind/op_function_common.h" + #include #include #include @@ -28,7 +30,6 @@ #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/pybind/imperative.h" -#include "paddle/fluid/pybind/op_function_common.h" namespace py = pybind11; namespace paddle { @@ -640,10 +641,11 @@ void CastPyArg2AttrBlock(PyObject* obj, void ConstructAttrMapFromPyArgs( const std::string& op_type, PyObject* args, ssize_t attr_start, ssize_t attr_end, paddle::framework::AttributeMap& attrs) { // NOLINT - PADDLE_ENFORCE_EQ( - (attr_end - attr_start) % 2, 0, - platform::errors::InvalidArgument( - "The number of arguments for attributes should be even.")); + PADDLE_ENFORCE_EQ((attr_end - attr_start) % 2, 0, + platform::errors::InvalidArgument( + "The number of arguments for attributes should be even " + "but attr_start = %d, attr_end = %d.", + attr_start, attr_end)); auto attr_type_map = &(OpAttrTypeMap::Instance().Map()[op_type]); diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h index 972e8aafab758..a6fd06f5d7059 100644 --- a/paddle/fluid/pybind/op_function_generator.h +++ b/paddle/fluid/pybind/op_function_generator.h @@ -182,7 +182,7 @@ std::map> op_outs_map = { {"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, {"sparse_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, {"rnn", {"DropoutState", "Reserve", "Out", "State"}}, - {"run_program", {"DOut"}}, + {"run_program", {"DOut", "CUDAGraph"}}, {"adam", {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}}, @@ -267,7 +267,7 @@ std::map> op_passing_outs_map = { {"moving_average_abs_max_scale", {"Out", "OutScale", "OutAccum", "OutState"}}, {"rnn", {"DropoutState"}}, - {"run_program", {"Out", "DOut", "OutScope"}}, + {"run_program", {"Out", "DOut", "OutScope", "CUDAGraph"}}, {"clear_float_status", {"FloatStatusOut"}}, {"get_float_status", {"FloatStatusOut"}}, {"assign", {"Out"}}, diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 66bf8c95179af..329b3b83337dc 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -76,11 +76,12 @@ void BindProgramDesc(pybind11::module *m) { platform::errors::InvalidArgument( "Failed to parse ProgramDesc from binary string.")); }) - .def("_set_version", - [](pd::ProgramDesc &self, int64_t version) { - return self.SetVersion(version); - }, - pybind11::arg("version") = pd::kCurProgramVersion) + .def( + "_set_version", + [](pd::ProgramDesc &self, int64_t version) { + return self.SetVersion(version); + }, + pybind11::arg("version") = pd::kCurProgramVersion) .def("_version", [](pd::ProgramDesc &self) -> int64_t { return self.Version(); }) .def("get_op_deps", [](const framework::ProgramDesc &program) { @@ -113,18 +114,20 @@ void BindBlockDesc(pybind11::module *m) { .def("_insert_op", &pd::BlockDesc::InsertOp, pybind11::return_value_policy::reference) .def("_remove_op", &pd::BlockDesc::RemoveOp) - .def("var", - [](pd::BlockDesc &self, pybind11::bytes byte_name) { - std::string name = byte_name; - return self.Var(name); - }, - pybind11::return_value_policy::reference) - .def("has_var", - [](pd::BlockDesc &self, pybind11::bytes byte_name) { - std::string name = byte_name; - return self.HasVar(name); - }, - pybind11::return_value_policy::reference) + .def( + "var", + [](pd::BlockDesc &self, pybind11::bytes byte_name) { + std::string name = byte_name; + return self.Var(name); + }, + pybind11::return_value_policy::reference) + .def( + "has_var", + [](pd::BlockDesc &self, pybind11::bytes byte_name) { + std::string name = byte_name; + return self.HasVar(name); + }, + pybind11::return_value_policy::reference) .def("_rename_var", [](pd::BlockDesc &self, const pybind11::bytes &byte_name, const pybind11::bytes &byte_name_new) { @@ -137,24 +140,27 @@ void BindBlockDesc(pybind11::module *m) { std::string name = byte_name; return self.HasVarRecursive(name); }) - .def("find_var", - [](pd::BlockDesc &self, pybind11::bytes byte_name) { - std::string name = byte_name; - return self.FindVar(name); - }, - pybind11::return_value_policy::reference) - .def("find_var_recursive", - [](pd::BlockDesc &self, pybind11::bytes byte_name) { - std::string name = byte_name; - return self.FindVarRecursive(name); - }, - pybind11::return_value_policy::reference) - .def("_remove_var", - [](pd::BlockDesc &self, pybind11::bytes byte_name) { - std::string name = byte_name; - return self.RemoveVar(name); - }, - pybind11::return_value_policy::reference) + .def( + "find_var", + [](pd::BlockDesc &self, pybind11::bytes byte_name) { + std::string name = byte_name; + return self.FindVar(name); + }, + pybind11::return_value_policy::reference) + .def( + "find_var_recursive", + [](pd::BlockDesc &self, pybind11::bytes byte_name) { + std::string name = byte_name; + return self.FindVarRecursive(name); + }, + pybind11::return_value_policy::reference) + .def( + "_remove_var", + [](pd::BlockDesc &self, pybind11::bytes byte_name) { + std::string name = byte_name; + return self.RemoveVar(name); + }, + pybind11::return_value_policy::reference) .def("all_vars", &pd::BlockDesc::AllVars, pybind11::return_value_policy::reference) .def("op_size", &pd::BlockDesc::OpSize) @@ -258,8 +264,9 @@ void BindOpDesc(pybind11::module *m) { pybind11::class_ op_desc(*m, "OpDesc", ""); op_desc - .def("__init__", [](pd::OpDesc &self) { new (&self) pd::OpDesc(); }, - pybind11::return_value_policy::reference) + .def( + "__init__", [](pd::OpDesc &self) { new (&self) pd::OpDesc(); }, + pybind11::return_value_policy::reference) .def("copy_from", &pd::OpDesc::CopyFrom) .def("type", &pd::OpDesc::Type) .def("set_type", &pd::OpDesc::SetType) @@ -304,8 +311,9 @@ void BindOpDesc(pybind11::module *m) { .def("infer_var_type", &pd::OpDesc::InferVarType) .def("set_is_target", &pd::OpDesc::SetIsTarget) .def("serialize_to_string", SerializeMessage) - .def("block", [](pd::OpDesc &self) { return self.Block(); }, - pybind11::return_value_policy::reference) + .def( + "block", [](pd::OpDesc &self) { return self.Block(); }, + pybind11::return_value_policy::reference) .def("id", &pd::OpDesc::Id) .def("original_id", &pd::OpDesc::OriginalId) .def("set_original_id", &pd::OpDesc::SetOriginalId) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 0e1271c1fe07f..cba7d03623516 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -604,6 +604,8 @@ PYBIND11_MODULE(core_noavx, m) { place, static_cast(mode)); }) .def_static("end_capture", &platform::EndCUDAGraphCapture) + .def_static("gen_new_memory_pool_id", + &platform::CUDAGraph::UniqueMemoryPoolID) .def("replay", &platform::CUDAGraph::Replay) .def("reset", &platform::CUDAGraph::Reset) .def("print_to_dot_files", &platform::CUDAGraph::PrintToDotFiles); @@ -691,56 +693,56 @@ PYBIND11_MODULE(core_noavx, m) { m.def("_get_use_default_grad_op_desc_maker_ops", [] { return OpInfoMap::Instance().GetUseDefaultGradOpDescMakerOps(); }); - m.def("_get_all_register_op_kernels", - [](const std::string &lib) { - std::unordered_map> - all_kernels_info; - if (lib == "fluid" || lib == "all") { - auto &all_kernels = - paddle::framework::OperatorWithKernel::AllOpKernels(); - - for (auto &kernel_pair : all_kernels) { - auto op_type = kernel_pair.first; - std::vector kernel_types; - for (auto &info_pair : kernel_pair.second) { - paddle::framework::OpKernelType kernel_type = info_pair.first; - kernel_types.emplace_back( - paddle::framework::KernelTypeToString(kernel_type)); - } - all_kernels_info.emplace(op_type, kernel_types); + m.def( + "_get_all_register_op_kernels", + [](const std::string &lib) { + std::unordered_map> + all_kernels_info; + if (lib == "fluid" || lib == "all") { + auto &all_kernels = + paddle::framework::OperatorWithKernel::AllOpKernels(); + + for (auto &kernel_pair : all_kernels) { + auto op_type = kernel_pair.first; + std::vector kernel_types; + for (auto &info_pair : kernel_pair.second) { + paddle::framework::OpKernelType kernel_type = info_pair.first; + kernel_types.emplace_back( + paddle::framework::KernelTypeToString(kernel_type)); } + all_kernels_info.emplace(op_type, kernel_types); } - if (lib == "phi" || lib == "all") { - auto phi_kernels = phi::KernelFactory::Instance().kernels(); - for (auto &kernel_pair : phi_kernels) { - auto op_type = phi::TransToFluidOpName(kernel_pair.first); - std::vector kernel_types; - for (auto &info_pair : kernel_pair.second) { - framework::OpKernelType kernel_type = - framework::TransPhiKernelKeyToOpKernelType(info_pair.first); - auto kernel_type_str = - framework::KernelTypeToString(kernel_type); - if (all_kernels_info.count(op_type)) { - if (std::find(all_kernels_info[op_type].begin(), - all_kernels_info[op_type].end(), - kernel_type_str) == - all_kernels_info[op_type].end()) { - all_kernels_info[op_type].emplace_back(kernel_type_str); - } - } else { - kernel_types.emplace_back(kernel_type_str); + } + if (lib == "phi" || lib == "all") { + auto phi_kernels = phi::KernelFactory::Instance().kernels(); + for (auto &kernel_pair : phi_kernels) { + auto op_type = phi::TransToFluidOpName(kernel_pair.first); + std::vector kernel_types; + for (auto &info_pair : kernel_pair.second) { + framework::OpKernelType kernel_type = + framework::TransPhiKernelKeyToOpKernelType(info_pair.first); + auto kernel_type_str = framework::KernelTypeToString(kernel_type); + if (all_kernels_info.count(op_type)) { + if (std::find(all_kernels_info[op_type].begin(), + all_kernels_info[op_type].end(), + kernel_type_str) == + all_kernels_info[op_type].end()) { + all_kernels_info[op_type].emplace_back(kernel_type_str); } - } - if (!kernel_types.empty()) { - all_kernels_info.emplace(op_type, kernel_types); + } else { + kernel_types.emplace_back(kernel_type_str); } } + if (!kernel_types.empty()) { + all_kernels_info.emplace(op_type, kernel_types); + } } + } - return all_kernels_info; - }, - py::arg("lib") = "all", - R"DOC( + return all_kernels_info; + }, + py::arg("lib") = "all", + R"DOC( Return the registered kernels in paddle. Args: @@ -1009,9 +1011,10 @@ PYBIND11_MODULE(core_noavx, m) { t.set(np.ndarray([5, 30]), fluid.CPUPlace()) )DOC") - .def("shape", - [](framework::Tensor &self) { return vectorize(self.dims()); }, - R"DOC( + .def( + "shape", + [](framework::Tensor &self) { return vectorize(self.dims()); }, + R"DOC( Return the shape of Tensor. Returns: @@ -1099,20 +1102,21 @@ PYBIND11_MODULE(core_noavx, m) { // avoid misuse. // The discussion is here: // https://github.com/PaddlePaddle/Paddle/issues/10855 - .def("set_lod", - [](framework::Tensor &self, - const std::vector> &lod) { - // the input lod is offset-based level-of-detail info - LoD new_lod; - new_lod.reserve(lod.size()); - std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); - PADDLE_ENFORCE_EQ( - CheckLoD(new_lod, vectorize(self.dims()).front()), true, - platform::errors::InvalidArgument( - "The provided LoD is invalid, the LoD is %s", new_lod)); - self.set_lod(new_lod); - }, - py::arg("lod"), R"DOC( + .def( + "set_lod", + [](framework::Tensor &self, + const std::vector> &lod) { + // the input lod is offset-based level-of-detail info + LoD new_lod; + new_lod.reserve(lod.size()); + std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); + PADDLE_ENFORCE_EQ( + CheckLoD(new_lod, vectorize(self.dims()).front()), true, + platform::errors::InvalidArgument( + "The provided LoD is invalid, the LoD is %s", new_lod)); + self.set_lod(new_lod); + }, + py::arg("lod"), R"DOC( Set LoD of the Tensor. Args: @@ -1132,28 +1136,29 @@ PYBIND11_MODULE(core_noavx, m) { t.set_lod([[0, 2, 5]]) print(t.lod()) # [[0, 2, 5]] )DOC") - .def("set_recursive_sequence_lengths", - [](framework::Tensor &self, const std::vector> - &recursive_sequence_lengths) { - // the input recursive_sequence_lengths is length-based - // level-of-detail info - LoD new_lod; - new_lod.reserve(recursive_sequence_lengths.size()); - std::copy(recursive_sequence_lengths.begin(), - recursive_sequence_lengths.end(), - std::back_inserter(new_lod)); - LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod); - PADDLE_ENFORCE_EQ( - CheckLoD(new_offset_lod, vectorize(self.dims()).front()), true, - platform::errors::InvalidArgument( - "The provided recursive_sequence_lengths info is " - "invalid, " - "the LoD converted by recursive_sequence_lengths is " - "%s", - new_lod)); - self.set_lod(new_offset_lod); - }, - py::arg("recursive_sequence_lengths"), R"DOC( + .def( + "set_recursive_sequence_lengths", + [](framework::Tensor &self, const std::vector> + &recursive_sequence_lengths) { + // the input recursive_sequence_lengths is length-based + // level-of-detail info + LoD new_lod; + new_lod.reserve(recursive_sequence_lengths.size()); + std::copy(recursive_sequence_lengths.begin(), + recursive_sequence_lengths.end(), + std::back_inserter(new_lod)); + LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod); + PADDLE_ENFORCE_EQ( + CheckLoD(new_offset_lod, vectorize(self.dims()).front()), true, + platform::errors::InvalidArgument( + "The provided recursive_sequence_lengths info is " + "invalid, " + "the LoD converted by recursive_sequence_lengths is " + "%s", + new_lod)); + self.set_lod(new_offset_lod); + }, + py::arg("recursive_sequence_lengths"), R"DOC( Set LoD of the Tensor according to recursive sequence lengths. For example, if recursive_sequence_lengths=[[2, 3]], which means @@ -1178,16 +1183,17 @@ PYBIND11_MODULE(core_noavx, m) { print(t.recursive_sequence_lengths()) # [[2, 3]] print(t.lod()) # [[0, 2, 5]] )DOC") - .def("lod", - [](framework::Tensor &self) -> std::vector> { - // output the offset-based lod info - LoD lod = self.lod(); - std::vector> new_lod; - new_lod.reserve(lod.size()); - std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); - return new_lod; - }, - R"DOC( + .def( + "lod", + [](framework::Tensor &self) -> std::vector> { + // output the offset-based lod info + LoD lod = self.lod(); + std::vector> new_lod; + new_lod.reserve(lod.size()); + std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); + return new_lod; + }, + R"DOC( Return the LoD of the Tensor. Returns: @@ -1205,16 +1211,17 @@ PYBIND11_MODULE(core_noavx, m) { print(t.lod()) # [[0, 2, 5]] )DOC") // Set above comments of set_lod. - .def("recursive_sequence_lengths", - [](framework::Tensor &self) -> std::vector> { - // output the length-based lod info - LoD lod = phi::ConvertToLengthBasedLoD(self.lod()); - std::vector> new_lod; - new_lod.reserve(lod.size()); - std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); - return new_lod; - }, - R"DOC( + .def( + "recursive_sequence_lengths", + [](framework::Tensor &self) -> std::vector> { + // output the length-based lod info + LoD lod = phi::ConvertToLengthBasedLoD(self.lod()); + std::vector> new_lod; + new_lod.reserve(lod.size()); + std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); + return new_lod; + }, + R"DOC( Return the recursive sequence lengths corresponding to of the LodD of the Tensor. @@ -1232,13 +1239,14 @@ PYBIND11_MODULE(core_noavx, m) { t.set_recursive_sequence_lengths([[2, 3]]) print(t.recursive_sequence_lengths()) # [[2, 3]] )DOC") - .def("has_valid_recursive_sequence_lengths", - [](framework::Tensor &self) -> bool { - // Check that the lod info is valid and match the outermost - // dimension of the Tensor data - return CheckLoD(self.lod(), vectorize(self.dims()).front()); - }, - R"DOC( + .def( + "has_valid_recursive_sequence_lengths", + [](framework::Tensor &self) -> bool { + // Check that the lod info is valid and match the outermost + // dimension of the Tensor data + return CheckLoD(self.lod(), vectorize(self.dims()).front()); + }, + R"DOC( Check whether the LoD of the Tensor is valid. Returns: @@ -1622,9 +1630,10 @@ PYBIND11_MODULE(core_noavx, m) { const int64_t &height) { new (&instance) phi::SelectedRows(rows, height); }) - .def("get_tensor", - [](phi::SelectedRows &self) { return self.mutable_value(); }, - py::return_value_policy::reference) + .def( + "get_tensor", + [](phi::SelectedRows &self) { return self.mutable_value(); }, + py::return_value_policy::reference) .def("numel", [](phi::SelectedRows &self) -> int64_t { return self.value().numel(); @@ -1666,11 +1675,12 @@ All parameter, weight, gradient are variables in Paddle. }) .def("get_float", [](const Variable &var) -> float { return var.Get(); }) - .def("get_tensor", - [](Variable &self) -> LoDTensor * { - return self.GetMutable(); - }, - py::return_value_policy::reference) + .def( + "get_tensor", + [](Variable &self) -> LoDTensor * { + return self.GetMutable(); + }, + py::return_value_policy::reference) .def("get_bytes", [](Variable &self) { return py::bytes(*self.GetMutable()); @@ -1681,53 +1691,60 @@ All parameter, weight, gradient are variables in Paddle. }) .def("set_vocab", [](Variable &self, Vocab vocab) { *self.GetMutable() = vocab; }) - .def("get_string_tensor", - [](Variable &self) { return self.GetMutable(); }, - py::return_value_policy::reference) - .def("get_map_tensor", - [](Variable &self) { return self.GetMutable(); }, - py::return_value_policy::reference) - .def("get_lod_rank_table", - [](Variable &self) { return self.GetMutable(); }, - py::return_value_policy::reference) - .def("get_selected_rows", - [](Variable &self) -> phi::SelectedRows * { - return self.GetMutable(); - }, - py::return_value_policy::reference) - .def("get_lod_tensor_array", - [](Variable &self) { return self.GetMutable(); }, - py::return_value_policy::reference) - .def("get_fetch_list", - [](Variable &self) { return self.GetMutable(); }, - py::return_value_policy::reference) + .def( + "get_string_tensor", + [](Variable &self) { return self.GetMutable(); }, + py::return_value_policy::reference) + .def( + "get_map_tensor", + [](Variable &self) { return self.GetMutable(); }, + py::return_value_policy::reference) + .def( + "get_lod_rank_table", + [](Variable &self) { return self.GetMutable(); }, + py::return_value_policy::reference) + .def( + "get_selected_rows", + [](Variable &self) -> phi::SelectedRows * { + return self.GetMutable(); + }, + py::return_value_policy::reference) + .def( + "get_lod_tensor_array", + [](Variable &self) { return self.GetMutable(); }, + py::return_value_policy::reference) + .def( + "get_fetch_list", + [](Variable &self) { return self.GetMutable(); }, + py::return_value_policy::reference) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - .def("get_communicator", - [](Variable &self) -> platform::Communicator * { - return self.GetMutable(); - }, - py::return_value_policy::reference) + .def( + "get_communicator", + [](Variable &self) -> platform::Communicator * { + return self.GetMutable(); + }, + py::return_value_policy::reference) #endif - .def("get_reader", - [](Variable &self) -> framework::ReaderHolder * { - PADDLE_ENFORCE_EQ( - self.IsType(), true, - platform::errors::InvalidArgument( - "The variable is not type of ReaderHolder.")); - return self.GetMutable(); - }, - py::return_value_policy::reference) - .def("get_scope", - [](Variable &self) -> Scope * { - auto scope_vec = - self.GetMutable>(); - PADDLE_ENFORCE_GT( - scope_vec->size(), 0, - platform::errors::InvalidArgument( - "The size of scope_vec should be greater than 0")); - return scope_vec->front(); - }, - py::return_value_policy::reference) + .def( + "get_reader", + [](Variable &self) -> framework::ReaderHolder * { + PADDLE_ENFORCE_EQ(self.IsType(), true, + platform::errors::InvalidArgument( + "The variable is not type of ReaderHolder.")); + return self.GetMutable(); + }, + py::return_value_policy::reference) + .def( + "get_scope", + [](Variable &self) -> Scope * { + auto scope_vec = self.GetMutable>(); + PADDLE_ENFORCE_GT( + scope_vec->size(), 0, + platform::errors::InvalidArgument( + "The size of scope_vec should be greater than 0")); + return scope_vec->front(); + }, + py::return_value_policy::reference) .def("set_scope", [](Variable &self, Scope &scope) { auto scope_vec = self.GetMutable>(); scope_vec->emplace_back(&scope); @@ -1760,12 +1777,13 @@ All parameter, weight, gradient are variables in Paddle. _Scope .def("_remove_from_pool", [](Scope &self) { ScopePool::Instance().Remove(&self); }) - .def("var", - [](Scope &self, const std::string &name) -> Variable * { - return self.Var(name); - }, - py::arg("name"), - R"DOC( + .def( + "var", + [](Scope &self, const std::string &name) -> Variable * { + return self.Var(name); + }, + py::arg("name"), + R"DOC( Find or create variable named :code:`name` in the current scope. If the variable named :code:`name` does not exist in the @@ -1778,7 +1796,7 @@ All parameter, weight, gradient are variables in Paddle. Returns: out (core.Variable): the found or created variable. )DOC", - py::return_value_policy::reference) + py::return_value_policy::reference) .def("find_var", &Scope::FindVar, py::arg("name"), R"DOC( Find variable named :code:`name` in the current scope or @@ -1804,33 +1822,35 @@ All parameter, weight, gradient are variables in Paddle. None )DOC", py::return_value_policy::reference) - .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); }, - R"DOC( + .def( + "new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); }, + R"DOC( Create a new sub-scope of the current scope. Returns: out (core._Scope): the created sub-scope. )DOC", - py::return_value_policy::reference) + py::return_value_policy::reference) .def("drop_kids", &Scope::DropKids, R"DOC( Delete all sub-scopes of the current scope. )DOC") .def("_kids", &Scope::kids); - m.def("Scope", - []() -> Scope * { - auto *s = new Scope(); - ScopePool::Instance().Insert(std::unique_ptr(s)); - return s; - }, - R"DOC( + m.def( + "Scope", + []() -> Scope * { + auto *s = new Scope(); + ScopePool::Instance().Insert(std::unique_ptr(s)); + return s; + }, + R"DOC( Create a new scope. Returns: out (core._Scope): the created scope. )DOC", - py::return_value_policy::reference); + py::return_value_policy::reference); //! @note: Be careful! PyBind will return std::string as an unicode, not //! Python str. If you want a str object, you should cast them in Python. @@ -1917,11 +1937,12 @@ All parameter, weight, gradient are variables in Paddle. return std::make_tuple(ProgramDesc(pruned_desc), pruned_origin_block_id_map); }); - m.def("prune_backward", - [](const framework::ProgramDesc &program) { - return PruneBackward(program); - }, - R"DOC( + m.def( + "prune_backward", + [](const framework::ProgramDesc &program) { + return PruneBackward(program); + }, + R"DOC( Prune the backward part of a program, mostly called in program.clone(for_test=True). @@ -2788,8 +2809,8 @@ All parameter, weight, gradient are variables in Paddle. .def("outputs", [](const OperatorBase &op) -> std::map> { - return op.Outputs(); - }) + return op.Outputs(); + }) .def("output_vars", [](const OperatorBase &op) { return op.OutputVars(true); }) .def("inputs", [](const OperatorBase &op) { return op.Inputs(); }) @@ -2804,11 +2825,12 @@ All parameter, weight, gradient are variables in Paddle. py::class_>( m, "TrainerBase") - .def("get_worker_scope", - [](TrainerBase &self, int thread_id) -> Scope * { - return self.GetWorkerScope(thread_id); - }, - py::return_value_policy::reference) + .def( + "get_worker_scope", + [](TrainerBase &self, int thread_id) -> Scope * { + return self.GetWorkerScope(thread_id); + }, + py::return_value_policy::reference) .def("finalize", &TrainerBase::Finalize) .def("ResetDataset", &TrainerBase::ResetDataset); @@ -3008,21 +3030,23 @@ All parameter, weight, gradient are variables in Paddle. m.def("device_memory_stat_current_value", memory::DeviceMemoryStatCurrentValue); m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue); - m.def("run_cmd", - [](const std::string &cmd, int time_out = -1, - int sleep_inter = -1) -> const std::string { - return paddle::framework::shell_get_command_output(cmd, time_out, - sleep_inter); - }, - py::arg("cmd"), py::arg("time_out") = -1, py::arg("sleep_inter") = -1); - m.def("shell_execute_cmd", - [](const std::string &cmd, int time_out = 0, int sleep_inter = 0, - bool redirect_stderr = false) -> std::vector { - return paddle::framework::shell_execute_cmd( - cmd, time_out, sleep_inter, redirect_stderr); - }, - py::arg("cmd"), py::arg("time_out") = 0, py::arg("sleep_inter") = 0, - py::arg("redirect_stderr") = false); + m.def( + "run_cmd", + [](const std::string &cmd, int time_out = -1, + int sleep_inter = -1) -> const std::string { + return paddle::framework::shell_get_command_output(cmd, time_out, + sleep_inter); + }, + py::arg("cmd"), py::arg("time_out") = -1, py::arg("sleep_inter") = -1); + m.def( + "shell_execute_cmd", + [](const std::string &cmd, int time_out = 0, int sleep_inter = 0, + bool redirect_stderr = false) -> std::vector { + return paddle::framework::shell_execute_cmd(cmd, time_out, sleep_inter, + redirect_stderr); + }, + py::arg("cmd"), py::arg("time_out") = 0, py::arg("sleep_inter") = 0, + py::arg("redirect_stderr") = false); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool { @@ -3090,9 +3114,10 @@ All parameter, weight, gradient are variables in Paddle. pylodtensorarray .def("__init__", [](LoDTensorArray &instance) { new (&instance) LoDTensorArray(); }) - .def("__getitem__", - [](LoDTensorArray &self, size_t i) { return &self.at(i); }, - py::return_value_policy::reference) + .def( + "__getitem__", + [](LoDTensorArray &self, size_t i) { return &self.at(i); }, + py::return_value_policy::reference) .def("__len__", [](LoDTensorArray &self) { return self.size(); }) .def("__setitem__", [](LoDTensorArray &self, size_t i, const LoDTensor &t) { @@ -3103,13 +3128,14 @@ All parameter, weight, gradient are variables in Paddle. self[i].ShareDataWith(t); self[i].set_lod(t.lod()); }) - .def("append", - [](LoDTensorArray &self, const LoDTensor &t) { - self.emplace_back(); - self.back().ShareDataWith(t); - self.back().set_lod(t.lod()); - }, - py::arg("tensor"), R"DOC( + .def( + "append", + [](LoDTensorArray &self, const LoDTensor &t) { + self.emplace_back(); + self.back().ShareDataWith(t); + self.back().set_lod(t.lod()); + }, + py::arg("tensor"), R"DOC( Append a LoDensor to LoDTensorArray. Args: @@ -3129,89 +3155,94 @@ All parameter, weight, gradient are variables in Paddle. t.set(np.ndarray([5, 30]), fluid.CPUPlace()) arr.append(t) )DOC") - .def("_move_to_list", - [](LoDTensorArray &self) -> py::list { - py::list res(self.size()); - for (size_t i = 0; i < self.size(); ++i) { - res[i] = py::cast(std::move(self[i])); - } - self.clear(); - return res; - }, - py::return_value_policy::take_ownership); + .def( + "_move_to_list", + [](LoDTensorArray &self) -> py::list { + py::list res(self.size()); + for (size_t i = 0; i < self.size(); ++i) { + res[i] = py::cast(std::move(self[i])); + } + self.clear(); + return res; + }, + py::return_value_policy::take_ownership); py::class_(m, "FetchList", R"DOC( FetchList is a vector of boost::variant. )DOC") - .def("_move_to_list", - [](FetchList &self) -> py::list { - py::list res(self.size()); - for (size_t i = 0; i < self.size(); ++i) { - if (data_is_lod_tensor(self[i])) { - auto &data = BOOST_GET(LoDTensor, self[i]); - res[i] = py::cast(std::move(data)); - } else { - auto &data = BOOST_GET(LoDTensorArray, self[i]); - py::list tmp(data.size()); - for (size_t j = 0; j < data.size(); ++j) { - tmp[j] = py::cast(std::move(data[j])); - } - res[i] = std::move(tmp); - } - } - self.clear(); - return res; - }, - py::return_value_policy::take_ownership) + .def( + "_move_to_list", + [](FetchList &self) -> py::list { + py::list res(self.size()); + for (size_t i = 0; i < self.size(); ++i) { + if (data_is_lod_tensor(self[i])) { + auto &data = BOOST_GET(LoDTensor, self[i]); + res[i] = py::cast(std::move(data)); + } else { + auto &data = BOOST_GET(LoDTensorArray, self[i]); + py::list tmp(data.size()); + for (size_t j = 0; j < data.size(); ++j) { + tmp[j] = py::cast(std::move(data[j])); + } + res[i] = std::move(tmp); + } + } + self.clear(); + return res; + }, + py::return_value_policy::take_ownership) - .def("append", - [](FetchList &self, const LoDTensor &t) { - self.emplace_back(); - auto &lod_tensor = BOOST_GET(LoDTensor, self.back()); - lod_tensor.ShareDataWith(t); - lod_tensor.set_lod(t.lod()); - }, - py::arg("var")) - - .def("append", - [](FetchList &self, const LoDTensorArray &t) { - self.emplace_back(); - auto &lod_tensor_array = BOOST_GET(LoDTensorArray, self.back()); - for (size_t i = 0; i < t.size(); ++i) { - lod_tensor_array[i].ShareDataWith(t[i]); - lod_tensor_array[i].set_lod(t[i].lod()); - } - }, - py::arg("var")); + .def( + "append", + [](FetchList &self, const LoDTensor &t) { + self.emplace_back(); + auto &lod_tensor = BOOST_GET(LoDTensor, self.back()); + lod_tensor.ShareDataWith(t); + lod_tensor.set_lod(t.lod()); + }, + py::arg("var")) + + .def( + "append", + [](FetchList &self, const LoDTensorArray &t) { + self.emplace_back(); + auto &lod_tensor_array = BOOST_GET(LoDTensorArray, self.back()); + for (size_t i = 0; i < t.size(); ++i) { + lod_tensor_array[i].ShareDataWith(t[i]); + lod_tensor_array[i].set_lod(t[i].lod()); + } + }, + py::arg("var")); py::class_(m, "FetchUnmergedList", R"DOC( FetchUnmergedList is 2-D array of FetchType(boost::variant(LoDTensor, LoDTensorArray)). )DOC") - .def("_move_to_list", - [](FetchUnmergedList &self) -> py::list { - py::list res(self.size()); - for (size_t i = 0; i < self.size(); ++i) { - py::list tmp(self[i].size()); - for (size_t j = 0; j < self[i].size(); ++j) { - if (data_is_lod_tensor(self[i][j])) { - auto &var = BOOST_GET(LoDTensor, self[i][j]); - tmp[j] = py::cast(std::move(var)); - } else { - auto &var = BOOST_GET(LoDTensorArray, self[i][j]); - py::list tmp_array(var.size()); - for (size_t k = 0; k < var.size(); ++k) { - tmp_array[k] = std::move(var[k]); - } - tmp[j] = std::move(tmp_array); - } - } - res[i] = std::move(tmp); - self[i].clear(); - } - self.clear(); - return res; - }, - py::return_value_policy::take_ownership); + .def( + "_move_to_list", + [](FetchUnmergedList &self) -> py::list { + py::list res(self.size()); + for (size_t i = 0; i < self.size(); ++i) { + py::list tmp(self[i].size()); + for (size_t j = 0; j < self[i].size(); ++j) { + if (data_is_lod_tensor(self[i][j])) { + auto &var = BOOST_GET(LoDTensor, self[i][j]); + tmp[j] = py::cast(std::move(var)); + } else { + auto &var = BOOST_GET(LoDTensorArray, self[i][j]); + py::list tmp_array(var.size()); + for (size_t k = 0; k < var.size(); ++k) { + tmp_array[k] = std::move(var[k]); + } + tmp[j] = std::move(tmp_array); + } + } + res[i] = std::move(tmp); + self[i].clear(); + } + self.clear(); + return res; + }, + py::return_value_policy::take_ownership); m.def("op_support_gpu", OpSupportGPU); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -3225,11 +3256,12 @@ All parameter, weight, gradient are variables in Paddle. } platform::EmptyCache(); }); - m.def("get_device_properties", - [](int id) -> const gpuDeviceProp & { - return platform::GetDeviceProperties(id); - }, - py::return_value_policy::copy); + m.def( + "get_device_properties", + [](int id) -> const gpuDeviceProp & { + return platform::GetDeviceProperties(id); + }, + py::return_value_policy::copy); py::class_(m, "_gpuDeviceProperties") .def_property_readonly( @@ -3407,15 +3439,16 @@ All parameter, weight, gradient are variables in Paddle. profiler->Prepare(); }) .def("start", &paddle::platform::Profiler::Start) - .def("stop", - [](paddle::platform::Profiler *profiler) { - platform::DisableHostEventRecorder(); - auto result = profiler->Stop(); - framework::StaticGraphExecutorPerfStatistics( - result->GetNodeTrees()); - return result; - }, - py::return_value_policy::automatic_reference); + .def( + "stop", + [](paddle::platform::Profiler *profiler) { + platform::DisableHostEventRecorder(); + auto result = profiler->Stop(); + framework::StaticGraphExecutorPerfStatistics( + result->GetNodeTrees()); + return result; + }, + py::return_value_policy::automatic_reference); py::class_(m, "ProfilerOptions") .def(py::init<>()) @@ -3664,11 +3697,12 @@ All parameter, weight, gradient are variables in Paddle. }, R"DOC(This config that the this is distributed training with parameter server )DOC") - .def_property("_dry_run", - [](const ExecutionStrategy &self) { return self.dry_run_; }, - [](ExecutionStrategy &self, bool dry_run) { - self.dry_run_ = dry_run; - }); + .def_property( + "_dry_run", + [](const ExecutionStrategy &self) { return self.dry_run_; }, + [](ExecutionStrategy &self, bool dry_run) { + self.dry_run_ = dry_run; + }); exec_strategy.def_property( "use_experimental_executor", @@ -3916,11 +3950,12 @@ All parameter, weight, gradient are variables in Paddle. const std::vector &trainers_endpoints) { self.trainers_endpoints_ = trainers_endpoints; }) - .def_property("trainer_id", - [](const BuildStrategy &self) { return self.trainer_id_; }, - [](BuildStrategy &self, int trainer_id) { - self.trainer_id_ = trainer_id; - }) + .def_property( + "trainer_id", + [](const BuildStrategy &self) { return self.trainer_id_; }, + [](BuildStrategy &self, int trainer_id) { + self.trainer_id_ = trainer_id; + }) .def_property( "nccl_comm_num", [](const BuildStrategy &self) { return self.nccl_comm_num_; }, @@ -3933,20 +3968,22 @@ All parameter, weight, gradient are variables in Paddle. [](BuildStrategy &self, int bkcl_comm_num) { self.bkcl_comm_num_ = bkcl_comm_num; }) - .def_property("use_hierarchical_allreduce", - [](const BuildStrategy &self) { - return self.use_hierarchical_allreduce_; - }, - [](BuildStrategy &self, bool use) { - self.use_hierarchical_allreduce_ = use; - }) - .def_property("hierarchical_allreduce_inter_nranks", - [](const BuildStrategy &self) { - return self.hierarchical_allreduce_inter_nranks_; - }, - [](BuildStrategy &self, int nranks) { - self.hierarchical_allreduce_inter_nranks_ = nranks; - }) + .def_property( + "use_hierarchical_allreduce", + [](const BuildStrategy &self) { + return self.use_hierarchical_allreduce_; + }, + [](BuildStrategy &self, bool use) { + self.use_hierarchical_allreduce_ = use; + }) + .def_property( + "hierarchical_allreduce_inter_nranks", + [](const BuildStrategy &self) { + return self.hierarchical_allreduce_inter_nranks_; + }, + [](BuildStrategy &self, int nranks) { + self.hierarchical_allreduce_inter_nranks_ = nranks; + }) .def_property( "fuse_elewise_add_act_ops", @@ -4105,19 +4142,20 @@ All parameter, weight, gradient are variables in Paddle. build_strategy = static.BuildStrategy() build_strategy.fuse_relu_depthwise_conv = True )DOC") - .def_property("fuse_broadcast_ops", - [](const BuildStrategy &self) { - return self.fuse_broadcast_ops_ == true || - self.fuse_broadcast_ops_ == paddle::none; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, " - "cannot be configured again.")); - self.fuse_broadcast_ops_ = b; - }, - R"DOC((bool, optional): fuse_broadcast_op indicates whether + .def_property( + "fuse_broadcast_ops", + [](const BuildStrategy &self) { + return self.fuse_broadcast_ops_ == true || + self.fuse_broadcast_ops_ == paddle::none; + }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, " + "cannot be configured again.")); + self.fuse_broadcast_ops_ = b; + }, + R"DOC((bool, optional): fuse_broadcast_op indicates whether to fuse the broadcast ops. Note that, in Reduce mode, fusing broadcast ops may make the program faster. Because fusing broadcast OP equals delaying the execution of all @@ -4135,18 +4173,19 @@ All parameter, weight, gradient are variables in Paddle. build_strategy = static.BuildStrategy() build_strategy.fuse_broadcast_ops = True )DOC") - .def_property("fuse_all_optimizer_ops", - [](const BuildStrategy &self) { - return self.fuse_all_optimizer_ops_ == true || - self.fuse_all_optimizer_ops_ == paddle::none; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, " - "cannot be configured again.")); - self.fuse_all_optimizer_ops_ = b; - }) + .def_property( + "fuse_all_optimizer_ops", + [](const BuildStrategy &self) { + return self.fuse_all_optimizer_ops_ == true || + self.fuse_all_optimizer_ops_ == paddle::none; + }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, " + "cannot be configured again.")); + self.fuse_all_optimizer_ops_ = b; + }) .def_property( "sync_batch_norm", [](const BuildStrategy &self) { return self.sync_batch_norm_; }, @@ -4229,9 +4268,10 @@ All parameter, weight, gradient are variables in Paddle. self.is_distribution_ = b; #endif }) - .def_property("async_mode", - [](const BuildStrategy &self) { return self.async_mode_; }, - [](BuildStrategy &self, bool b) { self.async_mode_ = b; }) + .def_property( + "async_mode", + [](const BuildStrategy &self) { return self.async_mode_; }, + [](BuildStrategy &self, bool b) { self.async_mode_ = b; }) .def_property( "enable_inplace", [](const BuildStrategy &self) { return self.enable_inplace_; }, @@ -4247,13 +4287,14 @@ All parameter, weight, gradient are variables in Paddle. self.fuse_all_reduce_ops_ == paddle::none; }, [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; }) - .def_property("enable_backward_optimizer_op_deps", - [](const BuildStrategy &self) { - return self.enable_backward_optimizer_op_deps_; - }, - [](BuildStrategy &self, bool b) { - self.enable_backward_optimizer_op_deps_ = b; - }) + .def_property( + "enable_backward_optimizer_op_deps", + [](const BuildStrategy &self) { + return self.enable_backward_optimizer_op_deps_; + }, + [](BuildStrategy &self, bool b) { + self.enable_backward_optimizer_op_deps_ = b; + }) .def_property( "cache_runtime_context", [](const BuildStrategy &self) { return self.cache_runtime_context_; }, @@ -4273,24 +4314,26 @@ All parameter, weight, gradient are variables in Paddle. [](BuildStrategy &self, bool fix_op_run_order) { self.fix_op_run_order_ = fix_op_run_order; }) - .def_property("allow_cuda_graph_capture", - [](const BuildStrategy &self) { - return self.allow_cuda_graph_capture_; - }, - [](BuildStrategy &self, bool allow_cuda_graph_capture) { - self.allow_cuda_graph_capture_ = allow_cuda_graph_capture; - }) + .def_property( + "allow_cuda_graph_capture", + [](const BuildStrategy &self) { + return self.allow_cuda_graph_capture_; + }, + [](BuildStrategy &self, bool allow_cuda_graph_capture) { + self.allow_cuda_graph_capture_ = allow_cuda_graph_capture; + }) .def("_copy", [](const BuildStrategy &self) { auto new_bs = self; new_bs.ClearFinalized(); return new_bs; }) - .def("_finalize_strategy_and_create_passes", - [](BuildStrategy &self) -> std::shared_ptr { - return self.CreatePassesFromStrategy(true); - }, - R"DOC(Allow user to customized passes. Normally model-specific + .def( + "_finalize_strategy_and_create_passes", + [](BuildStrategy &self) -> std::shared_ptr { + return self.CreatePassesFromStrategy(true); + }, + R"DOC(Allow user to customized passes. Normally model-specific optimization passes should be defined in this way. BuildStrategy cannot be updated after being finalized.)DOC"); @@ -4308,11 +4351,12 @@ All parameter, weight, gradient are variables in Paddle. // We still cannot get local_scope from this vector, since the element // of vec will be freed by Python GC. We can only return Scope* // one by one and mark them as reference. - .def("local_scopes", - [](ParallelExecutor &self) -> std::vector * { - return &self.GetLocalScopes(); - }, - py::return_value_policy::reference) + .def( + "local_scopes", + [](ParallelExecutor &self) -> std::vector * { + return &self.GetLocalScopes(); + }, + py::return_value_policy::reference) .def("drop_local_exe_scopes", &ParallelExecutor::DropLocalExeScopes) .def("_need_create_local_exe_scopes", &ParallelExecutor::NeedCreateLocalExeScope) @@ -4344,12 +4388,13 @@ All parameter, weight, gradient are variables in Paddle. std::unique_ptr>( m, "IpuBackend") // manage IpuBackend in C++ - .def("get_instance", - []() { - return std::unique_ptr( - platform::ipu::IpuBackend::GetInstance()); - }, - py::return_value_policy::reference) + .def( + "get_instance", + []() { + return std::unique_ptr( + platform::ipu::IpuBackend::GetInstance()); + }, + py::return_value_policy::reference) .def("weights_to_host", &platform::ipu::IpuBackend::WeightsToHost) .def("detach", &platform::ipu::IpuBackend::Detach) .def("reset", &platform::ipu::IpuBackend::Reset) diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc index e0aab0dd06ecb..3e779ba41c0eb 100644 --- a/paddle/fluid/pybind/reader_py.cc +++ b/paddle/fluid/pybind/reader_py.cc @@ -13,12 +13,14 @@ // limitations under the License. #include "paddle/fluid/pybind/reader_py.h" + #include #include #include #include #include #include + #include "Python.h" #include "boost/optional.hpp" #include "gflags/gflags.h" @@ -337,32 +339,33 @@ void BindMultiDeviceReader(py::module *module, const char *reader_name) { py::call_guard()) .def("read_next_list", &ReaderType::ReadNextList, py::call_guard()) - .def("read_next_var_list", - [](ReaderType &self) { - auto result_list = self.ReadNextList(); - auto &tensor_list = result_list[0]; - std::vector> var_list; - var_list.reserve(tensor_list.size()); - auto func = [](framework::LoDTensor &lod_tensor) { - std::string act_name = - imperative::GetCurrentTracer()->GenerateUniqueName( - "generated_var"); - auto new_var = std::make_shared(act_name); - new_var->SetPersistable(false); - new_var->SetType(framework::proto::VarType::LOD_TENSOR); - new_var->SetDataType( - framework::TransToProtoVarType(lod_tensor.dtype())); - auto *tensor = - new_var->MutableVar()->GetMutable(); - *tensor = std::move(lod_tensor); - return new_var; - }; - for (auto &tensor : tensor_list) { - var_list.emplace_back(func(tensor)); - } - return var_list; - }, - py::call_guard()) + .def( + "read_next_var_list", + [](ReaderType &self) { + auto result_list = self.ReadNextList(); + auto &tensor_list = result_list[0]; + std::vector> var_list; + var_list.reserve(tensor_list.size()); + auto func = [](framework::LoDTensor &lod_tensor) { + std::string act_name = + imperative::GetCurrentTracer()->GenerateUniqueName( + "generated_var"); + auto new_var = std::make_shared(act_name); + new_var->SetPersistable(false); + new_var->SetType(framework::proto::VarType::LOD_TENSOR); + new_var->SetDataType( + framework::TransToProtoVarType(lod_tensor.dtype())); + auto *tensor = + new_var->MutableVar()->GetMutable(); + *tensor = std::move(lod_tensor); + return new_var; + }; + for (auto &tensor : tensor_list) { + var_list.emplace_back(func(tensor)); + } + return var_list; + }, + py::call_guard()) .def("reset", &ReaderType::Reset, py::call_guard()) .def("shutdown", &ReaderType::Shutdown, @@ -372,34 +375,35 @@ void BindMultiDeviceReader(py::module *module, const char *reader_name) { void BindReader(py::module *module) { auto &m = *module; - m.def("diff_tensor_shape", [](const framework::LoDTensor &tensor, - const framework::VarDesc &var_desc, - size_t num_places) -> py::object { - auto diff = DiffTensorShapeWithVarDesc(tensor, var_desc, num_places); - if (diff) { - return py::cast(std::move(diff.get())); - } else { - return py::cast(nullptr); - } - }); - - m.def("init_lod_tensor_blocking_queue", - [](framework::Variable &var, size_t capacity, - bool is_ordered) -> py::object { - VLOG(1) << "init_lod_tensor_blocking_queue"; - if (is_ordered) { - auto *holder = var.GetMutable< - reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder>(); - holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode); - return py::cast(holder->GetQueue()); - } else { - auto *holder = - var.GetMutable(); - holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode); - return py::cast(holder->GetQueue()); - } - }, - py::return_value_policy::copy); + m.def( + "diff_tensor_shape", + [](const framework::LoDTensor &tensor, const framework::VarDesc &var_desc, + size_t num_places) -> py::object { + auto diff = DiffTensorShapeWithVarDesc(tensor, var_desc, num_places); + if (diff) { + return py::cast(std::move(diff.get())); + } else { + return py::cast(nullptr); + } + }); + + m.def( + "init_lod_tensor_blocking_queue", + [](framework::Variable &var, size_t capacity, + bool is_ordered) -> py::object { + VLOG(1) << "init_lod_tensor_blocking_queue"; + if (is_ordered) { + auto *holder = var.GetMutable< + reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder>(); + holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode); + return py::cast(holder->GetQueue()); + } else { + auto *holder = var.GetMutable(); + holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode); + return py::cast(holder->GetQueue()); + } + }, + py::return_value_policy::copy); py::class_(m, "Reader", "") .def("start", &framework::ReaderHolder::Start) @@ -408,12 +412,13 @@ void BindReader(py::module *module) { py::class_>( m, "LoDTensorBlockingQueue", "") - .def("push", - [](reader::LoDTensorBlockingQueue &self, - const std::vector &lod_tensor_vec) { - return self.Push(lod_tensor_vec); - }, - py::call_guard()) + .def( + "push", + [](reader::LoDTensorBlockingQueue &self, + const std::vector &lod_tensor_vec) { + return self.Push(lod_tensor_vec); + }, + py::call_guard()) .def("size", &reader::LoDTensorBlockingQueue::Size) .def("capacity", &reader::LoDTensorBlockingQueue::Cap) .def("close", &reader::LoDTensorBlockingQueue::Close) @@ -424,12 +429,13 @@ void BindReader(py::module *module) { py::class_>( m, "OrderedMultiDeviceLoDTensorBlockingQueue", "") - .def("push", - [](reader::OrderedMultiDeviceLoDTensorBlockingQueue &self, - const std::vector &lod_tensor_vec) { - return self.Push(lod_tensor_vec); - }, - py::call_guard()) + .def( + "push", + [](reader::OrderedMultiDeviceLoDTensorBlockingQueue &self, + const std::vector &lod_tensor_vec) { + return self.Push(lod_tensor_vec); + }, + py::call_guard()) .def("size", &reader::OrderedMultiDeviceLoDTensorBlockingQueue::Size) .def("capacity", &reader::OrderedMultiDeviceLoDTensorBlockingQueue::Cap) .def("close", &reader::OrderedMultiDeviceLoDTensorBlockingQueue::Close) @@ -444,19 +450,20 @@ void BindReader(py::module *module) { BindMultiDeviceReader( module, "OrderedMultiDeviceFeedReader"); - m.def("create_py_reader", - [](const std::shared_ptr &queue, - const std::vector &names, - const std::vector> &shapes, - const std::vector &dtypes, - const std::vector &need_check_feed, - const std::vector &dst_places, - bool use_double_buffer, bool drop_last, bool pin_memory) { - return new MultiDeviceFeedReader( - queue, names, shapes, dtypes, need_check_feed, dst_places, - use_double_buffer, drop_last, pin_memory); - }, - py::return_value_policy::take_ownership); + m.def( + "create_py_reader", + [](const std::shared_ptr &queue, + const std::vector &names, + const std::vector> &shapes, + const std::vector &dtypes, + const std::vector &need_check_feed, + const std::vector &dst_places, bool use_double_buffer, + bool drop_last, bool pin_memory) { + return new MultiDeviceFeedReader( + queue, names, shapes, dtypes, need_check_feed, dst_places, + use_double_buffer, drop_last, pin_memory); + }, + py::return_value_policy::take_ownership); m.def( "create_py_reader", diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h index add332abd30ea..109f3e5705b60 100644 --- a/paddle/fluid/pybind/slice_utils.h +++ b/paddle/fluid/pybind/slice_utils.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/scope_guard.h" #include "paddle/fluid/operators/utils.h" diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 63b36bd917390..ed7ce64032b0e 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -15,12 +15,14 @@ limitations under the License. */ #pragma once #include + #include #include #include #include #include #include + #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/memory/memcpy.h" diff --git a/paddle/fluid/pybind/uva_utils.h b/paddle/fluid/pybind/uva_utils.h index 94f55769b7356..3ea3d7ee1a742 100644 --- a/paddle/fluid/pybind/uva_utils.h +++ b/paddle/fluid/pybind/uva_utils.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/fluid/operators/utils.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/compat/convert_utils.h" diff --git a/paddle/fluid/string/pretty_log.h b/paddle/fluid/string/pretty_log.h index 45fe89e8b5b14..d161b2a912fca 100644 --- a/paddle/fluid/string/pretty_log.h +++ b/paddle/fluid/string/pretty_log.h @@ -17,6 +17,6 @@ #include #include #include -#include "gflags/gflags.h" +#include "gflags/gflags.h" #include "paddle/utils/string/pretty_log.h" diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt index e5f224bf6ad99..3846acbde4819 100644 --- a/paddle/infrt/CMakeLists.txt +++ b/paddle/infrt/CMakeLists.txt @@ -1,10 +1,10 @@ -if (NOT WITH_INFRT) - return() +if(NOT WITH_INFRT) + return() endif() -option(INFRT_WITH_PHI "Compile INFRT with PHI" ON) -option(INFRT_WITH_GPU "Compile INFRT with GPU" OFF) -option(INFRT_WITH_TRT "Compile INFRT with TensorRT" OFF) +option(INFRT_WITH_PHI "Compile INFRT with PHI" ON) +option(INFRT_WITH_GPU "Compile INFRT with GPU" OFF) +option(INFRT_WITH_TRT "Compile INFRT with TensorRT" OFF) #TODO(xiaowei) remove fluid include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform) @@ -13,13 +13,13 @@ if(WITH_GPU) set(INFRT_WITH_GPU ON) endif() -if (INFRT_WITH_PHI) +if(INFRT_WITH_PHI) add_definitions("-DINFRT_WITH_PHI") # TODO(wilber): Now Infrt gpu/trt depends on phi's components, Modify compile dependency options later. - if (INFRT_WITH_GPU) + if(INFRT_WITH_GPU) add_definitions("-DINFRT_WITH_GPU") - if (INFRT_WITH_TRT) + if(INFRT_WITH_TRT) add_definitions("-DINFRT_WITH_TRT") endif() endif() @@ -32,8 +32,8 @@ foreach(flag ${INFRT_FLAGS}) safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag}) endforeach() -set(INFRT_SOURCE_DIR "${PADDLE_SOURCE_DIR}/paddle/infrt" ) -set(INFRT_BINARY_DIR "${PADDLE_BINARY_DIR}/paddle/infrt" ) +set(INFRT_SOURCE_DIR "${PADDLE_SOURCE_DIR}/paddle/infrt") +set(INFRT_BINARY_DIR "${PADDLE_BINARY_DIR}/paddle/infrt") set(INFRT_TEST_TARGETS CACHE INTERNAL "") include(infrt_lib) @@ -41,21 +41,29 @@ set(infrt_src CACHE INTERNAL "" FORCE) # Gather headers for library publish. function(core_gather_headers) - file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h) - - foreach(header ${includes}) - set(core_includes "${core_includes};${header}" CACHE INTERNAL "") - endforeach() + file( + GLOB includes + LIST_DIRECTORIES false + RELATIVE ${CMAKE_SOURCE_DIR} + *.h) + + foreach(header ${includes}) + set(core_includes + "${core_includes};${header}" + CACHE INTERNAL "") + endforeach() endfunction() function(gather_srcs SRC_GROUP) - set(options) - set(oneValueArgs) - set(multiValueArgs "SRCS") - cmake_parse_arguments(prefix "" "" "${multiValueArgs}" ${ARGN}) - foreach(cpp ${prefix_SRCS}) - set(${SRC_GROUP} "${${SRC_GROUP}};${CMAKE_CURRENT_SOURCE_DIR}/${cpp}" CACHE INTERNAL "") - endforeach() + set(options) + set(oneValueArgs) + set(multiValueArgs "SRCS") + cmake_parse_arguments(prefix "" "" "${multiValueArgs}" ${ARGN}) + foreach(cpp ${prefix_SRCS}) + set(${SRC_GROUP} + "${${SRC_GROUP}};${CMAKE_CURRENT_SOURCE_DIR}/${cpp}" + CACHE INTERNAL "") + endforeach() endfunction() # This method is similar to the global cc_test, but discard the huge amount default dependencies those are @@ -65,28 +73,36 @@ function(cc_test_tiny TARGET_NAME) set(options SERIAL) set(oneValueArgs "") set(multiValueArgs SRCS DEPS ARGS) - cmake_parse_arguments(cc_test_tiny "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cmake_parse_arguments(cc_test_tiny "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) add_executable(${TARGET_NAME} ${cc_test_tiny_SRCS}) get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) - target_link_libraries(${TARGET_NAME} ${cc_test_tiny_DEPS} ${os_dependency_modules} infrt_gtest_main gtest ) - add_dependencies(${TARGET_NAME} ${cc_test_tiny_DEPS} infrt_gtest_main gtest extern_gtest) + target_link_libraries(${TARGET_NAME} ${cc_test_tiny_DEPS} + ${os_dependency_modules} infrt_gtest_main gtest) + add_dependencies(${TARGET_NAME} ${cc_test_tiny_DEPS} infrt_gtest_main gtest + extern_gtest) - add_test(NAME ${TARGET_NAME} + add_test( + NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} "${cc_test_tiny_ARGS}" - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) - if (${cc_test_tiny_SERIAL}) + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + if(${cc_test_tiny_SERIAL}) set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) endif() - set(INFRT_TEST_TARGETS ${INFRT_TEST_TARGETS} ${TARGET_NAME} CACHE INTERNAL "") + set(INFRT_TEST_TARGETS + ${INFRT_TEST_TARGETS} ${TARGET_NAME} + CACHE INTERNAL "") endif() endfunction() -if (WITH_TESTING) - cc_library(infrt_gtest_main SRCS gtest_main.cc DEPS gtest glog gflags) +if(WITH_TESTING) + cc_library( + infrt_gtest_main + SRCS gtest_main.cc + DEPS gtest glog gflags) endif() - add_subdirectory(api) add_subdirectory(backends) add_subdirectory(common) @@ -99,27 +115,24 @@ add_subdirectory(external_kernels) add_subdirectory(paddle) add_subdirectory(tests) - # MLIR td file generations -set(infrt_mlir_incs - basic_kernels_inc - test_kernels_inc - tensor_shape_inc - dense_tensor_inc - pd_extra_ops_inc - trt_ops_inc - ) - -if (INFRT_WITH_PHI) - set(phi_libs phi) - set(infrt_mlir_incs ${infrt_mlir_incs} - MLIRinfrt_phi_tensorIncGen - MLIRinfrt_phi_baseIncGen - ) +set(infrt_mlir_incs basic_kernels_inc test_kernels_inc tensor_shape_inc + dense_tensor_inc pd_extra_ops_inc trt_ops_inc) + +if(INFRT_WITH_PHI) + set(phi_libs phi) + set(infrt_mlir_incs ${infrt_mlir_incs} MLIRinfrt_phi_tensorIncGen + MLIRinfrt_phi_baseIncGen) endif() -cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto infrt_naive) -cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto) +cc_library( + infrt SHARED + SRCS ${infrt_src} + DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto infrt_naive) +cc_library( + infrt_static + SRCS ${infrt_src} + DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto) add_dependencies(infrt ${infrt_mlir_incs} mlir-headers) add_custom_target(test_infrt_exec DEPENDS ${INFRT_TEST_TARGETS}) diff --git a/paddle/infrt/api/CMakeLists.txt b/paddle/infrt/api/CMakeLists.txt index 6d4604edee6a0..2d88af7d5b5c8 100644 --- a/paddle/infrt/api/CMakeLists.txt +++ b/paddle/infrt/api/CMakeLists.txt @@ -1,9 +1,8 @@ core_gather_headers() -gather_srcs(infrt_src SRCS - infrt_api.cc - ) -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/infrt_api_test.cc.in ${CMAKE_CURRENT_SOURCE_DIR}/infrt_api_test.cc) +gather_srcs(infrt_src SRCS infrt_api.cc) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/infrt_api_test.cc.in + ${CMAKE_CURRENT_SOURCE_DIR}/infrt_api_test.cc) # Disable temporarily for the external-kernel's mkldnn is outdate cc_test_tiny(test_infrt_api SRCS infrt_api_test.cc DEPS infrt ${MLIR_IR_LIBS}) diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc index f0bf46567a5bf..2f4bbd5df352c 100644 --- a/paddle/infrt/api/infrt_api.cc +++ b/paddle/infrt/api/infrt_api.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +// clang-format off #include "paddle/infrt/api/infrt_api.h" #include @@ -61,6 +62,7 @@ #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h" #include "paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h" #endif +// clang-format on using namespace infrt::host_context; // NOLINT using namespace infrt::tensor; // NOLINT diff --git a/paddle/infrt/backends/CMakeLists.txt b/paddle/infrt/backends/CMakeLists.txt index b639f89292568..606fec5d92dae 100644 --- a/paddle/infrt/backends/CMakeLists.txt +++ b/paddle/infrt/backends/CMakeLists.txt @@ -1,3 +1,5 @@ -if (INFRT_WITH_PHI AND WITH_GPU AND WITH_TENSORRT) +if(INFRT_WITH_PHI + AND WITH_GPU + AND WITH_TENSORRT) add_subdirectory(tensorrt) endif() diff --git a/paddle/infrt/backends/host/phi_context.h b/paddle/infrt/backends/host/phi_context.h index 2af1fab100821..880d1f03d8766 100644 --- a/paddle/infrt/backends/host/phi_context.h +++ b/paddle/infrt/backends/host/phi_context.h @@ -35,12 +35,12 @@ class CpuPhiContext : public ::phi::CPUContext { class GpuPhiContext : public ::phi::GPUContext { public: using Base = ::phi::GPUContext; - using ::phi::GPUContext::SetStream; - using ::phi::GPUContext::SetEigenDevice; using ::phi::GPUContext::SetBlasHandle; using ::phi::GPUContext::SetDnnHandle; + using ::phi::GPUContext::SetEigenDevice; using ::phi::GPUContext::SetSolverHandle; using ::phi::GPUContext::SetSparseHandle; + using ::phi::GPUContext::SetStream; }; } // namespace backends diff --git a/paddle/infrt/backends/tensorrt/CMakeLists.txt b/paddle/infrt/backends/tensorrt/CMakeLists.txt index 672515ea4b7f8..9a9db6b737c10 100644 --- a/paddle/infrt/backends/tensorrt/CMakeLists.txt +++ b/paddle/infrt/backends/tensorrt/CMakeLists.txt @@ -4,4 +4,11 @@ core_gather_headers() gather_srcs(infrt_src SRCS trt_engine.cc) -cc_test_tiny(test_infrt_trt SRCS test_trt_engine.cc DEPS infrt phi_dynload_cuda tensorrt_converter) +cc_test_tiny( + test_infrt_trt + SRCS + test_trt_engine.cc + DEPS + infrt + phi_dynload_cuda + tensorrt_converter) diff --git a/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.cu b/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.cu index 5a53777c8e30f..f3e2fe35074a6 100644 --- a/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.cu +++ b/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.cu @@ -199,8 +199,8 @@ bool PoolPlugin::isOutputBroadcastAcrossBatch(int32_t outputIndex, return false; } -bool PoolPlugin::canBroadcastInputAcrossBatch(int32_t inputIndex) const - noexcept { +bool PoolPlugin::canBroadcastInputAcrossBatch( + int32_t inputIndex) const noexcept { return false; } diff --git a/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.h b/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.h index 0da1d15845330..34189f95438bf 100644 --- a/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.h +++ b/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.h @@ -114,10 +114,10 @@ class PoolPlugin : public nvinfer1::IPluginV2IOExt { char const* getPluginNamespace() const noexcept override; // IPluginV2Ext methods - nvinfer1::DataType getOutputDataType(int32_t index, - nvinfer1::DataType const* inputTypes, - int32_t nbInputs) const - noexcept override; + nvinfer1::DataType getOutputDataType( + int32_t index, + nvinfer1::DataType const* inputTypes, + int32_t nbInputs) const noexcept override; bool isOutputBroadcastAcrossBatch(int32_t outputIndex, bool const* inputIsBroadcasted, int32_t nbInputs) const noexcept override; diff --git a/paddle/infrt/backends/tensorrt/test_trt_engine.cc b/paddle/infrt/backends/tensorrt/test_trt_engine.cc index 89dd3b0dc7abf..7e081362f9c62 100644 --- a/paddle/infrt/backends/tensorrt/test_trt_engine.cc +++ b/paddle/infrt/backends/tensorrt/test_trt_engine.cc @@ -12,13 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include - #include #include #include #include #include +#include + #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" diff --git a/paddle/infrt/backends/tensorrt/trt_engine.cc b/paddle/infrt/backends/tensorrt/trt_engine.cc index a2d4954618986..a539078e4af4d 100644 --- a/paddle/infrt/backends/tensorrt/trt_engine.cc +++ b/paddle/infrt/backends/tensorrt/trt_engine.cc @@ -18,6 +18,7 @@ #include #include #include + #include "paddle/phi/backends/dynload/tensorrt.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/core/ddim.h" diff --git a/paddle/infrt/backends/tensorrt/trt_engine.h b/paddle/infrt/backends/tensorrt/trt_engine.h index 41d11a7111709..44f36a84cb5dc 100644 --- a/paddle/infrt/backends/tensorrt/trt_engine.h +++ b/paddle/infrt/backends/tensorrt/trt_engine.h @@ -17,6 +17,7 @@ #include #include + #include "paddle/infrt/backends/tensorrt/trt_options.h" #include "paddle/infrt/backends/tensorrt/trt_utils.h" #include "paddle/phi/backends/dynload/tensorrt.h" diff --git a/paddle/infrt/backends/tensorrt/trt_options.h b/paddle/infrt/backends/tensorrt/trt_options.h index d5190f5e6220e..b4e36da2058ed 100644 --- a/paddle/infrt/backends/tensorrt/trt_options.h +++ b/paddle/infrt/backends/tensorrt/trt_options.h @@ -15,12 +15,12 @@ #pragma once +#include + #include #include #include -#include - namespace infrt { namespace backends { namespace tensorrt { diff --git a/paddle/infrt/common/CMakeLists.txt b/paddle/infrt/common/CMakeLists.txt index 931e3e42307eb..c77f099aef4a4 100644 --- a/paddle/infrt/common/CMakeLists.txt +++ b/paddle/infrt/common/CMakeLists.txt @@ -1,14 +1,17 @@ core_gather_headers() -set(core_includes "${core_includes};infrt/common/dtype.def" CACHE INTERNAL "") +set(core_includes + "${core_includes};infrt/common/dtype.def" + CACHE INTERNAL "") -gather_srcs(infrt_src SRCS - dtype.cc - global.cc - target.cc - type.cc - shared.cc - object.cc - string.cc - buffer.cc - memory.cc - ) +gather_srcs( + infrt_src + SRCS + dtype.cc + global.cc + target.cc + type.cc + shared.cc + object.cc + string.cc + buffer.cc + memory.cc) diff --git a/paddle/infrt/common/global.h b/paddle/infrt/common/global.h index e6586cb3a3c60..2d7735d525244 100644 --- a/paddle/infrt/common/global.h +++ b/paddle/infrt/common/global.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/infrt/tensor/dense_host_tensor.h" namespace infrt { diff --git a/paddle/infrt/common/memory.h b/paddle/infrt/common/memory.h index 678529b8b785c..643b21477615d 100644 --- a/paddle/infrt/common/memory.h +++ b/paddle/infrt/common/memory.h @@ -15,9 +15,9 @@ #pragma once #include -#include #include +#include #include "paddle/infrt/common/macros.h" #include "paddle/infrt/common/target.h" diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt index cf3906c32e559..33206dbd56b6e 100644 --- a/paddle/infrt/dialect/CMakeLists.txt +++ b/paddle/infrt/dialect/CMakeLists.txt @@ -1,13 +1,14 @@ core_gather_headers() -gather_srcs(infrt_src SRCS - dialect.cc - init_dialects.cc - tensor_shape.cc - dense_tensor.cc - mlir_loader.cc - diagnostic_utils.cc - ) +gather_srcs( + infrt_src + SRCS + dialect.cc + init_dialects.cc + tensor_shape.cc + dense_tensor.cc + mlir_loader.cc + diagnostic_utils.cc) mlir_tablegen_on(tensor_shape DIALECT ts) mlir_tablegen_on(dense_tensor DIALECT dt) @@ -18,12 +19,13 @@ target_link_libraries(infrtopt infrt) add_executable(print-ir print_ir.cc) target_link_libraries(print-ir infrt ${mlir_libs}) -cc_test_tiny(test_infrt_mlir_loader SRCS mlir_loader_test.cc DEPS infrt ${MLIR_IR_LIBS}) +cc_test_tiny(test_infrt_mlir_loader SRCS mlir_loader_test.cc DEPS infrt + ${MLIR_IR_LIBS}) add_subdirectory(infrt) add_subdirectory(pd) add_subdirectory(tensorrt) -if (INFRT_WITH_PHI) - add_subdirectory(phi) +if(INFRT_WITH_PHI) + add_subdirectory(phi) endif() diff --git a/paddle/infrt/dialect/dense_tensor.h b/paddle/infrt/dialect/dense_tensor.h index 7fbd1e8a4efe1..8dec818a80a27 100644 --- a/paddle/infrt/dialect/dense_tensor.h +++ b/paddle/infrt/dialect/dense_tensor.h @@ -13,6 +13,8 @@ // limitations under the License. #pragma once + +// clang-format off #include #include #include @@ -25,3 +27,4 @@ #define GET_OP_CLASSES #include "paddle/infrt/dialect/dense_tensor.hpp.inc" +// clang-format on diff --git a/paddle/infrt/dialect/diagnostic_utils.cc b/paddle/infrt/dialect/diagnostic_utils.cc index 4151001067ecb..8785ce69b8e8f 100644 --- a/paddle/infrt/dialect/diagnostic_utils.cc +++ b/paddle/infrt/dialect/diagnostic_utils.cc @@ -15,6 +15,7 @@ #include "paddle/infrt/dialect/diagnostic_utils.h" #include + #include namespace infrt { diff --git a/paddle/infrt/dialect/infrt/common/CMakeLists.txt b/paddle/infrt/dialect/infrt/common/CMakeLists.txt index f693c82b5060e..593030be0a5bd 100644 --- a/paddle/infrt/dialect/infrt/common/CMakeLists.txt +++ b/paddle/infrt/dialect/infrt/common/CMakeLists.txt @@ -1,6 +1,3 @@ core_gather_headers() -gather_srcs(infrt_src SRCS - types.cc - utils.cc - ) +gather_srcs(infrt_src SRCS types.cc utils.cc) diff --git a/paddle/infrt/dialect/infrt/ir/CMakeLists.txt b/paddle/infrt/dialect/infrt/ir/CMakeLists.txt index 7c009bdb267e6..103c603e765c3 100644 --- a/paddle/infrt/dialect/infrt/ir/CMakeLists.txt +++ b/paddle/infrt/dialect/infrt/ir/CMakeLists.txt @@ -1,10 +1,6 @@ core_gather_headers() -gather_srcs(infrt_src SRCS - infrt_dialect.cc - basic_kernels.cc - test_kernels.cc - ) +gather_srcs(infrt_src SRCS infrt_dialect.cc basic_kernels.cc test_kernels.cc) add_mlir_dialect(infrt_ops infrt) diff --git a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc index c4f20cb4d35c5..0e3a10270cde2 100644 --- a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc +++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +// clang-format off #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include @@ -60,6 +61,7 @@ void InfrtDialect::initialize() { #include "paddle/infrt/dialect/infrt/ir/test_kernels.cpp.inc" >(); } +// clang-format on /// Parse a type registered to this dialect. mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const { diff --git a/paddle/infrt/dialect/infrt/ir/infrt_dialect.h b/paddle/infrt/dialect/infrt/ir/infrt_dialect.h index e2e9b9348eb46..5a7c45b320547 100644 --- a/paddle/infrt/dialect/infrt/ir/infrt_dialect.h +++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.h @@ -23,8 +23,8 @@ #include #include #include -#include "paddle/infrt/dialect/infrt/common/types.h" +#include "paddle/infrt/dialect/infrt/common/types.h" #include "paddle/infrt/dialect/infrt/ir/infrt_opsDialect.h.inc" #define GET_TYPEDEF_CLASSES #include "paddle/infrt/dialect/infrt/ir/infrt_opsTypes.h.inc" diff --git a/paddle/infrt/dialect/infrt/pass/CMakeLists.txt b/paddle/infrt/dialect/infrt/pass/CMakeLists.txt index ab06c00d143a7..7fa0ee1c716c9 100644 --- a/paddle/infrt/dialect/infrt/pass/CMakeLists.txt +++ b/paddle/infrt/dialect/infrt/pass/CMakeLists.txt @@ -1,8 +1,5 @@ core_gather_headers() -gather_srcs(infrt_src SRCS - infrt_op_fuse_pass.cc - infrt_weights_unfold_pass.cc - ) +gather_srcs(infrt_src SRCS infrt_op_fuse_pass.cc infrt_weights_unfold_pass.cc) mlir_add_rewriter(infrt_op_fuse) diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc index 63be5ca909563..309e0f8b94040 100644 --- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc +++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc @@ -15,6 +15,7 @@ #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h" #include + #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/pd/ir/pd_ops.h" namespace { diff --git a/paddle/infrt/dialect/init_dialects.cc b/paddle/infrt/dialect/init_dialects.cc index 8da34bd404be6..c204f9ea62669 100644 --- a/paddle/infrt/dialect/init_dialects.cc +++ b/paddle/infrt/dialect/init_dialects.cc @@ -19,12 +19,10 @@ #include "paddle/infrt/dialect/dense_tensor.h" #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" - #include "paddle/infrt/dialect/pd/ir/pd_ops.h" #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" #include "paddle/infrt/dialect/phi/ir/phi_base.h" #include "paddle/infrt/dialect/phi/ir/phi_kernels.h" - #include "paddle/infrt/dialect/tensor_shape.h" #include "paddle/infrt/dialect/tensorrt/trt_ops.h" diff --git a/paddle/infrt/dialect/mlir_loader.cc b/paddle/infrt/dialect/mlir_loader.cc index 19b8cba12df86..ab533a25c4173 100644 --- a/paddle/infrt/dialect/mlir_loader.cc +++ b/paddle/infrt/dialect/mlir_loader.cc @@ -20,10 +20,10 @@ #include #include #include -#include #include #include +#include #include #include diff --git a/paddle/infrt/dialect/mlir_loader.h b/paddle/infrt/dialect/mlir_loader.h index 5e50ad9e5a271..b4faba8068e44 100644 --- a/paddle/infrt/dialect/mlir_loader.h +++ b/paddle/infrt/dialect/mlir_loader.h @@ -16,9 +16,9 @@ #include #include -#include #include +#include namespace infrt { namespace dialect { diff --git a/paddle/infrt/dialect/opt.cc b/paddle/infrt/dialect/opt.cc index 2006530958f0b..e57666ffca080 100644 --- a/paddle/infrt/dialect/opt.cc +++ b/paddle/infrt/dialect/opt.cc @@ -14,6 +14,7 @@ #include #include + #include "paddle/infrt/dialect/init_dialects.h" int main(int argc, char **argv) { diff --git a/paddle/infrt/dialect/pd/common/CMakeLists.txt b/paddle/infrt/dialect/pd/common/CMakeLists.txt index ee1b0d4c30deb..d253a84755713 100644 --- a/paddle/infrt/dialect/pd/common/CMakeLists.txt +++ b/paddle/infrt/dialect/pd/common/CMakeLists.txt @@ -1,4 +1,3 @@ core_gather_headers() -gather_srcs(infrt_src SRCS - ) +gather_srcs(infrt_src SRCS) diff --git a/paddle/infrt/dialect/pd/ir/CMakeLists.txt b/paddle/infrt/dialect/pd/ir/CMakeLists.txt index 8aacfc97623c0..7c1c99a97a02a 100644 --- a/paddle/infrt/dialect/pd/ir/CMakeLists.txt +++ b/paddle/infrt/dialect/pd/ir/CMakeLists.txt @@ -1,7 +1,5 @@ core_gather_headers() -gather_srcs(infrt_src SRCS - pd_ops.cc - ) +gather_srcs(infrt_src SRCS pd_ops.cc) add_mlir_dialect(pd_ops pd) mlir_tablegen_on(pd_extra_ops) diff --git a/paddle/infrt/dialect/pd/pass/CMakeLists.txt b/paddle/infrt/dialect/pd/pass/CMakeLists.txt index 827df597b76e2..be87052794ebc 100644 --- a/paddle/infrt/dialect/pd/pass/CMakeLists.txt +++ b/paddle/infrt/dialect/pd/pass/CMakeLists.txt @@ -1,8 +1,5 @@ - core_gather_headers() -gather_srcs(infrt_src SRCS - pd_op_fuse_pass.cc - ) +gather_srcs(infrt_src SRCS pd_op_fuse_pass.cc) mlir_add_rewriter(pd_op_fuse) diff --git a/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc index 8bdf957db27d8..c9247abe695ae 100644 --- a/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc +++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc @@ -14,6 +14,7 @@ #include "paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h" // NOLINT #include + #include "paddle/infrt/dialect/pd/ir/pd_ops.h" namespace { diff --git a/paddle/infrt/dialect/phi/CMakeLists.txt b/paddle/infrt/dialect/phi/CMakeLists.txt index 67f6bb8a2d7bb..f07c6f70fb609 100644 --- a/paddle/infrt/dialect/phi/CMakeLists.txt +++ b/paddle/infrt/dialect/phi/CMakeLists.txt @@ -1,5 +1,5 @@ -if (NOT INFRT_WITH_PHI) - return() +if(NOT INFRT_WITH_PHI) + return() endif() add_subdirectory(ir) @@ -8,5 +8,4 @@ add_subdirectory(pass) add_executable(phi-exec phi_exec.cc) target_link_libraries(phi-exec infrt) -gather_srcs(infrt_src SRCS - data_type.cc) +gather_srcs(infrt_src SRCS data_type.cc) diff --git a/paddle/infrt/dialect/phi/ir/CMakeLists.txt b/paddle/infrt/dialect/phi/ir/CMakeLists.txt index 0497b9832118f..e038da564be1a 100644 --- a/paddle/infrt/dialect/phi/ir/CMakeLists.txt +++ b/paddle/infrt/dialect/phi/ir/CMakeLists.txt @@ -6,7 +6,4 @@ add_mlir_dialect(phi_gpu_kernels phi_gpu) #mlir_tablegen_on(infrt_phi_tensor) -gather_srcs(infrt_src SRCS - phi_base.cc - infrt_phi_tensor.cc - phi_kernels.cc) +gather_srcs(infrt_src SRCS phi_base.cc infrt_phi_tensor.cc phi_kernels.cc) diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h index 9a92558daab03..f7358db5bf356 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h @@ -14,6 +14,7 @@ #pragma once +// clang-format off #include #include #include @@ -37,3 +38,4 @@ // NOLINT #define GET_OP_CLASSES #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h.inc" +// clang-format on diff --git a/paddle/infrt/dialect/phi/ir/phi_base.cc b/paddle/infrt/dialect/phi/ir/phi_base.cc index 1bd6068d3fb96..39a23529ac3d1 100644 --- a/paddle/infrt/dialect/phi/ir/phi_base.cc +++ b/paddle/infrt/dialect/phi/ir/phi_base.cc @@ -21,6 +21,7 @@ #include #include #include + #include "paddle/infrt/common/global.h" #include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc" #include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.cpp.inc" diff --git a/paddle/infrt/dialect/phi/ir/phi_base.h b/paddle/infrt/dialect/phi/ir/phi_base.h index 64cd08cc05ed4..2cbdef5af906e 100644 --- a/paddle/infrt/dialect/phi/ir/phi_base.h +++ b/paddle/infrt/dialect/phi/ir/phi_base.h @@ -18,8 +18,8 @@ #include #include -#include "paddle/infrt/dialect/infrt/common/types.h" +#include "paddle/infrt/dialect/infrt/common/types.h" #include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.h.inc" #define GET_OP_CLASSES diff --git a/paddle/infrt/dialect/phi/ir/phi_kernels.cc b/paddle/infrt/dialect/phi/ir/phi_kernels.cc index c7a837b83fc24..69c3f96339117 100644 --- a/paddle/infrt/dialect/phi/ir/phi_kernels.cc +++ b/paddle/infrt/dialect/phi/ir/phi_kernels.cc @@ -13,12 +13,12 @@ // limitations under the License. #include "paddle/infrt/dialect/phi/ir/phi_kernels.h" + #include #include "paddle/infrt/dialect/phi/ir/phi_gpu_kernelsDialect.cpp.inc" #define GET_OP_CLASSES #include "paddle/infrt/dialect/phi/ir/phi_cpu_kernels.cpp.inc" // NOLINT - #include "paddle/infrt/dialect/phi/ir/phi_cpu_kernelsDialect.cpp.inc" #define GET_OP_CLASSES #include "paddle/infrt/dialect/phi/ir/phi_gpu_kernels.cpp.inc" // NOLINT diff --git a/paddle/infrt/dialect/phi/ir/phi_kernels.h b/paddle/infrt/dialect/phi/ir/phi_kernels.h index 4f8b41852cc67..9321ebb148f86 100644 --- a/paddle/infrt/dialect/phi/ir/phi_kernels.h +++ b/paddle/infrt/dialect/phi/ir/phi_kernels.h @@ -32,11 +32,9 @@ #include "paddle/infrt/dialect/dense_tensor.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/phi/ir/phi_base.h" - #include "paddle/infrt/dialect/phi/ir/phi_cpu_kernelsDialect.h.inc" #define GET_OP_CLASSES #include "paddle/infrt/dialect/phi/ir/phi_cpu_kernels.h.inc" - #include "paddle/infrt/dialect/phi/ir/phi_gpu_kernelsDialect.h.inc" #define GET_OP_CLASSES #include "paddle/infrt/dialect/phi/ir/phi_gpu_kernels.h.inc" diff --git a/paddle/infrt/dialect/phi/pass/CMakeLists.txt b/paddle/infrt/dialect/phi/pass/CMakeLists.txt index dc60ecf63fe2e..e664e05f9dde7 100644 --- a/paddle/infrt/dialect/phi/pass/CMakeLists.txt +++ b/paddle/infrt/dialect/phi/pass/CMakeLists.txt @@ -1,9 +1,9 @@ core_gather_headers() -gather_srcs(infrt_src SRCS - proto_arg_map_context.cc - phi_op_convert_pass.cc - kernel_op_desc.cc - ) +gather_srcs(infrt_src SRCS proto_arg_map_context.cc phi_op_convert_pass.cc + kernel_op_desc.cc) -cc_test(test_kernel_op_desc SRCS kernel_op_desc_test.cc DEPS infrt) +cc_test( + test_kernel_op_desc + SRCS kernel_op_desc_test.cc + DEPS infrt) diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc index 9425a290142da..ff870a06752e5 100644 --- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h" + #include + #include "paddle/infrt/dialect/phi/data_type.h" #include "paddle/phi/core/type_defs.h" #include "paddle/phi/kernels/declarations.h" diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h index cdc8f7cbff553..4385d3c941727 100644 --- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h @@ -16,6 +16,7 @@ #include #include + #include "paddle/infrt/dialect/infrt/common/types.h" namespace infrt { diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc index bd5f0799a60d5..24af0ea437875 100644 --- a/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +// clang-format off #include #include #include #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h" #include "paddle/phi/kernels/declarations.h" +// clang-format on namespace infrt { diff --git a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc index 862c9ae4ee5af..f4de56b42a683 100644 --- a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc +++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc @@ -20,6 +20,7 @@ #include #include #include + #include #include #include diff --git a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h index a0e74426a4097..9748e1679d3f1 100644 --- a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h +++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h @@ -14,6 +14,7 @@ #pragma once #include + #include "paddle/infrt/dialect/infrt/common/types.h" namespace infrt { diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h index 7cb2651ccf6a2..30bde83cd8199 100644 --- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h +++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h @@ -15,7 +15,9 @@ limitations under the License. */ #pragma once #include + #include + #include "paddle/infrt/dialect/pd/common/pd_ops_info.h" #include "paddle/phi/core/compat/arg_map_context.h" diff --git a/paddle/infrt/dialect/phi/phi_exec.cc b/paddle/infrt/dialect/phi/phi_exec.cc index a2808a00cb67d..0aae8cc93377d 100644 --- a/paddle/infrt/dialect/phi/phi_exec.cc +++ b/paddle/infrt/dialect/phi/phi_exec.cc @@ -41,7 +41,9 @@ bool parse_inputs(int argc, *params_file_name = argv[2]; return true; } - default: { return false; } + default: { + return false; + } } } diff --git a/paddle/infrt/dialect/print_ir.cc b/paddle/infrt/dialect/print_ir.cc index b118a5f7a9caf..a240cebe73655 100644 --- a/paddle/infrt/dialect/print_ir.cc +++ b/paddle/infrt/dialect/print_ir.cc @@ -28,6 +28,7 @@ #include #include #include + #include #include "paddle/infrt/common/global.h" @@ -74,8 +75,8 @@ void printOperation(mlir::Operation *op, int indent) { if (!op->getAttrs().empty()) { printIndent(indent) << op->getAttrs().size() << " attributes:\n"; for (mlir::NamedAttribute attr : op->getAttrs()) { - printIndent(indent + 1) << "- {" << attr.getName() << " : " - << attr.getValue() << "}\n"; + printIndent(indent + 1) + << "- {" << attr.getName() << " : " << attr.getValue() << "}\n"; } } diff --git a/paddle/infrt/dialect/tensor_shape.cc b/paddle/infrt/dialect/tensor_shape.cc index 92c03818264ee..9a825224f1d30 100644 --- a/paddle/infrt/dialect/tensor_shape.cc +++ b/paddle/infrt/dialect/tensor_shape.cc @@ -66,5 +66,4 @@ void TensorShapeDialect::printType(mlir::Type type, #define GET_OP_CLASSES #include "paddle/infrt/dialect/tensor_shape.cpp.inc" // NOLINT - #include "paddle/infrt/dialect/tensor_shape_dialect.cpp.inc" diff --git a/paddle/infrt/dialect/tensorrt/CMakeLists.txt b/paddle/infrt/dialect/tensorrt/CMakeLists.txt index 5b62b78e4dab1..68c6da2746433 100755 --- a/paddle/infrt/dialect/tensorrt/CMakeLists.txt +++ b/paddle/infrt/dialect/tensorrt/CMakeLists.txt @@ -1,13 +1,14 @@ core_gather_headers() -gather_srcs(infrt_src SRCS - trt_ops.cc - trt_op_converter_pass.cc - trt_op_teller_pass.cc - trt_graph_fuse_pass.cc - trt_graph_split_pass.cc - trt_type_convert_pass.cc - ) +gather_srcs( + infrt_src + SRCS + trt_ops.cc + trt_op_converter_pass.cc + trt_op_teller_pass.cc + trt_graph_fuse_pass.cc + trt_graph_split_pass.cc + trt_type_convert_pass.cc) mlir_tablegen_on(trt_ops) mlir_add_rewriter(pd_lower_to_trt) diff --git a/paddle/infrt/dialect/tensorrt/convert.h b/paddle/infrt/dialect/tensorrt/convert.h index 2a242ca285ba8..2dcd86486f51d 100644 --- a/paddle/infrt/dialect/tensorrt/convert.h +++ b/paddle/infrt/dialect/tensorrt/convert.h @@ -20,6 +20,7 @@ #include #include #include + #include "paddle/infrt/dialect/infrt/common/types.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/pd/ir/pd_ops.h" diff --git a/paddle/infrt/dialect/tensorrt/trt_exec.cc b/paddle/infrt/dialect/tensorrt/trt_exec.cc index dcb84ceb50edf..899e71f1c990f 100644 --- a/paddle/infrt/dialect/tensorrt/trt_exec.cc +++ b/paddle/infrt/dialect/tensorrt/trt_exec.cc @@ -11,10 +11,14 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + +// clang-format off #include #include + #include #include + #include "paddle/infrt/common/global.h" #include "paddle/infrt/dialect/infrt/pass/infrt_weights_unfold_pass.h" #include "paddle/infrt/dialect/mlir_loader.h" @@ -44,6 +48,7 @@ #endif #include +// clang-format on int main(int argc, char** argv) { static llvm::cl::opt input_file( diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc index bbe9a76e87b00..7109fc772ec86 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc @@ -18,6 +18,7 @@ #include #include #include + #include #include #include diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc index d5ce871edd1a3..d74fe3e5e9c2f 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc @@ -15,6 +15,7 @@ #include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h" #include + #include "paddle/infrt/dialect/pd/ir/pd_ops.h" namespace infrt { diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc index d7b917385cf14..35b869fb30788 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc @@ -16,6 +16,7 @@ #include #include + #include "paddle/infrt/dialect/dense_tensor.h" #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc index 415a78a6967ab..161fbbbcc65a5 100644 --- a/paddle/infrt/dialect/tensorrt/trt_ops.cc +++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc @@ -11,6 +11,8 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + +// clang-format off #include "paddle/infrt/dialect/tensorrt/trt_ops.h" #include #include @@ -24,6 +26,7 @@ #include "paddle/infrt/dialect/dense_tensor.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/phi/ir/phi_base.h" +// clang-format on namespace infrt { namespace trt { diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h index 76768037dbdb3..e851c26c43c8c 100644 --- a/paddle/infrt/dialect/tensorrt/trt_ops.h +++ b/paddle/infrt/dialect/tensorrt/trt_ops.h @@ -28,6 +28,7 @@ #include #include #include + #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/pd/ir/pd_ops.h" diff --git a/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.cc b/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.cc index 35c81d0230161..1cb7c4155b987 100644 --- a/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.cc @@ -15,6 +15,7 @@ #include "paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h" #include + #include #include "llvm/ADT/StringRef.h" diff --git a/paddle/infrt/external_kernels/CMakeLists.txt b/paddle/infrt/external_kernels/CMakeLists.txt index 9e90c1896c79f..96cfe2b73d8cd 100644 --- a/paddle/infrt/external_kernels/CMakeLists.txt +++ b/paddle/infrt/external_kernels/CMakeLists.txt @@ -8,6 +8,8 @@ set(external_kernels_lib "${CMAKE_CURRENT_BINARY_DIR}/libexternal_kernels.so") message(STATUS "basic_mlir: ${basic_mlir}") message(STATUS "external_kernels_lib: ${external_kernels_lib}") add_test( - NAME run_and_check_external_kernels - COMMAND sh -c "${CMAKE_BINARY_DIR}/infrt/host_context/infrtexec -i ${basic_mlir} --shared_libs=${external_kernels_lib} | ${LLVM_PATH}/bin/FileCheck ${basic_mlir}" + NAME run_and_check_external_kernels + COMMAND + sh -c + "${CMAKE_BINARY_DIR}/infrt/host_context/infrtexec -i ${basic_mlir} --shared_libs=${external_kernels_lib} | ${LLVM_PATH}/bin/FileCheck ${basic_mlir}" ) diff --git a/paddle/infrt/host_context/CMakeLists.txt b/paddle/infrt/host_context/CMakeLists.txt index 14cbea70ca841..2901a282cda7d 100644 --- a/paddle/infrt/host_context/CMakeLists.txt +++ b/paddle/infrt/host_context/CMakeLists.txt @@ -1,26 +1,33 @@ core_gather_headers() -gather_srcs(infrt_src SRCS - kernel_frame.cc - kernel_registry.cc - value.cc - kernel_utils.cc - symbol_table.cc - op_executable.cc - core_runtime.cc - mlir_to_runtime_translate.cc - function.cc - mlir_function_executable.cc - mlir_program_executor.cc - paddle_mlir.cc - ) +gather_srcs( + infrt_src + SRCS + kernel_frame.cc + kernel_registry.cc + value.cc + kernel_utils.cc + symbol_table.cc + op_executable.cc + core_runtime.cc + mlir_to_runtime_translate.cc + function.cc + mlir_function_executable.cc + mlir_program_executor.cc + paddle_mlir.cc) -cc_test_tiny(test_infrt_host_context_value SRCS value_test.cc DEPS infrt ${MLIR_IR_LIBS}) -cc_test_tiny(test_infrt_kernel_utils SRCS kernel_utils_test.cc DEPS infrt ${MLIR_IR_LIBS}) -cc_test_tiny(test_infrt_kernel_registry SRCS kernel_registry_test.cc DEPS infrt ${MLIR_IR_LIBS}) -cc_test_tiny(test_infrt_op_executable SRCS op_executable_test.cc DEPS infrt ${MLIR_IR_LIBS}) -cc_test_tiny(test_infrt_core_runtime SRCS core_runtime_test.cc DEPS infrt ${MLIR_IR_LIBS}) -cc_test_tiny(test_infrt_mlir_to_runtime_translate SRCS mlir_to_runtime_translate_test.cc DEPS infrt ${MLIR_IR_LIBS}) +cc_test_tiny(test_infrt_host_context_value SRCS value_test.cc DEPS infrt + ${MLIR_IR_LIBS}) +cc_test_tiny(test_infrt_kernel_utils SRCS kernel_utils_test.cc DEPS infrt + ${MLIR_IR_LIBS}) +cc_test_tiny(test_infrt_kernel_registry SRCS kernel_registry_test.cc DEPS infrt + ${MLIR_IR_LIBS}) +cc_test_tiny(test_infrt_op_executable SRCS op_executable_test.cc DEPS infrt + ${MLIR_IR_LIBS}) +cc_test_tiny(test_infrt_core_runtime SRCS core_runtime_test.cc DEPS infrt + ${MLIR_IR_LIBS}) +cc_test_tiny(test_infrt_mlir_to_runtime_translate SRCS + mlir_to_runtime_translate_test.cc DEPS infrt ${MLIR_IR_LIBS}) add_executable(paddle-mlir-convert paddle_mlir_converter.cc) target_link_libraries(paddle-mlir-convert infrt ${MLIR_IR_LIBS}) diff --git a/paddle/infrt/host_context/core_runtime.cc b/paddle/infrt/host_context/core_runtime.cc index e3917bd07d242..3dbb0b41c9fb8 100644 --- a/paddle/infrt/host_context/core_runtime.cc +++ b/paddle/infrt/host_context/core_runtime.cc @@ -14,9 +14,8 @@ #include "paddle/infrt/host_context/core_runtime.h" -#include - #include +#include #include #include "paddle/infrt/host_context/kernel_registry.h" diff --git a/paddle/infrt/host_context/core_runtime.h b/paddle/infrt/host_context/core_runtime.h index acb6a66cac630..585369e249b22 100644 --- a/paddle/infrt/host_context/core_runtime.h +++ b/paddle/infrt/host_context/core_runtime.h @@ -46,7 +46,7 @@ class CoreRuntime : public std::enable_shared_from_this { //! Get the results of the execution. llvm::SmallVector // - GetResults(llvm::ArrayRef arg_names); + GetResults(llvm::ArrayRef arg_names); std::shared_ptr getptr() { return std::shared_ptr(this); diff --git a/paddle/infrt/host_context/kernel_registry.cc b/paddle/infrt/host_context/kernel_registry.cc index 5693e973a3f98..2518056ba9d29 100644 --- a/paddle/infrt/host_context/kernel_registry.cc +++ b/paddle/infrt/host_context/kernel_registry.cc @@ -39,8 +39,8 @@ const std::vector &KernelRegistry::GetAttrNameList( void KernelRegistry::AddKernel(const std::string &key, KernelImplementation fn, const std::vector &attr_order) { - CHECK(!impl_->data.count(key)) << "kernel [" << key - << "] is registered twice"; + CHECK(!impl_->data.count(key)) + << "kernel [" << key << "] is registered twice"; impl_->data.emplace( key, std::make_pair([fn]() { return fn; }, std::move(attr_order))); } @@ -48,8 +48,8 @@ void KernelRegistry::AddKernel(const std::string &key, void KernelRegistry::AddKernel(const std::string &key, KernelLauncher fn, const std::vector &attr_order) { - CHECK(!impl_->data.count(key)) << "kernel [" << key - << "] is registered twice"; + CHECK(!impl_->data.count(key)) + << "kernel [" << key << "] is registered twice"; impl_->data.emplace(key, std::make_pair(std::move(fn), std::move(attr_order))); } diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc index 6ad51a02bda29..1ae7cdc742afd 100644 --- a/paddle/infrt/host_context/mlir_exec.cc +++ b/paddle/infrt/host_context/mlir_exec.cc @@ -14,6 +14,7 @@ #include #include + #include #include diff --git a/paddle/infrt/host_context/mlir_program_executor.h b/paddle/infrt/host_context/mlir_program_executor.h index c2ccb90640b21..7808c460457aa 100644 --- a/paddle/infrt/host_context/mlir_program_executor.h +++ b/paddle/infrt/host_context/mlir_program_executor.h @@ -19,10 +19,10 @@ #include #include #include -#include #include #include +#include #include "paddle/infrt/host_context/core_runtime.h" #include "paddle/infrt/host_context/kernel_registry.h" diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc index 05bb28b7c5613..9292e593a708f 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc +++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc @@ -14,6 +14,7 @@ #include "paddle/infrt/host_context/mlir_to_runtime_translate.h" +#include #include #include #include @@ -23,7 +24,6 @@ #include #include -#include #include #include #include @@ -591,8 +591,8 @@ bool MlirToRuntimeTranslator::EmitCallOp(mlir::Operation* op, { // lookup the callee function auto it = table.find(callee_name.getValue().str()); - CHECK(it != table.end()) << "can't find function [" - << callee_name.getValue().str() << "]"; + CHECK(it != table.end()) + << "can't find function [" << callee_name.getValue().str() << "]"; auto* function = impl_->cur_op->CreateFunctionExecutable(it->second, &impl_->func_defs); impl_->cur_op->AppendAttribute(new Value(function)); diff --git a/paddle/infrt/host_context/op_executable.cc b/paddle/infrt/host_context/op_executable.cc index 4d588a9c2b523..b53dc0545c72f 100644 --- a/paddle/infrt/host_context/op_executable.cc +++ b/paddle/infrt/host_context/op_executable.cc @@ -15,6 +15,7 @@ #include "paddle/infrt/host_context/op_executable.h" #include + #include #include diff --git a/paddle/infrt/host_context/op_executable.h b/paddle/infrt/host_context/op_executable.h index 550f6ab6349ed..b80b99fd41405 100644 --- a/paddle/infrt/host_context/op_executable.h +++ b/paddle/infrt/host_context/op_executable.h @@ -16,6 +16,7 @@ #include #include #include + #include #include #include diff --git a/paddle/infrt/host_context/paddle_mlir.h b/paddle/infrt/host_context/paddle_mlir.h index 57bdc1b48578b..629181cca3d6e 100644 --- a/paddle/infrt/host_context/paddle_mlir.h +++ b/paddle/infrt/host_context/paddle_mlir.h @@ -20,6 +20,7 @@ #include #include #include + #include #include #include diff --git a/paddle/infrt/host_context/paddle_mlir_converter.cc b/paddle/infrt/host_context/paddle_mlir_converter.cc index a2808a00cb67d..0aae8cc93377d 100644 --- a/paddle/infrt/host_context/paddle_mlir_converter.cc +++ b/paddle/infrt/host_context/paddle_mlir_converter.cc @@ -41,7 +41,9 @@ bool parse_inputs(int argc, *params_file_name = argv[2]; return true; } - default: { return false; } + default: { + return false; + } } } diff --git a/paddle/infrt/host_context/symbol_table.h b/paddle/infrt/host_context/symbol_table.h index 805215a78ce0d..8c79c78c690e8 100644 --- a/paddle/infrt/host_context/symbol_table.h +++ b/paddle/infrt/host_context/symbol_table.h @@ -14,9 +14,8 @@ #pragma once -#include - #include +#include #include "paddle/infrt/host_context/value.h" diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h index 1834cb4c0db05..af785c13349fd 100644 --- a/paddle/infrt/host_context/value.h +++ b/paddle/infrt/host_context/value.h @@ -159,15 +159,15 @@ class Value : public common::Object { template const T& get() const { - CHECK(data.template is()) << "typeid: " << data.index() - << " != " << ValueVariantType::IndexOf; + CHECK(data.template is()) + << "typeid: " << data.index() << " != " << ValueVariantType::IndexOf; return data.get(); } template T& get() { - CHECK(data.template is()) << "typeid: " << data.index() - << " != " << ValueVariantType::IndexOf; + CHECK(data.template is()) + << "typeid: " << data.index() << " != " << ValueVariantType::IndexOf; return data.get(); } diff --git a/paddle/infrt/kernel/CMakeLists.txt b/paddle/infrt/kernel/CMakeLists.txt index f20344f6f6b84..6a18047885d48 100644 --- a/paddle/infrt/kernel/CMakeLists.txt +++ b/paddle/infrt/kernel/CMakeLists.txt @@ -3,11 +3,12 @@ add_subdirectory(tensorrt) core_gather_headers() -gather_srcs(infrt_src SRCS - basic_kernels.cc - # phi_kernels.cc - test_kernels.cc - tensor_shape_kernels.cc - tensor_kernels.cc - control_flow_kernels.cc - ) +gather_srcs( + infrt_src + SRCS + basic_kernels.cc + # phi_kernels.cc + test_kernels.cc + tensor_shape_kernels.cc + tensor_kernels.cc + control_flow_kernels.cc) diff --git a/paddle/infrt/kernel/phi/CMakeLists.txt b/paddle/infrt/kernel/phi/CMakeLists.txt index 22a59ab2faf8c..92e4a49cd849c 100644 --- a/paddle/infrt/kernel/phi/CMakeLists.txt +++ b/paddle/infrt/kernel/phi/CMakeLists.txt @@ -1,34 +1,39 @@ -if (NOT INFRT_WITH_PHI) - return() +if(NOT INFRT_WITH_PHI) + return() endif() core_gather_headers() -gather_srcs(infrt_src SRCS - registry.cc - dense_tensor_kernels.cc - context_kernels.cc -) +gather_srcs(infrt_src SRCS registry.cc dense_tensor_kernels.cc + context_kernels.cc) -set(infrt_register_phi_kernels_gen_source_file ${CMAKE_SOURCE_DIR}/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc) -set(infrt_register_phi_kernels_gen_file ${CMAKE_SOURCE_DIR}/tools/infrt/get_phi_kernel_function.sh) -set(wrapped_infermeta_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.h) -set(wrapped_infermeta_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.cc) +set(infrt_register_phi_kernels_gen_source_file + ${CMAKE_SOURCE_DIR}/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc +) +set(infrt_register_phi_kernels_gen_file + ${CMAKE_SOURCE_DIR}/tools/infrt/get_phi_kernel_function.sh) +set(wrapped_infermeta_header_file + ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.h) +set(wrapped_infermeta_source_file + ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.cc) add_custom_command( - OUTPUT ${infrt_register_phi_kernels_gen_source_file} - COMMAND bash ${infrt_register_phi_kernels_gen_file} - DEPENDS wrapped_infermeta - VERBATIM) -add_custom_target(infrt_register_phi_kernel - COMMAND bash ${infrt_register_phi_kernels_gen_file} - DEPENDS wrapped_infermeta - COMMENT "infrt generate ${infrt_register_phi_kernels_gen_source_file}" - VERBATIM) + OUTPUT ${infrt_register_phi_kernels_gen_source_file} + COMMAND bash ${infrt_register_phi_kernels_gen_file} + DEPENDS wrapped_infermeta + VERBATIM) +add_custom_target( + infrt_register_phi_kernel + COMMAND bash ${infrt_register_phi_kernels_gen_file} + DEPENDS wrapped_infermeta + COMMENT "infrt generate ${infrt_register_phi_kernels_gen_source_file}" + VERBATIM) -cc_library(infrt_naive SRCS infershaped/infershaped_kernel_launcher.cc - infershaped/infershaped_kernel_launchers.cc - DEPS phi wrapped_infermeta) +cc_library( + infrt_naive + SRCS infershaped/infershaped_kernel_launcher.cc + infershaped/infershaped_kernel_launchers.cc + DEPS phi wrapped_infermeta) cc_test_tiny(test_infrt_infershape_launchers SRCS -infershaped/infershape_launchers_test.cc DEPS infrt) + infershaped/infershape_launchers_test.cc DEPS infrt) diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc index 95e25b243f3ab..8c49f47e7d873 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h" + #include + #include "llvm/Support/ErrorHandling.h" #include "paddle/infrt/backends/host/phi_allocator.h" #include "paddle/infrt/common/string.h" diff --git a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc index 2e40261f27386..cb9640451f9b2 100644 --- a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc +++ b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h" + #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/meta_tensor.h" diff --git a/paddle/infrt/kernel/phi/infershaped/infershaped_utils.h b/paddle/infrt/kernel/phi/infershaped/infershaped_utils.h index 277c4ad6b7afc..531d77ba952aa 100644 --- a/paddle/infrt/kernel/phi/infershaped/infershaped_utils.h +++ b/paddle/infrt/kernel/phi/infershaped/infershaped_utils.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/infrt/tensor/dense_host_tensor.h" #include "paddle/phi/core/dense_tensor.h" diff --git a/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h index d87027847202b..bac25e0f437d8 100644 --- a/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h +++ b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h @@ -14,6 +14,7 @@ #pragma once #include + #include #include "paddle/infrt/backends/host/phi_context.h" diff --git a/paddle/infrt/kernel/tensorrt/CMakeLists.txt b/paddle/infrt/kernel/tensorrt/CMakeLists.txt index cd35fccbe2aa3..2cb595f7ba4f7 100644 --- a/paddle/infrt/kernel/tensorrt/CMakeLists.txt +++ b/paddle/infrt/kernel/tensorrt/CMakeLists.txt @@ -1,10 +1,10 @@ -if (NOT (INFRT_WITH_PHI AND INFRT_WITH_GPU AND INFRT_WITH_TRT)) +if(NOT + (INFRT_WITH_PHI + AND INFRT_WITH_GPU + AND INFRT_WITH_TRT)) return() endif() core_gather_headers() -gather_srcs(infrt_src SRCS - registry.cc - trt_kernels.cc -) +gather_srcs(infrt_src SRCS registry.cc trt_kernels.cc) diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.cc b/paddle/infrt/kernel/tensorrt/trt_kernels.cc index c0f5ebb4a7657..0ea68f2e835f7 100644 --- a/paddle/infrt/kernel/tensorrt/trt_kernels.cc +++ b/paddle/infrt/kernel/tensorrt/trt_kernels.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +// clang-format off #include "paddle/infrt/kernel/tensorrt/trt_kernels.h" #include #include @@ -36,6 +37,7 @@ #include "paddle/infrt/host_context/symbol_table.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/dense_tensor.h" +// clang-format on namespace infrt { namespace kernel { diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.h b/paddle/infrt/kernel/tensorrt/trt_kernels.h index bf23bd45c1341..bf41c124a299b 100644 --- a/paddle/infrt/kernel/tensorrt/trt_kernels.h +++ b/paddle/infrt/kernel/tensorrt/trt_kernels.h @@ -19,7 +19,6 @@ #include #include "mlir/IR/Operation.h" - #include "paddle/infrt/backends/tensorrt/trt_engine.h" #include "paddle/phi/backends/gpu/gpu_context.h" diff --git a/paddle/infrt/kernel/test_kernels.cc b/paddle/infrt/kernel/test_kernels.cc index bcf475d1bc09d..e00afa4b7901a 100644 --- a/paddle/infrt/kernel/test_kernels.cc +++ b/paddle/infrt/kernel/test_kernels.cc @@ -92,11 +92,11 @@ class BenchmarkStats { std::sort(run_times_walltime_.begin(), run_times_walltime_.end()); std::sort(run_times_cpu_.begin(), run_times_cpu_.end()); - auto percentile = []( - double p, const std::vector &run_times) { - assert(p >= 0.0 && p <= 1.0); - return run_times[run_times.size() * p]; - }; + auto percentile = + [](double p, const std::vector &run_times) { + assert(p >= 0.0 && p <= 1.0); + return run_times[run_times.size() * p]; + }; // BM: prefix is added to make grepping results from lit output easier. std::string prefix; diff --git a/paddle/infrt/paddle/CMakeLists.txt b/paddle/infrt/paddle/CMakeLists.txt index 21c117535fe70..5f894626f8015 100644 --- a/paddle/infrt/paddle/CMakeLists.txt +++ b/paddle/infrt/paddle/CMakeLists.txt @@ -5,14 +5,16 @@ add_subdirectory(pb) core_gather_headers() -gather_srcs(infrt_src SRCS - model_parser.cc - scope.cc - tensor.cc - ) +gather_srcs(infrt_src SRCS model_parser.cc scope.cc tensor.cc) -file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h) +file( + GLOB includes + LIST_DIRECTORIES false + RELATIVE ${CMAKE_SOURCE_DIR} + *.h) foreach(header ${includes}) - set(core_includes "${core_includes};${header}" CACHE INTERNAL "") + set(core_includes + "${core_includes};${header}" + CACHE INTERNAL "") endforeach() diff --git a/paddle/infrt/paddle/cpp/CMakeLists.txt b/paddle/infrt/paddle/cpp/CMakeLists.txt index 8b48603bddf8e..9947747108494 100644 --- a/paddle/infrt/paddle/cpp/CMakeLists.txt +++ b/paddle/infrt/paddle/cpp/CMakeLists.txt @@ -1,5 +1,11 @@ -file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h) +file( + GLOB includes + LIST_DIRECTORIES false + RELATIVE ${CMAKE_SOURCE_DIR} + *.h) foreach(header ${includes}) - set(core_includes "${core_includes};${header}" CACHE INTERNAL "") + set(core_includes + "${core_includes};${header}" + CACHE INTERNAL "") endforeach() diff --git a/paddle/infrt/paddle/pb/CMakeLists.txt b/paddle/infrt/paddle/pb/CMakeLists.txt index b3491cfe13618..3614201a95f65 100644 --- a/paddle/infrt/paddle/pb/CMakeLists.txt +++ b/paddle/infrt/paddle/pb/CMakeLists.txt @@ -1,12 +1,13 @@ -gather_srcs(infrt_src SRCS - var_desc.cc - op_desc.cc - block_desc.cc - program_desc.cc - ) +gather_srcs(infrt_src SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc) -file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h) +file( + GLOB includes + LIST_DIRECTORIES false + RELATIVE ${CMAKE_SOURCE_DIR} + *.h) foreach(header ${includes}) - set(core_includes "${core_includes};${header}" CACHE INTERNAL "") + set(core_includes + "${core_includes};${header}" + CACHE INTERNAL "") endforeach() diff --git a/paddle/infrt/paddle/scope.h b/paddle/infrt/paddle/scope.h index 4ebf846374c6f..1f81d0914dfc6 100644 --- a/paddle/infrt/paddle/scope.h +++ b/paddle/infrt/paddle/scope.h @@ -13,10 +13,9 @@ // limitations under the License. #pragma once -#include - #include #include +#include #include #include "paddle/infrt/common/macros.h" diff --git a/paddle/infrt/support/type_traits.h b/paddle/infrt/support/type_traits.h index 341dabb7c1c4a..33a42fe37eaa6 100644 --- a/paddle/infrt/support/type_traits.h +++ b/paddle/infrt/support/type_traits.h @@ -115,7 +115,8 @@ struct nonesuch { template class Op, + template + class Op, class... Args> struct detector : std::false_type { using value_t = std::false_type; diff --git a/paddle/infrt/tensor/CMakeLists.txt b/paddle/infrt/tensor/CMakeLists.txt index 95d4090a9a3f7..b1c3149276c59 100644 --- a/paddle/infrt/tensor/CMakeLists.txt +++ b/paddle/infrt/tensor/CMakeLists.txt @@ -2,13 +2,14 @@ core_gather_headers() add_subdirectory(phi) -gather_srcs(infrt_src SRCS +gather_srcs( + infrt_src + SRCS tensor_map.cc tensor_metadata.cc dense_tensor_view.cc dense_host_tensor.cc - tensor_shape.cc - ) + tensor_shape.cc) # set(tensor_map_mlir "${CMAKE_SOURCE_DIR}/infrt/dialect/mlir_tests/tensor_map.mlir") # set(external_kernels_lib "${CMAKE_BINARY_DIR}/paddle/libexternal_kernels.so") diff --git a/paddle/infrt/tensor/phi/CMakeLists.txt b/paddle/infrt/tensor/phi/CMakeLists.txt index 97e26661266e9..94658e223e287 100644 --- a/paddle/infrt/tensor/phi/CMakeLists.txt +++ b/paddle/infrt/tensor/phi/CMakeLists.txt @@ -1,3 +1 @@ -gather_srcs(infrt_src SRCS - tensor_map.cc -) +gather_srcs(infrt_src SRCS tensor_map.cc) diff --git a/paddle/infrt/tests/CMakeLists.txt b/paddle/infrt/tests/CMakeLists.txt index a720ad824794e..22e5e232d5485 100644 --- a/paddle/infrt/tests/CMakeLists.txt +++ b/paddle/infrt/tests/CMakeLists.txt @@ -1,11 +1,21 @@ cc_test_tiny(test_abs_model SRCS models/test_abs.cc DEPS infrt ${MLIR_IR_LIBS}) -configure_file(lit.cfg.py.in "${CMAKE_SOURCE_DIR}/paddle/infrt/tests/lit.cfg.py") +configure_file(lit.cfg.py.in + "${CMAKE_SOURCE_DIR}/paddle/infrt/tests/lit.cfg.py") -add_test(NAME test_infrt_by_lit COMMAND sh -c "lit -v ${CMAKE_SOURCE_DIR}/paddle/infrt/tests --filter-out \"disabled_*\"" +add_test( + NAME test_infrt_by_lit + COMMAND + sh -c + "lit -v ${CMAKE_SOURCE_DIR}/paddle/infrt/tests --filter-out \"disabled_*\"" DEPENDS infrtopt infrtexec) -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir) -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/linear_cpu.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/linear_cpu.mlir) -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/resnet50.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/resnet50.mlir) -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensorrt/disabled_linear.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensorrt/disabled_linear.mlir) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir.in + ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/linear_cpu.mlir.in + ${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/linear_cpu.mlir) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/resnet50.mlir.in + ${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/resnet50.mlir) +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensorrt/disabled_linear.mlir.in + ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensorrt/disabled_linear.mlir) diff --git a/paddle/infrt/tests/models/test_abs.cc b/paddle/infrt/tests/models/test_abs.cc index 89bbe78ffe27a..aa5a2c6945b47 100644 --- a/paddle/infrt/tests/models/test_abs.cc +++ b/paddle/infrt/tests/models/test_abs.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +// clang-format off #include #include #include @@ -49,6 +50,7 @@ #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" #include "paddle/infrt/dialect/phi/ir/phi_base.h" #include "paddle/infrt/dialect/phi/ir/phi_kernels.h" +// clang-format on static llvm::cl::list cl_shared_libs( // NOLINT "shared_libs", diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt index 58ad42ddd1ff8..7f3dd1ddc38fb 100644 --- a/paddle/phi/CMakeLists.txt +++ b/paddle/phi/CMakeLists.txt @@ -23,14 +23,33 @@ add_subdirectory(tools) add_subdirectory(tests) # make an unity target for compile deps -set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor string_tensor api_scalar api_int_array) +set(PHI_DEPS + convert_utils + dense_tensor + phi_context + kernel_factory + kernel_context + arg_map_context + infermeta + lod_utils + op_compat_infos + sparse_csr_tensor + sparse_coo_tensor + string_tensor + api_scalar + api_int_array) get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) set(PHI_DEPS ${PHI_DEPS} ${phi_kernels}) create_dummy_static_lib(phi LIBS ${PHI_DEPS} LIMIT 100) -set(phi_extension_header_file ${CMAKE_CURRENT_SOURCE_DIR}/extension.h CACHE INTERNAL "phi/extension.h file") -file(WRITE ${phi_extension_header_file} "// Header file generated by paddle/phi/CMakeLists.txt for external users,\n// DO NOT edit or include it within paddle.\n\n#pragma once\n\n") +set(phi_extension_header_file + ${CMAKE_CURRENT_SOURCE_DIR}/extension.h + CACHE INTERNAL "phi/extension.h file") +file( + WRITE ${phi_extension_header_file} + "// Header file generated by paddle/phi/CMakeLists.txt for external users,\n// DO NOT edit or include it within paddle.\n\n#pragma once\n\n" +) # generate inner headers include dir for users generate_unify_header(backends) diff --git a/paddle/phi/api/CMakeLists.txt b/paddle/phi/api/CMakeLists.txt index d575759db32ee..b1d97cbc7fa2c 100644 --- a/paddle/phi/api/CMakeLists.txt +++ b/paddle/phi/api/CMakeLists.txt @@ -1,2 +1,6 @@ add_subdirectory(lib) -cc_library(phi_api SRCS all.cc DEPS phi_function_api phi_bw_function_api sparse_api sparse_bw_api strings_api) +cc_library( + phi_api + SRCS all.cc + DEPS phi_function_api phi_bw_function_api sparse_api sparse_bw_api + strings_api) diff --git a/paddle/phi/api/ext/op_meta_info.h b/paddle/phi/api/ext/op_meta_info.h index a9475db800816..fa19714dde7db 100644 --- a/paddle/phi/api/ext/op_meta_info.h +++ b/paddle/phi/api/ext/op_meta_info.h @@ -317,25 +317,24 @@ using InferShapeFunc = std::vector> (*)( const std::vector>>& vec_input_shapes, const std::vector& attrs); -#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(input_type) \ - template \ - struct InferShapeCallHelper { \ - template \ - static Return InferShape( \ - const std::vector>& input_shapes, \ - const std::vector>>& \ - vec_input_shapes, \ - const std::vector& attrs, \ - const PreviousArgs&... pargs) { \ - input_type arg = input_shapes[in_idx]; \ - return InferShapeCallHelper::template InferShape( \ - input_shapes, vec_input_shapes, attrs, pargs..., arg); \ - } \ +#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(input_type) \ + template \ + struct InferShapeCallHelper { \ + template \ + static Return InferShape( \ + const std::vector>& input_shapes, \ + const std::vector>>& \ + vec_input_shapes, \ + const std::vector& attrs, \ + const PreviousArgs&... pargs) { \ + input_type arg = input_shapes[in_idx]; \ + return InferShapeCallHelper:: \ + template InferShape( \ + input_shapes, vec_input_shapes, attrs, pargs..., arg); \ + } \ } #define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES(input_type) \ @@ -397,10 +396,8 @@ struct InferShapeFuncImpl { const std::vector>& input_shapes, const std::vector>>& vec_input_shapes, const std::vector& attrs) { - return InferShapeCallHelper>::template InferShape<0, - 0, - 0>( - input_shapes, vec_input_shapes, attrs); + return InferShapeCallHelper>:: + template InferShape<0, 0, 0>(input_shapes, vec_input_shapes, attrs); } private: @@ -482,20 +479,19 @@ using InferDtypeFunc = std::vector (*)( } \ } -#define PD_SPECIALIZE_InferDtypeCallHelper_FOR_DTYPES(input_type) \ - template \ - struct InferDtypeCallHelper { \ - template \ - static Return InferDtype( \ - const std::vector& input_dtypes, \ - const std::vector>& vec_input_dtypes, \ - const PreviousArgs&... pargs) { \ - input_type arg = vec_input_dtypes[vec_in_idx]; \ - return InferDtypeCallHelper::template InferDtype( \ - input_dtypes, vec_input_dtypes, pargs..., arg); \ - } \ +#define PD_SPECIALIZE_InferDtypeCallHelper_FOR_DTYPES(input_type) \ + template \ + struct InferDtypeCallHelper { \ + template \ + static Return InferDtype( \ + const std::vector& input_dtypes, \ + const std::vector>& vec_input_dtypes, \ + const PreviousArgs&... pargs) { \ + input_type arg = vec_input_dtypes[vec_in_idx]; \ + return InferDtypeCallHelper:: \ + template InferDtype( \ + input_dtypes, vec_input_dtypes, pargs..., arg); \ + } \ } template diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index 004ed8de520d9..a1c6989555f20 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -1,11 +1,20 @@ add_subdirectory(utils) -if (WITH_GPU) - nv_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils phi_enforce) -elseif (WITH_ROCM) - hip_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils phi_enforce) +if(WITH_GPU) + nv_library( + phi_tensor_raw + SRCS tensor.cc + DEPS tensor_base dense_tensor phi_api_utils phi_enforce) +elseif(WITH_ROCM) + hip_library( + phi_tensor_raw + SRCS tensor.cc + DEPS tensor_base dense_tensor phi_api_utils phi_enforce) else() - cc_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils phi_enforce) + cc_library( + phi_tensor_raw + SRCS tensor.cc + DEPS tensor_base dense_tensor phi_api_utils phi_enforce) endif() set(api_gen_base ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_base.py) @@ -13,71 +22,94 @@ set(api_gen_base ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_base.py) # forward api file set(api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_gen.py) set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml) -set(new_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/new_api.yaml) +set(new_api_yaml_file + ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/new_api.yaml) set(api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/api.h) set(api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/api.cc) set(api_header_file_tmp ${api_header_file}.tmp) set(api_source_file_tmp ${api_source_file}.tmp) # backward api file -set(bw_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/backward_api_gen.py) -set(bw_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml) -set(new_bw_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/new_backward.yaml) -set(bw_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/backward_api.h) +set(bw_api_gen_file + ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/backward_api_gen.py) +set(bw_api_yaml_file + ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml) +set(new_bw_api_yaml_file + ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/new_backward.yaml) +set(bw_api_header_file + ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/backward_api.h) set(bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/backward_api.cc) set(bw_api_header_file_tmp ${bw_api_header_file}.tmp) set(bw_api_source_file_tmp ${bw_api_source_file}.tmp) # dygraph(intermediate) api file -set(im_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/intermediate_api_gen.py) -set(dygraph_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/dygraph_api.h) -set(dygraph_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/dygraph_api.cc) +set(im_api_gen_file + ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/intermediate_api_gen.py) +set(dygraph_api_header_file + ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/dygraph_api.h) +set(dygraph_api_source_file + ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/dygraph_api.cc) set(dygraph_api_header_file_tmp ${dygraph_api_header_file}.tmp) set(dygraph_api_source_file_tmp ${dygraph_api_source_file}.tmp) # sparse api file -set(sparse_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api_gen.py) -set(sparse_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml) -set(sparse_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/sparse_api.h) +set(sparse_api_gen_file + ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api_gen.py) +set(sparse_api_yaml_file + ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml) +set(sparse_api_header_file + ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/sparse_api.h) set(sparse_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_api.cc) set(sparse_api_header_file_tmp ${sparse_api_header_file}.tmp) set(sparse_api_source_file_tmp ${sparse_api_source_file}.tmp) # sparse bw api file -set(sparse_bw_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api_gen.py) -set(sparse_bw_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml) -set(sparse_bw_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/sparse_bw_api.h) -set(sparse_bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_bw_api.cc) +set(sparse_bw_api_gen_file + ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api_gen.py) +set(sparse_bw_api_yaml_file + ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml) +set(sparse_bw_api_header_file + ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/sparse_bw_api.h) +set(sparse_bw_api_source_file + ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_bw_api.cc) set(sparse_bw_api_header_file_tmp ${sparse_bw_api_header_file}.tmp) set(sparse_bw_api_source_file_tmp ${sparse_bw_api_source_file}.tmp) # strings api file -set(strings_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/strings_api_gen.py) -set(strings_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/strings_api.yaml) -set(strings_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/strings_api.h) -set(strings_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/strings_api.cc) +set(strings_api_gen_file + ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/strings_api_gen.py) +set(strings_api_yaml_file + ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/strings_api.yaml) +set(strings_api_header_file + ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/strings_api.h) +set(strings_api_source_file + ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/strings_api.cc) set(strings_api_header_file_tmp ${strings_api_header_file}.tmp) set(strings_api_source_file_tmp ${strings_api_source_file}.tmp) # wrapped infermeta file -set(wrapped_infermeta_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py) -set(wrapped_infermeta_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.h) -set(wrapped_infermeta_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.cc) +set(wrapped_infermeta_gen_file + ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py) +set(wrapped_infermeta_header_file + ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.h) +set(wrapped_infermeta_source_file + ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.cc) -if (NOT PYTHON_EXECUTABLE) +if(NOT PYTHON_EXECUTABLE) find_package(PythonInterp REQUIRED) endif() # install extra dependencies -execute_process( - COMMAND ${PYTHON_EXECUTABLE} -m pip install -U pyyaml jinja2 -) +execute_process(COMMAND ${PYTHON_EXECUTABLE} -m pip install -U pyyaml jinja2) # parse apis set(parsed_api_dir ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/parsed_apis) -set(generated_op_path ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generated_op.cc) -set(generated_argument_mapping_path ${CMAKE_SOURCE_DIR}/paddle/phi/ops/compat/generated_sig.cc) -message("parse api yamls: +set(generated_op_path + ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generated_op.cc) +set(generated_argument_mapping_path + ${CMAKE_SOURCE_DIR}/paddle/phi/ops/compat/generated_sig.cc) +message( + "parse api yamls: - ${api_yaml_file} - ${new_api_yaml_file} - ${bw_api_yaml_file} @@ -85,24 +117,18 @@ message("parse api yamls: execute_process( WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen COMMAND ${CMAKE_COMMAND} -E make_directory ${parsed_api_dir} - COMMAND ${PYTHON_EXECUTABLE} parse_api.py - --api_yaml_path ./api.yaml - --output_path ./parsed_apis/api.parsed.yaml - COMMAND ${PYTHON_EXECUTABLE} parse_api.py - --api_yaml_path ./new_api.yaml - --output_path ./parsed_apis/new_api.parsed.yaml - COMMAND ${PYTHON_EXECUTABLE} parse_api.py - --api_yaml_path ./backward.yaml - --output_path ./parsed_apis/backward_api.parsed.yaml - --backward - COMMAND ${PYTHON_EXECUTABLE} parse_api.py - --api_yaml_path ./new_backward.yaml - --output_path ./parsed_apis/new_backward_api.parsed.yaml - --backward - RESULTS_VARIABLE _results -) + COMMAND ${PYTHON_EXECUTABLE} parse_api.py --api_yaml_path ./api.yaml + --output_path ./parsed_apis/api.parsed.yaml + COMMAND ${PYTHON_EXECUTABLE} parse_api.py --api_yaml_path ./new_api.yaml + --output_path ./parsed_apis/new_api.parsed.yaml + COMMAND ${PYTHON_EXECUTABLE} parse_api.py --api_yaml_path ./backward.yaml + --output_path ./parsed_apis/backward_api.parsed.yaml --backward + COMMAND + ${PYTHON_EXECUTABLE} parse_api.py --api_yaml_path ./new_backward.yaml + --output_path ./parsed_apis/new_backward_api.parsed.yaml --backward + RESULTS_VARIABLE _results) foreach(_result in ${_results}) - if (${_result}) + if(${_result}) message(FATAL_ERROR "api yaml parsing failed, exiting.") endif() endforeach() @@ -113,52 +139,67 @@ message("validate api yaml: - ${parsed_api_dir}/new_backward_api.parsed.yaml") execute_process( WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen - COMMAND ${PYTHON_EXECUTABLE} cross_validate.py - --forward_yaml_paths ./parsed_apis/api.parsed.yaml ./parsed_apis/new_api.parsed.yaml - --backward_yaml_paths ./parsed_apis/backward_api.parsed.yaml ./parsed_apis/new_backward_api.parsed.yaml - RESULT_VARIABLE _result -) -if (${_result}) - message(FATAL_ERROR "api validation failed, exiting." ) + COMMAND + ${PYTHON_EXECUTABLE} cross_validate.py --forward_yaml_paths + ./parsed_apis/api.parsed.yaml ./parsed_apis/new_api.parsed.yaml + --backward_yaml_paths ./parsed_apis/backward_api.parsed.yaml + ./parsed_apis/new_backward_api.parsed.yaml + RESULT_VARIABLE _result) +if(${_result}) + message(FATAL_ERROR "api validation failed, exiting.") endif() # code generation for op, op makers, and argument mapping functions -message("create or remove auto-geneated operators: ${generated_op_path}.tmp -create or remove auto-geneated argument mappings: ${generated_argument_mapping_path}.tmp") +message( + "create or remove auto-geneated operators: ${generated_op_path}.tmp +create or remove auto-geneated argument mappings: ${generated_argument_mapping_path}.tmp" +) execute_process( WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen - COMMAND ${PYTHON_EXECUTABLE} generate_op.py - --api_yaml_path ./parsed_apis/new_api.parsed.yaml - --backward_api_yaml_path ./parsed_apis/new_backward_api.parsed.yaml - --output_op_path "${generated_op_path}.tmp" - --output_arg_map_path "${generated_argument_mapping_path}.tmp" - RESULT_VARIABLE _result -) -if (${_result}) - message(FATAL_ERROR "operator codegen failed, exiting." ) + COMMAND + ${PYTHON_EXECUTABLE} generate_op.py --api_yaml_path + ./parsed_apis/new_api.parsed.yaml --backward_api_yaml_path + ./parsed_apis/new_backward_api.parsed.yaml --output_op_path + "${generated_op_path}.tmp" --output_arg_map_path + "${generated_argument_mapping_path}.tmp" + RESULT_VARIABLE _result) +if(${_result}) + message(FATAL_ERROR "operator codegen failed, exiting.") endif() - if(EXISTS "${generated_op_path}.tmp" AND EXISTS "${generated_op_path}") - execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different "${generated_op_path}.tmp" "${generated_op_path}") + execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${generated_op_path}.tmp" "${generated_op_path}") message("copy if different ${generated_op_path}.tmp ${generated_op_path}") elseif(EXISTS "${generated_op_path}.tmp") - execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${generated_op_path}.tmp" "${generated_op_path}") + execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${generated_op_path}.tmp" + "${generated_op_path}") message("copy ${generated_op_path}.tmp ${generated_op_path}") else() execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f "${generated_op_path}") message("remove ${generated_op_path}") endif() - -if(EXISTS "${generated_argument_mapping_path}.tmp" AND EXISTS "${generated_argument_mapping_path}") - execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different "${generated_argument_mapping_path}.tmp" "${generated_argument_mapping_path}") - message("copy if different ${generated_argument_mapping_path}.tmp ${generated_argument_mapping_path}") +if(EXISTS "${generated_argument_mapping_path}.tmp" + AND EXISTS "${generated_argument_mapping_path}") + execute_process( + COMMAND + ${CMAKE_COMMAND} -E copy_if_different + "${generated_argument_mapping_path}.tmp" + "${generated_argument_mapping_path}") + message( + "copy if different ${generated_argument_mapping_path}.tmp ${generated_argument_mapping_path}" + ) elseif(EXISTS "${generated_argument_mapping_path}.tmp") - execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${generated_argument_mapping_path}.tmp" "${generated_argument_mapping_path}") - message("copy ${generated_argument_mapping_path}.tmp ${generated_argument_mapping_path}") + execute_process( + COMMAND ${CMAKE_COMMAND} -E copy "${generated_argument_mapping_path}.tmp" + "${generated_argument_mapping_path}") + message( + "copy ${generated_argument_mapping_path}.tmp ${generated_argument_mapping_path}" + ) else() - execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f "${generated_argument_mapping_path}") + execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f + "${generated_argument_mapping_path}") message("remove ${generated_argument_mapping_path}") endif() @@ -166,26 +207,31 @@ endif() add_custom_command( OUTPUT ${api_header_file} ${api_source_file} COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml - COMMAND ${PYTHON_EXECUTABLE} ${api_gen_file} - --api_yaml_path ${api_yaml_file} ${new_api_yaml_file} - --api_header_path ${api_header_file_tmp} - --api_header_path ${api_header_file_tmp} - --api_source_path ${api_source_file_tmp} - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_header_file_tmp} ${api_header_file} - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_source_file_tmp} ${api_source_file} + COMMAND + ${PYTHON_EXECUTABLE} ${api_gen_file} --api_yaml_path ${api_yaml_file} + ${new_api_yaml_file} --api_header_path ${api_header_file_tmp} + --api_header_path ${api_header_file_tmp} --api_source_path + ${api_source_file_tmp} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_header_file_tmp} + ${api_header_file} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_source_file_tmp} + ${api_source_file} COMMENT "copy_if_different ${api_header_file} ${api_source_file}" DEPENDS ${api_yaml_file} ${api_gen_file} ${api_gen_base} VERBATIM) # generate backward api add_custom_command( - OUTPUT ${bw_api_header_file} ${bw_api_source_file} ${bw_api_header_file_tmp} ${bw_api_source_file_tmp} - COMMAND ${PYTHON_EXECUTABLE} ${bw_api_gen_file} - --backward_yaml_path ${bw_api_yaml_file} ${new_bw_api_yaml_file} - --backward_header_path ${bw_api_header_file_tmp} - --backward_source_path ${bw_api_source_file_tmp} - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_header_file_tmp} ${bw_api_header_file} - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_source_file_tmp} ${bw_api_source_file} + OUTPUT ${bw_api_header_file} ${bw_api_source_file} ${bw_api_header_file_tmp} + ${bw_api_source_file_tmp} + COMMAND + ${PYTHON_EXECUTABLE} ${bw_api_gen_file} --backward_yaml_path + ${bw_api_yaml_file} ${new_bw_api_yaml_file} --backward_header_path + ${bw_api_header_file_tmp} --backward_source_path ${bw_api_source_file_tmp} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_header_file_tmp} + ${bw_api_header_file} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_source_file_tmp} + ${bw_api_source_file} COMMENT "copy_if_different ${bw_api_header_file} ${bw_api_source_file}" DEPENDS ${bw_api_yaml_file} ${bw_api_gen_file} ${api_gen_base} VERBATIM) @@ -193,82 +239,177 @@ add_custom_command( # generate sparse api add_custom_command( OUTPUT ${sparse_api_header_file} ${sparse_api_source_file} - COMMAND ${PYTHON_EXECUTABLE} ${sparse_api_gen_file} - --api_yaml_path ${sparse_api_yaml_file} - --api_header_path ${sparse_api_header_file_tmp} - --api_source_path ${sparse_api_source_file_tmp} - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_header_file_tmp} ${sparse_api_header_file} - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_source_file_tmp} ${sparse_api_source_file} - COMMENT "copy_if_different ${sparse_api_header_file} ${sparse_sparse_api_source_file}" - DEPENDS ${sparse_api_yaml_file} ${sparse_api_gen_file} ${api_gen_base} ${api_gen_file} + COMMAND + ${PYTHON_EXECUTABLE} ${sparse_api_gen_file} --api_yaml_path + ${sparse_api_yaml_file} --api_header_path ${sparse_api_header_file_tmp} + --api_source_path ${sparse_api_source_file_tmp} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_header_file_tmp} + ${sparse_api_header_file} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_source_file_tmp} + ${sparse_api_source_file} + COMMENT + "copy_if_different ${sparse_api_header_file} ${sparse_sparse_api_source_file}" + DEPENDS ${sparse_api_yaml_file} ${sparse_api_gen_file} ${api_gen_base} + ${api_gen_file} VERBATIM) # generate backward sparse api add_custom_command( OUTPUT ${sparse_bw_api_header_file} ${sparse_bw_api_source_file} - COMMAND ${PYTHON_EXECUTABLE} ${sparse_bw_api_gen_file} - --api_yaml_path ${sparse_bw_api_yaml_file} - --api_header_path ${sparse_bw_api_header_file_tmp} - --api_source_path ${sparse_bw_api_source_file_tmp} - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_header_file_tmp} ${sparse_bw_api_header_file} - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_source_file_tmp} ${sparse_bw_api_source_file} - COMMENT "copy_if_different ${sparse_bw_api_header_file} ${sparse_bw_sparse_api_source_file}" - DEPENDS ${sparse_bw_api_yaml_file} ${sparse_bw_api_gen_file} ${api_gen_base} ${api_gen_file} ${sparse_api_gen_file} ${bw_api_gen_file} + COMMAND + ${PYTHON_EXECUTABLE} ${sparse_bw_api_gen_file} --api_yaml_path + ${sparse_bw_api_yaml_file} --api_header_path + ${sparse_bw_api_header_file_tmp} --api_source_path + ${sparse_bw_api_source_file_tmp} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_header_file_tmp} + ${sparse_bw_api_header_file} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_source_file_tmp} + ${sparse_bw_api_source_file} + COMMENT + "copy_if_different ${sparse_bw_api_header_file} ${sparse_bw_sparse_api_source_file}" + DEPENDS ${sparse_bw_api_yaml_file} ${sparse_bw_api_gen_file} ${api_gen_base} + ${api_gen_file} ${sparse_api_gen_file} ${bw_api_gen_file} VERBATIM) # generate strings api add_custom_command( OUTPUT ${strings_api_header_file} ${strings_api_source_file} - COMMAND ${PYTHON_EXECUTABLE} ${strings_api_gen_file} - --api_yaml_path ${strings_api_yaml_file} - --api_header_path ${strings_api_header_file_tmp} - --api_source_path ${strings_api_source_file_tmp} - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${strings_api_header_file_tmp} ${strings_api_header_file} - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${strings_api_source_file_tmp} ${strings_api_source_file} - COMMENT "copy_if_different ${strings_api_header_file} ${strings_strings_api_source_file}" - DEPENDS ${strings_api_yaml_file} ${strings_api_gen_file} ${api_gen_base} ${api_gen_file} + COMMAND + ${PYTHON_EXECUTABLE} ${strings_api_gen_file} --api_yaml_path + ${strings_api_yaml_file} --api_header_path ${strings_api_header_file_tmp} + --api_source_path ${strings_api_source_file_tmp} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${strings_api_header_file_tmp} + ${strings_api_header_file} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${strings_api_source_file_tmp} + ${strings_api_source_file} + COMMENT + "copy_if_different ${strings_api_header_file} ${strings_strings_api_source_file}" + DEPENDS ${strings_api_yaml_file} ${strings_api_gen_file} ${api_gen_base} + ${api_gen_file} VERBATIM) # generate dygraph(intermediate) api add_custom_command( OUTPUT ${dygraph_api_header_file} ${dygraph_api_source_file} - COMMAND ${PYTHON_EXECUTABLE} ${im_api_gen_file} - --api_yaml_path ${api_yaml_file} ${new_api_yaml_file} - --sparse_api_yaml_path ${sparse_api_yaml_file} - --dygraph_api_header_path ${dygraph_api_header_file_tmp} - --dygraph_api_source_path ${dygraph_api_source_file_tmp} - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${dygraph_api_header_file_tmp} ${dygraph_api_header_file} - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${dygraph_api_source_file_tmp} ${dygraph_api_source_file} - DEPENDS ${api_yaml_file} ${sparse_api_yaml_file} ${im_api_gen_file} ${api_gen_base} ${api_gen_file} + COMMAND + ${PYTHON_EXECUTABLE} ${im_api_gen_file} --api_yaml_path ${api_yaml_file} + ${new_api_yaml_file} --sparse_api_yaml_path ${sparse_api_yaml_file} + --dygraph_api_header_path ${dygraph_api_header_file_tmp} + --dygraph_api_source_path ${dygraph_api_source_file_tmp} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${dygraph_api_header_file_tmp} + ${dygraph_api_header_file} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${dygraph_api_source_file_tmp} + ${dygraph_api_source_file} + DEPENDS ${api_yaml_file} ${sparse_api_yaml_file} ${im_api_gen_file} + ${api_gen_base} ${api_gen_file} VERBATIM) # generate wrapped infermeta add_custom_command( OUTPUT ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file} - COMMAND ${PYTHON_EXECUTABLE} ${wrapped_infermeta_gen_file} - --api_yaml_path ${api_yaml_file} ${new_api_yaml_file} - --wrapped_infermeta_header_path ${wrapped_infermeta_header_file} - --wrapped_infermeta_source_path ${wrapped_infermeta_source_file} + COMMAND + ${PYTHON_EXECUTABLE} ${wrapped_infermeta_gen_file} --api_yaml_path + ${api_yaml_file} ${new_api_yaml_file} --wrapped_infermeta_header_path + ${wrapped_infermeta_header_file} --wrapped_infermeta_source_path + ${wrapped_infermeta_source_file} DEPENDS ${api_yaml_file} ${wrapped_infermeta_gen_file} ${api_gen_base} VERBATIM) -cc_library(op_meta_info SRCS op_meta_info.cc DEPS phi_tensor_raw) -cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS phi) -cc_library(context_pool SRCS context_pool.cc DEPS phi_context phi_enforce place) +cc_library( + op_meta_info + SRCS op_meta_info.cc + DEPS phi_tensor_raw) +cc_library( + wrapped_infermeta + SRCS ${wrapped_infermeta_source_file} + DEPS phi) +cc_library( + context_pool + SRCS context_pool.cc + DEPS phi_context phi_enforce place) -cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory context_pool) -cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor) -cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel copy_kernel tensor) -cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform) -cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform tensor_copy) +cc_library( + kernel_dispatch + SRCS kernel_dispatch.cc + DEPS phi_tensor_raw phi_context kernel_factory context_pool) +cc_library( + api_gen_utils + SRCS api_gen_utils.cc + DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor) +cc_library( + phi_data_transform + SRCS data_transform.cc + DEPS phi_tensor_raw transfer_layout_kernel cast_kernel copy_kernel tensor) +cc_library( + api_custom_impl + SRCS api_custom_impl.cc + DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta + phi_data_transform) +cc_library( + sparse_api_custom_impl + SRCS sparse_api_custom_impl.cc + DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform + tensor_copy) -cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform api_custom_impl) -cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform phi_function_api api_custom_impl global_utils) -cc_library(sparse_api SRCS ${sparse_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl) -cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api sparse_api_custom_impl) -cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform phi_function_api sparse_api) -cc_library(strings_api SRCS ${strings_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils) -cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta sparse_api strings_api) -cc_library(tensor_copy SRCS tensor_copy.cc DEPS phi_tensor_raw copy_kernel kernel_dispatch api_gen_utils) -cc_library(api_scalar SRCS scalar.cc DEPS tensor_copy) -cc_library(api_int_array SRCS int_array.cc DEPS tensor_copy) +cc_library( + phi_function_api + SRCS ${api_source_file} + DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform + api_custom_impl) +cc_library( + phi_bw_function_api + SRCS ${bw_api_source_file} + DEPS phi_tensor_raw + phi + kernel_dispatch + api_gen_utils + backward_infermeta + phi_data_transform + phi_function_api + api_custom_impl + global_utils) +cc_library( + sparse_api + SRCS ${sparse_api_source_file} + DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl) +cc_library( + sparse_bw_api + SRCS ${sparse_bw_api_source_file} + DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api + sparse_api_custom_impl) +cc_library( + phi_dygraph_api + SRCS ${dygraph_api_source_file} + DEPS phi_tensor_raw + phi + kernel_dispatch + api_gen_utils + phi_data_transform + phi_function_api + sparse_api) +cc_library( + strings_api + SRCS ${strings_api_source_file} + DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils) +cc_library( + phi_tensor + SRCS tensor_method.cc + DEPS phi_tensor_raw + phi_function_api + api_gen_utils + kernel_dispatch + infermeta + sparse_api + strings_api) +cc_library( + tensor_copy + SRCS tensor_copy.cc + DEPS phi_tensor_raw copy_kernel kernel_dispatch api_gen_utils) +cc_library( + api_scalar + SRCS scalar.cc + DEPS tensor_copy) +cc_library( + api_int_array + SRCS int_array.cc + DEPS tensor_copy) diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index 3ef7763d57e8b..5ca7f2b51edd2 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/phi/api/lib/api_custom_impl.h" +#include "glog/logging.h" #include "paddle/phi/api/lib/api_gen_utils.h" #include "paddle/phi/api/lib/data_transform.h" #include "paddle/phi/api/lib/kernel_dispatch.h" @@ -28,8 +29,6 @@ limitations under the License. */ #include "paddle/phi/infermeta/nullary.h" #include "paddle/phi/infermeta/unary.h" -#include "glog/logging.h" - namespace paddle { namespace experimental { diff --git a/paddle/phi/api/lib/backend_set.h b/paddle/phi/api/lib/backend_set.h index 2aa4f969221d9..93f8f05b74b75 100644 --- a/paddle/phi/api/lib/backend_set.h +++ b/paddle/phi/api/lib/backend_set.h @@ -32,8 +32,9 @@ class BackendSet final { public: constexpr BackendSet() : bitset_(0) {} explicit constexpr BackendSet(Backend b) - : bitset_(b == Backend::UNDEFINED ? 0 : 1ULL << (static_cast(b) - - 1)) {} + : bitset_(b == Backend::UNDEFINED + ? 0 + : 1ULL << (static_cast(b) - 1)) {} inline uint64_t bitset() const { return bitset_; } diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc index 12f7b8bba5870..4803616812cd0 100644 --- a/paddle/phi/api/lib/data_transform.cc +++ b/paddle/phi/api/lib/data_transform.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +// clang-format off #include "paddle/phi/api/lib/data_transform.h" #include "paddle/phi/api/lib/kernel_dispatch.h" @@ -23,6 +24,7 @@ limitations under the License. */ #include "paddle/phi/kernels/transfer_layout_kernel.h" #include "paddle/fluid/framework/tensor_util.h" +// clang-format on namespace paddle { namespace experimental { diff --git a/paddle/phi/api/lib/sparse_api_custom_impl.cc b/paddle/phi/api/lib/sparse_api_custom_impl.cc index 71ba8eaae2d36..0b93c96e7f81d 100644 --- a/paddle/phi/api/lib/sparse_api_custom_impl.cc +++ b/paddle/phi/api/lib/sparse_api_custom_impl.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/phi/api/lib/sparse_api_custom_impl.h" #include + #include "glog/logging.h" #include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index a340c0fed10d8..74364d5ab0373 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +// clang-format off #include "paddle/phi/api/include/tensor.h" #include @@ -34,6 +35,7 @@ limitations under the License. */ #include "paddle/phi/core/tensor_utils.h" #include "paddle/fluid/platform/stream/cuda_stream.h" +// clang-format off namespace paddle { namespace experimental { diff --git a/paddle/phi/api/lib/tensor_copy.cc b/paddle/phi/api/lib/tensor_copy.cc index 85de3601fd96a..5f8c2ed71e939 100644 --- a/paddle/phi/api/lib/tensor_copy.cc +++ b/paddle/phi/api/lib/tensor_copy.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/api/lib/tensor_copy.h" + #include "paddle/phi/api/lib/api_gen_utils.h" #include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/core/compat/convert_utils.h" diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc index 5285392b4a6ac..fbeeb3332eadb 100644 --- a/paddle/phi/api/lib/tensor_method.cc +++ b/paddle/phi/api/lib/tensor_method.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +// clang-format off #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/common/int_array.h" @@ -22,6 +23,7 @@ limitations under the License. */ #include "paddle/phi/api/lib/api_gen_utils.h" #include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/infermeta/unary.h" +// clang-format off namespace paddle { namespace experimental { diff --git a/paddle/phi/api/lib/utils/CMakeLists.txt b/paddle/phi/api/lib/utils/CMakeLists.txt index 0e1cd0cb83fd4..ef99a1586285e 100644 --- a/paddle/phi/api/lib/utils/CMakeLists.txt +++ b/paddle/phi/api/lib/utils/CMakeLists.txt @@ -1,2 +1,13 @@ -cc_library(phi_api_utils SRCS tensor_utils.cc DEPS -tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits string_tensor int_array scalar) +cc_library( + phi_api_utils + SRCS tensor_utils.cc + DEPS tensor_base + convert_utils + dense_tensor + lod_tensor + selected_rows_utils + place + var_type_traits + string_tensor + int_array + scalar) diff --git a/paddle/phi/api/lib/utils/tensor_utils.h b/paddle/phi/api/lib/utils/tensor_utils.h index 36a0901bbe980..f930f5b11f64f 100644 --- a/paddle/phi/api/lib/utils/tensor_utils.h +++ b/paddle/phi/api/lib/utils/tensor_utils.h @@ -18,7 +18,6 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/variable.h" - #include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt index 5f61615554645..c981b625192da 100644 --- a/paddle/phi/backends/CMakeLists.txt +++ b/paddle/phi/backends/CMakeLists.txt @@ -12,7 +12,10 @@ if(WITH_XPU) add_subdirectory(xpu) endif() -cc_library(phi_context SRCS all_context.cc DEPS device_context cpu_context) +cc_library( + phi_context + SRCS all_context.cc + DEPS device_context cpu_context) if(WITH_XPU) add_dependencies(phi_context xpu_context) @@ -24,11 +27,31 @@ endif() if(WITH_CUSTOM_DEVICE) add_dependencies(phi_context custom_context) - cc_library(callback_manager SRCS callback_manager.cc DEPS enforce place) - cc_library(device_guard SRCS device_guard.cc DEPS enforce place) - cc_library(stream SRCS stream.cc DEPS callback_manager) - cc_library(event SRCS event.cc DEPS enforce place) - cc_library(device_base SRCS device_base.cc DEPS stream event callback_manager device_guard device_context flags) - cc_library(device_manager SRCS device_manager.cc DEPS custom_device) - set(GLOB_DEV_LIB device_manager custom_device CACHE INTERNAL "Global DEV library") + cc_library( + callback_manager + SRCS callback_manager.cc + DEPS enforce place) + cc_library( + device_guard + SRCS device_guard.cc + DEPS enforce place) + cc_library( + stream + SRCS stream.cc + DEPS callback_manager) + cc_library( + event + SRCS event.cc + DEPS enforce place) + cc_library( + device_base + SRCS device_base.cc + DEPS stream event callback_manager device_guard device_context flags) + cc_library( + device_manager + SRCS device_manager.cc + DEPS custom_device) + set(GLOB_DEV_LIB + device_manager custom_device + CACHE INTERNAL "Global DEV library") endif() diff --git a/paddle/phi/backends/callback_manager.cc b/paddle/phi/backends/callback_manager.cc index 4a958ef73bfc6..295f70fc65cd7 100644 --- a/paddle/phi/backends/callback_manager.cc +++ b/paddle/phi/backends/callback_manager.cc @@ -13,11 +13,12 @@ // limitations under the License. #include "paddle/phi/backends/callback_manager.h" -#include "paddle/fluid/platform/device/device_wrapper.h" -#include "paddle/fluid/platform/enforce.h" #include +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/enforce.h" + namespace phi { CallbackManager::CallbackManager(stream::Stream *stream) diff --git a/paddle/phi/backends/cpu/CMakeLists.txt b/paddle/phi/backends/cpu/CMakeLists.txt index 82ea42566fc1f..e32aa17758b2b 100644 --- a/paddle/phi/backends/cpu/CMakeLists.txt +++ b/paddle/phi/backends/cpu/CMakeLists.txt @@ -1,6 +1,12 @@ if(WITH_MKLDNN) # TODO(wilber): support mkldnn context. - cc_library(cpu_context SRCS cpu_context.cc DEPS phi_device_context mkldnn eigen3) + cc_library( + cpu_context + SRCS cpu_context.cc + DEPS phi_device_context mkldnn eigen3) else() - cc_library(cpu_context SRCS cpu_context.cc DEPS phi_device_context eigen3) + cc_library( + cpu_context + SRCS cpu_context.cc + DEPS phi_device_context eigen3) endif() diff --git a/paddle/phi/backends/custom/CMakeLists.txt b/paddle/phi/backends/custom/CMakeLists.txt index 5b46afb4ce9ee..d8ed6706eba22 100644 --- a/paddle/phi/backends/custom/CMakeLists.txt +++ b/paddle/phi/backends/custom/CMakeLists.txt @@ -1,5 +1,14 @@ -if (WITH_CUSTOM_DEVICE) - cc_library(custom_context SRCS custom_context.cc DEPS phi_device_context device_manager) - cc_library(custom_device SRCS custom_device.cc DEPS device_base device_context) - cc_test(custom_device_test SRCS custom_device_test.cc DEPS device_manager device_context) +if(WITH_CUSTOM_DEVICE) + cc_library( + custom_context + SRCS custom_context.cc + DEPS phi_device_context device_manager) + cc_library( + custom_device + SRCS custom_device.cc + DEPS device_base device_context) + cc_test( + custom_device_test + SRCS custom_device_test.cc + DEPS device_manager device_context) endif() diff --git a/paddle/phi/backends/custom/custom_context.h b/paddle/phi/backends/custom/custom_context.h index 37b0ee21219b5..57be8534fa954 100644 --- a/paddle/phi/backends/custom/custom_context.h +++ b/paddle/phi/backends/custom/custom_context.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/phi/common/place.h" #include "paddle/phi/core/device_context.h" diff --git a/paddle/phi/backends/custom/custom_device_test.cc b/paddle/phi/backends/custom/custom_device_test.cc index 53b88f9b4ac79..51fa74b4dc5f3 100644 --- a/paddle/phi/backends/custom/custom_device_test.cc +++ b/paddle/phi/backends/custom/custom_device_test.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include #include "paddle/fluid/framework/tensor.h" diff --git a/paddle/phi/backends/device_base.cc b/paddle/phi/backends/device_base.cc index b72c6efd51f2c..e57653702c538 100644 --- a/paddle/phi/backends/device_base.cc +++ b/paddle/phi/backends/device_base.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/backends/device_base.h" + #include "gflags/gflags.h" #include "glog/logging.h" #include "paddle/phi/core/enforce.h" @@ -214,8 +215,9 @@ size_t DeviceInterface::AllocSize(size_t dev_id, bool realloc) { size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb : FLAGS_initial_gpu_memory_in_mb; size_t alloc_bytes = - (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc * - FLAGS_fraction_of_gpu_memory_to_use); + (flag_mb > 0ul + ? flag_mb << 20 + : available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use); PADDLE_ENFORCE_GE(available_to_alloc, alloc_bytes, phi::errors::ResourceExhausted( diff --git a/paddle/phi/backends/device_ext.h b/paddle/phi/backends/device_ext.h index 749d8d323b62d..ff58f4f35fd32 100644 --- a/paddle/phi/backends/device_ext.h +++ b/paddle/phi/backends/device_ext.h @@ -34,7 +34,9 @@ typedef enum { C_INTERNAL_ERROR // plugin error } C_Status; -typedef struct C_Device_st { int id; } * C_Device; +typedef struct C_Device_st { + int id; +} * C_Device; typedef struct C_Stream_st* C_Stream; diff --git a/paddle/phi/backends/device_manager.h b/paddle/phi/backends/device_manager.h index 18d51687ef121..56d99ba43bdd1 100644 --- a/paddle/phi/backends/device_manager.h +++ b/paddle/phi/backends/device_manager.h @@ -19,11 +19,10 @@ #include "paddle/phi/backends/device_base.h" #include "paddle/phi/backends/device_ext.h" +#include "paddle/phi/backends/dynload/port.h" #include "paddle/phi/backends/event.h" #include "paddle/phi/backends/stream.h" #include "paddle/phi/common/place.h" - -#include "paddle/phi/backends/dynload/port.h" #include "paddle/phi/core/utils/rw_lock.h" namespace phi { diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt index bc5ef3cd5c078..91dbafe0cd38d 100644 --- a/paddle/phi/backends/dynload/CMakeLists.txt +++ b/paddle/phi/backends/dynload/CMakeLists.txt @@ -1,57 +1,94 @@ -cc_library(phi_dynamic_loader SRCS dynamic_loader.cc DEPS enforce glog gflags) +cc_library( + phi_dynamic_loader + SRCS dynamic_loader.cc + DEPS enforce glog gflags) -list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc nvtx.cc cufft.cc) +list( + APPEND + CUDA_SRCS + cublas.cc + cublasLt.cc + cudnn.cc + curand.cc + cusolver.cc + cusparse.cc + nvtx.cc + cufft.cc) -if (NOT WITH_NV_JETSON) +if(NOT WITH_NV_JETSON) list(APPEND CUDA_SRCS nvjpeg.cc) endif() -if (WITH_ROCM) +if(WITH_ROCM) list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc hipfft.cc) endif() # There is no macOS version of NCCL. # Disable nvrtc and cuda_driver api on MacOS, and only do a early test on Linux and Windows. -if (NOT APPLE) +if(NOT APPLE) list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc) - if (WITH_NCCL) + if(WITH_NCCL) list(APPEND CUDA_SRCS nccl.cc) endif() - if (WITH_ROCM) + if(WITH_ROCM) list(APPEND HIP_SRCS hiprtc.cc rocm_driver.cc) - if (WITH_RCCL) + if(WITH_RCCL) list(APPEND HIP_SRCS rccl.cc) endif() endif() endif() -if (TENSORRT_FOUND) +if(TENSORRT_FOUND) list(APPEND CUDA_SRCS tensorrt.cc) endif() configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h) -if (CUPTI_FOUND) +if(CUPTI_FOUND) list(APPEND CUDA_SRCS cupti.cc) endif(CUPTI_FOUND) if(WITH_ROCM) - hip_library(phi_dynload_cuda SRCS ${HIP_SRCS} DEPS phi_dynamic_loader) - cc_library(phi_dynload_warpctc SRCS warpctc.cc DEPS phi_dynamic_loader warpctc) -elseif (WITH_ASCEND_CL) - cc_library(phi_dynload_warpctc SRCS warpctc.cc DEPS phi_dynamic_loader warpctc npu_hccl) + hip_library( + phi_dynload_cuda + SRCS ${HIP_SRCS} + DEPS phi_dynamic_loader) + cc_library( + phi_dynload_warpctc + SRCS warpctc.cc + DEPS phi_dynamic_loader warpctc) +elseif(WITH_ASCEND_CL) + cc_library( + phi_dynload_warpctc + SRCS warpctc.cc + DEPS phi_dynamic_loader warpctc npu_hccl) else() - nv_library(phi_dynload_cuda SRCS ${CUDA_SRCS} DEPS phi_dynamic_loader) - cc_library(phi_dynload_warpctc SRCS warpctc.cc DEPS phi_dynamic_loader warpctc) + nv_library( + phi_dynload_cuda + SRCS ${CUDA_SRCS} + DEPS phi_dynamic_loader) + cc_library( + phi_dynload_warpctc + SRCS warpctc.cc + DEPS phi_dynamic_loader warpctc) endif() -if (WITH_MKLML) - cc_library(phi_dynload_mklml SRCS mklml.cc DEPS phi_dynamic_loader mklml) +if(WITH_MKLML) + cc_library( + phi_dynload_mklml + SRCS mklml.cc + DEPS phi_dynamic_loader mklml) endif() -cc_library(phi_dynload_lapack SRCS lapack.cc DEPS phi_dynamic_loader) +cc_library( + phi_dynload_lapack + SRCS lapack.cc + DEPS phi_dynamic_loader) add_dependencies(phi_dynload_lapack extern_lapack) # TODO(TJ): add iomp, mkldnn? -if (MKL_FOUND AND WITH_ONEMKL) +if(MKL_FOUND AND WITH_ONEMKL) message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}") - cc_library(phi_dynload_mklrt SRCS mklrt.cc DEPS phi_dynamic_loader) + cc_library( + phi_dynload_mklrt + SRCS mklrt.cc + DEPS phi_dynamic_loader) target_include_directories(phi_dynload_mklrt PRIVATE ${MKL_INCLUDE}) endif() diff --git a/paddle/phi/backends/dynload/cublas.h b/paddle/phi/backends/dynload/cublas.h index ee0696fb4b218..308ae2accef14 100644 --- a/paddle/phi/backends/dynload/cublas.h +++ b/paddle/phi/backends/dynload/cublas.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include + #include // NOLINT #include diff --git a/paddle/phi/backends/dynload/cublasLt.h b/paddle/phi/backends/dynload/cublasLt.h index 4c7ac9c3f21c4..1e2a20ebdf440 100644 --- a/paddle/phi/backends/dynload/cublasLt.h +++ b/paddle/phi/backends/dynload/cublasLt.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include + #include // NOLINT #include diff --git a/paddle/phi/backends/dynload/cuda_driver.h b/paddle/phi/backends/dynload/cuda_driver.h index f4ea70a81b91f..f743a33a1866f 100644 --- a/paddle/phi/backends/dynload/cuda_driver.h +++ b/paddle/phi/backends/dynload/cuda_driver.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include // NOLINT #include "paddle/phi/backends/dynload/dynamic_loader.h" diff --git a/paddle/phi/backends/dynload/cudnn.cc b/paddle/phi/backends/dynload/cudnn.cc index 02d626d5f98f9..8aa3b623273d7 100644 --- a/paddle/phi/backends/dynload/cudnn.cc +++ b/paddle/phi/backends/dynload/cudnn.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/backends/dynload/cudnn.h" + #include "paddle/fluid/platform/enforce.h" namespace phi { diff --git a/paddle/phi/backends/dynload/cudnn.h b/paddle/phi/backends/dynload/cudnn.h index a3afb98e3e636..7b9004308e95b 100644 --- a/paddle/phi/backends/dynload/cudnn.h +++ b/paddle/phi/backends/dynload/cudnn.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #ifdef PADDLE_WITH_CUDA #include + #include // NOLINT #include "paddle/phi/backends/dynload/dynamic_loader.h" diff --git a/paddle/phi/backends/dynload/cufft.cc b/paddle/phi/backends/dynload/cufft.cc index 596a68c1ed6aa..5a7080032d28d 100644 --- a/paddle/phi/backends/dynload/cufft.cc +++ b/paddle/phi/backends/dynload/cufft.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/backends/dynload/cufft.h" + #include "paddle/fluid/platform/enforce.h" namespace phi { diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h index 4697e335477ec..a27d7c3ab1eee 100644 --- a/paddle/phi/backends/dynload/cufft.h +++ b/paddle/phi/backends/dynload/cufft.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include + #include // NOLINT #include "paddle/phi/backends/dynload/dynamic_loader.h" diff --git a/paddle/phi/backends/dynload/cupti.h b/paddle/phi/backends/dynload/cupti.h index a526fbfd92639..22e21b78f4f2e 100644 --- a/paddle/phi/backends/dynload/cupti.h +++ b/paddle/phi/backends/dynload/cupti.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include + #include // NOLINT #include "paddle/phi/backends/dynload/dynamic_loader.h" diff --git a/paddle/phi/backends/dynload/curand.h b/paddle/phi/backends/dynload/curand.h index 875403b03bb81..f3c4496dc4d39 100644 --- a/paddle/phi/backends/dynload/curand.h +++ b/paddle/phi/backends/dynload/curand.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include // NOLINT #include "paddle/phi/backends/dynload/dynamic_loader.h" diff --git a/paddle/phi/backends/dynload/cusolver.h b/paddle/phi/backends/dynload/cusolver.h index 40e5f183dc035..1354e31055480 100644 --- a/paddle/phi/backends/dynload/cusolver.h +++ b/paddle/phi/backends/dynload/cusolver.h @@ -15,6 +15,7 @@ limitations under the License. */ #include #include + #include // NOLINT #include "paddle/phi/backends/dynload/dynamic_loader.h" diff --git a/paddle/phi/backends/dynload/cusparse.h b/paddle/phi/backends/dynload/cusparse.h index 8f7d54d55dbc4..a7e305f98d49a 100644 --- a/paddle/phi/backends/dynload/cusparse.h +++ b/paddle/phi/backends/dynload/cusparse.h @@ -15,6 +15,7 @@ limitations under the License. */ #include #include + #include // NOLINT #include "paddle/phi/backends/dynload/dynamic_loader.h" diff --git a/paddle/phi/backends/dynload/hiprand.h b/paddle/phi/backends/dynload/hiprand.h index ccaf02d93047a..3e9502dd94d91 100644 --- a/paddle/phi/backends/dynload/hiprand.h +++ b/paddle/phi/backends/dynload/hiprand.h @@ -16,9 +16,9 @@ limitations under the License. */ #include #include // NOLINT -#include "paddle/phi/backends/dynload/port.h" #include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/backends/dynload/port.h" namespace phi { namespace dynload { diff --git a/paddle/phi/backends/dynload/hiprtc.h b/paddle/phi/backends/dynload/hiprtc.h index 0404aad559394..75dd88f87bd3a 100644 --- a/paddle/phi/backends/dynload/hiprtc.h +++ b/paddle/phi/backends/dynload/hiprtc.h @@ -15,7 +15,9 @@ limitations under the License. */ #pragma once #include + #include // NOLINT + #include "paddle/phi/backends/dynload/dynamic_loader.h" #include "paddle/phi/backends/dynload/port.h" diff --git a/paddle/phi/backends/dynload/lapack.cc b/paddle/phi/backends/dynload/lapack.cc index bb03beabd4ffc..9719da9775146 100644 --- a/paddle/phi/backends/dynload/lapack.cc +++ b/paddle/phi/backends/dynload/lapack.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/backends/dynload/lapack.h" + #include namespace phi { diff --git a/paddle/phi/backends/dynload/lapack.h b/paddle/phi/backends/dynload/lapack.h index c81c66c69282f..f0e1e9ad7a4c0 100644 --- a/paddle/phi/backends/dynload/lapack.h +++ b/paddle/phi/backends/dynload/lapack.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/platform/complex.h" #include "paddle/phi/backends/dynload/dynamic_loader.h" #include "paddle/phi/backends/dynload/port.h" diff --git a/paddle/phi/backends/dynload/miopen.cc b/paddle/phi/backends/dynload/miopen.cc index e7916873ccfde..9c58da1d6ff1a 100644 --- a/paddle/phi/backends/dynload/miopen.cc +++ b/paddle/phi/backends/dynload/miopen.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/backends/dynload/miopen.h" + #include "paddle/fluid/platform/enforce.h" namespace phi { diff --git a/paddle/phi/backends/dynload/miopen.h b/paddle/phi/backends/dynload/miopen.h index eb14bfe8ec543..eeaf8028ec312 100644 --- a/paddle/phi/backends/dynload/miopen.h +++ b/paddle/phi/backends/dynload/miopen.h @@ -14,10 +14,11 @@ limitations under the License. */ #pragma once #include - #include #include + #include // NOLINT + #include "paddle/phi/backends/dynload/dynamic_loader.h" #include "paddle/phi/backends/dynload/port.h" diff --git a/paddle/phi/backends/dynload/mklml.h b/paddle/phi/backends/dynload/mklml.h index 5f5520a831eb1..0f0c31f8064df 100644 --- a/paddle/phi/backends/dynload/mklml.h +++ b/paddle/phi/backends/dynload/mklml.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include // NOLINT #include "paddle/phi/backends/dynload/dynamic_loader.h" diff --git a/paddle/phi/backends/dynload/mklrt.h b/paddle/phi/backends/dynload/mklrt.h index 8638d83d025bd..0267fb69a5932 100644 --- a/paddle/phi/backends/dynload/mklrt.h +++ b/paddle/phi/backends/dynload/mklrt.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include // NOLINT #include "paddle/phi/backends/dynload/dynamic_loader.h" diff --git a/paddle/phi/backends/dynload/nccl.h b/paddle/phi/backends/dynload/nccl.h index b04ef0f0651eb..6c73c562caa69 100644 --- a/paddle/phi/backends/dynload/nccl.h +++ b/paddle/phi/backends/dynload/nccl.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include // NOLINT #include "paddle/phi/backends/dynload/dynamic_loader.h" diff --git a/paddle/phi/backends/dynload/nvjpeg.h b/paddle/phi/backends/dynload/nvjpeg.h index 13bb8a5698f15..6e71e6b582c05 100644 --- a/paddle/phi/backends/dynload/nvjpeg.h +++ b/paddle/phi/backends/dynload/nvjpeg.h @@ -12,6 +12,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include + #include // NOLINT #include "paddle/phi/backends/dynload/dynamic_loader.h" diff --git a/paddle/phi/backends/dynload/nvrtc.h b/paddle/phi/backends/dynload/nvrtc.h index 516ca7686d253..9244e9487b250 100644 --- a/paddle/phi/backends/dynload/nvrtc.h +++ b/paddle/phi/backends/dynload/nvrtc.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include // NOLINT #include "paddle/phi/backends/dynload/dynamic_loader.h" diff --git a/paddle/phi/backends/dynload/nvtx.h b/paddle/phi/backends/dynload/nvtx.h index e9fd32668dc80..a9a166b289e33 100644 --- a/paddle/phi/backends/dynload/nvtx.h +++ b/paddle/phi/backends/dynload/nvtx.h @@ -15,6 +15,7 @@ limitations under the License. */ #ifndef _WIN32 #include #include + #include // NOLINT #include "paddle/phi/backends/dynload/dynamic_loader.h" diff --git a/paddle/phi/backends/dynload/port.h b/paddle/phi/backends/dynload/port.h index 981e5f5af644e..d380993c9b67a 100644 --- a/paddle/phi/backends/dynload/port.h +++ b/paddle/phi/backends/dynload/port.h @@ -28,6 +28,7 @@ #include // dladdr #include #include + #include // std::accumulate #else #ifndef NOMINMAX @@ -40,6 +41,7 @@ #include #include #include + #include // std::accumulate in msvc #ifndef S_ISDIR // windows port for sys/stat.h #define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR) diff --git a/paddle/phi/backends/dynload/rccl.h b/paddle/phi/backends/dynload/rccl.h index 4472684962832..2da35dc2df2db 100644 --- a/paddle/phi/backends/dynload/rccl.h +++ b/paddle/phi/backends/dynload/rccl.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include // NOLINT + #include "paddle/phi/backends/dynload/dynamic_loader.h" #include "paddle/phi/backends/dynload/port.h" diff --git a/paddle/phi/backends/dynload/rocblas.h b/paddle/phi/backends/dynload/rocblas.h index 18061b192e465..a9804b3d82a7d 100644 --- a/paddle/phi/backends/dynload/rocblas.h +++ b/paddle/phi/backends/dynload/rocblas.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include // NOLINT #include diff --git a/paddle/phi/backends/dynload/rocm_driver.h b/paddle/phi/backends/dynload/rocm_driver.h index 59e35b787a599..4e456db44c904 100644 --- a/paddle/phi/backends/dynload/rocm_driver.h +++ b/paddle/phi/backends/dynload/rocm_driver.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include // NOLINT #include "paddle/phi/backends/dynload/dynamic_loader.h" diff --git a/paddle/phi/backends/dynload/tensorrt.cc b/paddle/phi/backends/dynload/tensorrt.cc index cc3b4e0146088..4552570102025 100644 --- a/paddle/phi/backends/dynload/tensorrt.cc +++ b/paddle/phi/backends/dynload/tensorrt.cc @@ -13,6 +13,7 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/tensorrt.h" + #include namespace phi { diff --git a/paddle/phi/backends/event.cc b/paddle/phi/backends/event.cc index a474536f865c1..43077d280f360 100644 --- a/paddle/phi/backends/event.cc +++ b/paddle/phi/backends/event.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/backends/event.h" + #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/phi/backends/device_guard.h" #include "paddle/phi/backends/stream.h" diff --git a/paddle/phi/backends/gpu/CMakeLists.txt b/paddle/phi/backends/gpu/CMakeLists.txt index ebe8f1ca4c101..6d9f2de67d530 100644 --- a/paddle/phi/backends/gpu/CMakeLists.txt +++ b/paddle/phi/backends/gpu/CMakeLists.txt @@ -1,10 +1,22 @@ if(WITH_GPU) add_subdirectory(cuda) - nv_library(phi_gpu_info SRCS gpu_info.cc DEPS phi_cuda_info gflags glog enforce phi_dynload_cuda) + nv_library( + phi_gpu_info + SRCS gpu_info.cc + DEPS phi_cuda_info gflags glog enforce phi_dynload_cuda) elseif(WITH_ROCM) add_subdirectory(rocm) - hip_library(phi_gpu_info SRCS gpu_info.cc DEPS phi_rocm_info gflags glog enforce phi_dynload_cuda) + hip_library( + phi_gpu_info + SRCS gpu_info.cc + DEPS phi_rocm_info gflags glog enforce phi_dynload_cuda) endif() -cc_library(gpu_resources SRCS gpu_resources.cc DEPS phi_device_context phi_gpu_info) -cc_library(gpu_context SRCS gpu_context.cc DEPS phi_device_context phi_gpu_info eigen3 gpu_resources) +cc_library( + gpu_resources + SRCS gpu_resources.cc + DEPS phi_device_context phi_gpu_info) +cc_library( + gpu_context + SRCS gpu_context.cc + DEPS phi_device_context phi_gpu_info eigen3 gpu_resources) diff --git a/paddle/phi/backends/gpu/cuda/CMakeLists.txt b/paddle/phi/backends/gpu/cuda/CMakeLists.txt index a3393f97d7559..9765f5dc03b5a 100644 --- a/paddle/phi/backends/gpu/cuda/CMakeLists.txt +++ b/paddle/phi/backends/gpu/cuda/CMakeLists.txt @@ -1 +1,4 @@ -nv_library(phi_cuda_info SRCS cuda_info.cc DEPS gflags glog enforce phi_dynload_cuda) +nv_library( + phi_cuda_info + SRCS cuda_info.cc + DEPS gflags glog enforce phi_dynload_cuda) diff --git a/paddle/phi/backends/gpu/cuda/cuda_helper.h b/paddle/phi/backends/gpu/cuda/cuda_helper.h index 08670832c775f..c62addfd257ab 100644 --- a/paddle/phi/backends/gpu/cuda/cuda_helper.h +++ b/paddle/phi/backends/gpu/cuda/cuda_helper.h @@ -60,7 +60,7 @@ namespace gpu { * } * } * -*/ + */ #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \ diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index e8c264b884fe3..f51f287ee4a08 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -23,7 +23,6 @@ limitations under the License. */ #include #include "glog/logging.h" - #include "paddle/phi/api/ext/exception.h" #include "paddle/phi/backends/gpu/gpu_decls.h" #include "paddle/phi/backends/gpu/gpu_info.h" diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h index db9f287041dfb..5246155131dbe 100644 --- a/paddle/phi/backends/gpu/gpu_context.h +++ b/paddle/phi/backends/gpu/gpu_context.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include + #include "paddle/phi/backends/gpu/forwards.h" #include "paddle/phi/backends/gpu/gpu_decls.h" #include "paddle/phi/backends/gpu/gpu_helper.h" diff --git a/paddle/phi/backends/gpu/gpu_info.h b/paddle/phi/backends/gpu/gpu_info.h index 443830acf4793..323565c000a1c 100644 --- a/paddle/phi/backends/gpu/gpu_info.h +++ b/paddle/phi/backends/gpu/gpu_info.h @@ -14,6 +14,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include + #include #include #include diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h index 888b44632ea28..2dd1431ff58bb 100644 --- a/paddle/phi/backends/gpu/gpu_launch_config.h +++ b/paddle/phi/backends/gpu/gpu_launch_config.h @@ -25,9 +25,11 @@ #endif #include + #include #include #include + #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/enforce.h" @@ -95,9 +97,9 @@ struct GpuLaunchConfig { }; /* According to NVIDIA, if number of threads per block is 64/128/256/512, - * cuda performs better. And number of blocks should be greater (at least - * 2x~4x) than number of SMs. Hence, SM count is took into account within - * this function to determine the right number of threads per block. */ + * cuda performs better. And number of blocks should be greater (at least + * 2x~4x) than number of SMs. Hence, SM count is took into account within + * this function to determine the right number of threads per block. */ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context, int64_t numel, int vec_size = 1) { diff --git a/paddle/phi/backends/gpu/gpu_resources.h b/paddle/phi/backends/gpu/gpu_resources.h index 07ccb6215409a..7bec5eebf5886 100644 --- a/paddle/phi/backends/gpu/gpu_resources.h +++ b/paddle/phi/backends/gpu/gpu_resources.h @@ -14,6 +14,7 @@ #pragma once #include + #include "paddle/phi/backends/gpu/gpu_decls.h" #include "paddle/phi/common/place.h" diff --git a/paddle/phi/backends/gpu/rocm/CMakeLists.txt b/paddle/phi/backends/gpu/rocm/CMakeLists.txt index 257e4cc8afbcf..730aad5d2fd2b 100644 --- a/paddle/phi/backends/gpu/rocm/CMakeLists.txt +++ b/paddle/phi/backends/gpu/rocm/CMakeLists.txt @@ -1 +1,4 @@ -hip_library(phi_rocm_info SRCS rocm_info.cc DEPS gflags glog enforce phi_dynload_cuda) +hip_library( + phi_rocm_info + SRCS rocm_info.cc + DEPS gflags glog enforce phi_dynload_cuda) diff --git a/paddle/phi/backends/gpu/rocm/rocm_helper.h b/paddle/phi/backends/gpu/rocm/rocm_helper.h index 2d75b6ea4cb71..14e9ca660bdf9 100644 --- a/paddle/phi/backends/gpu/rocm/rocm_helper.h +++ b/paddle/phi/backends/gpu/rocm/rocm_helper.h @@ -60,7 +60,7 @@ namespace gpu { * } * } * -*/ + */ #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \ diff --git a/paddle/phi/backends/gpu/rocm/rocm_info.cc b/paddle/phi/backends/gpu/rocm/rocm_info.cc index 23e58d34b2572..b89d5a3c1624f 100644 --- a/paddle/phi/backends/gpu/rocm/rocm_info.cc +++ b/paddle/phi/backends/gpu/rocm/rocm_info.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "paddle/phi/backends/gpu/gpu_info.h" // TODO(phi): remove fluid headers. diff --git a/paddle/phi/backends/stream.cc b/paddle/phi/backends/stream.cc index 30939f31fcc3c..f8b15bdbd9e63 100644 --- a/paddle/phi/backends/stream.cc +++ b/paddle/phi/backends/stream.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/backends/stream.h" + #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/phi/backends/device_guard.h" #include "paddle/phi/backends/event.h" diff --git a/paddle/phi/backends/xpu/CMakeLists.txt b/paddle/phi/backends/xpu/CMakeLists.txt index 4d885757bb1a6..861b57956ba8e 100644 --- a/paddle/phi/backends/xpu/CMakeLists.txt +++ b/paddle/phi/backends/xpu/CMakeLists.txt @@ -1,2 +1,8 @@ -cc_library(phi_xpu_info SRCS xpu_info.cc DEPS enforce xpulib phi_place) -cc_library(xpu_context SRCS xpu_context.cc DEPS phi_device_context phi_xpu_info) +cc_library( + phi_xpu_info + SRCS xpu_info.cc + DEPS enforce xpulib phi_place) +cc_library( + xpu_context + SRCS xpu_context.cc + DEPS phi_device_context phi_xpu_info) diff --git a/paddle/phi/backends/xpu/enforce_xpu.h b/paddle/phi/backends/xpu/enforce_xpu.h index 29b048ead852d..30095e3a0074a 100644 --- a/paddle/phi/backends/xpu/enforce_xpu.h +++ b/paddle/phi/backends/xpu/enforce_xpu.h @@ -14,11 +14,10 @@ limitations under the License. */ #pragma once +#include "paddle/fluid/platform/enforce.h" #include "paddle/phi/backends/xpu/xpu_header.h" #include "xpu/bkcl.h" -#include "paddle/fluid/platform/enforce.h" - namespace phi { namespace backends { namespace xpu { diff --git a/paddle/phi/backends/xpu/xpu_context.cc b/paddle/phi/backends/xpu/xpu_context.cc index 7cc9eb44bc488..dbff88c0a2709 100644 --- a/paddle/phi/backends/xpu/xpu_context.cc +++ b/paddle/phi/backends/xpu/xpu_context.cc @@ -18,7 +18,6 @@ #include "paddle/phi/api/ext/exception.h" #include "paddle/phi/common/place.h" - #include "xpu/runtime.h" #include "xpu/runtime_ex.h" #include "xpu/xdnn.h" @@ -86,8 +85,8 @@ struct XPUContext::Impl { void Init() { owned_ = true; backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId()); - LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: " - << static_cast(place_.device); + LOG_FIRST_N(WARNING, 1) + << "Please NOTE: xpu device: " << static_cast(place_.device); context_ = xpu::create_context(); xpu_version_ = backends::xpu::get_xpu_version(place_.device); SetL3Cache(); diff --git a/paddle/phi/backends/xpu/xpu_context.h b/paddle/phi/backends/xpu/xpu_context.h index b87489c567cab..d39b3c9cc1ff7 100644 --- a/paddle/phi/backends/xpu/xpu_context.h +++ b/paddle/phi/backends/xpu/xpu_context.h @@ -15,12 +15,12 @@ limitations under the License. */ #pragma once #include -#include "paddle/phi/backends/xpu/forwards.h" -#include "paddle/phi/common/place.h" -#include "paddle/phi/core/device_context.h" +#include "paddle/phi/backends/xpu/forwards.h" #include "paddle/phi/backends/xpu/xpu_header.h" #include "paddle/phi/backends/xpu/xpu_info.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/device_context.h" namespace xpu = baidu::xpu::api; diff --git a/paddle/phi/backends/xpu/xpu_header.h b/paddle/phi/backends/xpu/xpu_header.h index 5337f78c64207..1fe6f6d07796f 100644 --- a/paddle/phi/backends/xpu/xpu_header.h +++ b/paddle/phi/backends/xpu/xpu_header.h @@ -22,7 +22,6 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" - #include "xpu/runtime.h" #include "xpu/runtime_ex.h" #include "xpu/xdnn.h" diff --git a/paddle/phi/backends/xpu/xpu_info.h b/paddle/phi/backends/xpu/xpu_info.h index b1056cdc4b14b..9d5f073eaa8e6 100644 --- a/paddle/phi/backends/xpu/xpu_info.h +++ b/paddle/phi/backends/xpu/xpu_info.h @@ -12,6 +12,7 @@ limitations under the License. */ #include #include + #include "paddle/phi/common/place.h" namespace phi { diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt index b1ca4d1f8a8c6..d9266bd06d278 100644 --- a/paddle/phi/common/CMakeLists.txt +++ b/paddle/phi/common/CMakeLists.txt @@ -1,3 +1,9 @@ cc_library(phi_place SRCS place.cc) -cc_library(scalar SRCS scalar.cc DEPS phi_enforce tensor) -cc_library(int_array SRCS int_array.cc DEPS phi_enforce tensor) +cc_library( + scalar + SRCS scalar.cc + DEPS phi_enforce tensor) +cc_library( + int_array + SRCS int_array.cc + DEPS phi_enforce tensor) diff --git a/paddle/phi/common/data_type.h b/paddle/phi/common/data_type.h index 1792cb9370673..ef9b425048298 100644 --- a/paddle/phi/common/data_type.h +++ b/paddle/phi/common/data_type.h @@ -14,11 +14,10 @@ limitations under the License. */ #pragma once +#include "paddle/phi/api/ext/exception.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/complex.h" #include "paddle/phi/common/float16.h" - -#include "paddle/phi/api/ext/exception.h" #include "paddle/phi/common/pstring.h" namespace paddle { diff --git a/paddle/phi/common/int_array.cc b/paddle/phi/common/int_array.cc index daed2b6625a9e..81701ee010ca2 100644 --- a/paddle/phi/common/int_array.cc +++ b/paddle/phi/common/int_array.cc @@ -14,9 +14,8 @@ limitations under the License. */ #include "paddle/phi/common/int_array.h" -#include "paddle/phi/common/place.h" - #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/common/place.h" namespace paddle { namespace experimental { diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc index 667d0a32b93da..c15a17651b18b 100644 --- a/paddle/phi/common/place.cc +++ b/paddle/phi/common/place.cc @@ -19,7 +19,6 @@ limitations under the License. */ #include #include "glog/logging.h" - #include "paddle/phi/api/ext/exception.h" namespace phi { diff --git a/paddle/phi/common/scalar.cc b/paddle/phi/common/scalar.cc index 41f1c9541823d..2954af086ac4c 100644 --- a/paddle/phi/common/scalar.cc +++ b/paddle/phi/common/scalar.cc @@ -14,11 +14,10 @@ limitations under the License. */ #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/place.h" -#include "paddle/phi/core/enforce.h" - #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/enforce.h" namespace paddle { namespace experimental { diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt index 41f654bfc8f30..8b180a2c2aeff 100644 --- a/paddle/phi/core/CMakeLists.txt +++ b/paddle/phi/core/CMakeLists.txt @@ -6,30 +6,78 @@ set(phi_enforce_deps errors flags) if(WITH_GPU) set(phi_enforce_deps ${phi_enforce_deps} external_error_proto) endif() -cc_library(phi_enforce SRCS enforce.cc DEPS ${phi_enforce_deps}) +cc_library( + phi_enforce + SRCS enforce.cc + DEPS ${phi_enforce_deps}) -cc_library(kernel_factory SRCS kernel_factory.cc DEPS phi_enforce fluid_convert_utils) -cc_library(kernel_context SRCS kernel_context.cc DEPS phi_enforce phi_context) +cc_library( + kernel_factory + SRCS kernel_factory.cc + DEPS phi_enforce fluid_convert_utils) +cc_library( + kernel_context + SRCS kernel_context.cc + DEPS phi_enforce phi_context) -cc_library(ddim SRCS ddim.cc DEPS phi_enforce) -cc_library(tensor_base SRCS tensor_base.cc allocator.cc DEPS phi_enforce) -cc_library(tensor_meta SRCS tensor_meta.cc DEPS phi_enforce) -cc_library(lod_utils SRCS lod_utils.cc DEPS phi_enforce) +cc_library( + ddim + SRCS ddim.cc + DEPS phi_enforce) +cc_library( + tensor_base + SRCS tensor_base.cc allocator.cc + DEPS phi_enforce) +cc_library( + tensor_meta + SRCS tensor_meta.cc + DEPS phi_enforce) +cc_library( + lod_utils + SRCS lod_utils.cc + DEPS phi_enforce) -cc_library(dense_tensor SRCS dense_tensor.cc dense_tensor_impl.cc DEPS convert_utils fluid_convert_utils tensor_meta tensor_base) -cc_library(sparse_coo_tensor SRCS sparse_coo_tensor.cc DEPS tensor_meta tensor_base) -cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_base) -cc_library(string_tensor SRCS string_tensor.cc DEPS convert_utils tensor_meta tensor_base) +cc_library( + dense_tensor + SRCS dense_tensor.cc dense_tensor_impl.cc + DEPS convert_utils fluid_convert_utils tensor_meta tensor_base) +cc_library( + sparse_coo_tensor + SRCS sparse_coo_tensor.cc + DEPS tensor_meta tensor_base) +cc_library( + sparse_csr_tensor + SRCS sparse_csr_tensor.cc + DEPS dense_tensor tensor_base) +cc_library( + string_tensor + SRCS string_tensor.cc + DEPS convert_utils tensor_meta tensor_base) -cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor) -cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor) -cc_library(selected_rows SRCS selected_rows_impl.cc selected_rows.cc DEPS tensor_base dense_tensor phi_enforce ddim memcpy) -cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows) +cc_library( + meta_tensor + SRCS meta_tensor.cc + DEPS tensor_base tensor_meta dense_tensor) +cc_library( + infermeta_utils + SRCS infermeta_utils.cc + DEPS meta_tensor) +cc_library( + selected_rows + SRCS selected_rows_impl.cc selected_rows.cc + DEPS tensor_base dense_tensor phi_enforce ddim memcpy) +cc_library( + phi_device_context + SRCS device_context.cc + DEPS dense_tensor selected_rows) -cc_library(custom_kernel SRCS custom_kernel.cc DEPS kernel_factory) +cc_library( + custom_kernel + SRCS custom_kernel.cc + DEPS kernel_factory) # Will remove once we implemented MKLDNN_Tensor if(WITH_MKLDNN) - add_dependencies(dense_tensor mkldnn) - add_dependencies(tensor_base mkldnn) + add_dependencies(dense_tensor mkldnn) + add_dependencies(tensor_base mkldnn) endif() diff --git a/paddle/phi/core/compat/CMakeLists.txt b/paddle/phi/core/compat/CMakeLists.txt index 3423e380970df..3fd9b74255c1d 100644 --- a/paddle/phi/core/compat/CMakeLists.txt +++ b/paddle/phi/core/compat/CMakeLists.txt @@ -1,5 +1,11 @@ -cc_library(arg_map_context SRCS arg_map_context.cc DEPS phi_enforce) -cc_library(op_utils SRCS op_utils.cc DEPS arg_map_context enforce) +cc_library( + arg_map_context + SRCS arg_map_context.cc + DEPS phi_enforce) +cc_library( + op_utils + SRCS op_utils.cc + DEPS arg_map_context enforce) set(convert_utils_deps data_type place op_utils) @@ -13,4 +19,7 @@ endif() if(WITH_CUSTOM_DEVICE) set(convert_utils_deps ${convert_utils_deps} device_manager) endif() -cc_library(convert_utils SRCS convert_utils.cc DEPS ${convert_utils_deps}) +cc_library( + convert_utils + SRCS convert_utils.cc + DEPS ${convert_utils_deps}) diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h index 8eb6524e79c0f..ae3b8924ece69 100644 --- a/paddle/phi/core/compat/op_utils.h +++ b/paddle/phi/core/compat/op_utils.h @@ -18,7 +18,6 @@ limitations under the License. */ #include #include "glog/logging.h" - #include "paddle/phi/core/compat/arg_map_context.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/infermeta_utils.h" diff --git a/paddle/phi/core/ddim.h b/paddle/phi/core/ddim.h index dd13081ddafff..794d7051aee58 100644 --- a/paddle/phi/core/ddim.h +++ b/paddle/phi/core/ddim.h @@ -238,10 +238,10 @@ int arity(const DDim& ddim); std::ostream& operator<<(std::ostream&, const DDim&); /** -* \brief Flatten dim to 3d -* e.g., DDim d = mak_ddim({1, 2, 3, 4, 5, 6}) -* flatten_to_3d(d, 2, 4); ===> {1*2, 3*4, 5*6} ===> {2, 12, 30} -*/ + * \brief Flatten dim to 3d + * e.g., DDim d = mak_ddim({1, 2, 3, 4, 5, 6}) + * flatten_to_3d(d, 2, 4); ===> {1*2, 3*4, 5*6} ===> {2, 12, 30} + */ DDim flatten_to_3d(const DDim& src, int num_row_dims, int num_col_dims); // Reshape a tensor to a matrix. The matrix's first dimension(column length) diff --git a/paddle/phi/core/dense_tensor.h b/paddle/phi/core/dense_tensor.h index 06d3e435bc110..09098705b11e4 100644 --- a/paddle/phi/core/dense_tensor.h +++ b/paddle/phi/core/dense_tensor.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/phi/core/tensor_meta.h" /* @jim19930609: Move to MKLDNN_Tensor in the future - */ + */ #ifdef PADDLE_WITH_MKLDNN #include "dnnl.hpp" #endif diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc index 8c97b6bf223fb..a59b910b7e006 100644 --- a/paddle/phi/core/dense_tensor_impl.cc +++ b/paddle/phi/core/dense_tensor_impl.cc @@ -12,15 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/core/dense_tensor.h" - +#include "paddle/fluid/memory/malloc.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/complex.h" #include "paddle/phi/common/float16.h" - #include "paddle/phi/core/compat/convert_utils.h" - -#include "paddle/fluid/memory/malloc.h" +#include "paddle/phi/core/dense_tensor.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_utils.h" diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc index 0f5f22b5bd1f4..ce57f4f627baa 100644 --- a/paddle/phi/core/device_context.cc +++ b/paddle/phi/core/device_context.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/core/device_context.h" + #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/selected_rows.h" diff --git a/paddle/phi/core/device_context.h b/paddle/phi/core/device_context.h index d7c2c777ca632..45e4fbf64dc04 100644 --- a/paddle/phi/core/device_context.h +++ b/paddle/phi/core/device_context.h @@ -75,17 +75,17 @@ class PADDLE_API DeviceContext { void SetHostAllocator(const Allocator*); /** - * @brief Set the zero-size Allocator object. - * - * @param allocator - */ + * @brief Set the zero-size Allocator object. + * + * @param allocator + */ void SetZeroAllocator(const Allocator*); /** - * @brief Set the zero-size Allocator object. - * - * @param allocator - */ + * @brief Set the zero-size Allocator object. + * + * @param allocator + */ void SetPinnedAllocator(const Allocator*); /** @@ -135,10 +135,10 @@ class PADDLE_API DeviceContext { virtual void Wait() const {} /** - * @brief Set the generator for special op. - * - * @param Generator - */ + * @brief Set the generator for special op. + * + * @param Generator + */ void SetGenerator(Generator*); /** * @brief Get the generator object. @@ -148,10 +148,10 @@ class PADDLE_API DeviceContext { Generator* GetGenerator() const; /** - * @brief Set the host generator for special op. - * - * @param Generator - */ + * @brief Set the host generator for special op. + * + * @param Generator + */ void SetHostGenerator(Generator*); /** * @brief Get the host generator object. diff --git a/paddle/phi/core/enforce.cc b/paddle/phi/core/enforce.cc index ae6b0135b3222..91e0316ff7558 100644 --- a/paddle/phi/core/enforce.cc +++ b/paddle/phi/core/enforce.cc @@ -14,13 +14,12 @@ limitations under the License. */ #include "paddle/phi/core/enforce.h" +#include #include #include #include #include -#include - // is not suitable to be placed in the header file, // it will introduce a large number of unnecessary includes, and these type // declarations that depend on boost are also not suitable for the phi header diff --git a/paddle/phi/core/hostdevice.h b/paddle/phi/core/hostdevice.h index 0869df143235f..decebbe66a538 100644 --- a/paddle/phi/core/hostdevice.h +++ b/paddle/phi/core/hostdevice.h @@ -20,6 +20,7 @@ #if defined(__xpu__) #include + #include "xpu/kernel/cluster_header.h" #include "xpu/kernel/debug.h" #include "xpu/kernel/math.h" diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc index d479147f06ba1..d864544e10dd8 100644 --- a/paddle/phi/core/kernel_factory.cc +++ b/paddle/phi/core/kernel_factory.cc @@ -15,7 +15,6 @@ #include "paddle/phi/core/kernel_factory.h" #include "glog/logging.h" - #include "paddle/phi/core/enforce.h" namespace phi { diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index 41e1e2b53a9e9..65f655d50375c 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -22,13 +22,12 @@ #include #include "paddle/phi/core/custom_kernel.h" +#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_factory.h" #include "paddle/phi/core/kernel_utils.h" #include "paddle/phi/core/macros.h" #include "paddle/phi/core/type_defs.h" -#include "paddle/phi/core/enforce.h" - namespace phi { #define BACKEND(arg__) phi::Backend::arg__ @@ -58,16 +57,13 @@ struct KernelArgsParseFunctor { for (auto arg_type : args_type) { if (arg_type == std::type_index(typeid(const CPUContext&)) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - || - arg_type == std::type_index(typeid(const GPUContext&))) { + || arg_type == std::type_index(typeid(const GPUContext&))) { #elif defined(PADDLE_WITH_XPU) - || - arg_type == std::type_index(typeid(const XPUContext&))) { + || arg_type == std::type_index(typeid(const XPUContext&))) { #elif defined(PADDLE_WITH_CUSTOM_DEVICE) - || - arg_type == std::type_index(typeid(const CustomContext&))) { + || arg_type == std::type_index(typeid(const CustomContext&))) { #else - ) { + ) { #endif // do nothing, skip context arg now } else if (arg_type == std::type_index(typeid(const DenseTensor&))) { @@ -420,93 +416,93 @@ struct KernelRegistrar { PD_CONCATENATE(_PD_KERNEL_INSTANTIATION_, N) \ (meta_kernel_fn, backend, context, __VA_ARGS__) -#define _PD_KERNEL_INSTANTIATION_1( \ - meta_kernel_fn, backend, context, cpp_dtype) \ - template decltype( \ - meta_kernel_fn) meta_kernel_fn -#define _PD_KERNEL_INSTANTIATION_2( \ - meta_kernel_fn, backend, context, cpp_dtype, ...) \ - template decltype( \ - meta_kernel_fn) meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_1( \ +#define _PD_KERNEL_INSTANTIATION_1( \ + meta_kernel_fn, backend, context, cpp_dtype) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn +#define _PD_KERNEL_INSTANTIATION_2( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_1( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_3( \ - meta_kernel_fn, backend, context, cpp_dtype, ...) \ - template decltype( \ - meta_kernel_fn) meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_2( \ +#define _PD_KERNEL_INSTANTIATION_3( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_2( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_4( \ - meta_kernel_fn, backend, context, cpp_dtype, ...) \ - template decltype( \ - meta_kernel_fn) meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_3( \ +#define _PD_KERNEL_INSTANTIATION_4( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_3( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_5( \ - meta_kernel_fn, backend, context, cpp_dtype, ...) \ - template decltype( \ - meta_kernel_fn) meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_4( \ +#define _PD_KERNEL_INSTANTIATION_5( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_4( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_6( \ - meta_kernel_fn, backend, context, cpp_dtype, ...) \ - template decltype( \ - meta_kernel_fn) meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_5( \ +#define _PD_KERNEL_INSTANTIATION_6( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_5( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_7( \ - meta_kernel_fn, backend, context, cpp_dtype, ...) \ - template decltype( \ - meta_kernel_fn) meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_6( \ +#define _PD_KERNEL_INSTANTIATION_7( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_6( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_8( \ - meta_kernel_fn, backend, context, cpp_dtype, ...) \ - template decltype( \ - meta_kernel_fn) meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_7( \ +#define _PD_KERNEL_INSTANTIATION_8( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_7( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_9( \ - meta_kernel_fn, backend, context, cpp_dtype, ...) \ - template decltype( \ - meta_kernel_fn) meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_8( \ +#define _PD_KERNEL_INSTANTIATION_9( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_8( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_10( \ - meta_kernel_fn, backend, context, cpp_dtype, ...) \ - template decltype( \ - meta_kernel_fn) meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_9( \ +#define _PD_KERNEL_INSTANTIATION_10( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_9( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_11( \ - meta_kernel_fn, backend, context, cpp_dtype, ...) \ - template decltype( \ - meta_kernel_fn) meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_10( \ +#define _PD_KERNEL_INSTANTIATION_11( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_10( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_12( \ - meta_kernel_fn, backend, context, cpp_dtype, ...) \ - template decltype( \ - meta_kernel_fn) meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_11( \ +#define _PD_KERNEL_INSTANTIATION_12( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_11( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_13( \ - meta_kernel_fn, backend, context, cpp_dtype, ...) \ - template decltype( \ - meta_kernel_fn) meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_12( \ +#define _PD_KERNEL_INSTANTIATION_13( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_12( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_14( \ - meta_kernel_fn, backend, context, cpp_dtype, ...) \ - template decltype( \ - meta_kernel_fn) meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_13( \ +#define _PD_KERNEL_INSTANTIATION_14( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_13( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_15( \ - meta_kernel_fn, backend, context, cpp_dtype, ...) \ - template decltype( \ - meta_kernel_fn) meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_14( \ +#define _PD_KERNEL_INSTANTIATION_15( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_14( \ meta_kernel_fn, backend, context, __VA_ARGS__)) #define PD_KERNEL_REGISTRAR_INIT(reg_type, \ @@ -569,8 +565,8 @@ struct KernelRegistrar { #backend, \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ + ::phi::KernelArgsParseFunctor< \ + decltype(&meta_kernel_fn)>::Parse, \ args_def_fn, \ PHI_KERNEL(meta_kernel_fn), \ PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ @@ -592,8 +588,8 @@ struct KernelRegistrar { #backend, \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ + ::phi::KernelArgsParseFunctor< \ + decltype(&meta_kernel_fn)>::Parse, \ args_def_fn, \ PHI_KERNEL(meta_kernel_fn), \ PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ @@ -623,8 +619,8 @@ struct KernelRegistrar { #backend, \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ + ::phi::KernelArgsParseFunctor< \ + decltype(&meta_kernel_fn)>::Parse, \ args_def_fn, \ PHI_KERNEL(meta_kernel_fn), \ PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ @@ -654,8 +650,8 @@ struct KernelRegistrar { #backend, \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ + ::phi::KernelArgsParseFunctor< \ + decltype(&meta_kernel_fn)>::Parse, \ args_def_fn, \ PHI_KERNEL(meta_kernel_fn), \ PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ @@ -685,8 +681,8 @@ struct KernelRegistrar { #backend, \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ + ::phi::KernelArgsParseFunctor< \ + decltype(&meta_kernel_fn)>::Parse, \ args_def_fn, \ PHI_KERNEL(meta_kernel_fn), \ PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ @@ -716,8 +712,8 @@ struct KernelRegistrar { #backend, \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ + ::phi::KernelArgsParseFunctor< \ + decltype(&meta_kernel_fn)>::Parse, \ args_def_fn, \ PHI_KERNEL(meta_kernel_fn), \ PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ @@ -747,8 +743,8 @@ struct KernelRegistrar { #backend, \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ + ::phi::KernelArgsParseFunctor< \ + decltype(&meta_kernel_fn)>::Parse, \ args_def_fn, \ PHI_KERNEL(meta_kernel_fn), \ PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ @@ -778,8 +774,8 @@ struct KernelRegistrar { #backend, \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ + ::phi::KernelArgsParseFunctor< \ + decltype(&meta_kernel_fn)>::Parse, \ args_def_fn, \ PHI_KERNEL(meta_kernel_fn), \ PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ @@ -809,8 +805,8 @@ struct KernelRegistrar { #backend, \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ + ::phi::KernelArgsParseFunctor< \ + decltype(&meta_kernel_fn)>::Parse, \ args_def_fn, \ PHI_KERNEL(meta_kernel_fn), \ PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ @@ -840,8 +836,8 @@ struct KernelRegistrar { #backend, \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ + ::phi::KernelArgsParseFunctor< \ + decltype(&meta_kernel_fn)>::Parse, \ args_def_fn, \ PHI_KERNEL(meta_kernel_fn), \ PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ @@ -871,8 +867,8 @@ struct KernelRegistrar { #backend, \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ + ::phi::KernelArgsParseFunctor< \ + decltype(&meta_kernel_fn)>::Parse, \ args_def_fn, \ PHI_KERNEL(meta_kernel_fn), \ PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ @@ -902,8 +898,8 @@ struct KernelRegistrar { #backend, \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ + ::phi::KernelArgsParseFunctor< \ + decltype(&meta_kernel_fn)>::Parse, \ args_def_fn, \ PHI_KERNEL(meta_kernel_fn), \ PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ @@ -933,8 +929,8 @@ struct KernelRegistrar { #backend, \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ + ::phi::KernelArgsParseFunctor< \ + decltype(&meta_kernel_fn)>::Parse, \ args_def_fn, \ PHI_KERNEL(meta_kernel_fn), \ PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ @@ -964,8 +960,8 @@ struct KernelRegistrar { #backend, \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ + ::phi::KernelArgsParseFunctor< \ + decltype(&meta_kernel_fn)>::Parse, \ args_def_fn, \ PHI_KERNEL(meta_kernel_fn), \ PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ @@ -995,8 +991,8 @@ struct KernelRegistrar { #backend, \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ + ::phi::KernelArgsParseFunctor< \ + decltype(&meta_kernel_fn)>::Parse, \ args_def_fn, \ PHI_KERNEL(meta_kernel_fn), \ PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index d4765d1c4c3b4..3b5fd0247a484 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -233,9 +233,8 @@ template struct KernelImpl { static void Compute(KernelContext* ctx) { - KernelCallHelper>::template Compute<0, 0, 0, 0>(ctx); + KernelCallHelper>:: + template Compute<0, 0, 0, 0>(ctx); } static void VariadicCompute(const DeviceContext& dev_ctx, Args... args) { diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h index d277f32d8ea9a..271759161868b 100644 --- a/paddle/phi/core/meta_tensor.h +++ b/paddle/phi/core/meta_tensor.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "glog/logging.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/core/ddim.h" @@ -21,8 +22,6 @@ limitations under the License. */ #include "paddle/phi/core/tensor_base.h" #include "paddle/phi/core/tensor_meta.h" -#include "glog/logging.h" - namespace phi { // TODO(chenweihang): add other flags if needed diff --git a/paddle/phi/core/string_tensor.cc b/paddle/phi/core/string_tensor.cc index 0a4e0d6191510..20cbf3dffcb16 100644 --- a/paddle/phi/core/string_tensor.cc +++ b/paddle/phi/core/string_tensor.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/core/string_tensor.h" + #include "paddle/fluid/memory/malloc.h" namespace phi { diff --git a/paddle/phi/core/tensor_base.cc b/paddle/phi/core/tensor_base.cc index 1b3628906af09..718bf09ff7eb9 100644 --- a/paddle/phi/core/tensor_base.cc +++ b/paddle/phi/core/tensor_base.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/core/tensor_base.h" + #include "paddle/phi/core/utils/type_registry.h" namespace phi {} diff --git a/paddle/phi/core/utils/intrusive_ptr.h b/paddle/phi/core/utils/intrusive_ptr.h index 2b7580192539f..e2e6cb7060d05 100644 --- a/paddle/phi/core/utils/intrusive_ptr.h +++ b/paddle/phi/core/utils/intrusive_ptr.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "glog/logging.h" #include "paddle/phi/core/enforce.h" diff --git a/paddle/phi/infermeta/CMakeLists.txt b/paddle/phi/infermeta/CMakeLists.txt index 1a19fd003222d..92b64ab4e666a 100644 --- a/paddle/phi/infermeta/CMakeLists.txt +++ b/paddle/phi/infermeta/CMakeLists.txt @@ -1,3 +1,9 @@ -cc_library(infermeta SRCS nullary.cc unary.cc binary.cc ternary.cc multiary.cc DEPS convert_utils meta_tensor infermeta_utils) -cc_library(backward_infermeta SRCS backward.cc DEPS meta_tensor convert_utils) +cc_library( + infermeta + SRCS nullary.cc unary.cc binary.cc ternary.cc multiary.cc + DEPS convert_utils meta_tensor infermeta_utils) +cc_library( + backward_infermeta + SRCS backward.cc + DEPS meta_tensor convert_utils) add_subdirectory(strings) diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index 521eb03fd770f..f59ea5549bd71 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -313,10 +313,10 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out, } void InstanceNormGradInferMeta(const MetaTensor& x, - const MetaTensor& y_grad, const MetaTensor& scale, const MetaTensor& saved_mean, const MetaTensor& saved_variance, + const MetaTensor& y_grad, float epsilon, MetaTensor* x_grad, MetaTensor* scale_grad, diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index 93e2d4c43bc3f..0e7ed640d8ffb 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -145,10 +145,10 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out, MetaTensor* dx); void InstanceNormGradInferMeta(const MetaTensor& x, - const MetaTensor& y_grad, const MetaTensor& scale, const MetaTensor& saved_mean, const MetaTensor& saved_variance, + const MetaTensor& y_grad, float epsilon, MetaTensor* x_grad, MetaTensor* scale_grad, diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index a8d5ad564fe9b..f10fc54795ddb 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/core/ddim.h" diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 63f0d0c1eeb28..61c57981f94b5 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/infermeta/multiary.h" + #include + #include "paddle/phi/common/layout.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/infermeta_utils.h" diff --git a/paddle/phi/infermeta/strings/CMakeLists.txt b/paddle/phi/infermeta/strings/CMakeLists.txt index 3e1a947728f51..c2f891fe712eb 100644 --- a/paddle/phi/infermeta/strings/CMakeLists.txt +++ b/paddle/phi/infermeta/strings/CMakeLists.txt @@ -1 +1,4 @@ -cc_library(string_infermeta SRCS nullary.cc unary.cc DEPS convert_utils infermeta_utils) +cc_library( + string_infermeta + SRCS nullary.cc unary.cc + DEPS convert_utils infermeta_utils) diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index 3c2888cee58c7..d84cc9e6d75af 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/infermeta/ternary.h" + #include "paddle/phi/core/ddim.h" #include "paddle/phi/kernels/funcs/common_shape.h" diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 437c55c840f1a..67795c2a8aa6e 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -1,7 +1,14 @@ -set(kernel_declare_file ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h.tmp CACHE INTERNAL "declarations.h file") -set(kernel_declare_file_final ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h) -file(WRITE ${kernel_declare_file} "// Generated by the paddle/phi/kernels/CMakeLists.txt. DO NOT EDIT!\n\n#pragma once\n\n") -file(APPEND ${kernel_declare_file} "#include \"paddle/phi/core/kernel_registry.h\"\n\n") +set(kernel_declare_file + ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h.tmp + CACHE INTERNAL "declarations.h file") +set(kernel_declare_file_final + ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h) +file( + WRITE ${kernel_declare_file} + "// Generated by the paddle/phi/kernels/CMakeLists.txt. DO NOT EDIT!\n\n#pragma once\n\n" +) +file(APPEND ${kernel_declare_file} + "#include \"paddle/phi/core/kernel_registry.h\"\n\n") # phi functors and functions called by kernels add_subdirectory(funcs) @@ -13,8 +20,25 @@ add_subdirectory(autotune) set_property(GLOBAL PROPERTY PHI_KERNELS "") # [ 1. Common kernel compilation dependencies ] -set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils custom_kernel) -set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor selected_rows_functor) +set(COMMON_KERNEL_DEPS + dense_tensor + sparse_coo_tensor + sparse_csr_tensor + kernel_context + kernel_factory + arg_map_context + convert_utils + lod_utils + custom_kernel) +set(COMMON_KERNEL_DEPS + ${COMMON_KERNEL_DEPS} + eigen_function + blas + math_function + im2col + vol2col + concat_and_split_functor + selected_rows_functor) # remove this dep after removing fluid deps on tensor creation set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta) @@ -30,50 +54,105 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel) # Some kernels depend on some targets that are not commonly used. # These targets are not suitable for common dependencies. # In this case, you need to manually generate them here. -set(AUTOTUNE_KERNELS conv_kernel conv_grad_kernel conv_grad_grad_kernel conv_transpose_kernel conv_transpose_grad_kernel) -set(MANUAL_BUILD_KERNELS ${AUTOTUNE_KERNELS} cross_entropy_kernel adam_kernel adamw_kernel deformable_conv_kernel deformable_conv_grad_kernel eigh_kernel - gumbel_softmax_kernel gumbel_softmax_grad_kernel hierarchical_sigmoid_kernel hierarchical_sigmoid_grad_kernel - matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel - put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel - softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel - triangular_solve_grad_kernel determinant_grad_kernel reduce_sum_kernel reduce_mean_kernel rnn_kernel rnn_grad_kernel warpctc_kernel warpctc_grad_kernel) +set(AUTOTUNE_KERNELS conv_kernel conv_grad_kernel conv_grad_grad_kernel + conv_transpose_kernel conv_transpose_grad_kernel) +set(MANUAL_BUILD_KERNELS + ${AUTOTUNE_KERNELS} + cross_entropy_kernel + adam_kernel + adamw_kernel + deformable_conv_kernel + deformable_conv_grad_kernel + eigh_kernel + gumbel_softmax_kernel + gumbel_softmax_grad_kernel + hierarchical_sigmoid_kernel + hierarchical_sigmoid_grad_kernel + matrix_power_kernel + matrix_power_grad_kernel + maxout_kernel + maxout_grad_kernel + pool_kernel + put_along_axis_kernel + put_along_axis_grad_kernel + segment_pool_kernel + segment_pool_grad_kernel + softmax_kernel + softmax_grad_kernel + take_along_axis_kernel + take_along_axis_grad_kernel + triangular_solve_grad_kernel + determinant_grad_kernel + reduce_sum_kernel + reduce_mean_kernel + rnn_kernel + rnn_grad_kernel + warpctc_kernel + warpctc_grad_kernel) foreach(src ${AUTOTUNE_KERNELS}) kernel_library(${src} DEPS ${COMMON_KERNEL_DEPS} switch_autotune) endforeach() -kernel_library(adam_kernel DEPS gflags glog flags ${COMMON_KERNEL_DEPS} selected_rows_functor threadpool jit_kernel_helper) +kernel_library( + adam_kernel + DEPS + gflags + glog + flags + ${COMMON_KERNEL_DEPS} + selected_rows_functor + threadpool + jit_kernel_helper) kernel_library(adamw_kernel DEPS ${COMMON_KERNEL_DEPS} adam_kernel) -kernel_library(cross_entropy_kernel DEPS ${COMMON_KERNEL_DEPS} softmax cross_entropy) -kernel_library(deformable_conv_kernel DEPS ${COMMON_KERNEL_DEPS} deformable_conv_functor) -kernel_library(deformable_conv_grad_kernel DEPS ${COMMON_KERNEL_DEPS} deformable_conv_functor) -kernel_library(determinant_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) +kernel_library(cross_entropy_kernel DEPS ${COMMON_KERNEL_DEPS} softmax + cross_entropy) +kernel_library(deformable_conv_kernel DEPS ${COMMON_KERNEL_DEPS} + deformable_conv_functor) +kernel_library(deformable_conv_grad_kernel DEPS ${COMMON_KERNEL_DEPS} + deformable_conv_functor) +kernel_library(determinant_grad_kernel DEPS ${COMMON_KERNEL_DEPS} + matrix_inverse) kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function) -kernel_library(hierarchical_sigmoid_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit_code) -kernel_library(hierarchical_sigmoid_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit_code) +kernel_library(hierarchical_sigmoid_kernel DEPS ${COMMON_KERNEL_DEPS} + matrix_bit_code) +kernel_library(hierarchical_sigmoid_grad_kernel DEPS ${COMMON_KERNEL_DEPS} + matrix_bit_code) kernel_library(gumbel_softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(gumbel_softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(reduce_sum_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel) kernel_library(reduce_mean_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel) kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) -kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) +kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} + matrix_inverse) kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) kernel_library(maxout_grad_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) kernel_library(pool_kernel DEPS ${COMMON_KERNEL_DEPS} pooling) -kernel_library(put_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) -kernel_library(put_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) +kernel_library(put_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} + gather_scatter_kernel) +kernel_library(put_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} + gather_scatter_kernel) kernel_library(segment_pool_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling) -kernel_library(segment_pool_grad_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling) +kernel_library(segment_pool_grad_kernel DEPS ${COMMON_KERNEL_DEPS} + segment_pooling) kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) -kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) -kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) -kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce) -kernel_library(rnn_kernel DEPS ${COMMON_KERNEL_DEPS} concat_and_split_functor lstm_compute gru_compute) -kernel_library(rnn_grad_kernel DEPS ${COMMON_KERNEL_DEPS} concat_and_split_functor lstm_compute gru_compute) -kernel_library(warpctc_kernel DEPS ${COMMON_KERNEL_DEPS} phi_dynload_warpctc sequence_padding sequence_scale) -kernel_library(warpctc_grad_kernel DEPS ${COMMON_KERNEL_DEPS} phi_dynload_warpctc sequence_padding sequence_scale) +kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} + gather_scatter_kernel) +kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} + gather_scatter_kernel) +kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} + matrix_reduce) +kernel_library(rnn_kernel DEPS ${COMMON_KERNEL_DEPS} concat_and_split_functor + lstm_compute gru_compute) +kernel_library(rnn_grad_kernel DEPS ${COMMON_KERNEL_DEPS} + concat_and_split_functor lstm_compute gru_compute) +kernel_library(warpctc_kernel DEPS ${COMMON_KERNEL_DEPS} phi_dynload_warpctc + sequence_padding sequence_scale) +kernel_library(warpctc_grad_kernel DEPS ${COMMON_KERNEL_DEPS} + phi_dynload_warpctc sequence_padding sequence_scale) # 4. auto parse and build kernel targets by cmake -register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} ) +register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS + ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS}) # phi sparse kernels add_subdirectory(sparse) diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc index 2349bf990acd3..3d8e4db08bba1 100644 --- a/paddle/phi/kernels/assign_kernel.cc +++ b/paddle/phi/kernels/assign_kernel.cc @@ -14,12 +14,11 @@ #include "paddle/phi/kernels/assign_kernel.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/utils/optional.h" -#include "paddle/fluid/framework/tensor_util.h" - namespace phi { template diff --git a/paddle/phi/kernels/auc_kernel.h b/paddle/phi/kernels/auc_kernel.h index acbd17c7801e2..f58c3ce112bd7 100644 --- a/paddle/phi/kernels/auc_kernel.h +++ b/paddle/phi/kernels/auc_kernel.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" diff --git a/paddle/phi/kernels/autotune/CMakeLists.txt b/paddle/phi/kernels/autotune/CMakeLists.txt index 63dc22459446f..a7a6c2f8e4dc0 100644 --- a/paddle/phi/kernels/autotune/CMakeLists.txt +++ b/paddle/phi/kernels/autotune/CMakeLists.txt @@ -1,12 +1,33 @@ -if (WITH_GPU) - nv_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest) - nv_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest) -elseif (WITH_ROCM) - hip_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest) - hip_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest) +if(WITH_GPU) + nv_test( + gpu_timer_test + SRCS gpu_timer_test.cu + DEPS gtest) + nv_test( + auto_tune_test + SRCS auto_tune_test.cu + DEPS gtest) +elseif(WITH_ROCM) + hip_test( + gpu_timer_test + SRCS gpu_timer_test.cu + DEPS gtest) + hip_test( + auto_tune_test + SRCS auto_tune_test.cu + DEPS gtest) endif() -cc_library(cache SRCS cache.cc DEPS boost) -cc_library(switch_autotune SRCS switch_autotune.cc DEPS cache flags) +cc_library( + cache + SRCS cache.cc + DEPS boost) +cc_library( + switch_autotune + SRCS switch_autotune.cc + DEPS cache flags) -cc_test(cache_test SRCS cache_test.cc DEPS gtest cache) +cc_test( + cache_test + SRCS cache_test.cc + DEPS gtest cache) diff --git a/paddle/phi/kernels/autotune/auto_tune_base.h b/paddle/phi/kernels/autotune/auto_tune_base.h index eaf325dad7500..e18b854cf34b3 100644 --- a/paddle/phi/kernels/autotune/auto_tune_base.h +++ b/paddle/phi/kernels/autotune/auto_tune_base.h @@ -15,6 +15,7 @@ #pragma once #include + #include "glog/logging.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/kernels/autotune/gpu_timer.h" diff --git a/paddle/phi/kernels/autotune/auto_tune_test.cu b/paddle/phi/kernels/autotune/auto_tune_test.cu index f477cd1219331..c3918b8ebe59d 100644 --- a/paddle/phi/kernels/autotune/auto_tune_test.cu +++ b/paddle/phi/kernels/autotune/auto_tune_test.cu @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "glog/logging.h" #include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/backends/all_context.h" @@ -66,8 +67,8 @@ float Algo(const phi::GPUContext& ctx, N); #else VLOG(3) << "Vecsize is " << Vecsize; - VecSumTest<<>>( - d_in_data, d_out_data, N); + VecSumTest + <<>>(d_in_data, d_out_data, N); #endif return Vecsize; } diff --git a/paddle/phi/kernels/autotune/cache.cc b/paddle/phi/kernels/autotune/cache.cc index ef2cbe633d496..5e2c9e1c742ff 100644 --- a/paddle/phi/kernels/autotune/cache.cc +++ b/paddle/phi/kernels/autotune/cache.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/phi/kernels/autotune/cache.h" + #include + #include "glog/logging.h" namespace phi { diff --git a/paddle/phi/kernels/autotune/cache.h b/paddle/phi/kernels/autotune/cache.h index 37c5d134e8a61..9d7f57e96e373 100644 --- a/paddle/phi/kernels/autotune/cache.h +++ b/paddle/phi/kernels/autotune/cache.h @@ -19,6 +19,7 @@ #include #include #include + #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/errors.h" diff --git a/paddle/phi/kernels/autotune/cache_test.cc b/paddle/phi/kernels/autotune/cache_test.cc index f99f8bfc8b821..53574c3d0c9ac 100644 --- a/paddle/phi/kernels/autotune/cache_test.cc +++ b/paddle/phi/kernels/autotune/cache_test.cc @@ -13,9 +13,12 @@ // limitations under the License. #include "paddle/phi/kernels/autotune/cache.h" + #include + #include #include + #include "glog/logging.h" enum ConvAlgos { GEMMKernel = 0, CuDNNKernel_1 = 1, CuDNNKernel_2 = 2 }; diff --git a/paddle/phi/kernels/autotune/gpu_timer_test.cu b/paddle/phi/kernels/autotune/gpu_timer_test.cu index b6eb345885f30..d24508dfa2064 100644 --- a/paddle/phi/kernels/autotune/gpu_timer_test.cu +++ b/paddle/phi/kernels/autotune/gpu_timer_test.cu @@ -13,7 +13,9 @@ // limitations under the License. #include + #include + #include "glog/logging.h" #include "paddle/phi/kernels/autotune/gpu_timer.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" diff --git a/paddle/phi/kernels/autotune/switch_autotune.h b/paddle/phi/kernels/autotune/switch_autotune.h index 1793940542d47..de638ac4eda75 100644 --- a/paddle/phi/kernels/autotune/switch_autotune.h +++ b/paddle/phi/kernels/autotune/switch_autotune.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/phi/kernels/autotune/cache.h" namespace phi { diff --git a/paddle/phi/kernels/batch_norm_grad_kernel.h b/paddle/phi/kernels/batch_norm_grad_kernel.h index 3de2f69f452db..afbb0c78ca981 100644 --- a/paddle/phi/kernels/batch_norm_grad_kernel.h +++ b/paddle/phi/kernels/batch_norm_grad_kernel.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/phi/core/dense_tensor.h" namespace phi { diff --git a/paddle/phi/kernels/broadcast_tensors_grad_kernel.h b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h index 5d24f6684a48f..79d5b8a445b48 100644 --- a/paddle/phi/kernels/broadcast_tensors_grad_kernel.h +++ b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/phi/core/dense_tensor.h" namespace phi { diff --git a/paddle/phi/kernels/broadcast_tensors_kernel.h b/paddle/phi/kernels/broadcast_tensors_kernel.h index 22b5201b6900d..dccaebcf41ffe 100644 --- a/paddle/phi/kernels/broadcast_tensors_kernel.h +++ b/paddle/phi/kernels/broadcast_tensors_kernel.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/phi/core/dense_tensor.h" namespace phi { diff --git a/paddle/phi/kernels/channel_shuffle_grad_kernel.h b/paddle/phi/kernels/channel_shuffle_grad_kernel.h index ac89f3336bc76..d75d887d0fcd8 100644 --- a/paddle/phi/kernels/channel_shuffle_grad_kernel.h +++ b/paddle/phi/kernels/channel_shuffle_grad_kernel.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/phi/core/dense_tensor.h" namespace phi { diff --git a/paddle/phi/kernels/channel_shuffle_kernel.h b/paddle/phi/kernels/channel_shuffle_kernel.h index 12de25606dd96..c15e06fb552bf 100644 --- a/paddle/phi/kernels/channel_shuffle_kernel.h +++ b/paddle/phi/kernels/channel_shuffle_kernel.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/phi/core/dense_tensor.h" namespace phi { diff --git a/paddle/phi/kernels/conv_kernel.cc b/paddle/phi/kernels/conv_kernel.cc index 7268384f401a1..542a4ec8a61c8 100644 --- a/paddle/phi/kernels/conv_kernel.cc +++ b/paddle/phi/kernels/conv_kernel.cc @@ -14,9 +14,8 @@ #include "paddle/phi/kernels/conv_kernel.h" -#include "paddle/phi/core/kernel_registry.h" - #include "paddle/fluid/platform/cudnn_workspace_helper.h" +#include "paddle/phi/core/kernel_registry.h" namespace phi { @@ -41,8 +40,8 @@ void ConvInferKernel(const Context& dev_ctx, dilations, data_format, /*use_addto=*/false, - /*workspace_size_MB=*/paddle::platform:: - GetDefaultConvWorkspaceSizeLimitMB(), + /*workspace_size_MB=*/ + paddle::platform::GetDefaultConvWorkspaceSizeLimitMB(), /*exhaustive_search=*/false, out); } diff --git a/paddle/phi/kernels/conv_transpose_grad_kernel.h b/paddle/phi/kernels/conv_transpose_grad_kernel.h index 2b1c0c1a934cf..00d5fb51f01ee 100644 --- a/paddle/phi/kernels/conv_transpose_grad_kernel.h +++ b/paddle/phi/kernels/conv_transpose_grad_kernel.h @@ -16,6 +16,7 @@ #include #include + #include "paddle/phi/core/dense_tensor.h" namespace phi { diff --git a/paddle/phi/kernels/conv_transpose_kernel.h b/paddle/phi/kernels/conv_transpose_kernel.h index de56f13ddf73e..e39617e0e7c0c 100644 --- a/paddle/phi/kernels/conv_transpose_kernel.h +++ b/paddle/phi/kernels/conv_transpose_kernel.h @@ -16,6 +16,7 @@ #include #include + #include "paddle/phi/core/dense_tensor.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/abs_kernel.cc b/paddle/phi/kernels/cpu/abs_kernel.cc index 9f89fc27a7167..a10e0eed64aec 100644 --- a/paddle/phi/kernels/cpu/abs_kernel.cc +++ b/paddle/phi/kernels/cpu/abs_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/abs_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/complex.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/cpu/accuracy_kernel.cc b/paddle/phi/kernels/cpu/accuracy_kernel.cc index 6ff8a1f755897..17246de35db22 100644 --- a/paddle/phi/kernels/cpu/accuracy_kernel.cc +++ b/paddle/phi/kernels/cpu/accuracy_kernel.cc @@ -15,6 +15,7 @@ #include "paddle/phi/kernels/accuracy_kernel.h" #include + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc index 165627839a308..bd3e16d54dcad 100644 --- a/paddle/phi/kernels/cpu/activation_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_kernel.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/kernels/activation_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/activation_functor.h" diff --git a/paddle/phi/kernels/cpu/adagrad_kernel.cc b/paddle/phi/kernels/cpu/adagrad_kernel.cc index fcd89caf7fa29..d6867deff4c15 100644 --- a/paddle/phi/kernels/cpu/adagrad_kernel.cc +++ b/paddle/phi/kernels/cpu/adagrad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/adagrad_kernel.h" + #include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/cpu/allclose_kernel.cc b/paddle/phi/kernels/cpu/allclose_kernel.cc index f95ddc5621e9a..c6a512aa95cb1 100644 --- a/paddle/phi/kernels/cpu/allclose_kernel.cc +++ b/paddle/phi/kernels/cpu/allclose_kernel.cc @@ -15,6 +15,7 @@ #include "paddle/phi/kernels/allclose_kernel.h" #include + #include "glog/logging.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/cpu/arange_kernel.cc b/paddle/phi/kernels/cpu/arange_kernel.cc index 478251b0d3b6a..7f7e555423176 100644 --- a/paddle/phi/kernels/cpu/arange_kernel.cc +++ b/paddle/phi/kernels/cpu/arange_kernel.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/kernels/arange_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/range_function.h" diff --git a/paddle/phi/kernels/cpu/atan2_grad_kernel.cc b/paddle/phi/kernels/cpu/atan2_grad_kernel.cc index 7a519aab0ad71..3bc8c853a7b42 100644 --- a/paddle/phi/kernels/cpu/atan2_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/atan2_grad_kernel.cc @@ -12,10 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h" - #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h" PD_REGISTER_KERNEL(atan2_grad, CPU, diff --git a/paddle/phi/kernels/cpu/atan2_kernel.cc b/paddle/phi/kernels/cpu/atan2_kernel.cc index df6f5f59ac005..4cb96ad8b6c6c 100644 --- a/paddle/phi/kernels/cpu/atan2_kernel.cc +++ b/paddle/phi/kernels/cpu/atan2_kernel.cc @@ -12,10 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/impl/atan2_kernel_impl.h" - #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/atan2_kernel_impl.h" PD_REGISTER_KERNEL(atan2, CPU, diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc index 366a08e59fee3..beda276c8ef3a 100644 --- a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc @@ -12,12 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/batch_norm_kernel.h" #include "paddle/phi/kernels/funcs/eigen/common.h" - -#include "paddle/fluid/framework/tensor_util.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/gpu/batch_norm_utils.h" diff --git a/paddle/phi/kernels/cpu/batch_norm_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_kernel.cc index 743128e8dea99..cb8af06b540f8 100644 --- a/paddle/phi/kernels/cpu/batch_norm_kernel.cc +++ b/paddle/phi/kernels/cpu/batch_norm_kernel.cc @@ -13,12 +13,12 @@ // limitations under the License. #include "paddle/phi/kernels/batch_norm_kernel.h" + +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/eigen/common.h" -#include "paddle/fluid/framework/tensor_util.h" - namespace phi { template diff --git a/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc index 6859451e8be32..fc91af3ff71bc 100644 --- a/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc @@ -15,6 +15,7 @@ #include "paddle/phi/kernels/bce_loss_grad_kernel.h" #include // for max + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/cpu/bce_loss_kernel.cc b/paddle/phi/kernels/cpu/bce_loss_kernel.cc index 76b9793651484..9d62fabcbe736 100644 --- a/paddle/phi/kernels/cpu/bce_loss_kernel.cc +++ b/paddle/phi/kernels/cpu/bce_loss_kernel.cc @@ -15,6 +15,7 @@ #include "paddle/phi/kernels/bce_loss_kernel.h" #include // for max + #include "paddle/fluid/operators/math.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/cpu/bernoulli_kernel.cc b/paddle/phi/kernels/cpu/bernoulli_kernel.cc index 09c07d9ec9dea..6bf548154a404 100644 --- a/paddle/phi/kernels/cpu/bernoulli_kernel.cc +++ b/paddle/phi/kernels/cpu/bernoulli_kernel.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/phi/kernels/bernoulli_kernel.h" + #include + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/cpu/bilinear_tensor_product_grad_kernel.cc b/paddle/phi/kernels/cpu/bilinear_tensor_product_grad_kernel.cc index 2268212316af6..ef7e8a981c520 100644 --- a/paddle/phi/kernels/cpu/bilinear_tensor_product_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/bilinear_tensor_product_grad_kernel.cc @@ -13,9 +13,9 @@ // limitations under the License. #include "paddle/phi/kernels/bilinear_tensor_product_grad_kernel.h" -#include "paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h" PD_REGISTER_KERNEL(bilinear_tensor_product_grad, CPU, diff --git a/paddle/phi/kernels/cpu/bilinear_tensor_product_kernel.cc b/paddle/phi/kernels/cpu/bilinear_tensor_product_kernel.cc index 25bc5913865a0..d822656418261 100644 --- a/paddle/phi/kernels/cpu/bilinear_tensor_product_kernel.cc +++ b/paddle/phi/kernels/cpu/bilinear_tensor_product_kernel.cc @@ -13,9 +13,9 @@ // limitations under the License. #include "paddle/phi/kernels/bilinear_tensor_product_kernel.h" -#include "paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h" PD_REGISTER_KERNEL(bilinear_tensor_product, CPU, diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc index 0869cd62024dc..413638e177222 100644 --- a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc @@ -15,6 +15,7 @@ #include "paddle/phi/kernels/broadcast_tensors_grad_kernel.h" #include + #include "paddle/fluid/framework/tensor_util.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/dense_tensor.h" diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc index 4cb6db8769271..3ad26164d7d8d 100644 --- a/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc +++ b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/broadcast_tensors_kernel.h" -#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h" PD_REGISTER_KERNEL(broadcast_tensors, CPU, diff --git a/paddle/phi/kernels/cpu/cast_kernel.cc b/paddle/phi/kernels/cpu/cast_kernel.cc index 2132f0d5ae86c..8abfa173fd06d 100644 --- a/paddle/phi/kernels/cpu/cast_kernel.cc +++ b/paddle/phi/kernels/cpu/cast_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/cast_kernel.h" -#include "paddle/phi/kernels/cpu/cast_impl.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/visit_type.h" +#include "paddle/phi/kernels/cpu/cast_impl.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/channel_shuffle_grad_kernel.cc b/paddle/phi/kernels/cpu/channel_shuffle_grad_kernel.cc index fcc91b2191673..e95b454dbf900 100644 --- a/paddle/phi/kernels/cpu/channel_shuffle_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/channel_shuffle_grad_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/channel_shuffle_grad_kernel.h" -#include "paddle/phi/kernels/impl/channel_shuffle_grad_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/channel_shuffle_grad_kernel_impl.h" PD_REGISTER_KERNEL(channel_shuffle_grad, CPU, diff --git a/paddle/phi/kernels/cpu/channel_shuffle_kernel.cc b/paddle/phi/kernels/cpu/channel_shuffle_kernel.cc index 95d19ec6a7746..0bac82e779c21 100644 --- a/paddle/phi/kernels/cpu/channel_shuffle_kernel.cc +++ b/paddle/phi/kernels/cpu/channel_shuffle_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/channel_shuffle_kernel.h" -#include "paddle/phi/kernels/impl/channel_shuffle_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/channel_shuffle_kernel_impl.h" PD_REGISTER_KERNEL(channel_shuffle, CPU, diff --git a/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc b/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc index b6f5dd29ba2b7..612d10994cb17 100644 --- a/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc @@ -12,10 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h" - #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h" PD_REGISTER_KERNEL(cholesky_solve_grad, CPU, diff --git a/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc b/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc index 02597560a7f51..11cb66f88c1f6 100644 --- a/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc +++ b/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc @@ -12,11 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h" - #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" +#include "paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/clip_grad_kernel.cc b/paddle/phi/kernels/cpu/clip_grad_kernel.cc index bccdc0746d51c..89a14af10d16c 100644 --- a/paddle/phi/kernels/cpu/clip_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/clip_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/clip_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/clip_grad_kernel_impl.h" diff --git a/paddle/phi/kernels/cpu/clip_kernel.cc b/paddle/phi/kernels/cpu/clip_kernel.cc index 5fd9aea966f8d..bcbb85279277e 100644 --- a/paddle/phi/kernels/cpu/clip_kernel.cc +++ b/paddle/phi/kernels/cpu/clip_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/clip_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/clip_kernel_impl.h" diff --git a/paddle/phi/kernels/cpu/compare_kernel.cc b/paddle/phi/kernels/cpu/compare_kernel.cc index 9006325a521ec..694b44c16d80e 100644 --- a/paddle/phi/kernels/cpu/compare_kernel.cc +++ b/paddle/phi/kernels/cpu/compare_kernel.cc @@ -13,11 +13,11 @@ // limitations under the License. #include "paddle/phi/kernels/compare_kernel.h" -#include "paddle/phi/kernels/impl/compare_kernel_impl.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/impl/compare_kernel_impl.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/complex_grad_kernel.cc b/paddle/phi/kernels/cpu/complex_grad_kernel.cc index 5c1d50f5bf27d..11b7a05834607 100644 --- a/paddle/phi/kernels/cpu/complex_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/complex_grad_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/complex_grad_kernel.h" -#include "paddle/phi/kernels/impl/complex_grad_kernel_impl.h" #include "paddle/phi/common/complex.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/complex_grad_kernel_impl.h" PD_REGISTER_KERNEL(real_grad, CPU, diff --git a/paddle/phi/kernels/cpu/complex_kernel.cc b/paddle/phi/kernels/cpu/complex_kernel.cc index 859d5a84527a2..bef0b7b747a42 100644 --- a/paddle/phi/kernels/cpu/complex_kernel.cc +++ b/paddle/phi/kernels/cpu/complex_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/complex_kernel.h" -#include "paddle/phi/kernels/impl/complex_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/complex_kernel_impl.h" // See Note [ Why still include the fluid headers? ] #include "paddle/phi/common/complex.h" diff --git a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc index 4538ccf9433f9..3289c8f5c84d6 100644 --- a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/conv_grad_grad_kernel.h" -#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h" namespace phi { template diff --git a/paddle/phi/kernels/cpu/conv_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_kernel.cc index 2d8a9bf1de733..880837dd7cd61 100644 --- a/paddle/phi/kernels/cpu/conv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/conv_grad_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/conv_grad_kernel.h" -#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/conv_kernel.cc b/paddle/phi/kernels/cpu/conv_kernel.cc index e0b4ee7d5776f..ec3253194930b 100644 --- a/paddle/phi/kernels/cpu/conv_kernel.cc +++ b/paddle/phi/kernels/cpu/conv_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/conv_kernel.h" -#include "paddle/phi/kernels/impl/conv_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/conv_kernel_impl.h" namespace phi { template diff --git a/paddle/phi/kernels/cpu/conv_transpose_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_transpose_grad_kernel.cc index 8d0749500695c..17fe44dea3f65 100644 --- a/paddle/phi/kernels/cpu/conv_transpose_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/conv_transpose_grad_kernel.cc @@ -13,9 +13,9 @@ // limitations under the License. #include "paddle/phi/kernels/conv_transpose_grad_kernel.h" -#include "paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/conv_transpose_kernel.cc b/paddle/phi/kernels/cpu/conv_transpose_kernel.cc index b4cacc850938e..ad9a5933f2809 100644 --- a/paddle/phi/kernels/cpu/conv_transpose_kernel.cc +++ b/paddle/phi/kernels/cpu/conv_transpose_kernel.cc @@ -13,9 +13,9 @@ // limitations under the License. #include "paddle/phi/kernels/conv_transpose_kernel.h" -#include "paddle/phi/kernels/impl/conv_transpose_kernel_impl.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/conv_transpose_kernel_impl.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/cross_entropy_kernel.cc b/paddle/phi/kernels/cpu/cross_entropy_kernel.cc index c684fb416eaab..bd3eb3eb754c3 100644 --- a/paddle/phi/kernels/cpu/cross_entropy_kernel.cc +++ b/paddle/phi/kernels/cpu/cross_entropy_kernel.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/phi/kernels/cross_entropy_kernel.h" +#include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/copy_kernel.h" @@ -21,8 +22,6 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/softmax_kernel.h" -#include "paddle/fluid/operators/math/cross_entropy.h" - namespace phi { template diff --git a/paddle/phi/kernels/cpu/cross_grad_kernel.cc b/paddle/phi/kernels/cpu/cross_grad_kernel.cc index 390420008e6ea..8dddc6f6e4e95 100644 --- a/paddle/phi/kernels/cpu/cross_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/cross_grad_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/cross_grad_kernel.h" -#include "paddle/phi/kernels/impl/cross_grad_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/cross_grad_kernel_impl.h" PD_REGISTER_KERNEL(cross_grad, CPU, diff --git a/paddle/phi/kernels/cpu/cross_kernel.cc b/paddle/phi/kernels/cpu/cross_kernel.cc index a63f33174eacd..1f3a8fe5a3879 100644 --- a/paddle/phi/kernels/cpu/cross_kernel.cc +++ b/paddle/phi/kernels/cpu/cross_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/cross_kernel.h" -#include "paddle/phi/kernels/impl/cross_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/cross_kernel_impl.h" PD_REGISTER_KERNEL( cross, CPU, ALL_LAYOUT, phi::CrossKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/cpu/cumprod_kernel.cc b/paddle/phi/kernels/cpu/cumprod_kernel.cc index aea338027f5bb..4ecf092918418 100644 --- a/paddle/phi/kernels/cpu/cumprod_kernel.cc +++ b/paddle/phi/kernels/cpu/cumprod_kernel.cc @@ -16,6 +16,7 @@ #include #include + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/complex_functors.h" diff --git a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc index f64b1d3291f5e..a4d43ef8fbe89 100644 --- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc @@ -58,10 +58,9 @@ inline void ModulatedDeformableCol2imCPUKernel( int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; - const T* data_offset_ptr = data_offset + - (b * deformable_group + deformable_group_index) * - 2 * kernel_h * kernel_w * height_col * - width_col; + const T* data_offset_ptr = + data_offset + (b * deformable_group + deformable_group_index) * 2 * + kernel_h * kernel_w * height_col * width_col; const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; const int data_offset_w_ptr = @@ -75,9 +74,9 @@ inline void ModulatedDeformableCol2imCPUKernel( T cur_top_grad = data_col[thread]; if (data_mask) { - const T* data_mask_ptr = data_mask + - (b * deformable_group + deformable_group_index) * - kernel_h * kernel_w * height_col * width_col; + const T* data_mask_ptr = + data_mask + (b * deformable_group + deformable_group_index) * + kernel_h * kernel_w * height_col * width_col; const T mask = data_mask_ptr[data_mask_hw_ptr]; cur_top_grad *= mask; } @@ -180,23 +179,20 @@ void ModulatedDeformableCol2imCoordCPUKernel( const int deformable_group_index = c / (2 * kernel_h * kernel_w); const int col_step = kernel_h * kernel_w; int cnt = 0; - const T* data_col_ptr = data_col + - deformable_group_index * - channel_per_deformable_group * batch_size * - width_col * height_col; - const T* data_im_ptr = data_im + - (b * deformable_group + deformable_group_index) * - channel_per_deformable_group / kernel_h / - kernel_w * height * width; - const T* data_offset_ptr = data_offset + - (b * deformable_group + deformable_group_index) * - 2 * kernel_h * kernel_w * height_col * - width_col; + const T* data_col_ptr = data_col + deformable_group_index * + channel_per_deformable_group * + batch_size * width_col * height_col; + const T* data_im_ptr = + data_im + (b * deformable_group + deformable_group_index) * + channel_per_deformable_group / kernel_h / kernel_w * + height * width; + const T* data_offset_ptr = + data_offset + (b * deformable_group + deformable_group_index) * 2 * + kernel_h * kernel_w * height_col * width_col; const T* data_mask_ptr = data_mask - ? data_mask + - (b * deformable_group + deformable_group_index) * kernel_h * - kernel_w * height_col * width_col + ? data_mask + (b * deformable_group + deformable_group_index) * + kernel_h * kernel_w * height_col * width_col : nullptr; const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; diff --git a/paddle/phi/kernels/cpu/diag_grad_kernel.cc b/paddle/phi/kernels/cpu/diag_grad_kernel.cc index c56b225e2a753..616ea753ef1ba 100644 --- a/paddle/phi/kernels/cpu/diag_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/diag_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/diag_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/diag_functor.h" diff --git a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc index c3c290b4fe91e..5671e70c96e0a 100644 --- a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/diagonal_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/diagonal.h" diff --git a/paddle/phi/kernels/cpu/diagonal_kernel.cc b/paddle/phi/kernels/cpu/diagonal_kernel.cc index df17b458e1166..8ea5826ba25f7 100644 --- a/paddle/phi/kernels/cpu/diagonal_kernel.cc +++ b/paddle/phi/kernels/cpu/diagonal_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/diagonal_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/diagonal.h" diff --git a/paddle/phi/kernels/cpu/digamma_grad_kernel.cc b/paddle/phi/kernels/cpu/digamma_grad_kernel.cc index da1b5ae556609..dc7fcaf6f92be 100644 --- a/paddle/phi/kernels/cpu/digamma_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/digamma_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/digamma_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" diff --git a/paddle/phi/kernels/cpu/digamma_kernel.cc b/paddle/phi/kernels/cpu/digamma_kernel.cc index ee120a29b6061..80cbda4b7a9fc 100644 --- a/paddle/phi/kernels/cpu/digamma_kernel.cc +++ b/paddle/phi/kernels/cpu/digamma_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/digamma_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" diff --git a/paddle/phi/kernels/cpu/dist_grad_kernel.cc b/paddle/phi/kernels/cpu/dist_grad_kernel.cc index 2b7f8f98f9473..c1aaa2adf7563 100644 --- a/paddle/phi/kernels/cpu/dist_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/dist_grad_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/dist_grad_kernel.h" -#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h" PD_REGISTER_KERNEL( dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/dist_kernel.cc b/paddle/phi/kernels/cpu/dist_kernel.cc index ccf3d4be83230..0c7b5db64b38f 100644 --- a/paddle/phi/kernels/cpu/dist_kernel.cc +++ b/paddle/phi/kernels/cpu/dist_kernel.cc @@ -13,9 +13,9 @@ // limitations under the License. #include "paddle/phi/kernels/dist_kernel.h" -#include "paddle/phi/kernels/impl/dist_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/dist_kernel_impl.h" PD_REGISTER_KERNEL(dist, CPU, ALL_LAYOUT, phi::DistKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/dot_grad_kernel.cc b/paddle/phi/kernels/cpu/dot_grad_kernel.cc index a2abdb7c00900..883b77802217b 100644 --- a/paddle/phi/kernels/cpu/dot_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/dot_grad_kernel.cc @@ -13,12 +13,11 @@ // limitations under the License. #include "paddle/phi/kernels/dot_grad_kernel.h" -#include "paddle/phi/kernels/impl/dot_grad_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/kernel_registry.h" - #include "paddle/phi/common/complex.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/dot_grad_kernel_impl.h" PD_REGISTER_KERNEL(dot_grad, CPU, diff --git a/paddle/phi/kernels/cpu/dropout_grad_kernel.cc b/paddle/phi/kernels/cpu/dropout_grad_kernel.cc index b77a6c55b1471..db95656421884 100644 --- a/paddle/phi/kernels/cpu/dropout_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/dropout_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/dropout_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/eigen/common.h" diff --git a/paddle/phi/kernels/cpu/dropout_kernel.cc b/paddle/phi/kernels/cpu/dropout_kernel.cc index fa12e505e4209..d9c02eff0106f 100644 --- a/paddle/phi/kernels/cpu/dropout_kernel.cc +++ b/paddle/phi/kernels/cpu/dropout_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/dropout_kernel.h" + #include "paddle/fluid/framework/generator.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/cpu/eigh_grad_kernel.cc b/paddle/phi/kernels/cpu/eigh_grad_kernel.cc index 5135778db56c5..db533416d2748 100644 --- a/paddle/phi/kernels/cpu/eigh_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/eigh_grad_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/eigh_grad_kernel.h" -#include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h" #include "paddle/phi/common/complex.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h" PD_REGISTER_KERNEL(eigh_grad, CPU, diff --git a/paddle/phi/kernels/cpu/eigh_kernel.cc b/paddle/phi/kernels/cpu/eigh_kernel.cc index 92fd20ca9b825..0f0a10c837792 100644 --- a/paddle/phi/kernels/cpu/eigh_kernel.cc +++ b/paddle/phi/kernels/cpu/eigh_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/eigh_kernel.h" -#include "paddle/phi/kernels/funcs/values_vectors_functor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/values_vectors_functor.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/einsum_kernel.cc b/paddle/phi/kernels/cpu/einsum_kernel.cc index 8968542b3e0b8..401d2fd158a5d 100644 --- a/paddle/phi/kernels/cpu/einsum_kernel.cc +++ b/paddle/phi/kernels/cpu/einsum_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/einsum_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/einsum_impl.h" diff --git a/paddle/phi/kernels/cpu/elementwise.h b/paddle/phi/kernels/cpu/elementwise.h index 0f67df661136d..255dae7da014d 100644 --- a/paddle/phi/kernels/cpu/elementwise.h +++ b/paddle/phi/kernels/cpu/elementwise.h @@ -16,10 +16,9 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/common_shape.h" - -#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/eigen/common.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/elementwise_add_kernel.cc b/paddle/phi/kernels/cpu/elementwise_add_kernel.cc index 5019b9f570628..b5e28ab39e5a6 100644 --- a/paddle/phi/kernels/cpu/elementwise_add_kernel.cc +++ b/paddle/phi/kernels/cpu/elementwise_add_kernel.cc @@ -12,12 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/cpu/elementwise.h" #include "paddle/phi/api/ext/dispatch.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/complex.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/elementwise.h" #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc b/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc index d380621818b35..15fe92c929194 100644 --- a/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc +++ b/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc @@ -12,12 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/cpu/elementwise.h" #include "paddle/phi/api/ext/dispatch.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/complex.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/elementwise.h" #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc index 286b0d0ffaad9..f090ddd5bbe9a 100644 --- a/paddle/phi/kernels/cpu/elementwise_kernel.cc +++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc @@ -12,11 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/cpu/elementwise.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/complex.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/elementwise.h" #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc b/paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc index 2424a5330109c..349150373844b 100644 --- a/paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc +++ b/paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc @@ -12,12 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/cpu/elementwise.h" #include "paddle/phi/api/ext/dispatch.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/complex.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/elementwise.h" #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/elementwise_subtract_kernel.cc b/paddle/phi/kernels/cpu/elementwise_subtract_kernel.cc index 0e97852ac33e1..a013309233d47 100644 --- a/paddle/phi/kernels/cpu/elementwise_subtract_kernel.cc +++ b/paddle/phi/kernels/cpu/elementwise_subtract_kernel.cc @@ -12,12 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/cpu/elementwise.h" #include "paddle/phi/api/ext/dispatch.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/complex.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/elementwise.h" #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/embedding_grad_kernel.cc b/paddle/phi/kernels/cpu/embedding_grad_kernel.cc index 21b3e6da8d9ef..fabb4e83d52f7 100644 --- a/paddle/phi/kernels/cpu/embedding_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/embedding_grad_kernel.cc @@ -13,11 +13,11 @@ // limitations under the License. #include "paddle/phi/kernels/embedding_grad_kernel.h" -#include "paddle/phi/kernels/funcs/embedding_util.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/embedding_util.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/embedding_kernel.cc b/paddle/phi/kernels/cpu/embedding_kernel.cc index 76cc3814b0567..0430f7a005221 100644 --- a/paddle/phi/kernels/cpu/embedding_kernel.cc +++ b/paddle/phi/kernels/cpu/embedding_kernel.cc @@ -13,12 +13,12 @@ // limitations under the License. #include "paddle/phi/kernels/embedding_kernel.h" -#include "paddle/phi/kernels/funcs/embedding_util.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/funcs/embedding_util.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/erf_grad_kernel.cc b/paddle/phi/kernels/cpu/erf_grad_kernel.cc index 3c1cd0df1531a..ae0b218bc0be3 100644 --- a/paddle/phi/kernels/cpu/erf_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/erf_grad_kernel.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/kernels/erf_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/cpu/erf_kernel.cc b/paddle/phi/kernels/cpu/erf_kernel.cc index 05ce4cab7fcef..ace9775c0b869 100644 --- a/paddle/phi/kernels/cpu/erf_kernel.cc +++ b/paddle/phi/kernels/cpu/erf_kernel.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/kernels/erf_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc b/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc index b1fe4f026ab07..2d363189936b0 100644 --- a/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/erfinv_grad_kernel.h" -#include "paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h" PD_REGISTER_KERNEL( erfinv_grad, CPU, ALL_LAYOUT, phi::ErfinvGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/erfinv_kernel.cc b/paddle/phi/kernels/cpu/erfinv_kernel.cc index 4f3a740f9d9be..f298cc358d662 100644 --- a/paddle/phi/kernels/cpu/erfinv_kernel.cc +++ b/paddle/phi/kernels/cpu/erfinv_kernel.cc @@ -13,9 +13,9 @@ // limitations under the License. #include "paddle/phi/kernels/erfinv_kernel.h" -#include "paddle/phi/kernels/impl/erfinv_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/erfinv_kernel_impl.h" PD_REGISTER_KERNEL(erfinv, CPU, ALL_LAYOUT, phi::ErfinvKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/expand_as_grad_kernel.cc b/paddle/phi/kernels/cpu/expand_as_grad_kernel.cc index 6eafe9aa49dfe..c57e3a87281e0 100644 --- a/paddle/phi/kernels/cpu/expand_as_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/expand_as_grad_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/expand_as_grad_kernel.h" -#include "paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h" PD_REGISTER_KERNEL(expand_as_grad, CPU, diff --git a/paddle/phi/kernels/cpu/expand_as_kernel.cc b/paddle/phi/kernels/cpu/expand_as_kernel.cc index 697ea138097ee..4ec28ef8413cc 100644 --- a/paddle/phi/kernels/cpu/expand_as_kernel.cc +++ b/paddle/phi/kernels/cpu/expand_as_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/expand_as_kernel.h" -#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h" PD_REGISTER_KERNEL(expand_as, CPU, diff --git a/paddle/phi/kernels/cpu/expand_grad_kernel.cc b/paddle/phi/kernels/cpu/expand_grad_kernel.cc index 4799a6aa7afdf..5cbbf253b747d 100644 --- a/paddle/phi/kernels/cpu/expand_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/expand_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/expand_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" diff --git a/paddle/phi/kernels/cpu/expand_kernel.cc b/paddle/phi/kernels/cpu/expand_kernel.cc index 077048976729f..2df833d0f9c30 100644 --- a/paddle/phi/kernels/cpu/expand_kernel.cc +++ b/paddle/phi/kernels/cpu/expand_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/expand_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" diff --git a/paddle/phi/kernels/cpu/eye_kernel.cc b/paddle/phi/kernels/cpu/eye_kernel.cc index a0d0f2c439096..ef3489d3fae0d 100644 --- a/paddle/phi/kernels/cpu/eye_kernel.cc +++ b/paddle/phi/kernels/cpu/eye_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/eye_kernel.h" -#include "paddle/phi/kernels/impl/eye_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/eye_kernel_impl.h" PD_REGISTER_KERNEL(eye, CPU, diff --git a/paddle/phi/kernels/cpu/frobenius_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/frobenius_norm_grad_kernel.cc index 338be9e252da3..5434296be4dbe 100644 --- a/paddle/phi/kernels/cpu/frobenius_norm_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/frobenius_norm_grad_kernel.cc @@ -13,9 +13,9 @@ // limitations under the License. #include "paddle/phi/kernels/frobenius_norm_grad_kernel.h" -#include "paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h" PD_REGISTER_KERNEL(frobenius_norm_grad, CPU, diff --git a/paddle/phi/kernels/cpu/frobenius_norm_kernel.cc b/paddle/phi/kernels/cpu/frobenius_norm_kernel.cc index 77509b953bf39..56444ddad8d8b 100644 --- a/paddle/phi/kernels/cpu/frobenius_norm_kernel.cc +++ b/paddle/phi/kernels/cpu/frobenius_norm_kernel.cc @@ -13,9 +13,9 @@ // limitations under the License. #include "paddle/phi/kernels/frobenius_norm_kernel.h" -#include "paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h" PD_REGISTER_KERNEL( frobenius_norm, CPU, ALL_LAYOUT, phi::FrobeniusNormKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/full_kernel.cc b/paddle/phi/kernels/cpu/full_kernel.cc index 0b76425a659a0..ceb2312b53a0b 100644 --- a/paddle/phi/kernels/cpu/full_kernel.cc +++ b/paddle/phi/kernels/cpu/full_kernel.cc @@ -16,7 +16,6 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" - #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" diff --git a/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc b/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc index b375a7ec4691c..88a288afd318e 100644 --- a/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/gather_nd_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" diff --git a/paddle/phi/kernels/cpu/gather_nd_kernel.cc b/paddle/phi/kernels/cpu/gather_nd_kernel.cc index aa32d036934e8..8ae866a1c8add 100644 --- a/paddle/phi/kernels/cpu/gather_nd_kernel.cc +++ b/paddle/phi/kernels/cpu/gather_nd_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/gather_nd_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/gather.h" diff --git a/paddle/phi/kernels/cpu/gather_tree_kernel.cc b/paddle/phi/kernels/cpu/gather_tree_kernel.cc index 25fb870d851f6..6f3cac6c4aa10 100644 --- a/paddle/phi/kernels/cpu/gather_tree_kernel.cc +++ b/paddle/phi/kernels/cpu/gather_tree_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/gather_tree_kernel.h" + #include "paddle/phi/core/kernel_registry.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/gaussian_random_kernel.cc index 348d24b534e3e..c600149cbbacc 100644 --- a/paddle/phi/kernels/cpu/gaussian_random_kernel.cc +++ b/paddle/phi/kernels/cpu/gaussian_random_kernel.cc @@ -14,11 +14,10 @@ #include "paddle/phi/kernels/gaussian_random_kernel.h" +#include "paddle/fluid/framework/generator.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/fluid/framework/generator.h" - namespace phi { template diff --git a/paddle/phi/kernels/cpu/gelu_kernel.cc b/paddle/phi/kernels/cpu/gelu_kernel.cc index d7af220574565..4d23470aa4e9e 100644 --- a/paddle/phi/kernels/cpu/gelu_kernel.cc +++ b/paddle/phi/kernels/cpu/gelu_kernel.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/gelu_kernel.h" + #include #include + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/blas/blas.h" diff --git a/paddle/phi/kernels/cpu/graph_reindex_kernel.cc b/paddle/phi/kernels/cpu/graph_reindex_kernel.cc index c0a88f3222717..428bcb031704c 100644 --- a/paddle/phi/kernels/cpu/graph_reindex_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_reindex_kernel.cc @@ -12,11 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/phi/kernels/graph_reindex_kernel.h" + #include #include -#include "paddle/phi/kernels/graph_reindex_kernel.h" - #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" @@ -59,11 +59,15 @@ void GraphReindexKernel(const Context& dev_ctx, src[i] = node_map[node]; } // Reindex Dst + // Add support for multi-type edges reindex + int num_edge_types = count.dims()[0] / bs; int cnt = 0; - for (int i = 0; i < bs; i++) { - for (int j = 0; j < count_data[i]; j++) { - T node = x_data[i]; - dst[cnt++] = node_map[node]; + for (int i = 0; i < num_edge_types; i++) { + for (int j = 0; j < bs; j++) { + for (int k = 0; k < count_data[i * bs + j]; k++) { + T node = x_data[j]; + dst[cnt++] = node_map[node]; + } } } diff --git a/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc b/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc index 70aac053417b8..1ef5373d6310b 100644 --- a/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include - #include "paddle/phi/kernels/graph_sample_neighbors_kernel.h" +#include + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc index 6ea65d005c1ad..ad04bd258e141 100644 --- a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc @@ -13,12 +13,12 @@ // limitations under the License. #include "paddle/phi/kernels/graph_send_recv_grad_kernel.h" -#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h" #include #include #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc index 8f71ba12cc4fa..e4034230c7866 100644 --- a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/phi/kernels/graph_send_recv_kernel.h" -#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h" #include #include @@ -22,6 +21,7 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc index 923cb8424115e..32fa0d5aafefe 100644 --- a/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc @@ -73,8 +73,9 @@ static inline void ClipWithMask(const CPUContext& ctx, .cwiseMin(static_cast(max_val)); auto in_bound = (clipped == reflected).template cast(); grid_scale_t.device(place) = - grid_scale_t * ((is_neg == one_more_flip).template cast() - - (is_neg != one_more_flip).template cast()) * + grid_scale_t * + ((is_neg == one_more_flip).template cast() - + (is_neg != one_more_flip).template cast()) * in_bound; grid_slice_t.device(place) = clipped; } diff --git a/paddle/phi/kernels/cpu/gumbel_softmax_grad_kernel.cc b/paddle/phi/kernels/cpu/gumbel_softmax_grad_kernel.cc index a4c131e72b59a..832df98e0f3f6 100644 --- a/paddle/phi/kernels/cpu/gumbel_softmax_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/gumbel_softmax_grad_kernel.cc @@ -13,9 +13,9 @@ // limitations under the License. #include "paddle/phi/kernels/gumbel_softmax_grad_kernel.h" -#include "paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h" PD_REGISTER_KERNEL(gumbel_softmax_grad, CPU, diff --git a/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc b/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc index eb406665c5f4f..7638ca3aa7ee6 100644 --- a/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc +++ b/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc @@ -13,11 +13,11 @@ // limitations under the License. #include "paddle/phi/kernels/gumbel_softmax_kernel.h" -#include "paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/axis_utils.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/histogram_kernel.cc b/paddle/phi/kernels/cpu/histogram_kernel.cc index 82b88f868d8a7..d9c41508efde0 100644 --- a/paddle/phi/kernels/cpu/histogram_kernel.cc +++ b/paddle/phi/kernels/cpu/histogram_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/histogram_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc index 654f2c9400af0..b52a587070af6 100644 --- a/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/huber_loss_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/huber_loss_grad_kernel_impl.h" diff --git a/paddle/phi/kernels/cpu/huber_loss_kernel.cc b/paddle/phi/kernels/cpu/huber_loss_kernel.cc index 702c0589057af..2c4d8941ab87b 100644 --- a/paddle/phi/kernels/cpu/huber_loss_kernel.cc +++ b/paddle/phi/kernels/cpu/huber_loss_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/huber_loss_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/huber_loss_kernel_impl.h" diff --git a/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc index d060e8c9b2837..fe8ca4e432e21 100644 --- a/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/index_sample_grad_kernel.h" + #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/phi/backends/cpu/cpu_context.h" diff --git a/paddle/phi/kernels/cpu/index_sample_kernel.cc b/paddle/phi/kernels/cpu/index_sample_kernel.cc index b895e4aa7c0e7..faa6953704e80 100644 --- a/paddle/phi/kernels/cpu/index_sample_kernel.cc +++ b/paddle/phi/kernels/cpu/index_sample_kernel.cc @@ -13,12 +13,14 @@ // limitations under the License. #include "paddle/phi/kernels/index_sample_kernel.h" + #include #include #include #include #include #include + #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/phi/backends/cpu/cpu_context.h" diff --git a/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc index 340d2907a7909..45ef003410926 100644 --- a/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc @@ -17,6 +17,7 @@ #include #include #include + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/core/kernel_registry.h" @@ -42,10 +43,10 @@ using EigenVectorArrayMap = Eigen::Map>; template void InstanceNormGradKernel(const Context& dev_ctx, const DenseTensor& x, - const DenseTensor& d_y, const paddle::optional& scale, const DenseTensor& saved_mean, const DenseTensor& saved_variance, + const DenseTensor& d_y, float epsilon, DenseTensor* d_x, DenseTensor* d_scale, @@ -142,12 +143,11 @@ void InstanceNormGradKernel(const Context& dev_ctx, dx_arr.device(*place) = scale_arr.broadcast(bcast_param) * inv_var_arr.broadcast(bcast) * (dy_arr - dy_mean - - tmp * - (dy_arr * tmp) - .mean(mean_rdims) - .reshape(NxC_shape) - .eval() - .broadcast(bcast)); + tmp * (dy_arr * tmp) + .mean(mean_rdims) + .reshape(NxC_shape) + .eval() + .broadcast(bcast)); } template diff --git a/paddle/phi/kernels/cpu/instance_norm_kernel.cc b/paddle/phi/kernels/cpu/instance_norm_kernel.cc index 5eac473effa0e..4deced5499ecb 100644 --- a/paddle/phi/kernels/cpu/instance_norm_kernel.cc +++ b/paddle/phi/kernels/cpu/instance_norm_kernel.cc @@ -17,6 +17,7 @@ #include #include #include + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc index d4e13aa3b24fe..edd41b2c7a31d 100644 --- a/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/interpolate_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/cpu/isclose_kernel.cc b/paddle/phi/kernels/cpu/isclose_kernel.cc index 633c6ba093e42..dca21494b3ee9 100644 --- a/paddle/phi/kernels/cpu/isclose_kernel.cc +++ b/paddle/phi/kernels/cpu/isclose_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/isclose_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/isclose_kernel_impl.h" diff --git a/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc index f9399d38d711f..9f6e2573e33e5 100644 --- a/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/kldiv_loss_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h" diff --git a/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc b/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc index c462b8ec32c89..ecb1915cf420e 100644 --- a/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc +++ b/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/kldiv_loss_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h" diff --git a/paddle/phi/kernels/cpu/label_smooth_grad_kernel.cc b/paddle/phi/kernels/cpu/label_smooth_grad_kernel.cc index 74664fb270b2d..1a900b4bc2aff 100644 --- a/paddle/phi/kernels/cpu/label_smooth_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/label_smooth_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/label_smooth_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/eigen/common.h" diff --git a/paddle/phi/kernels/cpu/label_smooth_kernel.cc b/paddle/phi/kernels/cpu/label_smooth_kernel.cc index af9548e8186bc..cdeed73310d24 100644 --- a/paddle/phi/kernels/cpu/label_smooth_kernel.cc +++ b/paddle/phi/kernels/cpu/label_smooth_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/label_smooth_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/eigen/common.h" diff --git a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc index a30f54fd4b60e..081a32b4f245b 100644 --- a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/layer_norm_grad_kernel.h" + #include "paddle/phi/kernels/cpu/elementwise.h" #include "paddle/phi/kernels/funcs/layer_norm_util.h" #if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \ diff --git a/paddle/phi/kernels/cpu/layer_norm_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_kernel.cc index 52722468e16bd..dbc3da0ca15ac 100644 --- a/paddle/phi/kernels/cpu/layer_norm_kernel.cc +++ b/paddle/phi/kernels/cpu/layer_norm_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/layer_norm_kernel.h" + #include "paddle/phi/kernels/cpu/elementwise.h" #include "paddle/phi/kernels/funcs/layer_norm_util.h" #if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \ diff --git a/paddle/phi/kernels/cpu/lerp_grad_kernel.cc b/paddle/phi/kernels/cpu/lerp_grad_kernel.cc index d74919011ec5d..ae98cb9d03aee 100644 --- a/paddle/phi/kernels/cpu/lerp_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/lerp_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/lerp_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/lerp_grad_kernel_impl.h" diff --git a/paddle/phi/kernels/cpu/lerp_kernel.cc b/paddle/phi/kernels/cpu/lerp_kernel.cc index 7adfc35bfa321..d02e706d8d600 100644 --- a/paddle/phi/kernels/cpu/lerp_kernel.cc +++ b/paddle/phi/kernels/cpu/lerp_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/lerp_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/lerp_kernel_impl.h" diff --git a/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc b/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc index 116fa3f8d3f6a..a87c01214a93d 100644 --- a/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/lgamma_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h" diff --git a/paddle/phi/kernels/cpu/lgamma_kernel.cc b/paddle/phi/kernels/cpu/lgamma_kernel.cc index f849322174d29..4979ad0b30bcd 100644 --- a/paddle/phi/kernels/cpu/lgamma_kernel.cc +++ b/paddle/phi/kernels/cpu/lgamma_kernel.cc @@ -15,6 +15,7 @@ #include "paddle/phi/kernels/lgamma_kernel.h" #include + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/for_range.h" diff --git a/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc b/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc index 5f344b9cc3fe0..d3e5e90fd17a3 100644 --- a/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc @@ -55,10 +55,9 @@ struct LogSoftmaxGradFunctor { Eigen::DSizes one_axis(1, axis_dim); dx.device(*context.eigen_device()) = - dy - - (y.exp()) * (dy.reshape(batch_axis_remain) - .sum(along_class) - .broadcast(one_axis)); + dy - (y.exp()) * (dy.reshape(batch_axis_remain) + .sum(along_class) + .broadcast(one_axis)); } }; diff --git a/paddle/phi/kernels/cpu/log_softmax_kernel.cc b/paddle/phi/kernels/cpu/log_softmax_kernel.cc index 241742378cc5d..510eb7a6ca97a 100644 --- a/paddle/phi/kernels/cpu/log_softmax_kernel.cc +++ b/paddle/phi/kernels/cpu/log_softmax_kernel.cc @@ -72,34 +72,31 @@ struct LogSoftmaxFunctor { // axis == -1, axis and class in same dimension, calculate along // class dimension directly for higher performance log_softmax.device(*context.eigen_device()) = - (logits - - logits.maximum(along_axis) - .eval() - .reshape(batch_by_one) - .broadcast(one_by_class)) + (logits - logits.maximum(along_axis) + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)) .unaryExpr(ValueClip()); } else { // axis != -1, class dimension split into (axis, remain), max and sum // should be calculated along axis dimension log_softmax.device(*context.eigen_device()) = - (logits.reshape(batch_axis_remain) - - logits.reshape(batch_axis_remain) - .maximum(along_axis) - .eval() - .reshape(batch_one_remain) - .broadcast(one_axis_one) - .reshape(batch_classes)) + (logits.reshape(batch_axis_remain) - logits.reshape(batch_axis_remain) + .maximum(along_axis) + .eval() + .reshape(batch_one_remain) + .broadcast(one_axis_one) + .reshape(batch_classes)) .unaryExpr(ValueClip()); } log_softmax.device(*context.eigen_device()) = - log_softmax - - log_softmax.exp() - .eval() - .reshape(batch_axis_remain) - .sum(along_axis) - .log() - .broadcast(one_axis); + log_softmax - log_softmax.exp() + .eval() + .reshape(batch_axis_remain) + .sum(along_axis) + .log() + .broadcast(one_axis); } }; diff --git a/paddle/phi/kernels/cpu/logsumexp_kernel.cc b/paddle/phi/kernels/cpu/logsumexp_kernel.cc index 06e0b30a9ca65..f1fecdfbe9e66 100644 --- a/paddle/phi/kernels/cpu/logsumexp_kernel.cc +++ b/paddle/phi/kernels/cpu/logsumexp_kernel.cc @@ -16,7 +16,6 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" - #include "paddle/phi/kernels/impl/logsumexp_kernel_impl.h" PD_REGISTER_KERNEL( diff --git a/paddle/phi/kernels/cpu/matmul_grad_kernel.cc b/paddle/phi/kernels/cpu/matmul_grad_kernel.cc index aba519ff04849..e3cd8fff8a50e 100644 --- a/paddle/phi/kernels/cpu/matmul_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/matmul_grad_kernel.cc @@ -16,7 +16,6 @@ limitations under the License. */ #include "paddle/phi/common/complex.h" #include "paddle/phi/core/kernel_registry.h" - #include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h" PD_REGISTER_KERNEL(matmul_grad, diff --git a/paddle/phi/kernels/cpu/matmul_kernel.cc b/paddle/phi/kernels/cpu/matmul_kernel.cc index 8aa25c0da07d9..c75a50130db76 100644 --- a/paddle/phi/kernels/cpu/matmul_kernel.cc +++ b/paddle/phi/kernels/cpu/matmul_kernel.cc @@ -15,9 +15,8 @@ limitations under the License. */ #include "paddle/phi/kernels/matmul_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/kernel_registry.h" - #include "paddle/phi/common/complex.h" +#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/matmul_kernel_impl.h" PD_REGISTER_KERNEL(matmul, diff --git a/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc b/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc index ae3b4d2b45582..0f60f8da71a8b 100644 --- a/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc @@ -13,10 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/kernels/matrix_power_grad_kernel.h" -#include "paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h" PD_REGISTER_KERNEL(matrix_power_grad, CPU, diff --git a/paddle/phi/kernels/cpu/matrix_power_kernel.cc b/paddle/phi/kernels/cpu/matrix_power_kernel.cc index f40e1e616f526..08ee7cbc865df 100644 --- a/paddle/phi/kernels/cpu/matrix_power_kernel.cc +++ b/paddle/phi/kernels/cpu/matrix_power_kernel.cc @@ -13,10 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/kernels/matrix_power_kernel.h" -#include "paddle/phi/kernels/impl/matrix_power_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/matrix_power_kernel_impl.h" PD_REGISTER_KERNEL( matrix_power, CPU, ALL_LAYOUT, phi::MatrixPowerKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/matrix_rank_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_kernel.cc index 5e13abe8aed2c..f56bd3d6dbe8a 100644 --- a/paddle/phi/kernels/cpu/matrix_rank_kernel.cc +++ b/paddle/phi/kernels/cpu/matrix_rank_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/matrix_rank_kernel.h" -#include "paddle/phi/kernels/matrix_rank_tol_kernel.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/matrix_rank_tol_kernel.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc index 3bfc07319e98d..af9b7728389ba 100644 --- a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc +++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc @@ -16,6 +16,7 @@ #include #include + #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/elementwise_multiply_kernel.h" #include "paddle/phi/kernels/full_kernel.h" diff --git a/paddle/phi/kernels/cpu/maxout_grad_kernel.cc b/paddle/phi/kernels/cpu/maxout_grad_kernel.cc index 429344a362b1c..dad4e96b5a8b1 100644 --- a/paddle/phi/kernels/cpu/maxout_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/maxout_grad_kernel.cc @@ -12,9 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h" - #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h" PD_REGISTER_KERNEL( maxout_grad, CPU, ALL_LAYOUT, phi::MaxOutGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/maxout_kernel.cc b/paddle/phi/kernels/cpu/maxout_kernel.cc index e7cd3ab07ff59..cc1d21d310b1f 100644 --- a/paddle/phi/kernels/cpu/maxout_kernel.cc +++ b/paddle/phi/kernels/cpu/maxout_kernel.cc @@ -12,8 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/impl/maxout_kernel_impl.h" - #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/maxout_kernel_impl.h" PD_REGISTER_KERNEL(maxout, CPU, ALL_LAYOUT, phi::MaxOutKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc b/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc index 159d109255381..5b43fb02b5117 100644 --- a/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/meshgrid_grad_kernel.h" -#include "paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h" PD_REGISTER_KERNEL(meshgrid_grad, CPU, diff --git a/paddle/phi/kernels/cpu/meshgrid_kernel.cc b/paddle/phi/kernels/cpu/meshgrid_kernel.cc index c201103b3dac4..35e43f7bbc85e 100644 --- a/paddle/phi/kernels/cpu/meshgrid_kernel.cc +++ b/paddle/phi/kernels/cpu/meshgrid_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/meshgrid_kernel.h" -#include "paddle/phi/kernels/impl/meshgrid_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/meshgrid_kernel_impl.h" PD_REGISTER_KERNEL(meshgrid, CPU, diff --git a/paddle/phi/kernels/cpu/momentum_kernel.cc b/paddle/phi/kernels/cpu/momentum_kernel.cc index 63cc5592ef422..7a4ea9f19e5c2 100644 --- a/paddle/phi/kernels/cpu/momentum_kernel.cc +++ b/paddle/phi/kernels/cpu/momentum_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/momentum_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/momentum_kernel_impl.h" diff --git a/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc b/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc index 2cd75404be821..f6b07584ce44e 100644 --- a/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc @@ -13,10 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/kernels/multi_dot_grad_kernel.h" -#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h" PD_REGISTER_KERNEL( multi_dot_grad, CPU, ALL_LAYOUT, phi::MultiDotGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/multi_dot_kernel.cc b/paddle/phi/kernels/cpu/multi_dot_kernel.cc index a4249a98e46dd..00cf425a038a1 100644 --- a/paddle/phi/kernels/cpu/multi_dot_kernel.cc +++ b/paddle/phi/kernels/cpu/multi_dot_kernel.cc @@ -13,10 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/kernels/multi_dot_kernel.h" -#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h" PD_REGISTER_KERNEL( multi_dot, CPU, ALL_LAYOUT, phi::MultiDotKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc b/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc index f5a426e93db2c..12ba6dadde304 100644 --- a/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc @@ -15,7 +15,6 @@ #include "paddle/phi/kernels/multiplex_grad_kernel.h" #include "paddle/fluid/memory/memcpy.h" - #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/eigen/common.h" diff --git a/paddle/phi/kernels/cpu/mv_kernel.cc b/paddle/phi/kernels/cpu/mv_kernel.cc index 7f76ddda6dde5..408eda34e1c00 100644 --- a/paddle/phi/kernels/cpu/mv_kernel.cc +++ b/paddle/phi/kernels/cpu/mv_kernel.cc @@ -16,7 +16,6 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" - #include "paddle/phi/kernels/impl/mv_kernel_impl.h" PD_REGISTER_KERNEL(mv, CPU, ALL_LAYOUT, phi::MvKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/nanmedian_grad_kernel.cc b/paddle/phi/kernels/cpu/nanmedian_grad_kernel.cc index 156124c214895..f8639a0d10fee 100644 --- a/paddle/phi/kernels/cpu/nanmedian_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/nanmedian_grad_kernel.cc @@ -13,9 +13,11 @@ // limitations under the License. #include "paddle/phi/kernels/nanmedian_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/impl/nanmedian_grad_kernel_impl.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/nanmedian_kernel.cc b/paddle/phi/kernels/cpu/nanmedian_kernel.cc index ed38405c9179f..03d7fe304be3e 100644 --- a/paddle/phi/kernels/cpu/nanmedian_kernel.cc +++ b/paddle/phi/kernels/cpu/nanmedian_kernel.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/nanmedian_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/nanmedian_kernel_impl.h" #include "paddle/phi/kernels/top_k_kernel.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc index dd2b09ee39acb..9048e87d04989 100644 --- a/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc @@ -16,6 +16,7 @@ #include #include + #include "paddle/fluid/operators/math.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/cpu/nll_loss_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_kernel.cc index 92cb6a1ad17de..c966e91a9a6e9 100644 --- a/paddle/phi/kernels/cpu/nll_loss_kernel.cc +++ b/paddle/phi/kernels/cpu/nll_loss_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/nll_loss_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/cpu/norm_grad_kernel.cc b/paddle/phi/kernels/cpu/norm_grad_kernel.cc index bd05e2c4c6ec1..92ca51b499c7a 100644 --- a/paddle/phi/kernels/cpu/norm_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/norm_grad_kernel.cc @@ -13,15 +13,13 @@ // limitations under the License. #include "paddle/phi/kernels/norm_grad_kernel.h" -#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -#include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" - #include "paddle/phi/kernels/funcs/common_shape.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function.h" namespace phi { template diff --git a/paddle/phi/kernels/cpu/norm_kernel.cc b/paddle/phi/kernels/cpu/norm_kernel.cc index 50906d9c3bb94..f69d03b66b1b5 100644 --- a/paddle/phi/kernels/cpu/norm_kernel.cc +++ b/paddle/phi/kernels/cpu/norm_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/norm_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/common_shape.h" diff --git a/paddle/phi/kernels/cpu/one_hot_kernel.cc b/paddle/phi/kernels/cpu/one_hot_kernel.cc index fc7979e41d938..f408c9f036152 100644 --- a/paddle/phi/kernels/cpu/one_hot_kernel.cc +++ b/paddle/phi/kernels/cpu/one_hot_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/one_hot_kernel.h" + #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/phi/kernels/cpu/p_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/p_norm_grad_kernel.cc index 44ab050408653..32905ab087883 100644 --- a/paddle/phi/kernels/cpu/p_norm_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/p_norm_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/p_norm_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/eigen/common.h" diff --git a/paddle/phi/kernels/cpu/p_norm_kernel.cc b/paddle/phi/kernels/cpu/p_norm_kernel.cc index 9da7fdbb297c2..597939953b277 100644 --- a/paddle/phi/kernels/cpu/p_norm_kernel.cc +++ b/paddle/phi/kernels/cpu/p_norm_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/p_norm_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/common_shape.h" diff --git a/paddle/phi/kernels/cpu/pixel_shuffle_grad_kernel.cc b/paddle/phi/kernels/cpu/pixel_shuffle_grad_kernel.cc index b32065d4f0a14..0e2bfd04b620e 100644 --- a/paddle/phi/kernels/cpu/pixel_shuffle_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/pixel_shuffle_grad_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/pixel_shuffle_grad_kernel.h" -#include "paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h" PD_REGISTER_KERNEL(pixel_shuffle_grad, CPU, diff --git a/paddle/phi/kernels/cpu/pixel_shuffle_kernel.cc b/paddle/phi/kernels/cpu/pixel_shuffle_kernel.cc index 80f8fa7b50efb..44dcb8b59f77c 100644 --- a/paddle/phi/kernels/cpu/pixel_shuffle_kernel.cc +++ b/paddle/phi/kernels/cpu/pixel_shuffle_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/pixel_shuffle_kernel.h" -#include "paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h" PD_REGISTER_KERNEL( pixel_shuffle, CPU, ALL_LAYOUT, phi::PixelShuffleKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/pixel_unshuffle_grad_kernel.cc b/paddle/phi/kernels/cpu/pixel_unshuffle_grad_kernel.cc index ef61fca35957e..cbcbf1e129d20 100644 --- a/paddle/phi/kernels/cpu/pixel_unshuffle_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/pixel_unshuffle_grad_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/pixel_unshuffle_grad_kernel.h" -#include "paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h" PD_REGISTER_KERNEL(pixel_unshuffle_grad, CPU, diff --git a/paddle/phi/kernels/cpu/pixel_unshuffle_kernel.cc b/paddle/phi/kernels/cpu/pixel_unshuffle_kernel.cc index 9f4bc747f3209..837378972c69a 100644 --- a/paddle/phi/kernels/cpu/pixel_unshuffle_kernel.cc +++ b/paddle/phi/kernels/cpu/pixel_unshuffle_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/pixel_unshuffle_kernel.h" -#include "paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h" PD_REGISTER_KERNEL(pixel_unshuffle, CPU, diff --git a/paddle/phi/kernels/cpu/poisson_kernel.cc b/paddle/phi/kernels/cpu/poisson_kernel.cc index 6a3e32c2f0785..8ba1afe229eee 100644 --- a/paddle/phi/kernels/cpu/poisson_kernel.cc +++ b/paddle/phi/kernels/cpu/poisson_kernel.cc @@ -12,11 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/phi/kernels/poisson_kernel.h" + #include #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/poisson_kernel.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/pool_grad_kernel.cc b/paddle/phi/kernels/cpu/pool_grad_kernel.cc index bb97694d8fc38..68cd57c52277b 100644 --- a/paddle/phi/kernels/cpu/pool_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/pool_grad_kernel.cc @@ -14,9 +14,8 @@ #include "paddle/phi/kernels/pool_grad_kernel.h" -#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h" - #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h" PD_REGISTER_KERNEL( pool2d_grad, CPU, ALL_LAYOUT, phi::Pool2dGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/pool_kernel.cc b/paddle/phi/kernels/cpu/pool_kernel.cc index 1d57e282c3c8a..3d3880692c0c8 100644 --- a/paddle/phi/kernels/cpu/pool_kernel.cc +++ b/paddle/phi/kernels/cpu/pool_kernel.cc @@ -14,9 +14,8 @@ #include "paddle/phi/kernels/pool_kernel.h" -#include "paddle/phi/kernels/impl/pool_kernel_impl.h" - #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/pool_kernel_impl.h" PD_REGISTER_KERNEL(pool2d, CPU, ALL_LAYOUT, phi::Pool2dKernel, float, double) {} PD_REGISTER_KERNEL(max_pool2d_with_index, diff --git a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc index b68c3ad545d33..202baddd713a4 100644 --- a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc @@ -15,6 +15,7 @@ #include "paddle/phi/kernels/psroi_pool_grad_kernel.h" #include + #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/phi/kernels/cpu/psroi_pool_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc index 4f7925ad00f5a..82eff70b75643 100644 --- a/paddle/phi/kernels/cpu/psroi_pool_kernel.cc +++ b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc @@ -15,6 +15,7 @@ #include "paddle/phi/kernels/psroi_pool_kernel.h" #include + #include "paddle/phi/core/kernel_registry.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/qr_kernel.cc b/paddle/phi/kernels/cpu/qr_kernel.cc index b0e82cedb6b8b..6a5551d95571b 100644 --- a/paddle/phi/kernels/cpu/qr_kernel.cc +++ b/paddle/phi/kernels/cpu/qr_kernel.cc @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include - #include "paddle/phi/kernels/qr_kernel.h" +#include + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/complex_functors.h" diff --git a/paddle/phi/kernels/cpu/reduce.h b/paddle/phi/kernels/cpu/reduce.h index 35395dccca1af..dad288cff2c1a 100644 --- a/paddle/phi/kernels/cpu/reduce.h +++ b/paddle/phi/kernels/cpu/reduce.h @@ -17,10 +17,9 @@ #include #include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/cast_kernel.h" - -#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/math_function.h" // See Note [ Why still include the fluid headers? ] diff --git a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc index 66ae5e02ffc75..abc18b1c578a8 100644 --- a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc @@ -111,4 +111,3 @@ PD_REGISTER_KERNEL(sum_grad, int64_t, phi::dtype::complex, phi::dtype::complex) {} - diff --git a/paddle/phi/kernels/cpu/rmsprop_kernel.cc b/paddle/phi/kernels/cpu/rmsprop_kernel.cc index fa1e1a2eed345..1d60823d75949 100644 --- a/paddle/phi/kernels/cpu/rmsprop_kernel.cc +++ b/paddle/phi/kernels/cpu/rmsprop_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/rmsprop_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/rmsprop_kernel_impl.h" diff --git a/paddle/phi/kernels/cpu/rnn_functor.h b/paddle/phi/kernels/cpu/rnn_functor.h index ab6f98ffcd5d6..911814647d6c0 100644 --- a/paddle/phi/kernels/cpu/rnn_functor.h +++ b/paddle/phi/kernels/cpu/rnn_functor.h @@ -14,6 +14,8 @@ #pragma once +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/operators/utils.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" @@ -21,9 +23,6 @@ #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/operators/utils.h" - namespace phi { #define DEFINE_MODE_DETECTOR(MODE_NAME, MODE_STR) \ @@ -252,9 +251,12 @@ inline std::vector Unbind(const DenseTensor& in) { } template class LayerT, - template class SingleLayerT, - template class BidirLayerT, + template + class LayerT, + template + class SingleLayerT, + template + class BidirLayerT, typename T, typename Context> void RnnFunc(const Context& dev_ctx, diff --git a/paddle/phi/kernels/cpu/rnn_grad_kernel.cc b/paddle/phi/kernels/cpu/rnn_grad_kernel.cc index 4dd1894320af7..1cd4add7d50e6 100644 --- a/paddle/phi/kernels/cpu/rnn_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/rnn_grad_kernel.cc @@ -16,7 +16,6 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" - #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/cpu/rnn_functor.h" #include "paddle/phi/kernels/funcs/activation_functor.h" @@ -962,8 +961,10 @@ void dropout_cpu_grad_function_inplace(const CPUContext& dev_ctx, } template class SingleGradLayerT, - template class BidirGradLayerT, + template + class SingleGradLayerT, + template + class BidirGradLayerT, typename T> void RnnGradFunc(const CPUContext& dev_ctx, const DenseTensor& x, diff --git a/paddle/phi/kernels/cpu/rnn_kernel.cc b/paddle/phi/kernels/cpu/rnn_kernel.cc index 80c521918ed07..e2e784b2943cc 100644 --- a/paddle/phi/kernels/cpu/rnn_kernel.cc +++ b/paddle/phi/kernels/cpu/rnn_kernel.cc @@ -49,7 +49,8 @@ struct Cell { }; template class EigenActivationFunctor, + template + class EigenActivationFunctor, funcs::detail::ActivationType act_type> struct SimpleRNNCell : Cell { void operator()(const CPUContext* dev_ctx, diff --git a/paddle/phi/kernels/cpu/roi_align_kernel.cc b/paddle/phi/kernels/cpu/roi_align_kernel.cc index cd779b72e7a84..cf0dc47f47bd3 100644 --- a/paddle/phi/kernels/cpu/roi_align_kernel.cc +++ b/paddle/phi/kernels/cpu/roi_align_kernel.cc @@ -79,16 +79,12 @@ std::vector> GetIndexesAndRatios( for (std::size_t px = 0; px < pooled_width; px++) { for (std::size_t iy = 0; iy < roi_bin_grid_h; iy++) { // calculate x of sample points - auto y = - roi_ymin + - bin_h * (py + - static_cast(iy + .5f) / static_cast(roi_bin_grid_h)); + auto y = roi_ymin + bin_h * (py + static_cast(iy + .5f) / + static_cast(roi_bin_grid_h)); for (std::size_t ix = 0; ix < roi_bin_grid_w; ix++) { // calculate x of sample points - auto x = roi_xmin + - bin_w * (px + - static_cast(ix + .5f) / - static_cast(roi_bin_grid_w)); + auto x = roi_xmin + bin_w * (px + static_cast(ix + .5f) / + static_cast(roi_bin_grid_w)); // deal with elements out of map if (y < -1.0 || y > height || x < -1.0 || x > width) { diff --git a/paddle/phi/kernels/cpu/scatter_grad_kernel.cc b/paddle/phi/kernels/cpu/scatter_grad_kernel.cc index 62fd58704c4fe..f09015f24a136 100644 --- a/paddle/phi/kernels/cpu/scatter_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/scatter_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/scatter_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/copy_kernel.h" diff --git a/paddle/phi/kernels/cpu/scatter_kernel.cc b/paddle/phi/kernels/cpu/scatter_kernel.cc index d48ceaf29a08c..7032c3bb5a335 100644 --- a/paddle/phi/kernels/cpu/scatter_kernel.cc +++ b/paddle/phi/kernels/cpu/scatter_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/scatter_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/copy_kernel.h" diff --git a/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc b/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc index cc143ba8d0e45..7c3665c5d2e2e 100644 --- a/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/scatter_nd_add_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/copy_kernel.h" diff --git a/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc b/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc index 04ae10f5e8b5d..31e2f4c716122 100644 --- a/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc +++ b/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/scatter_nd_add_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/copy_kernel.h" diff --git a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc index a5c9dc4c55e49..744ec7805fa60 100644 --- a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/segment_pool_grad_kernel.h" -#include "paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h" PD_REGISTER_KERNEL(segment_pool_grad, CPU, diff --git a/paddle/phi/kernels/cpu/segment_pool_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_kernel.cc index ad76a7a86bcb2..541ccd3436548 100644 --- a/paddle/phi/kernels/cpu/segment_pool_kernel.cc +++ b/paddle/phi/kernels/cpu/segment_pool_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/segment_pool_kernel.h" -#include "paddle/phi/kernels/impl/segment_pool_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/segment_pool_kernel_impl.h" PD_REGISTER_KERNEL(segment_pool, CPU, diff --git a/paddle/phi/kernels/cpu/selu_grad_kernel.cc b/paddle/phi/kernels/cpu/selu_grad_kernel.cc index 32101b1913282..9f83e39a363d3 100644 --- a/paddle/phi/kernels/cpu/selu_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/selu_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/selu_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/selu_grad_kernel_impl.h" diff --git a/paddle/phi/kernels/cpu/sgd_kernel.cc b/paddle/phi/kernels/cpu/sgd_kernel.cc index 214fd82bef358..055c44d38e4b2 100644 --- a/paddle/phi/kernels/cpu/sgd_kernel.cc +++ b/paddle/phi/kernels/cpu/sgd_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/sgd_kernel.h" + #include "paddle/fluid/operators/jit/kernels.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/cpu/sign_kernel.cc b/paddle/phi/kernels/cpu/sign_kernel.cc index 5fe11ffbd6d5c..9ded252c5c592 100644 --- a/paddle/phi/kernels/cpu/sign_kernel.cc +++ b/paddle/phi/kernels/cpu/sign_kernel.cc @@ -13,10 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/kernels/sign_kernel.h" -#include "paddle/phi/kernels/impl/sign_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/sign_kernel_impl.h" // See Note [ Why still include the fluid headers? ] #include "paddle/phi/common/bfloat16.h" diff --git a/paddle/phi/kernels/cpu/size_kernel.cc b/paddle/phi/kernels/cpu/size_kernel.cc index 71ebf9cdc09f7..ca8373b84889d 100644 --- a/paddle/phi/kernels/cpu/size_kernel.cc +++ b/paddle/phi/kernels/cpu/size_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/size_kernel.h" -#include "paddle/phi/kernels/impl/size_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/size_kernel_impl.h" PD_REGISTER_KERNEL(size, CPU, diff --git a/paddle/phi/kernels/cpu/slice_grad_kernel.cc b/paddle/phi/kernels/cpu/slice_grad_kernel.cc index 5c2cb3ea80e87..7e3efd217511f 100644 --- a/paddle/phi/kernels/cpu/slice_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/slice_grad_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/slice_grad_kernel.h" -#include "paddle/phi/kernels/impl/slice_grad_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/slice_grad_kernel_impl.h" PD_REGISTER_KERNEL(slice_grad, CPU, diff --git a/paddle/phi/kernels/cpu/slice_kernel.cc b/paddle/phi/kernels/cpu/slice_kernel.cc index 736540609dd72..0f2fe98a85323 100644 --- a/paddle/phi/kernels/cpu/slice_kernel.cc +++ b/paddle/phi/kernels/cpu/slice_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/slice_kernel.h" -#include "paddle/phi/kernels/impl/slice_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/slice_kernel_impl.h" PD_REGISTER_KERNEL(slice, CPU, diff --git a/paddle/phi/kernels/cpu/sparse_weight_embedding_grad_kernel.cc b/paddle/phi/kernels/cpu/sparse_weight_embedding_grad_kernel.cc index d78477073ad03..d296aba66503b 100644 --- a/paddle/phi/kernels/cpu/sparse_weight_embedding_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/sparse_weight_embedding_grad_kernel.cc @@ -13,12 +13,12 @@ // limitations under the License. #include "paddle/phi/kernels/sparse_weight_embedding_grad_kernel.h" -#include "paddle/phi/kernels/funcs/embedding_util.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/funcs/embedding_util.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/sparse_weight_embedding_kernel.cc b/paddle/phi/kernels/cpu/sparse_weight_embedding_kernel.cc index c0f95d03888b8..cfdccb5c8d9ba 100644 --- a/paddle/phi/kernels/cpu/sparse_weight_embedding_kernel.cc +++ b/paddle/phi/kernels/cpu/sparse_weight_embedding_kernel.cc @@ -12,14 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/embedding_kernel.h" -#include "paddle/phi/kernels/funcs/embedding_util.h" - #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/embedding_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/embedding_util.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc index 56d872922490a..288cdd235aede 100644 --- a/paddle/phi/kernels/cpu/split_kernel.cc +++ b/paddle/phi/kernels/cpu/split_kernel.cc @@ -17,7 +17,6 @@ #include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" - #include "paddle/phi/infermeta/unary.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc b/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc index 400f7e8783932..2aff156819748 100644 --- a/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/temporal_shift_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/cpu/temporal_shift_kernel.cc b/paddle/phi/kernels/cpu/temporal_shift_kernel.cc index 6721117992dd5..29be487131964 100644 --- a/paddle/phi/kernels/cpu/temporal_shift_kernel.cc +++ b/paddle/phi/kernels/cpu/temporal_shift_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/temporal_shift_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/cpu/transpose_grad_kernel.cc b/paddle/phi/kernels/cpu/transpose_grad_kernel.cc index 9dbcf575f33c1..dee69222e6dc0 100644 --- a/paddle/phi/kernels/cpu/transpose_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/transpose_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/transpose_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc b/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc index 14aca258a2c71..660254fef86f6 100644 --- a/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc @@ -12,10 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h" - #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h" PD_REGISTER_KERNEL(tril_triu_grad, CPU, diff --git a/paddle/phi/kernels/cpu/tril_triu_kernel.cc b/paddle/phi/kernels/cpu/tril_triu_kernel.cc index a3d20e55e21fb..f3599bb92b97b 100644 --- a/paddle/phi/kernels/cpu/tril_triu_kernel.cc +++ b/paddle/phi/kernels/cpu/tril_triu_kernel.cc @@ -12,10 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h" - #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h" PD_REGISTER_KERNEL(tril_triu, CPU, diff --git a/paddle/phi/kernels/cpu/trunc_grad_kernel.cc b/paddle/phi/kernels/cpu/trunc_grad_kernel.cc index 4d85dd609e2d1..24fc389256222 100644 --- a/paddle/phi/kernels/cpu/trunc_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/trunc_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/trunc_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/cpu/trunc_kernel.cc b/paddle/phi/kernels/cpu/trunc_kernel.cc index babae6ce7c931..5fe33ec6a4b2e 100644 --- a/paddle/phi/kernels/cpu/trunc_kernel.cc +++ b/paddle/phi/kernels/cpu/trunc_kernel.cc @@ -12,11 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/phi/kernels/trunc_kernel.h" + #include #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/trunc_kernel.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/unfold_grad_kernel.cc b/paddle/phi/kernels/cpu/unfold_grad_kernel.cc index c97005dd84547..6ba4ba49b9af9 100644 --- a/paddle/phi/kernels/cpu/unfold_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/unfold_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/unfold_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/unfold_grad_kernel_impl.h" diff --git a/paddle/phi/kernels/cpu/unfold_kernel.cc b/paddle/phi/kernels/cpu/unfold_kernel.cc index e38d8acd09820..f15201542e6c1 100644 --- a/paddle/phi/kernels/cpu/unfold_kernel.cc +++ b/paddle/phi/kernels/cpu/unfold_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/unfold_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/unfold_kernel_impl.h" diff --git a/paddle/phi/kernels/cpu/uniform_random_kernel.cc b/paddle/phi/kernels/cpu/uniform_random_kernel.cc index c95a8f4ded6dc..a09812363f1d8 100644 --- a/paddle/phi/kernels/cpu/uniform_random_kernel.cc +++ b/paddle/phi/kernels/cpu/uniform_random_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/uniform_random_kernel.h" + #include "paddle/phi/core/kernel_registry.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/unique_kernel.cc b/paddle/phi/kernels/cpu/unique_kernel.cc index 853b401315d22..834f05f73e228 100644 --- a/paddle/phi/kernels/cpu/unique_kernel.cc +++ b/paddle/phi/kernels/cpu/unique_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/unique_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/utils/data_type.h" diff --git a/paddle/phi/kernels/cpu/unstack_grad_kernel.cc b/paddle/phi/kernels/cpu/unstack_grad_kernel.cc index 9c2dce808dca7..c494cbc965eff 100644 --- a/paddle/phi/kernels/cpu/unstack_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/unstack_grad_kernel.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/kernels/unstack_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/unstack_grad_kernel_impl.h" diff --git a/paddle/phi/kernels/cpu/unstack_kernel.cc b/paddle/phi/kernels/cpu/unstack_kernel.cc index 3d233e9ec405f..4bc8d1b2c93b2 100644 --- a/paddle/phi/kernels/cpu/unstack_kernel.cc +++ b/paddle/phi/kernels/cpu/unstack_kernel.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/kernels/unstack_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/unstack_kernel_impl.h" diff --git a/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc b/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc index fab49f5416048..c98a098aa0e6f 100644 --- a/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc +++ b/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc @@ -109,7 +109,8 @@ struct Gather { }; template typename CompareFunctor, + template + typename CompareFunctor, typename T> struct GetMask { void operator()(const Context& dev_ctx, @@ -122,7 +123,8 @@ struct GetMask { }; template typename BinaryFunctor, + template + typename BinaryFunctor, typename T> struct BinaryOperation { void operator()(const Context& dev_ctx, diff --git a/paddle/phi/kernels/cpu/warpctc_grad_kernel.cc b/paddle/phi/kernels/cpu/warpctc_grad_kernel.cc index 0b29336335481..7d70d825250ee 100644 --- a/paddle/phi/kernels/cpu/warpctc_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/warpctc_grad_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/warpctc_grad_kernel.h" -#include "paddle/phi/kernels/impl/warpctc_grad_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/warpctc_grad_kernel_impl.h" PD_REGISTER_KERNEL( warpctc_grad, CPU, ALL_LAYOUT, phi::WarpctcGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/warpctc_kernel.cc b/paddle/phi/kernels/cpu/warpctc_kernel.cc index 4b87202c11e92..239c6cb0cbe04 100644 --- a/paddle/phi/kernels/cpu/warpctc_kernel.cc +++ b/paddle/phi/kernels/cpu/warpctc_kernel.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/warpctc_kernel.h" -#include "paddle/phi/kernels/impl/warpctc_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/warpctc_kernel_impl.h" PD_REGISTER_KERNEL( warpctc, CPU, ALL_LAYOUT, phi::WarpctcKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/yolo_box_kernel.cc b/paddle/phi/kernels/cpu/yolo_box_kernel.cc index a83bc019fc3af..6b882ad289512 100644 --- a/paddle/phi/kernels/cpu/yolo_box_kernel.cc +++ b/paddle/phi/kernels/cpu/yolo_box_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/yolo_box_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/yolo_box_util.h" diff --git a/paddle/phi/kernels/cpu/yolov3_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/yolov3_loss_grad_kernel.cc index 383009229f9a1..655106e9cb44d 100644 --- a/paddle/phi/kernels/cpu/yolov3_loss_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/yolov3_loss_grad_kernel.cc @@ -12,11 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/phi/kernels/yolov3_loss_grad_kernel.h" + #include #include -#include "paddle/phi/kernels/yolov3_loss_grad_kernel.h" - #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cpu/yolov3_loss_functor.h" diff --git a/paddle/phi/kernels/cpu/yolov3_loss_kernel.cc b/paddle/phi/kernels/cpu/yolov3_loss_kernel.cc index 8a190ab25a7b2..75b2e3c5c4a0e 100644 --- a/paddle/phi/kernels/cpu/yolov3_loss_kernel.cc +++ b/paddle/phi/kernels/cpu/yolov3_loss_kernel.cc @@ -12,11 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/phi/kernels/yolov3_loss_kernel.h" + #include #include -#include "paddle/phi/kernels/yolov3_loss_kernel.h" - #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cpu/yolov3_loss_functor.h" diff --git a/paddle/phi/kernels/cumprod_grad_kernel.h b/paddle/phi/kernels/cumprod_grad_kernel.h index b3cb17b28e07f..7610cad31e327 100644 --- a/paddle/phi/kernels/cumprod_grad_kernel.h +++ b/paddle/phi/kernels/cumprod_grad_kernel.h @@ -25,4 +25,4 @@ void CumprodGradKernel(const Context& dev_ctx, const DenseTensor& dout, int dim, DenseTensor* dx); -} // phi +} // namespace phi diff --git a/paddle/phi/kernels/cumprod_kernel.h b/paddle/phi/kernels/cumprod_kernel.h index 96d76cb0f4370..bb8b1427b30c4 100644 --- a/paddle/phi/kernels/cumprod_kernel.h +++ b/paddle/phi/kernels/cumprod_kernel.h @@ -23,4 +23,4 @@ void CumprodKernel(const Context& dev_ctx, const DenseTensor& x, int dim, DenseTensor* out); -} // phi +} // namespace phi diff --git a/paddle/phi/kernels/diagonal_kernel.h b/paddle/phi/kernels/diagonal_kernel.h index 7cf7282307a4b..10afd7dbe920a 100644 --- a/paddle/phi/kernels/diagonal_kernel.h +++ b/paddle/phi/kernels/diagonal_kernel.h @@ -25,4 +25,4 @@ void DiagonalKernel(const Context& dev_ctx, int axis1, int axis2, DenseTensor* out); -} // phi +} // namespace phi diff --git a/paddle/phi/kernels/digamma_grad_kernel.h b/paddle/phi/kernels/digamma_grad_kernel.h index ae5346080d30d..abd8634518d2c 100644 --- a/paddle/phi/kernels/digamma_grad_kernel.h +++ b/paddle/phi/kernels/digamma_grad_kernel.h @@ -24,4 +24,4 @@ void DigammaGradKernel(const Context& ctx, const DenseTensor& out_grad, DenseTensor* x_grad); -} // namepsace phi +} // namespace phi diff --git a/paddle/phi/kernels/digamma_kernel.h b/paddle/phi/kernels/digamma_kernel.h index ce25f2e148e96..3cf1eae67cc3e 100644 --- a/paddle/phi/kernels/digamma_kernel.h +++ b/paddle/phi/kernels/digamma_kernel.h @@ -21,4 +21,4 @@ namespace phi { template void DigammaKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out); -} // namepsace phi +} // namespace phi diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc index 06d258a8a4e80..d8cf0bd2ef90d 100644 --- a/paddle/phi/kernels/empty_kernel.cc +++ b/paddle/phi/kernels/empty_kernel.cc @@ -14,9 +14,8 @@ #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/backends/all_context.h" -#include "paddle/phi/core/kernel_registry.h" - #include "paddle/phi/common/complex.h" +#include "paddle/phi/core/kernel_registry.h" namespace phi { diff --git a/paddle/phi/kernels/expand_kernel.h b/paddle/phi/kernels/expand_kernel.h index 3b44c46e4dd7c..930240db6ccca 100644 --- a/paddle/phi/kernels/expand_kernel.h +++ b/paddle/phi/kernels/expand_kernel.h @@ -26,4 +26,4 @@ void ExpandKernel(const Context& ctx, const IntArray& shape, DenseTensor* out); -} // namepsace phi +} // namespace phi diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc index 83f96c1f9f521..54279fca6e429 100644 --- a/paddle/phi/kernels/flatten_grad_kernel.cc +++ b/paddle/phi/kernels/flatten_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/flatten_grad_kernel.h" + #include "paddle/phi/backends/all_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/copy_kernel.h" diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc index f304e7706add4..dd000896073c7 100644 --- a/paddle/phi/kernels/flatten_kernel.cc +++ b/paddle/phi/kernels/flatten_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/flatten_kernel.h" + #include "paddle/phi/backends/all_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/infermeta/unary.h" diff --git a/paddle/phi/kernels/frobenius_norm_grad_kernel.h b/paddle/phi/kernels/frobenius_norm_grad_kernel.h index cfe8192d1a69b..65db8dd9e0a10 100644 --- a/paddle/phi/kernels/frobenius_norm_grad_kernel.h +++ b/paddle/phi/kernels/frobenius_norm_grad_kernel.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/phi/core/dense_tensor.h" namespace phi { diff --git a/paddle/phi/kernels/frobenius_norm_kernel.h b/paddle/phi/kernels/frobenius_norm_kernel.h index f5f37ee0c0fa5..30122cb416094 100644 --- a/paddle/phi/kernels/frobenius_norm_kernel.h +++ b/paddle/phi/kernels/frobenius_norm_kernel.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/phi/core/dense_tensor.h" namespace phi { diff --git a/paddle/phi/kernels/full_kernel.h b/paddle/phi/kernels/full_kernel.h index d5785f2eedafa..228e862a09c79 100644 --- a/paddle/phi/kernels/full_kernel.h +++ b/paddle/phi/kernels/full_kernel.h @@ -19,7 +19,6 @@ #include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" - #include "paddle/phi/infermeta/nullary.h" #include "paddle/phi/kernels/empty_kernel.h" diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index f80117ccec799..f481821a7bfcc 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -15,14 +15,14 @@ #pragma once #include + #include +#include #include #include #include #include #include - -#include #ifndef _USE_MATH_DEFINES #define _USE_MATH_DEFINES #endif @@ -986,9 +986,9 @@ struct BReluGradFunctor : public BaseActivationFunctor { typename dOut, typename dX> void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * - ((x > static_cast(t_min)) * (x < static_cast(t_max))) - .template cast(); + dx.device(d) = + dout * ((x > static_cast(t_min)) * (x < static_cast(t_max))) + .template cast(); } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } @@ -1054,11 +1054,10 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor { GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad")); auto ddout = EigenVector::Flatten( GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad")); - ddout.device(*d) = - ddx * - ((x > static_cast(0)).template cast() + - static_cast(alpha) * (x <= static_cast(0)).template cast()) - .template cast(); + ddout.device(*d) = ddx * ((x > static_cast(0)).template cast() + + static_cast(alpha) * + (x <= static_cast(0)).template cast()) + .template cast(); } } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } @@ -1290,11 +1289,10 @@ struct ELUGradGradFunctor : public BaseActivationFunctor { if (ddOut) { auto ddout = EigenVector::Flatten( GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ELUGradGrad")); - ddout.device(*d) = ddx * - ((x > static_cast(0)).template cast() + - static_cast(alpha) * x.exp() * - (x <= static_cast(0)).template cast()) - .template cast(); + ddout.device(*d) = ddx * ((x > static_cast(0)).template cast() + + static_cast(alpha) * x.exp() * + (x <= static_cast(0)).template cast()) + .template cast(); } } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } @@ -1980,11 +1978,10 @@ struct CELUGradGradFunctor : public BaseActivationFunctor { if (ddOut) { auto ddout = EigenVector::Flatten( GET_DATA_SAFELY(ddOut, "Output", "DDOut", "CELUGradGrad")); - ddout.device(*d) = ddx * - ((x > static_cast(0)).template cast() + - (x / static_cast(alpha)).exp() * - (x <= static_cast(0)).template cast()) - .template cast(); + ddout.device(*d) = ddx * ((x > static_cast(0)).template cast() + + (x / static_cast(alpha)).exp() * + (x <= static_cast(0)).template cast()) + .template cast(); } } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } diff --git a/paddle/phi/kernels/funcs/adam_functors.h b/paddle/phi/kernels/funcs/adam_functors.h index 2f706f0ef1c36..b14ee7f072e4e 100644 --- a/paddle/phi/kernels/funcs/adam_functors.h +++ b/paddle/phi/kernels/funcs/adam_functors.h @@ -14,6 +14,7 @@ #pragma once #include // for sqrt in CPU and CUDA + #include #include "paddle/phi/kernels/funcs/algorithm.h" @@ -169,9 +170,8 @@ class AdamFunctor { moment1_out = beta1_ * mom1 + (1 - beta1_) * g; moment2_out = beta2_ * mom2 + (1 - beta2_) * g * g; - param_out = param - - lr * (moment1_out / - (moment2_out.sqrt() + epsilon_ * sqrt(1 - beta2_pow))); + param_out = param - lr * (moment1_out / (moment2_out.sqrt() + + epsilon_ * sqrt(1 - beta2_pow))); } }; diff --git a/paddle/phi/kernels/funcs/aligned_vector.h b/paddle/phi/kernels/funcs/aligned_vector.h index 14a9560b841fa..70f75d5352ac5 100644 --- a/paddle/phi/kernels/funcs/aligned_vector.h +++ b/paddle/phi/kernels/funcs/aligned_vector.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/phi/core/hostdevice.h" #if defined(__xpu__) #define CHAR_BIT 8 @@ -45,11 +46,11 @@ HOSTDEVICE inline void Store(const AlignedVector& vec, T* addr) { } /* -* Only the address of input data is the multiplier of 1,2,4, vectorized load -* with corresponding multiplier-value is possible. Moreover, the maximum length -* of vectorized load is 128 bits once. Hence, valid length of vectorized load -* shall be determined under both former constraints. -*/ + * Only the address of input data is the multiplier of 1,2,4, vectorized load + * with corresponding multiplier-value is possible. Moreover, the maximum length + * of vectorized load is 128 bits once. Hence, valid length of vectorized load + * shall be determined under both former constraints. + */ template int GetVectorizedSize(const T* pointer) { constexpr int max_load_bits = 128; @@ -60,11 +61,11 @@ int GetVectorizedSize(const T* pointer) { constexpr int vec2 = std::alignment_of>::value; // NOLINT if (address % vec8 == 0) { /* - * Currently, decide to deal with no more than 4 data once while adopting - * vectorization load/store, if performance test shows that dealing with - * 8 data once in vectorization load/store does get optimized, return code - * below can be changed into " return std::min(8, valid_vec_size); " . - */ + * Currently, decide to deal with no more than 4 data once while adopting + * vectorization load/store, if performance test shows that dealing with + * 8 data once in vectorization load/store does get optimized, return code + * below can be changed into " return std::min(8, valid_vec_size); " . + */ return std::min(4, valid_vec_size); } else if (address % vec4 == 0) { return std::min(4, valid_vec_size); diff --git a/paddle/phi/kernels/funcs/blas/CMakeLists.txt b/paddle/phi/kernels/funcs/blas/CMakeLists.txt index cb054cc76e1d7..732114f2a6e80 100644 --- a/paddle/phi/kernels/funcs/blas/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/blas/CMakeLists.txt @@ -1 +1,4 @@ -cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context) +cc_library( + blas + SRCS blas.cc + DEPS cblas framework_proto device_context) diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h index e2b16a1eb7ff1..3e197a18f96b9 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h @@ -14,11 +14,10 @@ #pragma once -#include "paddle/fluid/platform/dynload/cublas.h" -#include "paddle/phi/kernels/funcs/math_function.h" - #include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/kernels/funcs/math_function.h" DECLARE_bool(enable_cublas_tensor_op_math); DECLARE_bool(gemm_use_half_precision_compute_type); diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index ecdfa7abcfd42..88b87c07c7615 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -456,21 +456,16 @@ void LaunchBroadcastKernel( read_lens * gpu_config.GetBlockSize(); int tail_tid = numel % (read_lens * gpu_config.GetBlockSize()); #endif - VectorizedBroadcastKernel<<>>( - ins_data, - outs_data, - use_broadcast, - numel, - configs, - main_offset, - tail_tid, - read_lens, - func); + VectorizedBroadcastKernel + <<>>(ins_data, + outs_data, + use_broadcast, + numel, + configs, + main_offset, + tail_tid, + read_lens, + func); } template dims().size()); } - axis = axis == -1 - ? *std::max_element(dims_size.begin(), dims_size.end()) - - *std::min_element(dims_size.begin(), dims_size.end()) - : axis; + axis = axis == -1 ? *std::max_element(dims_size.begin(), dims_size.end()) - + *std::min_element(dims_size.begin(), dims_size.end()) + : axis; BroadcastKernelForDifferentVecSize( ctx, ins, outs, axis, func); } diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu index 06be592dd9375..5abaf6c2ffa87 100644 --- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu +++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu @@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" - #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" +#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" namespace phi { namespace funcs { diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cc b/paddle/phi/kernels/funcs/deformable_conv_functor.cc index ea256e93bba75..48858fa59390e 100644 --- a/paddle/phi/kernels/funcs/deformable_conv_functor.cc +++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cc @@ -60,14 +60,12 @@ inline void ModulatedDeformableIm2colCPUKernel( const T* data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; const T* data_offset_ptr = - data_offset + - (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * - kernel_w * height_col * width_col; + data_offset + (b_col * deformable_group + deformable_group_index) * 2 * + kernel_h * kernel_w * height_col * width_col; const T* data_mask_ptr = data_mask - ? data_mask + - (b_col * deformable_group + deformable_group_index) * - kernel_h * kernel_w * height_col * width_col + ? data_mask + (b_col * deformable_group + deformable_group_index) * + kernel_h * kernel_w * height_col * width_col : nullptr; for (int i = 0; i < kernel_h; ++i) { diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cu b/paddle/phi/kernels/funcs/deformable_conv_functor.cu index 8bfb46c6636e9..bebea5dcb74ca 100644 --- a/paddle/phi/kernels/funcs/deformable_conv_functor.cu +++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cu @@ -12,9 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/funcs/deformable_conv_functor.h" - #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/kernels/funcs/deformable_conv_functor.h" namespace phi { namespace funcs { @@ -70,14 +69,12 @@ __global__ void ModulatedDeformableIm2colGpuKernel( const T* data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; const T* data_offset_ptr = - data_offset + - (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * - kernel_w * height_col * width_col; + data_offset + (b_col * deformable_group + deformable_group_index) * 2 * + kernel_h * kernel_w * height_col * width_col; const T* data_mask_ptr = data_mask - ? data_mask + - (b_col * deformable_group + deformable_group_index) * - kernel_h * kernel_w * height_col * width_col + ? data_mask + (b_col * deformable_group + deformable_group_index) * + kernel_h * kernel_w * height_col * width_col : nullptr; for (int i = 0; i < kernel_h; ++i) { @@ -129,28 +126,28 @@ void ModulatedDeformableIm2col(const Context& dev_ctx, int blocks = NumBlocks(num_kernels); int threads = kNumCUDAThreads; - ModulatedDeformableIm2colGpuKernel< - T><<>>(num_kernels, - data_im, - data_offset, - data_mask, - im_shape[1], - im_shape[2], - filter_shape[2], - filter_shape[3], - paddings[0], - paddings[1], - strides[0], - strides[1], - dilations[0], - dilations[1], - channel_per_deformable_group, - col_shape[1], - im_shape[0], - deformable_groups, - col_shape[2], - col_shape[3], - data_col); + ModulatedDeformableIm2colGpuKernel + <<>>(num_kernels, + data_im, + data_offset, + data_mask, + im_shape[1], + im_shape[2], + filter_shape[2], + filter_shape[3], + paddings[0], + paddings[1], + strides[0], + strides[1], + dilations[0], + dilations[1], + channel_per_deformable_group, + col_shape[1], + im_shape[0], + deformable_groups, + col_shape[2], + col_shape[3], + data_col); } template void ModulatedDeformableIm2col( diff --git a/paddle/phi/kernels/funcs/detail/activation_functions.h b/paddle/phi/kernels/funcs/detail/activation_functions.h index 475557f164210..d41dca33f7571 100644 --- a/paddle/phi/kernels/funcs/detail/activation_functions.h +++ b/paddle/phi/kernels/funcs/detail/activation_functions.h @@ -14,8 +14,10 @@ limitations under the License. */ #pragma once #include + #include #include + #include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/core/hostdevice.h" diff --git a/paddle/phi/kernels/funcs/detail/avx_mathfun.h b/paddle/phi/kernels/funcs/detail/avx_mathfun.h index e5e7388d51dff..75e4922648c20 100644 --- a/paddle/phi/kernels/funcs/detail/avx_mathfun.h +++ b/paddle/phi/kernels/funcs/detail/avx_mathfun.h @@ -356,11 +356,11 @@ v8sf sin256_ps(v8sf x) { // any x /* scale by 4/Pi */ y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); -/* - Here we start a series of integer operations, which are in the - realm of AVX2. - If we don't have AVX, let's perform them using SSE2 directives -*/ + /* + Here we start a series of integer operations, which are in the + realm of AVX2. + If we don't have AVX, let's perform them using SSE2 directives + */ #ifdef __AVX2__ /* store the integer part of y in mm0 */ diff --git a/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h index 0016bfb64c96e..0fdf490c5534d 100644 --- a/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h +++ b/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/phi/kernels/funcs/activation_functor.h" #include "paddle/phi/kernels/funcs/detail/activation_functions.h" diff --git a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h index 6657417beac8d..93232d8f7f434 100644 --- a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h +++ b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/kernels/funcs/detail/activation_functions.h" diff --git a/paddle/phi/kernels/funcs/detail/gru_kernel.h b/paddle/phi/kernels/funcs/detail/gru_kernel.h index db53fc4576daa..9e2aef1940619 100644 --- a/paddle/phi/kernels/funcs/detail/gru_kernel.h +++ b/paddle/phi/kernels/funcs/detail/gru_kernel.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/kernels/funcs/detail/activation_functions.h" diff --git a/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h index ed8e749f7fdad..02fddc57b313a 100644 --- a/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h +++ b/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/phi/kernels/funcs/activation_functor.h" #include "paddle/phi/kernels/funcs/detail/activation_functions.h" diff --git a/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h index 6d4c430d9e648..5d06dddd9645b 100644 --- a/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h +++ b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h @@ -249,27 +249,27 @@ void gpu_lstm_forward(const paddle::platform::DeviceContext& context, if (batch_size == 1) { KeLstmForward<<>>( - op, - value, - frame_size, - batch_size, - cell_clip, - active_node, - active_gate, - active_state); + /* is_batch= */ false> + <<>>(op, + value, + frame_size, + batch_size, + cell_clip, + active_node, + active_gate, + active_state); } else { KeLstmForward<<>>( - op, - value, - frame_size, - batch_size, - cell_clip, - active_node, - active_gate, - active_state); + /* is_batch= */ true> + <<>>(op, + value, + frame_size, + batch_size, + cell_clip, + active_node, + active_gate, + active_state); } } @@ -303,29 +303,29 @@ void gpu_lstm_backward(const paddle::platform::DeviceContext& context, if (batch_size == 1) { KeLstmBackward<<>>( - op, - value, - grad, - frame_size, - batch_size, - cell_clip, - active_node, - active_gate, - active_state); + /* is_batch= */ false> + <<>>(op, + value, + grad, + frame_size, + batch_size, + cell_clip, + active_node, + active_gate, + active_state); } else { KeLstmBackward<<>>( - op, - value, - grad, - frame_size, - batch_size, - cell_clip, - active_node, - active_gate, - active_state); + /* is_batch= */ true> + <<>>(op, + value, + grad, + frame_size, + batch_size, + cell_clip, + active_node, + active_gate, + active_state); } } diff --git a/paddle/phi/kernels/funcs/detail/lstm_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_kernel.h index 8b42926412525..0846f05a0c2c5 100644 --- a/paddle/phi/kernels/funcs/detail/lstm_kernel.h +++ b/paddle/phi/kernels/funcs/detail/lstm_kernel.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include + #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/kernels/funcs/detail/activation_functions.h" diff --git a/paddle/phi/kernels/funcs/diagonal.h b/paddle/phi/kernels/funcs/diagonal.h index 19a93970d090a..81525cb25449e 100644 --- a/paddle/phi/kernels/funcs/diagonal.h +++ b/paddle/phi/kernels/funcs/diagonal.h @@ -17,6 +17,7 @@ #if defined(__NVCC__) || defined(__HIPCC__) #include #include + #include "paddle/phi/kernels/primitive/kernel_primitives.h" #endif diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h index 68e986c334ecb..0e6b3a3f9d733 100644 --- a/paddle/phi/kernels/funcs/distribution_helper.h +++ b/paddle/phi/kernels/funcs/distribution_helper.h @@ -319,10 +319,9 @@ void distribution_and_transform(const GPUContext &ctx, uint64_t seed = seed_offset.first; uint64_t offset = seed_offset.second; - DistributionKernel<<>>( - size, seed, offset, dist, trans, out_data, total_thread); + DistributionKernel + <<>>( + size, seed, offset, dist, trans, out_data, total_thread); } #endif diff --git a/paddle/phi/kernels/funcs/eigen/CMakeLists.txt b/paddle/phi/kernels/funcs/eigen/CMakeLists.txt index 8b64e35b93526..de771f12fbfe2 100644 --- a/paddle/phi/kernels/funcs/eigen/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/eigen/CMakeLists.txt @@ -1,9 +1,24 @@ -file(GLOB EIGEN_CC_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") -file(GLOB EIGEN_CU_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cu") +file( + GLOB EIGEN_CC_SOURCES + RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + "*.cc") +file( + GLOB EIGEN_CU_SOURCES + RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + "*.cu") if(WITH_GPU) - nv_library(eigen_function SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES} DEPS eigen3) + nv_library( + eigen_function + SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES} + DEPS eigen3) elseif(WITH_ROCM) - hip_library(eigen_function SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES} DEPS eigen3) + hip_library( + eigen_function + SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES} + DEPS eigen3) else() - cc_library(eigen_function SRCS ${EIGEN_CC_SOURCES} DEPS eigen3) + cc_library( + eigen_function + SRCS ${EIGEN_CC_SOURCES} + DEPS eigen3) endif() diff --git a/paddle/phi/kernels/funcs/eigen/extensions.h b/paddle/phi/kernels/funcs/eigen/extensions.h index fbb9d8e3d2ef5..c724564417b19 100644 --- a/paddle/phi/kernels/funcs/eigen/extensions.h +++ b/paddle/phi/kernels/funcs/eigen/extensions.h @@ -20,7 +20,6 @@ #include "paddle/phi/common/complex.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/hostdevice.h" - #include "unsupported/Eigen/CXX11/Tensor" namespace Eigen { diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h index 1093bdfa726c8..71dfbc206a191 100644 --- a/paddle/phi/kernels/funcs/elementwise_base.h +++ b/paddle/phi/kernels/funcs/elementwise_base.h @@ -494,7 +494,7 @@ template